├── LICENSE.md ├── README.md ├── data └── foxdata.txt ├── howto ├── README.md ├── download_install_run_spark.md └── minimize_verbosity.md ├── images ├── Data-Algorithms-with-Spark_mech2.pdf ├── Data-Algorithms-with-Spark_mech2.png ├── Data_Algorithms_with_Spark_COVER_9781492082385.png ├── data_algorithms_image.jpg ├── data_algorithms_with_spark.jpg └── pyspark_algorithms2.jpg └── tutorial ├── .DS_Store ├── add-indices └── add-indices.txt ├── basic-average └── basic-average.txt ├── basic-filter └── basic-filter.txt ├── basic-join └── basicjoin.txt ├── basic-map └── basic-map.txt ├── basic-multiply └── basic-multiply.txt ├── basic-sort └── sort-by-key.txt ├── basic-sum └── basic-sum.txt ├── basic-union └── basic-union.txt ├── bigrams └── bigrams.txt ├── cartesian └── cartesian.txt ├── combine-by-key ├── README.md ├── combine-by-key.txt ├── distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf ├── spark-combineByKey.md ├── spark-combineByKey.txt └── standard_deviation_by_combineByKey.md ├── dna-basecount ├── README.md ├── basemapper.py ├── dna-basecount.md ├── dna-basecount2.md ├── dna-basecount3.md └── dna_seq.txt ├── map-partitions └── README.md ├── pyspark-examples ├── dataframes │ ├── VIDEO-DataFrames.txt │ ├── dataframe-examples.md │ ├── dataframe-session-2018-04-26.txt │ ├── dataframe-session-2018-05-15.txt │ ├── dataframe-session-2018-10-30.txt │ ├── dataframe-session-2019-02-14.txt │ ├── dataframe-session-2020-11-04.txt │ ├── dataframe-session-2021-05-12-intro.txt │ ├── dataframe-session-2022-05-12.txt │ └── dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt └── rdds │ ├── combineByKey_example.py │ ├── count_min_max.py │ ├── groupbykey_and_reducebykey_example.ipynb │ ├── pyspark-session-2015-02-23.txt │ ├── pyspark-session-2015-03-13.txt │ ├── pyspark-session-2015-04-10.txt │ ├── pyspark-session-2018-01-18.txt │ ├── pyspark-session-2018-04-12.txt │ ├── pyspark-session-2018-10-02.txt │ ├── pyspark-session-2018-10-09.txt │ ├── pyspark-session-2019-01-22.txt │ ├── pyspark-session-2019-01-30.txt │ ├── pyspark-session-2019-04-16.txt │ ├── pyspark-session-2019-04-18.txt │ ├── pyspark-session-2019-04-26.txt │ ├── pyspark-session-2019-05-09.txt │ ├── pyspark-session-2019-10-09.txt │ ├── pyspark-session-2019-10-16.txt │ ├── pyspark-session-2020-01-22.txt │ ├── pyspark-session-2020-01-24.txt │ ├── pyspark-session-2020-02-03.txt │ ├── pyspark-session-2020-04-16.txt │ ├── pyspark-session-2020-04-23.txt │ ├── pyspark-session-2020-07-06-word-count.txt │ ├── pyspark-session-2020-10-05.txt │ ├── pyspark-session-2020-10-07.txt │ ├── pyspark-session-2020-10-12.txt │ ├── pyspark-session-2020-10-15.txt │ ├── pyspark-session-2020-10-19.txt │ ├── pyspark-session-2021-01-19.txt │ ├── pyspark-session-2021-01-21.ipynb │ ├── pyspark-session-2021-01-26.txt │ ├── pyspark-session-2021-04-12.txt │ ├── pyspark-session-2021-04-14.txt │ ├── pyspark-session-2021-04-19.txt │ ├── pyspark-session-2021-04-21-mapPartitions.txt │ ├── pyspark-session-2021-04-29-min-max-avg.txt │ ├── pyspark-session-2021-05-05-join.txt │ ├── pyspark-session-2021-10-06.txt │ ├── pyspark-session-2021-10-11-filter-map-flatMap.txt │ ├── pyspark-session-2021-10-20-understanding-partitions.txt │ ├── pyspark-session-2021-10-25-RDD-join.txt │ ├── pyspark-session-2022-04-12.txt │ ├── pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt │ ├── pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt │ ├── pyspark-session_2019-10-07.txt │ ├── pyspark-session_2020-07-01.txt │ └── understanding_partitions.txt ├── pyspark-udf └── pyspark_udf_maptype.txt ├── ranking ├── README.md └── ranking_functions_in_pyspark.md ├── split-function └── README.md ├── top-N └── top-N.txt └── wordcount ├── README.md ├── run_word_count.sh ├── run_word_count_ver2.sh ├── word_count.py ├── word_count_ver2.py ├── wordcount-shorthand.txt └── wordcount.txt /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright [2019] [Mahmoud Parsian] 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySpark Tutorial 2 | 3 | * PySpark is the Python API for Spark. 4 | 5 | * The purpose of PySpark tutorial is to provide 6 | basic distributed algorithms using PySpark. 7 | 8 | * PySpark supports two types of Data Abstractions: 9 | * RDDs 10 | * DataFrames 11 | 12 | * **PySpark Interactive Mode**: has an interactive shell 13 | (`$SPARK_HOME/bin/pyspark`) for basic testing 14 | and debugging and is not supposed to be used 15 | for production environment. 16 | 17 | * **PySpark Batch Mode**: you may use `$SPARK_HOME/bin/spark-submit` 18 | command for running PySpark programs (may be used for 19 | testing and production environemtns) 20 | 21 | ------ 22 | 23 | # [Glossary: big data, MapReduce, Spark](https://github.com/mahmoudparsian/big-data-mapreduce-course/blob/master/slides/glossary/README.md) 24 | 25 | ------ 26 | 27 | # [Basics of PySpark with Examples](./howto/README.md) 28 | 29 | ------ 30 | 31 | # PySpark Examples and Tutorials 32 | 33 | * [PySpark Examples: RDDs](./tutorial/pyspark-examples/rdds/) 34 | * [PySpark Examples: DataFramess](./tutorial/pyspark-examples/dataframes/) 35 | * [DNA Base Counting](./tutorial/dna-basecount/README.md) 36 | * [Classic Word Count](./tutorial/wordcount) 37 | * [Find Frequency of Bigrams](./tutorial/bigrams) 38 | * [Join of Two Relations R(K, V1), S(K, V2)](./tutorial/basic-join) 39 | * [Basic Mapping of RDD Elements](./tutorial/basic-map) 40 | * [How to add all RDD elements together](./tutorial/basic-sum) 41 | * [How to multiply all RDD elements together](./tutorial/basic-multiply) 42 | * [Find Top-N and Bottom-N](./tutorial/top-N) 43 | * [Find average by using combineByKey()](./tutorial/combine-by-key) 44 | * [How to filter RDD elements](./tutorial/basic-filter) 45 | * [How to find average](./tutorial/basic-average) 46 | * [Cartesian Product: rdd1.cartesian(rdd2)](./tutorial/cartesian) 47 | * [Sort By Key: sortByKey() ascending/descending](./tutorial/basic-sort) 48 | * [How to Add Indices](./tutorial/add-indices) 49 | * [Map Partitions: mapPartitions() by Examples](./tutorial/map-partitions/README.md) 50 | * [Monoid: Design Principle](https://github.com/mahmoudparsian/data-algorithms-with-spark/blob/master/wiki-spark/docs/monoid/README.md) 51 | * [Ranking Functions by Examples](./tutorial/ranking/README.md) 52 | 53 | ------ 54 | 55 | # Books 56 | 57 | ### [Data Algorithms with Spark](https://github.com/mahmoudparsian/data-algorithms-with-spark/) 58 | 59 | ### [Data Algorithms](https://github.com/mahmoudparsian/data-algorithms-book/) 60 | 61 | ### [PySpark Algorithms](https://github.com/mahmoudparsian/pyspark-algorithms/) 62 | 63 | ----- 64 | 65 | # Miscellaneous 66 | 67 | ### [Download, Install Spark and Run PySpark](./howto/download_install_run_spark.md) 68 | 69 | ### [How to Minimize the Verbosity of Spark](./howto/minimize_verbosity.md) 70 | 71 | ------- 72 | 73 | # PySpark Tutorial and References... 74 | * [Getting started with PySpark - Part 1](http://www.mccarroll.net/blog/pyspark/) 75 | * [Getting started with PySpark - Part 2](http://www.mccarroll.net/blog/pyspark2/index.html) 76 | * [A really really fast introduction to PySpark](http://www.slideshare.net/hkarau/a-really-really-fast-introduction-to-py-spark-lightning-fast-cluster-computing-with-python-1) 77 | * [PySpark](http://www.slideshare.net/thegiivee/pysaprk?qid=81cf1b31-8b19-4570-89a5-21d03cad6ecd&v=default&b=&from_search=9) 78 | * [Basic Big Data Manipulation with PySpark](http://bigdatasciencebootcamp.com/posts/Part_3/basic_big_data.html) 79 | * [Working in Pyspark: Basics of Working with Data and RDDs](http://www.learnbymarketing.com/618/pyspark-rdd-basics-examples/) 80 | 81 | ------- 82 | 83 | # Questions/Comments 84 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian) 85 | * Please send me an email: mahmoud.parsian@yahoo.com 86 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian) 87 | 88 | Thank you! 89 | 90 | ```` 91 | best regards, 92 | Mahmoud Parsian 93 | ```` 94 | 95 | ----- 96 | 97 | 98 | 99 | Data Algorithms with Spark 103 | 104 | 105 | Data Algorithms with Spark 109 | 110 | 111 | PySpark Algorithms 115 | 116 | 117 | Data Algorithms 121 | 122 | ------ 123 | 124 | [//]: # (metadata:) 125 | [//]: # (Spark, PySpark, Python) 126 | [//]: # (MapReduce, Distributed Algorithms, mappers, reducers, partitioners) 127 | [//]: # (Transformations, Actions, RDDs, DataFrames, SQL) 128 | -------------------------------------------------------------------------------- /data/foxdata.txt: -------------------------------------------------------------------------------- 1 | red fox jumped high 2 | fox jumped over high fence 3 | red fox jumped 4 | -------------------------------------------------------------------------------- /howto/README.md: -------------------------------------------------------------------------------- 1 | # PySpark Tutorial 2 | 3 | * Spark is a multi-language engine for executing data engineering, 4 | data science, and machine learning on single-node machines or clusters. 5 | 6 | * PySpark is the Python API for Spark. 7 | 8 | # Start PySpark 9 | 10 | First make sure that you have started the Spark cluster. 11 | To start Spark, you execute the following. Note, if you 12 | are going to run PySpark shell in your laptop/macbook, 13 | then you do not need to start any clauter -- your 14 | laptop/macbook as a cluster of a single node: 15 | 16 | export SPARK_HOME= 17 | cd $SPARK_HOME 18 | ./sbin/start-all.sh 19 | 20 | 21 | To start PySpark, execute the following: 22 | 23 | 24 | cd $SPARK_HOME 25 | ./bin/pyspark 26 | 27 | 28 | Successful execution will give you the PySpark prompt: 29 | 30 | 31 | ~ % ./spark-3.3.0/bin/pyspark 32 | Python 3.10.5 (v3.10.5:f377153967, Jun 6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)] on darwin 33 | Type "help", "copyright", "credits" or "license" for more information. 34 | Setting default log level to "WARN". 35 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 36 | Welcome to 37 | ____ __ 38 | / __/__ ___ _____/ /__ 39 | _\ \/ _ \/ _ `/ __/ '_/ 40 | /__ / .__/\_,_/_/ /_/\_\ version 3.3.0 41 | /_/ 42 | 43 | Using Python version 3.10.5 (v3.10.5:f377153967, Jun 6 2022 12:36:10) 44 | Spark context Web UI available at http://10.0.0.232:4040 45 | Spark context available as 'sc' (master = local[*], app id = local-1656268371486). 46 | SparkSession available as 'spark'. 47 | >>> 48 | 49 | 50 | Note that the shell already have created two objects: 51 | * SparkContext (`sc`) object and you may use it to create RDDs. 52 | * SparkSession (`spark`) object and you may use it to create DataFrames. 53 | 54 | # Creating RDDs 55 | 56 | You may create RDDs by: 57 | * reading textfiles, 58 | * Python collections and data structures, 59 | * local file system, 60 | * S3 and HDFS, 61 | * and other data sources. 62 | 63 | 64 | ## Create RDD from a Data Structure (or Collection) 65 | 66 | * Example-1 67 | 68 | >>> data = [1, 2, 3, 4, 5, 8, 9] 69 | >>> data 70 | [1, 2, 3, 4, 5, 8, 9] 71 | >>> myRDD = sc.parallelize(data) 72 | >>> myRDD.collect() 73 | [1, 2, 3, 4, 5, 8, 9] 74 | >>> myRDD.count() 75 | 7 76 | >>> 77 | 78 | 79 | * Example-2 80 | 81 | >>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)] 82 | >>> kv 83 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)] 84 | >>> rdd2 = sc.parallelize(kv) 85 | >>> rdd2.collect() 86 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)] 87 | >>> 88 | >>> rdd3 = rdd2.reduceByKey(lambda x, y : x+y) 89 | >>> rdd3.collect() 90 | [('a', 9), ('c', 10), ('b', 6)] 91 | >>> 92 | 93 | 94 | * Example-3 95 | 96 | 97 | >>> kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)] 98 | >>> kv 99 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)] 100 | >>> rdd2 = sc.parallelize(kv) 101 | >>> rdd2.collect() 102 | [('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)] 103 | 104 | >>> rdd3 = rdd2.groupByKey() 105 | >>> rdd3.collect() 106 | [ 107 | ('a', ), 108 | ('c', ), 109 | ('b', ) 110 | ] 111 | 112 | >>> rdd3.map(lambda x : (x[0], list(x[1]))).collect() 113 | [ 114 | ('a', [7, 2]), 115 | ('c', [1, 2, 3, 4]), 116 | ('b', [2, 4]) 117 | ] 118 | >>> 119 | 120 | 121 | 122 | # Create RDD from a Local File System (Java Example) 123 | 124 | import org.apache.spark.api.java.JavaRDD; 125 | import org.apache.spark.api.java.JavaSparkContext; 126 | ... 127 | JavaSparkContext context = new JavaSparkContext(); 128 | ... 129 | final String inputPath ="file:///dir1/dir2/myinputfile.txt"; 130 | JavaRDD rdd = context.textFile(inputPath); 131 | ... 132 | 133 | 134 | # Create RDD from HDFS (Java Example) 135 | 136 | * Example-1: 137 | 138 | import org.apache.spark.api.java.JavaRDD; 139 | import org.apache.spark.api.java.JavaSparkContext; 140 | ... 141 | JavaSparkContext context = new JavaSparkContext(); 142 | ... 143 | final String inputPath ="hdfs://myhadoopserver:9000/dir1/dir2/myinputfile.txt"; 144 | JavaRDD rdd = context.textFile(inputPath); 145 | ... 146 | 147 | * Example-2: 148 | 149 | 150 | import org.apache.spark.api.java.JavaRDD; 151 | import org.apache.spark.api.java.JavaSparkContext; 152 | ... 153 | JavaSparkContext context = new JavaSparkContext(); 154 | ... 155 | final String inputPath ="/dir1/dir2/myinputfile.txt"; 156 | JavaRDD rdd = context.textFile(inputPath); 157 | ... 158 | 159 | 160 | # Questions/Comments 161 | 162 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian) 163 | * Please send me an email: mahmoud.parsian@yahoo.com 164 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian) 165 | 166 | 167 | Thank you! 168 | 169 | ```` 170 | best regards, 171 | Mahmoud Parsian 172 | ```` 173 | 174 | ----- 175 | 176 | 177 | 178 | Data Algorithms with Spark 182 | 183 | 184 | Data Algorithms with Spark 188 | 189 | 190 | PySpark Algorithms 194 | 195 | 196 | Data Algorithms 200 | -------------------------------------------------------------------------------- /howto/download_install_run_spark.md: -------------------------------------------------------------------------------- 1 | # Download, Install, and Run PySpark 2 | 3 | # 1. For macbook users: Enable "Remote Login" 4 | 5 | 6 | System Preferences --> Sharing --> enable "Remote Login" service 7 | 8 | 9 | 10 | # 2. Make Sure Java 8 is Installed Properly 11 | 12 | java -version 13 | java version "1.8.0_72" 14 | Java(TM) SE Runtime Environment (build 1.8.0_72-b15) 15 | Java HotSpot(TM) 64-Bit Server VM (build 25.72-b15, mixed mode) 16 | 17 | 18 | # 3. Download 19 | 20 | Download the latest binary Spark from the following URL: 21 | 22 | https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz 23 | 24 | 25 | # 4. Open the Downloaded File 26 | 27 | Assuming that I have downloaded my file in 28 | `/home/mparsian/spark-3.3.0-bin-hadoop3.tgz` 29 | 30 | 31 | cd /home/mparsian 32 | 33 | tar zvfx spark-3.3.0-bin-hadoop3.tgz 34 | x spark-3.3.0-bin-hadoop3/ 35 | x spark-3.3.0-bin-hadoop3/NOTICE 36 | x spark-3.3.0-bin-hadoop3/CHANGES.txt 37 | ... 38 | 39 | 40 | # 5. Start the Spark Cluster 41 | 42 | cd /home/mparsian/spark-3.3.0-bin-hadoop3/ 43 | 44 | ./sbin/start-all.sh 45 | 46 | NOTE: If you are going to run Spark in your pc/macbook/windows, 47 | then you do NOT need to start cluster at all. Invoking 48 | ./bin/pyspark, your laptop is considered as your cluster 49 | 50 | 51 | # 6. Check Master and Worker 52 | 53 | Make sure that Master and Worker processes are running: 54 | 55 | 56 | jps 57 | 1347 Master 58 | 1390 Worker 59 | 60 | 61 | # 7. Check The Spark URL 62 | 63 | http://localhost:8080 64 | 65 | 66 | # 8. Define Very Basic Python Program 67 | 68 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test.py` 69 | 70 | #!/usr/bin/python 71 | import sys 72 | 73 | for line in sys.stdin: 74 | print "hello " + line 75 | 76 | 77 | * Python program: `/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py` 78 | 79 | #!/usr/bin/python 80 | def fun2(str): 81 | str2 = str + " zaza" 82 | return str2 83 | 84 | 85 | # 9. Start and Run pyspark 86 | 87 | cd /home/mparsian/spark-3.3.0-bin-hadoop3/ 88 | ./bin/pyspark 89 | ... 90 | ... 91 | Welcome to 92 | ____ __ 93 | / __/__ ___ _____/ /__ 94 | _\ \/ _ \/ _ `/ __/ '_/ 95 | /__ / .__/\_,_/_/ /_/\_\ version 3.3.0 96 | /_/ 97 | 98 | >>> data = ["john","paul","george","ringo"] 99 | >>> data 100 | ['john', 'paul', 'george', 'ringo'] 101 | 102 | >>> rdd = sc.parallelize(data) 103 | >>> rdd.collect() 104 | ['john', 'paul', 'george', 'ringo'] 105 | 106 | 107 | >>> test = "/home/mparsian/spark-3.3.0-bin-hadoop3/test.py" 108 | >>> test2 = "/home/mparsian/spark-3.3.0-bin-hadoop3/test2.py" 109 | >>> import test 110 | >>> import test2 111 | 112 | 113 | >>> pipeRDD = rdd.pipe(test) 114 | >>> pipeRDD.collect() 115 | [u'hello john', u'', u'hello paul', u'', u'hello george', u'', u'hello ringo', u''] 116 | 117 | 118 | >>> rdd.collect() 119 | ['john', 'paul', 'george', 'ringo'] 120 | 121 | 122 | >>> rdd2 = rdd.map(lambda x : test2.fun2(x)) 123 | >>> rdd2.collect() 124 | ['john zaza', 'paul zaza', 'george zaza', 'ringo zaza'] 125 | >>> 126 | 127 | -------------------------------------------------------------------------------- /howto/minimize_verbosity.md: -------------------------------------------------------------------------------- 1 | How to Minimize the Verbosity of Spark 2 | ====================================== 3 | * Step-1: create a log4j.properties file 4 | ```` 5 | cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties 6 | ```` 7 | * Step-2: Edit $SPARK_HOME/conf/log4j.properties file: replace "INFO" with "WARN" 8 | 9 | * Now your file should look like: 10 | ```` 11 | cat $SPARK_HOME/conf/log4j.properties 12 | # Set everything to be logged to the console 13 | log4j.rootCategory=WARN, console 14 | log4j.appender.console=org.apache.log4j.ConsoleAppender 15 | log4j.appender.console.target=System.err 16 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 17 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 18 | 19 | # Settings to quiet third party logs that are too verbose 20 | log4j.logger.org.eclipse.jetty=WARN 21 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 22 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN 23 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN 24 | ```` -------------------------------------------------------------------------------- /images/Data-Algorithms-with-Spark_mech2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.pdf -------------------------------------------------------------------------------- /images/Data-Algorithms-with-Spark_mech2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data-Algorithms-with-Spark_mech2.png -------------------------------------------------------------------------------- /images/Data_Algorithms_with_Spark_COVER_9781492082385.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/Data_Algorithms_with_Spark_COVER_9781492082385.png -------------------------------------------------------------------------------- /images/data_algorithms_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_image.jpg -------------------------------------------------------------------------------- /images/data_algorithms_with_spark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/data_algorithms_with_spark.jpg -------------------------------------------------------------------------------- /images/pyspark_algorithms2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/images/pyspark_algorithms2.jpg -------------------------------------------------------------------------------- /tutorial/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/.DS_Store -------------------------------------------------------------------------------- /tutorial/add-indices/add-indices.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Welcome to 3 | ____ __ 4 | / __/__ ___ _____/ /__ 5 | _\ \/ _ \/ _ `/ __/ '_/ 6 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 7 | /_/ 8 | 9 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 10 | SparkContext available as sc, SQLContext available as sqlContext. 11 | >>> a = [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)] 12 | >>> a 13 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)] 14 | 15 | >>> rdd = sc.parallelize(a); 16 | >>> rdd.collect() 17 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)] 18 | 19 | >>> sorted = rdd.sortByKey() 20 | >>> sorted.collect() 21 | [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)] 22 | 23 | 24 | >>> rdd2 = rdd.map(lambda (x,y) : (y,x)) 25 | >>> rdd2.collect() 26 | [(2, 'g1'), (4, 'g2'), (3, 'g3'), (8, 'g4')] 27 | 28 | >>> sorted = rdd2.sortByKey() 29 | >>> sorted.collect() 30 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')] 31 | 32 | 33 | >>> sorted = rdd2.sortByKey(False) 34 | >>> sorted.collect() 35 | [(8, 'g4'), (4, 'g2'), (3, 'g3'), (2, 'g1')] 36 | 37 | >>> sorted = rdd2.sortByKey() 38 | >>> sorted.collect() 39 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')] 40 | >>> 41 | >>> list 42 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')] 43 | 44 | >>> 45 | >>> sorted.collect() 46 | [(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')] 47 | 48 | >>> indices = sorted.zipWithIndex() 49 | >>> indices.collect() 50 | [((2, 'g1'), 0), ((3, 'g3'), 1), ((4, 'g2'), 2), ((8, 'g4'), 3)] 51 | >>> -------------------------------------------------------------------------------- /tutorial/basic-average/basic-average.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 4 | Welcome to 5 | ____ __ 6 | / __/__ ___ _____/ /__ 7 | _\ \/ _ \/ _ `/ __/ '_/ 8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 9 | /_/ 10 | 11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 12 | SparkContext available as sc. 13 | >>> sc 14 | 15 | >>> 16 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20]) 17 | >>> nums.collect() 18 | [1, 2, 3, 4, 5, 6, 7, 8, 20] 19 | >>> sumAndCount = nums.map(lambda x: (x, 1)).fold((0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1]))) 20 | >>> sumAndCount 21 | (56, 9) 22 | >>> 23 | >>> avg = float(sumAndCount[0]) / float(sumAndCount[1]) 24 | >>> avg 25 | 6.2222222222222223 26 | >>> 27 | -------------------------------------------------------------------------------- /tutorial/basic-filter/basic-filter.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 4 | Welcome to 5 | ____ __ 6 | / __/__ ___ _____/ /__ 7 | _\ \/ _ \/ _ `/ __/ '_/ 8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 9 | /_/ 10 | 11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 12 | SparkContext available as sc. 13 | >>> sc 14 | 15 | 16 | >>> 17 | >>> nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7]) 18 | >>> nums.collect() 19 | [1, 2, 3, 4, 5, 6, 7] 20 | 21 | >>> filtered1 = nums.filter(lambda x : x % 2 == 1) 22 | >>> filtered1.collect() 23 | [1, 3, 5, 7] 24 | >>> 25 | >>> filtered2 = nums.filter(lambda x : x % 2 == 0) 26 | >>> filtered2.collect() 27 | [2, 4, 6] 28 | >>> 29 | -------------------------------------------------------------------------------- /tutorial/basic-join/basicjoin.txt: -------------------------------------------------------------------------------- 1 | # cat > R.txt 2 | k1,v1 3 | k1,v2 4 | k2,v3 5 | k2,v4 6 | k3,v7 7 | k3,v8 8 | k3,v9 9 | 10 | # cat > S.txt 11 | k1,v11 12 | k1,v22 13 | k1,v33 14 | k2,v55 15 | k4,v77 16 | k5,v88 17 | 18 | # ./pyspark 19 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 20 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 21 | Type "help", "copyright", "credits" or "license" for more information. 22 | Welcome to 23 | ____ __ 24 | / __/__ ___ _____/ /__ 25 | _\ \/ _ \/ _ `/ __/ '_/ 26 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 27 | /_/ 28 | 29 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 30 | SparkContext available as sc. 31 | >>> R = sc.textFile("R.txt"); 32 | >>> R.collect() 33 | [u'k1,v1', 34 | u'k1,v2', 35 | u'k2,v3', 36 | u'k2,v4', 37 | u'k3,v7', 38 | u'k3,v8', 39 | u'k3,v9'] 40 | 41 | >>> S = sc.textFile("S.txt"); 42 | >>> S.collect() 43 | [u'k1,v11', 44 | u'k1,v22', 45 | u'k1,v33', 46 | u'k2,v55', 47 | u'k4,v77', 48 | u'k5,v88' 49 | ] 50 | 51 | >>> r1 = R.map(lambda s: s.split(",")) 52 | >>> r1.collect() 53 | [ 54 | [u'k1', u'v1'], 55 | [u'k1', u'v2'], 56 | [u'k2', u'v3'], 57 | [u'k2', u'v4'], 58 | [u'k3', u'v7'], 59 | [u'k3', u'v8'], 60 | [u'k3', u'v9'] 61 | ] 62 | >>> r2 = r1.flatMap(lambda s: [(s[0], s[1])]) 63 | >>> r2.collect() 64 | [ 65 | (u'k1', u'v1'), 66 | (u'k1', u'v2'), 67 | (u'k2', u'v3'), 68 | (u'k2', u'v4'), 69 | (u'k3', u'v7'), 70 | (u'k3', u'v8'), 71 | (u'k3', u'v9') 72 | ] 73 | >>> 74 | >>> s1 = S.map(lambda s: s.split(",")) 75 | >>> s1.collect() 76 | [ 77 | [u'k1', u'v11'], 78 | [u'k1', u'v22'], 79 | [u'k1', u'v33'], 80 | [u'k2', u'v55'], 81 | [u'k4', u'v77'], 82 | [u'k5', u'v88'] 83 | ] 84 | >>> s2 = s1.flatMap(lambda s: [(s[0], s[1])]) 85 | >>> s2.collect() 86 | [ 87 | (u'k1', u'v11'), 88 | (u'k1', u'v22'), 89 | (u'k1', u'v33'), 90 | (u'k2', u'v55'), 91 | (u'k4', u'v77'), 92 | (u'k5', u'v88') 93 | ] 94 | >>> RjoinedS = r2.join(s2) 95 | >>> RjoinedS.collect() 96 | [ 97 | (u'k2', (u'v3', u'v55')), 98 | (u'k2', (u'v4', u'v55')), 99 | (u'k1', (u'v1', u'v11')), 100 | (u'k1', (u'v1', u'v22')), 101 | (u'k1', (u'v1', u'v33')), 102 | (u'k1', (u'v2', u'v11')), 103 | (u'k1', (u'v2', u'v22')), 104 | (u'k1', (u'v2', u'v33')) 105 | ] 106 | >>> -------------------------------------------------------------------------------- /tutorial/basic-map/basic-map.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | Welcome to 4 | ____ __ 5 | / __/__ ___ _____/ /__ 6 | _\ \/ _ \/ _ `/ __/ '_/ 7 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 8 | /_/ 9 | 10 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 11 | SparkContext available as sc. 12 | >>> sc 13 | 14 | >>> 15 | >>> nums = sc.parallelize([1, 2, 3, 4, 5]) 16 | >>> nums.collect() 17 | [1, 2, 3, 4, 5] 18 | >>> 19 | >>> bytwo = nums.map(lambda x: x + 2) 20 | >>> bytwo.collect() 21 | [3, 4, 5, 6, 7] 22 | >>> 23 | >>> squared = nums.map(lambda x: x * x) 24 | >>> squared.collect() 25 | [1, 4, 9, 16, 25] 26 | >>> 27 | -------------------------------------------------------------------------------- /tutorial/basic-multiply/basic-multiply.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | Welcome to 6 | ____ __ 7 | / __/__ ___ _____/ /__ 8 | _\ \/ _ \/ _ `/ __/ '_/ 9 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 10 | /_/ 11 | 12 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 13 | SparkContext available as sc. 14 | >>> sc 15 | 16 | >>> numbers = sc.parallelize([1, 2, 3, 4]) 17 | >>> mult = numbers.fold(1, (lambda x, y: x * y)) 18 | 19 | >>> mult 20 | 24 21 | -------------------------------------------------------------------------------- /tutorial/basic-sort/sort-by-key.txt: -------------------------------------------------------------------------------- 1 | # cat data.txt 2 | crazy crazy fox jumped 3 | crazy fox jumped 4 | fox is fast 5 | fox is smart 6 | dog is smart 7 | 8 | # ./bin/pyspark 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 14 | /_/ 15 | 16 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 17 | SparkContext available as sc, SQLContext available as sqlContext. 18 | >>> 19 | >>> lines = sc.textFile('data.txt', 1); 20 | >>> lines.collect() 21 | [ 22 | u'crazy crazy fox jumped', 23 | u'crazy fox jumped', 24 | u'fox is fast', 25 | u'fox is smart', 26 | u'dog is smart' 27 | ] 28 | 29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) 30 | >>> frequencies.collect() 31 | [ 32 | (u'crazy', 3), 33 | (u'jumped', 2), 34 | (u'is', 3), 35 | (u'fox', 4), 36 | (u'dog', 1), 37 | (u'fast', 1), 38 | (u'smart', 2) 39 | ] 40 | 41 | >>> frequencies.count() 42 | 7 43 | 44 | >>> sorted = frequencies.sortByKey() 45 | >>> sorted.collect() 46 | [ 47 | (u'crazy', 3), 48 | (u'dog', 1), 49 | (u'fast', 1), 50 | (u'fox', 4), 51 | (u'is', 3), 52 | (u'jumped', 2), 53 | (u'smart', 2) 54 | ] 55 | >>> 56 | >>> sortedDescending = frequencies.sortByKey(False) 57 | >>> sortedDescending.collect() 58 | [ 59 | (u'smart', 2), 60 | (u'jumped', 2), 61 | (u'is', 3), 62 | (u'fox', 4), 63 | (u'fast', 1), 64 | (u'dog', 1), 65 | (u'crazy', 3) 66 | ] 67 | -------------------------------------------------------------------------------- /tutorial/basic-sum/basic-sum.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | Welcome to 6 | ____ __ 7 | / __/__ ___ _____/ /__ 8 | _\ \/ _ \/ _ `/ __/ '_/ 9 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 10 | /_/ 11 | 12 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 13 | SparkContext available as sc. 14 | >>> sc 15 | 16 | >>> numbers = sc.parallelize([1, 2, 3, 4]) 17 | >>> sum = numbers.fold(0, (lambda x, y: x + y)) 18 | 19 | >>> sum 20 | 10 21 | -------------------------------------------------------------------------------- /tutorial/basic-union/basic-union.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Welcome to 3 | ____ __ 4 | / __/__ ___ _____/ /__ 5 | _\ \/ _ \/ _ `/ __/ '_/ 6 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 7 | /_/ 8 | 9 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 10 | SparkContext available as sc, SQLContext available as sqlContext. 11 | 12 | >>> d1= [('k1', 1), ('k2', 2), ('k3', 5)] 13 | >>> d1 14 | [('k1', 1), ('k2', 2), ('k3', 5)] 15 | 16 | >>> d2= [('k1', 3), ('k2',4), ('k4', 8)] 17 | >>> d2 18 | [('k1', 3), ('k2', 4), ('k4', 8)] 19 | 20 | >>> rdd1 = sc.parallelize(d1) 21 | >>> rdd1.collect() 22 | [('k1', 1), ('k2', 2), ('k3', 5)] 23 | 24 | >>> rdd2 = sc.parallelize(d2) 25 | >>> rdd2.collect(); 26 | [('k1', 3), ('k2', 4), ('k4', 8)] 27 | 28 | >>> rdd3 = rdd1.union(rdd2) 29 | >>> rdd3.collect() 30 | [('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)] 31 | 32 | >>> rdd4 = rdd3.reduceByKey(lambda x,y: x+y) 33 | >>> rdd4.collect() 34 | [('k3', 5), ('k2', 6), ('k1', 4), ('k4', 8)] -------------------------------------------------------------------------------- /tutorial/bigrams/bigrams.txt: -------------------------------------------------------------------------------- 1 | 1. Prepare Input 2 | 3 | # cat data.txt 4 | crazy crazy fox jumped over the fence 5 | crazy fox jumped 6 | the fence is high for fox 7 | crazy fox is smart 8 | fox jumped very high 9 | 10 | 2. Invoke pyspark 11 | 12 | # export SPARK_HOME=... 13 | # SPARK_HOME/bin/pyspark 14 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 16 | Type "help", "copyright", "credits" or "license" for more information. 17 | Welcome to 18 | ____ __ 19 | / __/__ ___ _____/ /__ 20 | _\ \/ _ \/ _ `/ __/ '_/ 21 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 22 | /_/ 23 | 24 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 25 | SparkContext available as sc. 26 | >>> sc 27 | 28 | >>> lines = sc.textFile("data.txt") 29 | >>> lines.collect() 30 | 31 | [u'crazy crazy fox jumped over the fence', 32 | u'crazy fox jumped', 33 | u'the fence is high for fox', 34 | u'crazy fox is smart', 35 | u'fox jumped very high' 36 | ] 37 | >>> bigrams = lines.map(lambda s : s.split(" ")).flatMap(lambda s: [((s[i],s[i+1]),1) for i in range (0, len(s)-1)]) 38 | >>> bigrams.collect() 39 | [((u'crazy', u'crazy'), 1), 40 | ((u'crazy', u'fox'), 1), 41 | ((u'fox', u'jumped'), 1), 42 | ((u'jumped', u'over'), 1), 43 | ((u'over', u'the'), 1), 44 | ((u'the', u'fence'), 1), 45 | ((u'crazy', u'fox'), 1), 46 | ((u'fox', u'jumped'), 1), 47 | ((u'the', u'fence'), 1), 48 | ((u'fence', u'is'), 1), 49 | ((u'is', u'high'), 1), 50 | ((u'high', u'for'), 1), 51 | ((u'for', u'fox'), 1), 52 | ((u'crazy', u'fox'), 1), 53 | ((u'fox', u'is'), 1), 54 | ((u'is', u'smart'), 1), 55 | ((u'fox', u'jumped'), 1), 56 | ((u'jumped', u'very'), 1), 57 | ((u'very', u'high'), 1) 58 | ] 59 | >>> 60 | >>> counts = bigrams.reduceByKey(lambda x, y : x+y) 61 | >>> counts.collect() 62 | [ 63 | ((u'high', u'for'), 1), 64 | ((u'fox', u'is'), 1), 65 | ((u'is', u'smart'), 1), 66 | ((u'is', u'high'), 1), 67 | ((u'fence', u'is'), 1), 68 | ((u'very', u'high'), 1), 69 | ((u'crazy', u'fox'), 3), 70 | ((u'over', u'the'), 1), 71 | ((u'for', u'fox'), 1), 72 | ((u'the', u'fence'), 2), 73 | ((u'crazy', u'crazy'), 1), 74 | ((u'jumped', u'over'), 1), 75 | ((u'jumped', u'very'), 1), 76 | ((u'fox', u'jumped'), 3) 77 | ] 78 | -------------------------------------------------------------------------------- /tutorial/cartesian/cartesian.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | ... 4 | Welcome to 5 | ____ __ 6 | / __/__ ___ _____/ /__ 7 | _\ \/ _ \/ _ `/ __/ '_/ 8 | /__ / .__/\_,_/_/ /_/\_\ version 1.3.0 9 | /_/ 10 | 11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 12 | SparkContext available as sc, SQLContext available as sqlCtx. 13 | >>> a = [('k1','v1'), ('k2', 'v2')] 14 | >>> a 15 | [('k1', 'v1'), ('k2', 'v2')] 16 | >>> b = [('k3','v3'), ('k4', 'v4'), ('k5', 'v5') ] 17 | >>> b 18 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')] 19 | >>> rdd1= sc.parallelize(a) 20 | >>> rdd1.collect() 21 | [('k1', 'v1'), ('k2', 'v2')] 22 | >>> rdd2= sc.parallelize(b) 23 | >>> rdd2.collect() 24 | [('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')] 25 | >>> rdd3 = rdd1.cartesian(rdd2) 26 | >>> rdd3.collect() 27 | [ 28 | (('k1', 'v1'), ('k3', 'v3')), 29 | (('k1', 'v1'), ('k4', 'v4')), 30 | (('k1', 'v1'), ('k5', 'v5')), 31 | (('k2', 'v2'), ('k3', 'v3')), 32 | (('k2', 'v2'), ('k4', 'v4')), 33 | (('k2', 'v2'), ('k5', 'v5')) 34 | ] 35 | >>> 36 | -------------------------------------------------------------------------------- /tutorial/combine-by-key/README.md: -------------------------------------------------------------------------------- 1 | Spark's combineByKey() Examples and Tutorial 2 | ============================================ 3 | 4 | * [Mean Calculation by combineByKey()](./spark-combineByKey.md) 5 | * [Standard Deviation and Mean Calculation by combineByKey()](./standard_deviation_by_combineByKey.md) 6 | 7 | 8 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) 9 | -------------------------------------------------------------------------------- /tutorial/combine-by-key/combine-by-key.txt: -------------------------------------------------------------------------------- 1 | # export SPARK_HOME=... 2 | # SPARK_HOME/bin/pyspark 3 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 4 | Welcome to 5 | ____ __ 6 | / __/__ ___ _____/ /__ 7 | _\ \/ _ \/ _ `/ __/ '_/ 8 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 9 | /_/ 10 | 11 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 12 | SparkContext available as sc. 13 | >>> sc 14 | 15 | 16 | >>> input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5), 17 | ("k2", 6), ("k2", 7), ("k2", 8), 18 | ("k3", 10), ("k3", 12)] 19 | >>> rdd = sc.parallelize(input) 20 | >>> sumCount = rdd.combineByKey( 21 | (lambda x: (x, 1)), 22 | (lambda x, y: (x[0] + y, x[1] + 1)), 23 | (lambda x, y: (x[0] + y[0], x[1] + y[1])) 24 | ) 25 | >>> sumCount.collect() 26 | [('k3', (22, 2)), ('k2', (21, 3)), ('k1', (15, 5))] 27 | >>> 28 | >>> avg = sumCount.mapValues( lambda v : v[0] / v[1]) 29 | >>> avg.collect() 30 | [('k3', 11), ('k2', 7), ('k1', 3)] 31 | >>> -------------------------------------------------------------------------------- /tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-tutorial/6d2df229246081db2e2087084f31fdecd0e72a5e/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf -------------------------------------------------------------------------------- /tutorial/combine-by-key/standard_deviation_by_combineByKey.md: -------------------------------------------------------------------------------- 1 | Mean and Standard Deviation by Spark's combineByKey() 2 | ===================================================== 3 | 4 | ```` 5 | # ./bin/pyspark 6 | Python 2.7.10 (default, Oct 23 2015, 19:19:21) 7 | ... 8 | Welcome to 9 | ____ __ 10 | / __/__ ___ _____/ /__ 11 | _\ \/ _ \/ _ `/ __/ '_/ 12 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 13 | /_/ 14 | 15 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21) 16 | SparkContext available as sc, HiveContext available as sqlContext. 17 | >>> data = [ 18 | ... ("A", 2.), ("A", 4.), ("A", 9.), 19 | ... ("B", 10.), ("B", 20.), 20 | ... ("Z", 3.), ("Z", 5.), ("Z", 8.), ("Z", 12.) 21 | ... ] 22 | >>> data 23 | [ 24 | ('A', 2.0), 25 | ('A', 4.0), 26 | ('A', 9.0), 27 | ('B', 10.0), 28 | ('B', 20.0), 29 | ('Z', 3.0), 30 | ('Z', 5.0), 31 | ('Z', 8.0), 32 | ('Z', 12.0) 33 | ] 34 | >>> rdd = sc.parallelize( data ) 35 | >>> rdd.collect() 36 | [ 37 | ('A', 2.0), 38 | ('A', 4.0), 39 | ('A', 9.0), 40 | ('B', 10.0), 41 | ('B', 20.0), 42 | ('Z', 3.0), 43 | ('Z', 5.0), 44 | ('Z', 8.0), 45 | ('Z', 12.0) 46 | ] 47 | >>> rdd.count() 48 | 9 49 | >>> sumCount = rdd.combineByKey(lambda value: (value, value*value, 1), 50 | ... lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1), 51 | ... lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]) 52 | ... ) 53 | 54 | >>> sumCount.collect() 55 | [ 56 | ('A', (15.0, 101.0, 3)), 57 | ('Z', (28.0, 242.0, 4)), 58 | ('B', (30.0, 500.0, 2)) 59 | ] 60 | 61 | >>> import math 62 | >>> def stdDev( sumX, sumSquared, n ): 63 | ... mean = sumX / n 64 | ... stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n) 65 | ... return (mean, stdDeviation) 66 | ... ^D 67 | 68 | >>> meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2])) 69 | >>> meanAndStdDev.collect() 70 | [ 71 | ('A', (5.0, 2.943920288775949)), 72 | ('Z', (7.0, 3.391164991562634)), 73 | ('B', (15.0, 5.0)) 74 | ] 75 | >>> 76 | ```` -------------------------------------------------------------------------------- /tutorial/dna-basecount/README.md: -------------------------------------------------------------------------------- 1 | DNA Base Counting 2 | ================= 3 | 4 | The following examples demostrates the usage of PySpark to count DNA bases. 5 | In a nutshell, ````DNA Base Counting```` counts the number of A's, T's, C's, G's, 6 | and N's (N refers to undefined code). 7 | 8 | 9 | * [DNA Base Counting Without In-Mapper Combiner](./dna-basecount.md) 10 | 11 | * [DNA Base Counting With In-Mapper Combiner](./dna-basecount2.md) 12 | 13 | * [DNA Base Counting With External Python Function](./dna-basecount3.md) 14 | 15 | 16 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) 17 | -------------------------------------------------------------------------------- /tutorial/dna-basecount/basemapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | def mapper(seq): 4 | freq = dict() 5 | for x in list(seq): 6 | if x in freq: 7 | freq[x] +=1 8 | else: 9 | freq[x] = 1 10 | # 11 | kv = [(x, freq[x]) for x in freq] 12 | return kv 13 | # 14 | #print mapper("ATCGATCGATAT") 15 | -------------------------------------------------------------------------------- /tutorial/dna-basecount/dna-basecount.md: -------------------------------------------------------------------------------- 1 | DNA Base Counting using PySpark 2 | =============================== 3 | 4 | DNA Base Count Definition 5 | ------------------------- 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html) 7 | 8 | Solution in PySpark 9 | ------------------- 10 | This solution assumes that each record is a DNA sequence. 11 | This solution emits a ````(base, 1)```` for every base in 12 | a given sequence and then aggregates all frequencies for 13 | unique bases. 14 | 15 | 16 | ```` 17 | $ cat /home/mparsian/dna_seq.txt 18 | ATATCCCCGGGAT 19 | ATCGATCGATAT 20 | 21 | 22 | # ./bin/pyspark 23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39) 24 | Welcome to 25 | ____ __ 26 | / __/__ ___ _____/ /__ 27 | _\ \/ _ \/ _ `/ __/ '_/ 28 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.0 29 | /_/ 30 | 31 | SparkContext available as sc, HiveContext available as sqlContext. 32 | >>> recs = sc.textFile('file:///home/mparsian/dna_seq.txt') 33 | 34 | >>> recs.collect() 35 | [ 36 | u'ATATCCCCGGGAT', 37 | u'ATCGATCGATAT' 38 | ] 39 | 40 | >>> rdd = recs.flatMap(lambda x : [(c,1) for c in list(x)]) 41 | >>> rdd.collect() 42 | [ 43 | (u'A', 1), 44 | (u'T', 1), 45 | (u'A', 1), 46 | (u'T', 1), 47 | (u'C', 1), 48 | (u'C', 1), 49 | (u'C', 1), 50 | (u'C', 1), 51 | (u'G', 1), 52 | (u'G', 1), 53 | (u'G', 1), 54 | (u'A', 1), 55 | (u'T', 1), 56 | (u'A', 1), 57 | (u'T', 1), 58 | (u'C', 1), 59 | (u'G', 1), 60 | (u'A', 1), 61 | (u'T', 1), 62 | (u'C', 1), 63 | (u'G', 1), 64 | (u'A', 1), 65 | (u'T', 1), 66 | (u'A', 1), 67 | (u'T', 1) 68 | ] 69 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y) 70 | >>> baseCount.collect() 71 | [ 72 | (u'A', 7), 73 | (u'C', 6), 74 | (u'G', 5), 75 | (u'T', 7) 76 | ] 77 | >>> 78 | ```` 79 | 80 | 81 | -------------------------------------------------------------------------------- /tutorial/dna-basecount/dna-basecount2.md: -------------------------------------------------------------------------------- 1 | DNA Base Counting using PySpark Using In-Mapper Combiner 2 | ======================================================== 3 | 4 | DNA Base Count Definition 5 | ------------------------- 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html) 7 | 8 | Solution in PySpark 9 | ------------------- 10 | This solution assumes that each record is a DNA sequence. 11 | This solution uses "In-Mapper Combiner" design pattern 12 | and aggregates bases for each sequence before full 13 | aggregation of all frequencies for unique bases. 14 | 15 | 16 | ```` 17 | $ cat /home/mparsian/dna_seq.txt 18 | ATATCCCCGGGAT 19 | ATCGATCGATAT 20 | 21 | 22 | # ./bin/pyspark 23 | Python 2.7.10 (default, Aug 22 2015, 20:33:39) 24 | Welcome to 25 | ____ __ 26 | / __/__ ___ _____/ /__ 27 | _\ \/ _ \/ _ `/ __/ '_/ 28 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.0 29 | /_/ 30 | 31 | SparkContext available as sc, HiveContext available as sqlContext. 32 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt') 33 | 34 | >>> recs.collect() 35 | [ 36 | u'ATATCCCCGGGAT', 37 | u'ATCGATCGATAT' 38 | ] 39 | 40 | >>> def mapper(seq): 41 | ... freq = dict() 42 | ... for x in list(seq): 43 | ... if x in freq: 44 | ... freq[x] +=1 45 | ... else: 46 | ... freq[x] = 1 47 | ... # 48 | ... kv = [(x, freq[x]) for x in freq] 49 | ... return kv 50 | ... ^D 51 | 52 | 53 | >>> rdd = recs.flatMap(mapper) 54 | >>> rdd.collect() 55 | [ 56 | (u'A', 3), 57 | (u'C', 4), 58 | (u'T', 3), 59 | (u'G', 3), 60 | (u'A', 4), 61 | (u'C', 2), 62 | (u'T', 4), 63 | (u'G', 2) 64 | ] 65 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y) 66 | >>> baseCount.collect() 67 | [ 68 | (u'A', 7), 69 | (u'C', 6), 70 | (u'G', 5), 71 | (u'T', 7) 72 | ] 73 | >>> 74 | ```` 75 | 76 | 77 | -------------------------------------------------------------------------------- /tutorial/dna-basecount/dna-basecount3.md: -------------------------------------------------------------------------------- 1 | DNA Base Counting using PySpark 2 | =============================== 3 | 4 | DNA Base Count Definition 5 | ------------------------- 6 | [DNA Base Counting is defined here.](https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch24.html) 7 | 8 | Solution in PySpark 9 | ------------------- 10 | This solution assumes that each record is a DNA sequence. 11 | This solution emits a ````(base, 1)```` for every base in 12 | a given sequence and then aggregates all frequencies for 13 | unique bases. For this solution we use an external Python 14 | function defined in ````basemapper.py```` 15 | 16 | * Define Python Function 17 | 18 | ```` 19 | $ export SPARK_HOME=/home/mparsian/spark-1.6.1-bin-hadoop2.6 20 | $ cat $SPARK_HOME/basemapper.py 21 | #!/usr/bin/python 22 | 23 | def mapper(seq): 24 | freq = dict() 25 | for x in list(seq): 26 | if x in freq: 27 | freq[x] +=1 28 | else: 29 | freq[x] = 1 30 | # 31 | kv = [(x, freq[x]) for x in freq] 32 | return kv 33 | # 34 | #for testing: 35 | #print mapper("ATCGATCGATAT") 36 | ```` 37 | * Define Very Basic Sample Input 38 | 39 | ```` 40 | $ cat /home/mparsian/dna_seq.txt 41 | ATATCCCCGGGAT 42 | ATCGATCGATAT 43 | ```` 44 | 45 | * Sample PySpark Run 46 | 47 | ```` 48 | # ./bin/pyspark 49 | Welcome to 50 | ____ __ 51 | / __/__ ___ _____/ /__ 52 | _\ \/ _ \/ _ `/ __/ '_/ 53 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 54 | /_/ 55 | 56 | SparkContext available as sc, HiveContext available as sqlContext. 57 | >>> recs = sc.texFile('file:///home/mparsian/dna_seq.txt') 58 | 59 | >>> recs.collect() 60 | [ 61 | u'ATATCCCCGGGAT', 62 | u'ATCGATCGATAT' 63 | ] 64 | 65 | >>> basemapper = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/basemapper.py" 66 | >>> import basemapper 67 | >>> basemapper 68 | 69 | >>> 70 | >>> recs = sc.textFile('file:////Users/mparsian/zmp/github/pyspark-tutorial/tutorial/dna-basecount/dna_seq.txt') 71 | >>> rdd = recs.flatMap(basemapper.mapper) 72 | >>> rdd.collect() 73 | [(u'A', 3), (u'C', 4), (u'T', 3), (u'G', 3), (u'A', 4), (u'C', 2), (u'T', 4), (u'G', 2)] 74 | 75 | >>> baseCount = rdd.reduceByKey(lambda x,y : x+y) 76 | >>> baseCount.collect() 77 | [(u'A', 7), (u'C', 6), (u'G', 5), (u'T', 7)] 78 | >>> 79 | ```` -------------------------------------------------------------------------------- /tutorial/dna-basecount/dna_seq.txt: -------------------------------------------------------------------------------- 1 | ATATCCCCGGGAT 2 | ATCGATCGATAT 3 | -------------------------------------------------------------------------------- /tutorial/map-partitions/README.md: -------------------------------------------------------------------------------- 1 | Spark's mapPartitions() 2 | ======================= 3 | 4 | According to Spark API: ````mapPartitions(func)```` transformation is 5 | similar to ````map()````, but runs separately on each partition (block) 6 | of the RDD, so ````func```` must be of type ````Iterator => Iterator```` 7 | when running on an RDD of type T. 8 | 9 | 10 | The ````mapPartitions()```` transformation should be used when you want to 11 | extract some condensed information (such as finding the minimum and maximum 12 | of numbers) from each partition. For example, if you want to find the minimum 13 | and maximum of all numbers in your input, then using ````map()```` can be 14 | pretty inefficient, since you will be generating tons of intermediate 15 | (K,V) pairs, but the bottom line is you just want to find two numbers: the 16 | minimum and maximum of all numbers in your input. Another example can be if 17 | you want to find top-10 (or bottom-10) for your input, then mapPartitions() 18 | can work very well: find the top-10 (or bottom-10) per partition, then find 19 | the top-10 (or bottom-10) for all partitions: this way you are limiting 20 | emitting too many intermediate (K,V) pairs. 21 | 22 | 23 | Example-1: Sum Each Partition 24 | ============================= 25 | ```` 26 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 27 | >>> numbers 28 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 29 | 30 | >>> rdd = sc.parallelize(numbers, 3) 31 | 32 | >>> rdd.collect() 33 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 34 | 35 | >>> rdd.getNumPartitions() 36 | 3 37 | 38 | >>> def f(iterator): 39 | ... for x in iterator: 40 | ... print(x) 41 | ... print "===" 42 | ... 43 | >>> rdd.mapPartitions(f).collect() 44 | 1 45 | 2 46 | 3 47 | === 48 | 7 49 | 8 50 | 9 51 | 10 52 | === 53 | 4 54 | 5 55 | 6 56 | === 57 | 58 | >>> def adder(iterator): 59 | ... yield sum(iterator) 60 | ... 61 | >>> rdd.mapPartitions(adder).collect() 62 | [6, 15, 34] 63 | 64 | ```` 65 | 66 | 67 | Example-2: Find Minimum and Maximum 68 | =================================== 69 | Use ````mapPartitions()```` and find the minimum and maximum from each partition. 70 | 71 | To make it a cleaner solution, we define a python function to return the minimum and maximum 72 | for a given iteration. 73 | 74 | ```` 75 | $ cat minmax.py 76 | #!/usr/bin/python 77 | 78 | def minmax(iterator): 79 | firsttime = 0 80 | #min = 0; 81 | #max = 0; 82 | for x in iterator: 83 | if (firsttime == 0): 84 | min = x; 85 | max = x; 86 | firsttime = 1 87 | else: 88 | if x > max: 89 | max = x 90 | if x < min: 91 | min = x 92 | # 93 | return [(min, max)] 94 | # 95 | #data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10] 96 | #print minmax(data) 97 | ```` 98 | Then we use the minmax function for the ````mapPartitions()````: 99 | 100 | >>> rdd = spark.sparkContext.parallelize(data, 3) 101 | >>> mapped = rdd.mapPartitions(minmax) 102 | >>> mapped.collect() 103 | [(3, 20), (2, 5), (2, 20)] 104 | >>> minmax_list = mapped.collect() 105 | >>> minimum = min(minmax_list[0]) 106 | >>> minimum 107 | 3 108 | >>> maximum = max(minmax_list[0]) 109 | >>> maximum 110 | 20 111 | 112 | ```` 113 | ### NOTE: data can be huge, but for understanding 114 | ### the mapPartitions() we use a very small data set 115 | 116 | >>> data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10] 117 | >>> rdd = sc.parallelize(data, 3) 118 | 119 | >>> rdd.getNumPartitions() 120 | 3 121 | 122 | >>> rdd.collect() 123 | [10, 20, 3, 4, 5, 2, 2, 20, 20, 10] 124 | 125 | >>> def f(iterator): 126 | ... for x in iterator: 127 | ... print(x) 128 | ... print "===" 129 | ... ^D 130 | 131 | >>> rdd.foreachPartition(f) 132 | 10 133 | 20 134 | 3 135 | === 136 | 4 137 | 5 138 | 2 139 | === 140 | 2 141 | 20 142 | 20 143 | 10 144 | === 145 | >>> 146 | 147 | >>> minmax = "/Users/mparsian/spark-1.6.1-bin-hadoop2.6/minmax.py" 148 | >>> import minmax 149 | 150 | ### NOTE: the minmaxlist is a small list of numbers 151 | ### two mumbers (min and max) are generated per partition 152 | >>> minmaxlist = rdd.mapPartitions(minmax.minmax).collect() 153 | >>> minmaxlist 154 | [3, 20, 2, 5, 2, 20] 155 | 156 | >>> min(minmaxlist) 157 | 2 158 | >>> max(minmaxlist) 159 | 20 160 | ```` 161 | 162 | Questions/Comments 163 | ================== 164 | * [View Mahmoud Parsian's profile on LinkedIn](http://www.linkedin.com/in/mahmoudparsian) 165 | * Please send me an email: mahmoud.parsian@yahoo.com 166 | * [Twitter: @mahmoudparsian](http://twitter.com/mahmoudparsian) 167 | 168 | Thank you! 169 | 170 | ```` 171 | best regards, 172 | Mahmoud Parsian 173 | ```` 174 | 175 | [![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) 176 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/VIDEO-DataFrames.txt: -------------------------------------------------------------------------------- 1 | Structuring Apache Spark 2.0: SQL, DataFrames, Datasets And Streaming - by Michael Armbrust 2 | https://www.youtube.com/watch?v=1a4pgYzeFwE 3 | 28 mins 4 | 5 | AWS Tutorial - AWS Athena + S3 6 | 20 mins 7 | https://www.youtube.com/watch?v=SiUDN95sJIo 8 | 9 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-examples.md: -------------------------------------------------------------------------------- 1 | ## Spark DataFrame Examples (using PySpark): 2 | 3 | 1. [Introduction to PySpark DataFrames (slides)](https://projector-video-pdf-converter.datacamp.com/13023/chapter3.pdf) 4 | 5 | 2. [Apache Spark's DataFrame Examples](http://spark.apache.org/examples.html) 6 | 7 | 3. [PySpark Dataframe Basics](https://changhsinlee.com/pyspark-dataframe-basics/) 8 | 9 | 4. [PySpark Dataframe Basics -- notebook](https://github.com/changhsinlee/changhsinlee.github.io/blob/master/notebook/2018-03-04-pyspark-dataframe-basics/dataframe-basics.ipynb) 10 | 11 | 5. [My Tutorial/Spark SQL Tutorial (PySpark)](https://www.zepl.com/viewer/notebooks/bm90ZTovL3pqZmZkdS8wN2M3YmI0MmJjMWI0YmE0OTc1M2IzMzZkMjA2MTk4Ny9ub3RlLmpzb24) 12 | 13 | 6. [Complete Guide on DataFrame Operations in PySpark](https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/) 14 | 15 | 7. [Introduction to DataFrame Operations in PySpark](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html) 16 | 17 | 8. [PySpark DataFrame Tutorial: Introduction to DataFrames](https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra) 18 | 19 | 9. [Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html) 20 | 21 | 10. [How to use Spark SQL: A hands-on tutorial](https://opensource.com/article/19/3/apache-spark-and-dataframes-tutorial) 22 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-session-2019-02-14.txt: -------------------------------------------------------------------------------- 1 | $ cat /Users/mparsian/tmp/emps_no_header.txt 2 | 1001,alex,67000,SALES 3 | 1002,bob,24000,SALES 4 | 1003,boby,24000,SALES 5 | 1004,jane,69000,SOFTWARE 6 | 1005,betty,55000,SOFTWARE 7 | 1006,jeff,59000,SOFTWARE 8 | 1007,dara,72000,SOFTWARE 9 | 10 | 11 | $ ./bin/pyspark 12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 13 | [Clang 6.0 (clang-600.0.57)] on darwin 14 | Type "help", "copyright", "credits" or "license" for more information. 15 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 16 | Welcome to 17 | ____ __ 18 | / __/__ ___ _____/ /__ 19 | _\ \/ _ \/ _ `/ __/ '_/ 20 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 21 | /_/ 22 | 23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 24 | SparkSession available as 'spark'. 25 | >>> 26 | >>> 27 | >>> 28 | >>> 29 | >>> 30 | >>> 31 | >>> 32 | >>> input_path = "/Users/mparsian/tmp/emps_no_header.txt" 33 | >>> df = spark.read.csv(input_path) 34 | >>> df.show() 35 | +----+-----+-----+--------+ 36 | | _c0| _c1| _c2| _c3| 37 | +----+-----+-----+--------+ 38 | |1001| alex|67000| SALES| 39 | |1002| bob|24000| SALES| 40 | |1003| boby|24000| SALES| 41 | |1004| jane|69000|SOFTWARE| 42 | |1005|betty|55000|SOFTWARE| 43 | |1006| jeff|59000|SOFTWARE| 44 | |1007| dara|72000|SOFTWARE| 45 | +----+-----+-----+--------+ 46 | 47 | >>> df.collect() 48 | [ 49 | Row(_c0='1001', _c1='alex', _c2='67000', _c3='SALES'), 50 | Row(_c0='1002', _c1='bob', _c2='24000', _c3='SALES'), 51 | Row(_c0='1003', _c1='boby', _c2='24000', _c3='SALES'), 52 | Row(_c0='1004', _c1='jane', _c2='69000', _c3='SOFTWARE'), 53 | Row(_c0='1005', _c1='betty', _c2='55000', _c3='SOFTWARE'), 54 | Row(_c0='1006', _c1='jeff', _c2='59000', _c3='SOFTWARE'), 55 | Row(_c0='1007', _c1='dara', _c2='72000', _c3='SOFTWARE') 56 | ] 57 | >>> 58 | >>> 59 | 60 | >>> 61 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept") 62 | >>> df2.show() 63 | +----+-----+------+--------+ 64 | | id| name|salary| dept| 65 | +----+-----+------+--------+ 66 | |1001| alex| 67000| SALES| 67 | |1002| bob| 24000| SALES| 68 | |1003| boby| 24000| SALES| 69 | |1004| jane| 69000|SOFTWARE| 70 | |1005|betty| 55000|SOFTWARE| 71 | |1006| jeff| 59000|SOFTWARE| 72 | |1007| dara| 72000|SOFTWARE| 73 | +----+-----+------+--------+ 74 | 75 | >>> df2.printSchema() 76 | root 77 | |-- id: string (nullable = true) 78 | |-- name: string (nullable = true) 79 | |-- salary: string (nullable = true) 80 | |-- dept: string (nullable = true) 81 | 82 | >>> df2.createOrReplaceTempView("emp_table") 83 | >>> 84 | >>> 85 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002") 86 | >>> df3.show() 87 | +----+-----+------+--------+ 88 | | id| name|salary| dept| 89 | +----+-----+------+--------+ 90 | |1003| boby| 24000| SALES| 91 | |1004| jane| 69000|SOFTWARE| 92 | |1005|betty| 55000|SOFTWARE| 93 | |1006| jeff| 59000|SOFTWARE| 94 | |1007| dara| 72000|SOFTWARE| 95 | +----+-----+------+--------+ 96 | 97 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-session-2020-11-04.txt: -------------------------------------------------------------------------------- 1 | $ cat /tmp/emps_no_header.txt 2 | 1001,alex,67000,SALES 3 | 1002,bob,24000,SALES 4 | 1003,boby,24000,SALES 5 | 1004,jane,69000,SOFTWARE 6 | 1005,betty,55000,SOFTWARE 7 | 1006,jeff,59000,SOFTWARE 8 | 1007,dara,72000,SOFTWARE 9 | 1001,al,69000,SALES 10 | 1002,bobby,24900,BUSINESS 11 | 12 | $ ./bin/pyspark 13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 14 | [Clang 6.0 (clang-600.0.57)] on darwin 15 | Type "help", "copyright", "credits" or "license" for more information. 16 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 17 | Setting default log level to "WARN". 18 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 19 | Welcome to 20 | ____ __ 21 | / __/__ ___ _____/ /__ 22 | _\ \/ _ \/ _ `/ __/ '_/ 23 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 24 | /_/ 25 | 26 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 27 | SparkSession available as 'spark'. 28 | >>> input_path = '/tmp/emps_no_header.txt' 29 | >>> df = spark.read.csv(input_path) 30 | >>> df.show() 31 | +----+-----+-----+--------+ 32 | | _c0| _c1| _c2| _c3| 33 | +----+-----+-----+--------+ 34 | |1001| alex|67000| SALES| 35 | |1002| bob|24000| SALES| 36 | |1003| boby|24000| SALES| 37 | |1004| jane|69000|SOFTWARE| 38 | |1005|betty|55000|SOFTWARE| 39 | |1006| jeff|59000|SOFTWARE| 40 | |1007| dara|72000|SOFTWARE| 41 | |1001| al|69000| SALES| 42 | |1002|bobby|24900|BUSINESS| 43 | +----+-----+-----+--------+ 44 | 45 | >>> df.count() 46 | 9 47 | >>> df.printSchema() 48 | root 49 | |-- _c0: string (nullable = true) 50 | |-- _c1: string (nullable = true) 51 | |-- _c2: string (nullable = true) 52 | |-- _c3: string (nullable = true) 53 | 54 | >>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept") 55 | >>> df2.show() 56 | +----+-----+------+--------+ 57 | | id| name|salary| dept| 58 | +----+-----+------+--------+ 59 | |1001| alex| 67000| SALES| 60 | |1002| bob| 24000| SALES| 61 | |1003| boby| 24000| SALES| 62 | |1004| jane| 69000|SOFTWARE| 63 | |1005|betty| 55000|SOFTWARE| 64 | |1006| jeff| 59000|SOFTWARE| 65 | |1007| dara| 72000|SOFTWARE| 66 | |1001| al| 69000| SALES| 67 | |1002|bobby| 24900|BUSINESS| 68 | +----+-----+------+--------+ 69 | 70 | >>> df2.createOrReplaceTempView("emp_table") 71 | >>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002") 72 | >>> df3.show() 73 | +----+-----+------+--------+ 74 | | id| name|salary| dept| 75 | +----+-----+------+--------+ 76 | |1003| boby| 24000| SALES| 77 | |1004| jane| 69000|SOFTWARE| 78 | |1005|betty| 55000|SOFTWARE| 79 | |1006| jeff| 59000|SOFTWARE| 80 | |1007| dara| 72000|SOFTWARE| 81 | +----+-----+------+--------+ 82 | 83 | >>> df3.printSchema() 84 | root 85 | |-- id: string (nullable = true) 86 | |-- name: string (nullable = true) 87 | |-- salary: string (nullable = true) 88 | |-- dept: string (nullable = true) 89 | 90 | >>> df4 = df2.filter(df2.id > 1002) 91 | >>> df4.show() 92 | +----+-----+------+--------+ 93 | | id| name|salary| dept| 94 | +----+-----+------+--------+ 95 | |1003| boby| 24000| SALES| 96 | |1004| jane| 69000|SOFTWARE| 97 | |1005|betty| 55000|SOFTWARE| 98 | |1006| jeff| 59000|SOFTWARE| 99 | |1007| dara| 72000|SOFTWARE| 100 | +----+-----+------+--------+ 101 | 102 | >>> df5 = spark.sql("SELECT id, salary FROM emp_table WHERE id > 1002") 103 | >>> df5.show() 104 | +----+------+ 105 | | id|salary| 106 | +----+------+ 107 | |1003| 24000| 108 | |1004| 69000| 109 | |1005| 55000| 110 | |1006| 59000| 111 | |1007| 72000| 112 | +----+------+ 113 | 114 | >>> 115 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary") 116 | >>> df6.show() 117 | +----+------+ 118 | |name|salary| 119 | +----+------+ 120 | |jeff| 59000| 121 | |alex| 67000| 122 | |jane| 69000| 123 | | al| 69000| 124 | |dara| 72000| 125 | +----+------+ 126 | 127 | >>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary DESC") 128 | >>> df6.show() 129 | +----+------+ 130 | |name|salary| 131 | +----+------+ 132 | |dara| 72000| 133 | | al| 69000| 134 | |jane| 69000| 135 | |alex| 67000| 136 | |jeff| 59000| 137 | +----+------+ 138 | 139 | >>> df7 = spark.sql("SELECT dept, COUNT(*) as count FROM emp_table GROUP BY dept") 140 | >>> df7.show() 141 | +--------+-----+ 142 | | dept|count| 143 | +--------+-----+ 144 | | SALES| 4| 145 | |BUSINESS| 1| 146 | |SOFTWARE| 4| 147 | +--------+-----+ 148 | 149 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-session-2021-05-12-intro.txt: -------------------------------------------------------------------------------- 1 | #-------------------- 2 | # DataFrame Tutorial: 3 | #-------------------- 4 | https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra 5 | 6 | 7 | #--------------------- 8 | # Demo of DataFrames 9 | #--------------------- 10 | 11 | $ cat /tmp/cats.csv 12 | name,age,gender,weight 13 | cuttie,2,female,6 14 | mono,3,male,9 15 | pishi,2,female,4 16 | zazo,1,male,4 17 | fuzzy,1,female,4 18 | 19 | $ ./bin/pyspark 20 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 21 | Welcome to 22 | ____ __ 23 | / __/__ ___ _____/ /__ 24 | _\ \/ _ \/ _ `/ __/ '_/ 25 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 26 | /_/ 27 | 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 29 | Spark context Web UI available at http://10.0.0.93:4040 30 | Spark context available as 'sc' (master = local[*], app id = local-1620755686906). 31 | SparkSession available as 'spark'. 32 | 33 | >>> 34 | >>> input_path = '/tmp/cats.csv' 35 | >>> input_path 36 | '/tmp/cats.csv' 37 | >>> cats = spark.read.csv(input_path, inferSchema = True, header = True) 38 | 39 | >>> 40 | >>> cats.show(truncate=False) 41 | +------+---+------+------+ 42 | |name |age|gender|weight| 43 | +------+---+------+------+ 44 | |cuttie|2 |female|6 | 45 | |mono |3 |male |9 | 46 | |pishi |2 |female|4 | 47 | |zazo |1 |male |4 | 48 | |fuzzy |1 |female|4 | 49 | +------+---+------+------+ 50 | 51 | >>> cats.printSchema() 52 | root 53 | |-- name: string (nullable = true) 54 | |-- age: integer (nullable = true) 55 | |-- gender: string (nullable = true) 56 | |-- weight: integer (nullable = true) 57 | 58 | >>> cats.count() 59 | 5 60 | >>> cats.columns 61 | ['name', 'age', 'gender', 'weight'] 62 | >>> cats.describe('weight').show() 63 | +-------+------------------+ 64 | |summary| weight| 65 | +-------+------------------+ 66 | | count| 5| 67 | | mean| 5.4| 68 | | stddev|2.1908902300206643| 69 | | min| 4| 70 | | max| 9| 71 | +-------+------------------+ 72 | 73 | >>> name_age = cats.select("name", "age") 74 | >>> name_age.show(truncate=False) 75 | +------+---+ 76 | |name |age| 77 | +------+---+ 78 | |cuttie|2 | 79 | |mono |3 | 80 | |pishi |2 | 81 | |zazo |1 | 82 | |fuzzy |1 | 83 | +------+---+ 84 | 85 | >>> name_age.printSchema() 86 | root 87 | |-- name: string (nullable = true) 88 | |-- age: integer (nullable = true) 89 | 90 | >>> cats.select('age').distinct().show() 91 | +---+ 92 | |age| 93 | +---+ 94 | | 1| 95 | | 3| 96 | | 2| 97 | +---+ 98 | 99 | >>> cats.select('name', 'age').distinct().show() 100 | +------+---+ 101 | | name|age| 102 | +------+---+ 103 | | zazo| 1| 104 | |cuttie| 2| 105 | | fuzzy| 1| 106 | | mono| 3| 107 | | pishi| 2| 108 | +------+---+ 109 | 110 | >>> cats.filter(cats.age > 1).show() 111 | +------+---+------+------+ 112 | | name|age|gender|weight| 113 | +------+---+------+------+ 114 | |cuttie| 2|female| 6| 115 | | mono| 3| male| 9| 116 | | pishi| 2|female| 4| 117 | +------+---+------+------+ 118 | 119 | 120 | >>> cats.orderBy(cats.age).show() 121 | +------+---+------+------+ 122 | | name|age|gender|weight| 123 | +------+---+------+------+ 124 | | zazo| 1| male| 4| 125 | | fuzzy| 1|female| 4| 126 | |cuttie| 2|female| 6| 127 | | pishi| 2|female| 4| 128 | | mono| 3| male| 9| 129 | +------+---+------+------+ 130 | 131 | >>> age_df = cats.groupby("age").count() 132 | >>> age_df.show() 133 | +---+-----+ 134 | |age|count| 135 | +---+-----+ 136 | | 1| 2| 137 | | 3| 1| 138 | | 2| 2| 139 | +---+-----+ 140 | 141 | >>> cats.show() 142 | +------+---+------+------+ 143 | | name|age|gender|weight| 144 | +------+---+------+------+ 145 | |cuttie| 2|female| 6| 146 | | mono| 3| male| 9| 147 | | pishi| 2|female| 4| 148 | | zazo| 1| male| 4| 149 | | fuzzy| 1|female| 4| 150 | +------+---+------+------+ 151 | 152 | >>> cats.registerTempTable('cats_table') 153 | >>> spark.sql("select * from cats_table").show() 154 | +------+---+------+------+ 155 | | name|age|gender|weight| 156 | +------+---+------+------+ 157 | |cuttie| 2|female| 6| 158 | | mono| 3| male| 9| 159 | | pishi| 2|female| 4| 160 | | zazo| 1| male| 4| 161 | | fuzzy| 1|female| 4| 162 | +------+---+------+------+ 163 | 164 | >>> spark.sql("select * from cats_table where age > 1").show() 165 | +------+---+------+------+ 166 | | name|age|gender|weight| 167 | +------+---+------+------+ 168 | |cuttie| 2|female| 6| 169 | | mono| 3| male| 9| 170 | | pishi| 2|female| 4| 171 | +------+---+------+------+ 172 | 173 | >>> spark.sql("select age, count(*) from cats_table group by age").show() 174 | +---+--------+ 175 | |age|count(1)| 176 | +---+--------+ 177 | | 1| 2| 178 | | 3| 1| 179 | | 2| 2| 180 | +---+--------+ 181 | 182 | >>> def exec_sql(query): 183 | ... spark.sql(query).show() 184 | ... 185 | >>> 186 | >>> exec_sql("select age, count(*) from cats_table group by age") 187 | +---+--------+ 188 | |age|count(1)| 189 | +---+--------+ 190 | | 1| 2| 191 | | 3| 1| 192 | | 2| 2| 193 | +---+--------+ 194 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-12.txt: -------------------------------------------------------------------------------- 1 | >>> spark 2 | 3 | 4 | >>> spark.version 5 | '3.2.0' 6 | 7 | >>> # create a Python collection as data 8 | >>> data = 9 | [ 10 | ('alex', 20, 12000), 11 | ('jane', 30, 45000), 12 | ('rafa', 40, 56000), 13 | ('ted', 30, 145000), 14 | ('xo2', 10, 1332000), 15 | ('mary', 44, 555000) 16 | ] 17 | 18 | >>> data 19 | [ 20 | ('alex', 20, 12000), 21 | ('jane', 30, 45000), 22 | ('rafa', 40, 56000), 23 | ('ted', 30, 145000), 24 | ('xo2', 10, 1332000), 25 | ('mary', 44, 555000) 26 | ] 27 | 28 | >>> #define column names 29 | >>> column_names = ['name', 'age', 'salary'] 30 | >>> column_names 31 | ['name', 'age', 'salary'] 32 | 33 | >>> # create a DataFrame as df 34 | >>> df = spark.createDataFrame(data, column_names) 35 | >>> 36 | >>> # inspect created DataFrame 37 | >>> df 38 | DataFrame[name: string, age: bigint, salary: bigint] 39 | 40 | >>> # inspect created DataFrame's Schema 41 | >>> df.printSchema() 42 | root 43 | |-- name: string (nullable = true) 44 | |-- age: long (nullable = true) 45 | |-- salary: long (nullable = true) 46 | 47 | >>> # display the first 20 rows of a DataFrame 48 | >>> df.show() 49 | +----+---+-------+ 50 | |name|age| salary| 51 | +----+---+-------+ 52 | |alex| 20| 12000| 53 | |jane| 30| 45000| 54 | |rafa| 40| 56000| 55 | | ted| 30| 145000| 56 | | xo2| 10|1332000| 57 | |mary| 44| 555000| 58 | +----+---+-------+ 59 | 60 | >>> # count the number of rows 61 | >>> df.count() 62 | 6 63 | 64 | 65 | >>> # Creates or replaces a local temporary view with this DataFrame 66 | >>> df.createOrReplaceTempView("people") 67 | 68 | >>> df2 = spark.sql("select * from people where salary > 67000") 69 | >>> df2.show() 70 | +----+---+-------+ 71 | |name|age| salary| 72 | +----+---+-------+ 73 | | ted| 30| 145000| 74 | | xo2| 10|1332000| 75 | |mary| 44| 555000| 76 | +----+---+-------+ 77 | 78 | >>> df3 = spark.sql("select * from people where salary > 67000 and age > 11") 79 | >>> df3.show() 80 | +----+---+------+ 81 | |name|age|salary| 82 | +----+---+------+ 83 | | ted| 30|145000| 84 | |mary| 44|555000| 85 | +----+---+------+ 86 | 87 | >>> df.show() 88 | +----+---+-------+ 89 | |name|age| salary| 90 | +----+---+-------+ 91 | |alex| 20| 12000| 92 | |jane| 30| 45000| 93 | |rafa| 40| 56000| 94 | | ted| 30| 145000| 95 | | xo2| 10|1332000| 96 | |mary| 44| 555000| 97 | +----+---+-------+ 98 | 99 | >>> df4 = spark.sql("select * from people") 100 | >>> df4.show() 101 | +----+---+-------+ 102 | |name|age| salary| 103 | +----+---+-------+ 104 | |alex| 20| 12000| 105 | |jane| 30| 45000| 106 | |rafa| 40| 56000| 107 | | ted| 30| 145000| 108 | | xo2| 10|1332000| 109 | |mary| 44| 555000| 110 | +----+---+-------+ 111 | 112 | >>> cart = spark.sql("select * from people p1, people p2") 113 | >>> cart.show() 114 | +----+---+------+----+---+-------+ 115 | |name|age|salary|name|age| salary| 116 | +----+---+------+----+---+-------+ 117 | |alex| 20| 12000|alex| 20| 12000| 118 | |alex| 20| 12000|jane| 30| 45000| 119 | |alex| 20| 12000|rafa| 40| 56000| 120 | |alex| 20| 12000| ted| 30| 145000| 121 | |alex| 20| 12000| xo2| 10|1332000| 122 | |alex| 20| 12000|mary| 44| 555000| 123 | |jane| 30| 45000|alex| 20| 12000| 124 | |jane| 30| 45000|jane| 30| 45000| 125 | |jane| 30| 45000|rafa| 40| 56000| 126 | |jane| 30| 45000| ted| 30| 145000| 127 | |jane| 30| 45000| xo2| 10|1332000| 128 | |jane| 30| 45000|mary| 44| 555000| 129 | |rafa| 40| 56000|alex| 20| 12000| 130 | |rafa| 40| 56000|jane| 30| 45000| 131 | |rafa| 40| 56000|rafa| 40| 56000| 132 | |rafa| 40| 56000| ted| 30| 145000| 133 | |rafa| 40| 56000| xo2| 10|1332000| 134 | |rafa| 40| 56000|mary| 44| 555000| 135 | | ted| 30|145000|alex| 20| 12000| 136 | | ted| 30|145000|jane| 30| 45000| 137 | +----+---+------+----+---+-------+ 138 | only showing top 20 rows 139 | 140 | >>> cart 141 | >>> Frame[name: string, age: bigint, salary: bigint, name: string, age: bigint, salary: bigint] 142 | >>> 143 | 144 | >>> cart2 = spark.sql("select p1.name as name, p2.age as age, p1.salary as salary, p2.name as name2, p2.age as age2, p2.salary as salary2 from people p1, people p2") 145 | >>> cart2.show() 146 | +----+---+------+-----+----+-------+ 147 | |name|age|salary|name2|age2|salary2| 148 | +----+---+------+-----+----+-------+ 149 | |alex| 20| 12000| alex| 20| 12000| 150 | |alex| 30| 12000| jane| 30| 45000| 151 | |alex| 40| 12000| rafa| 40| 56000| 152 | |alex| 30| 12000| ted| 30| 145000| 153 | |alex| 10| 12000| xo2| 10|1332000| 154 | |alex| 44| 12000| mary| 44| 555000| 155 | |jane| 20| 45000| alex| 20| 12000| 156 | |jane| 30| 45000| jane| 30| 45000| 157 | |jane| 40| 45000| rafa| 40| 56000| 158 | |jane| 30| 45000| ted| 30| 145000| 159 | |jane| 10| 45000| xo2| 10|1332000| 160 | |jane| 44| 45000| mary| 44| 555000| 161 | |rafa| 20| 56000| alex| 20| 12000| 162 | |rafa| 30| 56000| jane| 30| 45000| 163 | |rafa| 40| 56000| rafa| 40| 56000| 164 | |rafa| 30| 56000| ted| 30| 145000| 165 | |rafa| 10| 56000| xo2| 10|1332000| 166 | |rafa| 44| 56000| mary| 44| 555000| 167 | | ted| 20|145000| alex| 20| 12000| 168 | | ted| 30|145000| jane| 30| 45000| 169 | +----+---+------+-----+----+-------+ 170 | only showing top 20 rows 171 | 172 | >>> 173 | >>> cart2 174 | DataFrame[name: string, age: bigint, salary: bigint, name2: string, age2: bigint, salary2: bigint] -------------------------------------------------------------------------------- /tutorial/pyspark-examples/dataframes/dataframe-session-2022-05-19-Converting-DataFrame-to-RDD.txt: -------------------------------------------------------------------------------- 1 | This demo shows how to convert 2 | 1. a DataFrame to an RDD 3 | 2. an RDD to a DataFrame 4 | 5 | 6 | ~ % /Users/mparsian/spark-3.2.1/bin/pyspark 7 | Python 3.8.9 (default, Jul 19 2021, 09:37:32) 8 | Welcome to Spark version 3.2.1 9 | 10 | Spark context Web UI available at http://10.0.0.234:4041 11 | Spark context available as 'sc' (master = local[*], app id = local-1653016254174). 12 | SparkSession available as 'spark'. 13 | >>> data = [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000),('mary', 'HR', 93000)] 14 | >>> data 15 | [('alex', 'sales', 23000), ('jane', 'HR', 29000), ('bob', 'sales', 43000), ('mary', 'HR', 93000)] 16 | >>> df = spark.createDataFrame(data, ['name', 'dept', 'salary']) 17 | >>> df.show() 18 | +----+-----+------+ 19 | |name| dept|salary| 20 | +----+-----+------+ 21 | |alex|sales| 23000| 22 | |jane| HR| 29000| 23 | | bob|sales| 43000| 24 | |mary| HR| 93000| 25 | +----+-----+------+ 26 | 27 | >>> df.printSchema() 28 | root 29 | |-- name: string (nullable = true) 30 | |-- dept: string (nullable = true) 31 | |-- salary: long (nullable = true) 32 | 33 | >>> rdd5 = df.rdd 34 | >>> rdd5.collect() 35 | [ 36 | Row(name='alex', dept='sales', salary=23000), 37 | Row(name='jane', dept='HR', salary=29000), 38 | Row(name='bob', dept='sales', salary=43000), 39 | Row(name='mary', dept='HR', salary=93000) 40 | ] 41 | >>> 42 | >>> df2 = rdd5.toDF() 43 | >>> df2.show() 44 | +----+-----+------+ 45 | |name| dept|salary| 46 | +----+-----+------+ 47 | |alex|sales| 23000| 48 | |jane| HR| 29000| 49 | | bob|sales| 43000| 50 | |mary| HR| 93000| 51 | +----+-----+------+ 52 | 53 | >>> from pyspark.sql import Row 54 | >>> # NOTE: to convert an RDD into a DataFrame, 55 | >>> # each Row() must have the same column names: 56 | >>> rows = 57 | [ 58 | Row(name='alex', dept='sales', salary=23000), 59 | Row(name='jane', dept='HR', salary=29000, address='123 main street') 60 | ] 61 | >>> rdd = sc.parallelize(rows) 62 | >>> rdd.collect() 63 | [Row(name='alex', dept='sales', salary=23000), Row(name='jane', dept='HR', salary=29000, address='123 main street')] 64 | >>> df44 = rdd.toDF() 65 | >>> df44.show() 66 | 22/05/19 20:21:51 ERROR Executor: Exception in task 10.0 in stage 15.0 (TID 100) 67 | java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 3 fields are required while 4 values are provided. 68 | ... 69 | >>> # create Row()'s which have the same columns 70 | >>> rows = 71 | [ 72 | Row(name='alex', dept='sales', salary=23000, address=None), 73 | Row(name='jane', dept='HR', salary=29000, address='123 main street') 74 | ] 75 | >>> rdd = sc.parallelize(rows) 76 | >>> df44 = rdd.toDF() 77 | >>> df44.show() 78 | +----+-----+------+---------------+ 79 | |name| dept|salary| address| 80 | +----+-----+------+---------------+ 81 | |alex|sales| 23000| null| 82 | |jane| HR| 29000|123 main street| 83 | +----+-----+------+---------------+ 84 | 85 | >>> 86 | >>> some_data = [('alex', 10), ('jane', 20)] 87 | >>> rdd3 = sc.parallelize(some_data) 88 | >>> rdd3.collect() 89 | [('alex', 10), ('jane', 20)] 90 | >>> rdd3_with_rows = rdd3.map(lambda x: Row(name=x[0], age=x[1])) 91 | >>> rdd3_with_rows.collect() 92 | [Row(name='alex', age=10), Row(name='jane', age=20)] 93 | >>> df3 = rdd3_with_rows.toDF() 94 | >>> df3.show() 95 | +----+---+ 96 | |name|age| 97 | +----+---+ 98 | |alex| 10| 99 | |jane| 20| 100 | +----+---+ 101 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/combineByKey_example.py: -------------------------------------------------------------------------------- 1 | Problem: Given a set of (K, V) pairs, 2 | find (sum, count, min, max) per key using 3 | the combineByKey() transformation. 4 | 5 | ~/spark-2.4.4 $ ./bin/pyspark 6 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 7 | [Clang 6.0 (clang-600.0.57)] on darwin 8 | Type "help", "copyright", "credits" or "license" for more information. 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 14 | /_/ 15 | 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 17 | SparkSession available as 'spark'. 18 | >>> 19 | 20 | >>> 21 | >>> spark 22 | 23 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8) ] 24 | >>> data 25 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)] 26 | >>> rdd = spark.sparkContext.parallelize(data) 27 | >>> 28 | >>> 29 | >>> rdd.count() 30 | 7 31 | >>> rdd.collect() 32 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 6), ('B', 7), ('B', 8)] 33 | >>> # (K, (sum, count, min, max)) 34 | ... 35 | >>> def single(v): 36 | ... return (v, 1, v, v) 37 | ... 38 | >>> def merge(C, v): 39 | ... return (C[0]+v, C[1]+1, min(C[2],v), max(C[3],v)) 40 | ... 41 | >>> def combine(C1, C2): 42 | ... return (C1[0]+C2[0], C1[1]+C2[1], min(C1[2], C2[2]), max(C1[3], C2[3]) ) 43 | ... 44 | >>> rdd2 = rdd.combineByKey(single, merge, combine) 45 | >>> rdd2.collect() 46 | [ 47 | ('B', (21, 3, 6, 8)), 48 | ('A', (14, 4, 2, 5)) 49 | ] 50 | 51 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/count_min_max.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | 5 | from pyspark.sql import SparkSession 6 | 7 | # 8 | print ("This is the name of the script: ", sys.argv[0]) 9 | print ("Number of arguments: ", len(sys.argv)) 10 | print ("The arguments are: " , str(sys.argv)) 11 | # 12 | 13 | # DEFINE your input path 14 | input_path = sys.argv[1] 15 | print("input_path: ", input_path) 16 | 17 | 18 | # CREATE an instance of a SparkSession object 19 | spark = SparkSession\ 20 | .builder\ 21 | .appName("PythonWordCount")\ 22 | .getOrCreate() 23 | 24 | # CREATE a new RDD[String] 25 | #lines = spark.sparkContext.textFile(input_path) 26 | # APPLY a SET of TRANSFORMATIONS... 27 | 28 | #------------------------------------------- 29 | def minmax(partition): 30 | first_time = False 31 | #count 32 | #min2 33 | #max2 34 | for x in partition: 35 | if (first_time == False): 36 | count = 1 37 | min2 = x 38 | max2 = x 39 | first_time = True 40 | else: 41 | count = count + 1 42 | max2 = max(x, max2) 43 | min2 = min(x, min2) 44 | #end-for 45 | # 46 | return [(count, min2, max2)] 47 | #end-def 48 | #--------------------- 49 | def iterate_partition(partition): 50 | elements = [] 51 | for x in partition: 52 | elements.append(x) 53 | print("elements=", elements) 54 | #print ("==================") 55 | #end-def 56 | #------------------------- 57 | def add3(t1, t2): 58 | count = t1[0] + t2[0] 59 | min2 = min(t1[1], t2[1]) 60 | max2 = max(t1[2], t2[2]) 61 | return (count, min2, max2) 62 | #end-def 63 | 64 | data = [10, 20, 30, 44, 55, 3, 4, 60, 50, 5, 2, 2, 20, 20, 10, 30, 70] 65 | print("data=", data) 66 | print("==============") 67 | 68 | # 69 | rdd = spark.sparkContext.parallelize(data, 4) 70 | print("rdd.collect()=", rdd.collect()) 71 | print("==============") 72 | # 73 | rdd.foreachPartition(iterate_partition) 74 | print("==============") 75 | # 76 | 77 | count_min_max_rdd = rdd.mapPartitions(minmax) 78 | print("minmax_rdd.collect()=", count_min_max_rdd.collect()) 79 | 80 | final_triplet = count_min_max_rdd.reduce(add3) 81 | print("final_triplet=", final_triplet) 82 | 83 | spark.stop() 84 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2015-03-13.txt: -------------------------------------------------------------------------------- 1 | pyspark-tutorial- 2 | pyspark-tutorial provides basic algorithms using pyspark 3 | 4 | interactive session: valid and tested: Feb. 23, 2015 5 | 6 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat data.txt 7 | crazy crazy fox jumped 8 | crazy fox jumped 9 | fox is fast 10 | fox is smart 11 | dog is smart 12 | 13 | SPARK_HOME=~/zmp/zs/spark-1.2.0 14 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# ~/zmp/zs/spark-1.2.0/bin/pyspark 15 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 16 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 17 | Type "help", "copyright", "credits" or "license" for more information. 18 | 19 | Welcome to 20 | ____ __ 21 | / __/__ ___ _____/ /__ 22 | _\ \/ _ \/ _ `/ __/ '_/ 23 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 24 | /_/ 25 | 26 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 27 | SparkContext available as sc. 28 | >>> sc 29 | 30 | >>> lines = sc.textFile("data.txt", 1) 31 | >>> debuglines = lines.collect(); 32 | >>> debuglines 33 | [u'crazy crazy fox jumped', u'crazy fox jumped', u'fox is fast', u'fox is smart', u'dog is smart'] 34 | >>> words = lines.flatMap(lambda x: x.split(' ')) 35 | >>> debugwords = words.collect(); 36 | >>> debugwords 37 | [u'crazy', u'crazy', u'fox', u'jumped', u'crazy', u'fox', u'jumped', u'fox', u'is', u'fast', u'fox', u'is', u'smart', u'dog', u'is', u'smart'] 38 | >>> ones = words.map(lambda x: (x, 1)) 39 | >>> debugones = ones.collect() 40 | >>> debugones 41 | [(u'crazy', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'crazy', 1), (u'fox', 1), (u'jumped', 1), (u'fox', 1), (u'is', 1), (u'fast', 1), (u'fox', 1), (u'is', 1), (u'smart', 1), (u'dog', 1), (u'is', 1), (u'smart', 1)] 42 | >>> counts = ones.reduceByKey(lambda x, y: x + y) 43 | >>> debugcounts = counts.collect() 44 | >>> debugcounts 45 | [(u'crazy', 3), (u'jumped', 2), (u'is', 3), (u'fox', 4), (u'dog', 1), (u'fast', 1), (u'smart', 2)] 46 | >>> 47 | 48 | >>> grouped = ones.groupByKey(); 49 | >>> debuggrouped = grouped.collect(); 50 | 51 | >>> counts.saveAsTextFile("output.txt") 52 | 53 | mparsian@Mahmouds-MacBook:~/zmp/BigData-MapReduce-Course/pyspark# cat output.txt/part* 54 | (u'crazy', 3) 55 | (u'jumped', 2) 56 | (u'is', 3) 57 | (u'fox', 4) 58 | (u'dog', 1) 59 | (u'fast', 1) 60 | (u'smart', 2) -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2015-04-10.txt: -------------------------------------------------------------------------------- 1 | First session on PySpark 2 | 3 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# cat zfox_data.txt 4 | crazy red fox ran fast 5 | red fox jumped very very high 6 | red fox is very crazy 7 | red fox ran very fast 8 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# 9 | 10 | mparsian@Mahmouds-MacBook-2:~/spark-1.3.0/bin# ./pyspark 11 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 12 | Welcome to 13 | ____ __ 14 | / __/__ ___ _____/ /__ 15 | _\ \/ _ \/ _ `/ __/ '_/ 16 | /__ / .__/\_,_/_/ /_/\_\ version 1.3.0 17 | /_/ 18 | 19 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 20 | SparkContext available as sc, SQLContext available as sqlCtx. 21 | >>> 22 | >>> sc 23 | 24 | >>> 25 | >>> lines = sc.textFile("zfox_data.txt") 26 | >>> 27 | >>> lines.collect() 28 | [u'crazy red fox ran fast', u'red fox jumped very very high', u'red fox is very crazy', u'red fox ran very fast'] 29 | >>> 30 | >>> lines.count() 31 | 4 32 | >>> 33 | >>> words = lines.flatMap(lambda x: x.split(' ')) 34 | >>> 35 | >>> words.collect() 36 | [u'crazy', u'red', u'fox', u'ran', u'fast', u'red', u'fox', u'jumped', u'very', u'very', u'high', u'red', u'fox', u'is', u'very', u'crazy', u'red', u'fox', u'ran', u'very', u'fast'] 37 | >>> 38 | >>> words.count() 39 | 21 40 | >>> 41 | >>> ones = words.map(lambda x: (x, 1)) 42 | >>> 43 | >>> ones.collect() 44 | [(u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'fast', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1), (u'very', 1), (u'very', 1), (u'high', 1), (u'red', 1), (u'fox', 1), (u'is', 1), (u'very', 1), (u'crazy', 1), (u'red', 1), (u'fox', 1), (u'ran', 1), (u'very', 1), (u'fast', 1)] 45 | >>> 46 | >>> counts = ones.reduceByKey(lambda x, y: x + y) 47 | >>> 48 | >>> 49 | >>> counts.collect() 50 | [(u'crazy', 2), (u'ran', 2), (u'is', 1), (u'fox', 4), (u'fast', 2), (u'high', 1), (u'very', 4), (u'red', 4), (u'jumped', 1)] 51 | >>> 52 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2018-01-18.txt: -------------------------------------------------------------------------------- 1 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ source zbin/zenv_setup.sh 2 | mparsian@mahmoudsmacbook ~/spark-2.2.1 $ ./bin/pyspark 3 | Python 2.7.10 (default, Feb 7 2017, 00:08:15) 4 | [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin 5 | Type "help", "copyright", "credits" or "license" for more information. 6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 7 | Setting default log level to "WARN". 8 | Welcome to 9 | ____ __ 10 | / __/__ ___ _____/ /__ 11 | _\ \/ _ \/ _ `/ __/ '_/ 12 | /__ / .__/\_,_/_/ /_/\_\ version 2.2.1 13 | /_/ 14 | 15 | Using Python version 2.7.10 (default, Feb 7 2017 00:08:15) 16 | SparkSession available as 'spark'. 17 | >>> spark 18 | 19 | >>> 20 | >>> 21 | >>> 22 | >>> 23 | >>> spark 24 | 25 | >>> 26 | >>> 27 | >>> sc = spark.sparkContext 28 | >>> 29 | >>> sc 30 | 31 | >>> 32 | >>> 33 | >>> rdd = sc.textFile("file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt") 34 | >>> 35 | >>> 36 | >>> rdd 37 | file:///Users/mparsian/zmp/github/pyspark-tutorial/data/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 38 | >>> 39 | >>> 40 | >>> rdd.count() 41 | 3 42 | >>> rdd.collect() 43 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped'] 44 | >>> rdd.take(2) 45 | [u'red fox jumped high', u'fox jumped over high fence'] 46 | >>> rdd.take(1) 47 | [u'red fox jumped high'] 48 | >>> rdd.collect() 49 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped'] 50 | >>> 51 | >>> 52 | 53 | >>> rdd2 = rdd.map(lambda x : (x, len(x))) 54 | >>> rdd2.collect() 55 | [(u'red fox jumped high', 19), (u'fox jumped over high fence', 26), (u'red fox jumped', 14)] 56 | >>> rdd2 = rdd.map(lambda x : (x, len(x), len(x)-2)) 57 | >>> 58 | >>> rdd2.collect() 59 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)] 60 | >>> rdd3 = rdd.map(lambda x : (x, len(x), len(x)-2)) 61 | >>> 62 | >>> 63 | >>> rdd3.collect() 64 | [(u'red fox jumped high', 19, 17), (u'fox jumped over high fence', 26, 24), (u'red fox jumped', 14, 12)] 65 | >>> 66 | >>> 67 | >>> rdd4 = rdd.map(lambda x : (len(x), x, x)) 68 | >>> rdd4.collect() 69 | [(19, u'red fox jumped high', u'red fox jumped high'), (26, u'fox jumped over high fence', u'fox jumped over high fence'), (14, u'red fox jumped', u'red fox jumped')] 70 | >>> 71 | >>> 72 | >>> 73 | >>> rdd.collect() 74 | [u'red fox jumped high', u'fox jumped over high fence', u'red fox jumped'] 75 | >>> rdd2 = rdd.flatMap(lambda x: x.split(" ")) 76 | >>> rdd2.collect() 77 | [u'red', u'fox', u'jumped', u'high', u'fox', u'jumped', u'over', u'high', u'fence', u'red', u'fox', u'jumped'] 78 | >>> rdd2.count() 79 | 12 80 | >>> 81 | >>> 82 | >>> pairs = rdd2.map(lambda w : (w, 1)) 83 | >>> pairs.count() 84 | 12 85 | >>> pairs.collect() 86 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)] 87 | >>> 88 | >>> 89 | 90 | >>> 91 | >>> pairs.collect() 92 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)] 93 | >>> grouped = pairs.groupByKey() 94 | >>> grouped.collect() 95 | [(u'high', ), (u'over', ), (u'fox', ), (u'red', ), (u'fence', ), (u'jumped', )] 96 | >>> grouped.mapValues(lambda iter : list(iter)).collect() 97 | [(u'high', [1, 1]), (u'over', [1]), (u'fox', [1, 1, 1]), (u'red', [1, 1]), (u'fence', [1]), (u'jumped', [1, 1, 1])] 98 | >>> 99 | >>> freq = grouped.mapValues(lambda iter: sum(iter)) 100 | >>> freq.collect() 101 | [(u'high', 2), (u'over', 1), (u'fox', 3), (u'red', 2), (u'fence', 1), (u'jumped', 3)] 102 | >>> freq.collectAsHashMap() 103 | Traceback (most recent call last): 104 | File "", line 1, in 105 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap' 106 | >>> freq.collectAsHashMap 107 | Traceback (most recent call last): 108 | File "", line 1, in 109 | AttributeError: 'PipelinedRDD' object has no attribute 'collectAsHashMap' 110 | >>> freq.collectAsMap 111 | :1> 112 | >>> freq.collectAsMap() 113 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2} 114 | >>> 115 | >>> 116 | >>> 117 | >>> 118 | >>> 119 | >>> pairs.collect() 120 | [(u'red', 1), (u'fox', 1), (u'jumped', 1), (u'high', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'high', 1), (u'fence', 1), (u'red', 1), (u'fox', 1), (u'jumped', 1)] 121 | >>> freq = pairs.reduceByKey(lambda x, y: x+y) 122 | >>> freq.collectAsMap() 123 | {u'fence': 1, u'jumped': 3, u'over': 1, u'fox': 3, u'high': 2, u'red': 2} 124 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2018-10-02.txt: -------------------------------------------------------------------------------- 1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh 2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark 3 | Python 2.7.10 (default, Oct 6 2017, 22:29:07) 4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin 5 | Type "help", "copyright", "credits" or "license" for more information. 6 | 18/10/02 15:50:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 7 | Welcome to 8 | ____ __ 9 | / __/__ ___ _____/ /__ 10 | _\ \/ _ \/ _ `/ __/ '_/ 11 | /__ / .__/\_,_/_/ /_/\_\ version 2.3.0 12 | /_/ 13 | 14 | Using Python version 2.7.10 (default, Oct 6 2017 22:29:07) 15 | SparkSession available as 'spark'. 16 | >>> 17 | >>> 18 | >>> 19 | >>> 20 | >>> spark 21 | 22 | >>> spark.sparkContext 23 | 24 | >>> 25 | >>> spark.version 26 | u'2.3.0' 27 | >>> 28 | >>> 29 | >>> 30 | >>> 31 | >>> 32 | >>> 33 | >>> input_path = "/Users/mparsian/spark-2.3.0/myinput.txt" 34 | >>> myrdd = spark.sparkContext.textFile(input_path) 35 | >>> rdd.count() 36 | Traceback (most recent call last): 37 | File "", line 1, in 38 | NameError: name 'rdd' is not defined 39 | >>> myrdd.count() 40 | 3 41 | >>> myrdd.collect() 42 | [u'this is record 1', u'this is record 2', u'this is record 3'] 43 | >>> 44 | >>> 45 | >>> def tokenize(rec): 46 | ... tokens = rec.split() 47 | ... return tokens 48 | ... 49 | >>> 50 | >>> rec33 = "this is it" 51 | >>> mytokens = tokenize(rec33) 52 | >>> mytokens 53 | ['this', 'is', 'it'] 54 | >>> 55 | >>> 56 | >>> words = myrdd.flatMap(lambda record: tokenize(record)) 57 | >>> words.collect() 58 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3'] 59 | >>> words.count() 60 | 12 61 | >>> 62 | >>> duplicated = myrdd.map(lambda rec: rec + ";" rec) 63 | File "", line 1 64 | duplicated = myrdd.map(lambda rec: rec + ";" rec) 65 | ^ 66 | SyntaxError: invalid syntax 67 | >>> duplicated = myrdd.map(lambda rec: rec + ";" + rec) 68 | >>> duplicated.count() 69 | 3 70 | >>> duplicated.collect() 71 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3'] 72 | >>> 73 | >>> def myconcat(rec): 74 | ... return rec + ";" + rec 75 | ... 76 | >>> 77 | >>> z = myconcat("testing") 78 | >>> z 79 | 'testing;testing' 80 | >>> duplicated2 = myrdd.map(myconcat) 81 | >>> duplicated2.count() 82 | 3 83 | >>> duplicated2.collect() 84 | [u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3'] 85 | >>> 86 | >>> 87 | >>> 88 | >>> words.collect() 89 | [u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3'] 90 | >>> words.count() 91 | 12 92 | >>> pairs = words.map(lambda w: (w, 1)) 93 | >>> pairs.collect() 94 | [(u'this', 1), (u'is', 1), (u'record', 1), (u'1', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'2', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'3', 1)] 95 | >>> pairs.count() 96 | 12 97 | >>> freq = pairs.reduceByKey(lambda x, y : x+y) 98 | >>> freq.collect() 99 | [(u'this', 3), (u'1', 1), (u'is', 3), (u'3', 1), (u'record', 3), (u'2', 1)] 100 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2018-10-09.txt: -------------------------------------------------------------------------------- 1 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh 2 | mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark 3 | Python 2.7.10 (default, Oct 6 2017, 22:29:07) 4 | [GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin 5 | Type "help", "copyright", "credits" or "license" for more information. 6 | 18/10/09 18:04:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 7 | Welcome to 8 | ____ __ 9 | / __/__ ___ _____/ /__ 10 | _\ \/ _ \/ _ `/ __/ '_/ 11 | /__ / .__/\_,_/_/ /_/\_\ version 2.3.0 12 | /_/ 13 | 14 | Using Python version 2.7.10 (default, Oct 6 2017 22:29:07) 15 | SparkSession available as 'spark'. 16 | >>> 17 | >>> 18 | >>> spark 19 | 20 | >>> 21 | >>> 22 | >>> 23 | >>> 24 | >>> data = [1, -3, 4, 2, -5, 2] 25 | >>> data 26 | [1, -3, 4, 2, -5, 2] 27 | >>> rdd = spark.sparkContext.parallalize(data) 28 | Traceback (most recent call last): 29 | File "", line 1, in 30 | AttributeError: 'SparkContext' object has no attribute 'parallalize' 31 | >>> rdd = spark.sparkContext.parallelize(data) 32 | >>> rdd.count() 33 | 6 34 | >>> rdd.collect() 35 | [1, -3, 4, 2, -5, 2] 36 | >>> 37 | >>> def myfun(n): 38 | ... mylist = [] 39 | ... if n > 0: 40 | ... mylist.append(100) 41 | ... mylist.append(200) 42 | ... else: 43 | ... mylist.append(0) 44 | ... # 45 | ... return mylist 46 | ... 47 | >>> 48 | >>> x = myfun(3) 49 | >>> x 50 | [100, 200] 51 | >>> y = myfun(-55) 52 | >>> y 53 | [0] 54 | >>> 55 | >>> rdd2 = rdd.flatMap(myfun) 56 | >>> rdd.collect() 57 | [1, -3, 4, 2, -5, 2] 58 | >>> rdd2.collect() 59 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200] 60 | >>> rdd2.count() 61 | 10 62 | >>> 63 | >>> 64 | >>> 65 | >>> rdd3 = rdd2.filter(lambda x : x > 100) 66 | >>> rdd3.collect() 67 | [200, 200, 200, 200] 68 | >>> 69 | >>> rdd4 = rdd2.filter(lambda x : x > 10) 70 | >>> rdd4.collect() 71 | [100, 200, 100, 200, 100, 200, 100, 200] 72 | >>> 73 | >>> 74 | >>> def keep100(n): 75 | ... if n > 100: 76 | ... return True 77 | ... else: 78 | ... return False 79 | ... 80 | >>> 81 | >>> rdd5 = rdd2.filter(keep100) 82 | >>> rdd5.collect() 83 | [200, 200, 200, 200] 84 | >>> 85 | >>> 86 | >>> rdd2.collect() 87 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200] 88 | >>> rdd6 = rdd.map(lambda x : x+1000) 89 | >>> rdd6.collect() 90 | [1001, 997, 1004, 1002, 995, 1002] 91 | >>> 92 | >>> def myadder(n): 93 | ... if n > 0: 94 | ... return n+1000 95 | ... else: 96 | ... return n 97 | ... 98 | >>> 99 | >>> rdd2.collect() 100 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200] 101 | >>> rdd7 = rdd2.map(myadder) 102 | >>> rdd7.collect() 103 | [1100, 1200, 0, 1100, 1200, 1100, 1200, 0, 1100, 1200] 104 | >>> 105 | >>> 106 | >>> 107 | >>> 108 | >>> 109 | >>> 110 | >>> rdd2.collect() 111 | [100, 200, 0, 100, 200, 100, 200, 0, 100, 200] 112 | >>> mysum = rdd2.reduce(lambda x,y: x+y) 113 | >>> mysum 114 | 1200 115 | >>> 116 | >>> 117 | >>> 118 | >>> 119 | >>> 120 | >>> pairs = [("a", 2), ("b", 3), ("a", 3), ("b", 4), ("a", 7), ("b", 10), ("c", 7), ("c", 1)] 121 | >>> 122 | >>> pairs 123 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)] 124 | >>> 125 | >>> pairs_rdd = spark.sparkContext.parallelize(pairs) 126 | >>> pairs_rdd.count() 127 | 8 128 | >>> pairs_rdd.collect() 129 | [('a', 2), ('b', 3), ('a', 3), ('b', 4), ('a', 7), ('b', 10), ('c', 7), ('c', 1)] 130 | >>> 131 | >>> 132 | >>> grouped = pairs_rdd.groupByKey() 133 | >>> grouped.collect() 134 | [('a', ), ('c', ), ('b', )] 135 | >>> grouped.mapValues(lambda it: list(it)).collect() 136 | [('a', [2, 3, 7]), ('c', [7, 1]), ('b', [3, 4, 10])] 137 | >>> 138 | >>> incby100 = pairs_rdd.mapValues(lambda x : x+100) 139 | >>> incby100.collect() 140 | [('a', 102), ('b', 103), ('a', 103), ('b', 104), ('a', 107), ('b', 110), ('c', 107), ('c', 101)] 141 | >>> incby1000 = pairs_rdd.map(lambda (k,v) : (k, v+1000)) 142 | >>> incby1000.collect() 143 | [('a', 1002), ('b', 1003), ('a', 1003), ('b', 1004), ('a', 1007), ('b', 1010), ('c', 1007), ('c', 1001)] 144 | >>> 145 | >>> 146 | >>> grouped.collect() 147 | [('a', ), ('c', ), ('b', )] 148 | >>> 149 | >>> average = grouped.mapValues(lambda it: sum(it)/len(it)) 150 | >>> average.collect() 151 | [('a', 4), ('c', 4), ('b', 5)] 152 | >>> average = grouped.mapValues(lambda it: float(sum(it))/float(len(it))) 153 | >>> average.collect() 154 | [('a', 4.0), ('c', 4.0), ('b', 5.666666666666667)] 155 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-01-30.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Setting default log level to "WARN". 5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 6 | Welcome to 7 | ____ __ 8 | / __/__ ___ _____/ /__ 9 | _\ \/ _ \/ _ `/ __/ '_/ 10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 11 | /_/ 12 | 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 14 | SparkSession available as 'spark'. 15 | >>> 16 | >>> spark 17 | 18 | >>> 19 | >>> 20 | >>> pairs = [("alex", 100, 1), ("jane", 200, 3), ("ted", 300, 3)] 21 | >>> pairs 22 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)] 23 | >>> 24 | >>> rdd = spark.sparkContext.parallelize(pairs) 25 | >>> rdd.collect() 26 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)] 27 | >>> rdd.count() 28 | 3 29 | >>> def find_average(record): 30 | ... return record[1]/record[2] 31 | ... 32 | >>> 33 | >>> x = ('jane', 200, 3) 34 | >>> y = find_average(x) 35 | >>> y 36 | 66.66666666666667 37 | >>> x = ('ted', 300, 3) 38 | >>> y = find_average(x) 39 | >>> y 40 | 100.0 41 | >>> rdd.collect() 42 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)] 43 | >>> rdd2 = rdd.map(find_average) 44 | >>> rdd2.collect() 45 | [100.0, 66.66666666666667, 100.0] 46 | >>> def find_average(record): 47 | ... return (record[0], record[1]/record[2]) 48 | ... 49 | >>> 50 | >>> x = ('jane', 200, 3) 51 | >>> y = find_average(x) 52 | >>> y 53 | ('jane', 66.66666666666667) 54 | >>> rdd2 = rdd.map(find_average) 55 | >>> rdd2.collect() 56 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)] 57 | >>> def find_average22(record): 58 | ... return [(record[0], record[1]/record[2])] 59 | ... 60 | >>> x = ('ted', 300, 3) 61 | >>> y = find_average22(x) 62 | >>> y 63 | [('ted', 100.0)] 64 | >>> 65 | >>> 66 | >>> rdd3 = rdd.flatMap(find_average22) 67 | >>> rdd3.collect() 68 | [('alex', 100.0), ('jane', 66.66666666666667), ('ted', 100.0)] 69 | >>> 70 | >>> 71 | >>> 72 | >>> numbers = [1, 2, 3, 4, 5, 6] 73 | >>> rdd4 = spark.sparkContext.parallelize(numbers) 74 | >>> rdd4.count() 75 | 6 76 | >>> rdd.collect() 77 | [('alex', 100, 1), ('jane', 200, 3), ('ted', 300, 3)] 78 | >>> rdd4.collect() 79 | [1, 2, 3, 4, 5, 6] 80 | >>> 81 | >>> 82 | >>> mysum = rdd4.reduce(lambda x, y: x+7) 83 | >>> mysum 84 | 36 85 | >>> rdd5 = rdd4.map(lambda x : x +7) 86 | >>> rdd5.collect() 87 | [8, 9, 10, 11, 12, 13] 88 | >>> rdd5 89 | PythonRDD[8] at collect at :1 90 | >>> rdd4 91 | ParallelCollectionRDD[5] at parallelize at PythonRDD.scala:195 92 | >>> 93 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-04-16.txt: -------------------------------------------------------------------------------- 1 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ pwd 2 | /Users/mparsian/spark-2.4.0 3 | 4 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l 5 | -rw-r--r--@ 1 mparsian 897801646 21357 Oct 28 23:36 LICENSE 6 | -rw-r--r--@ 1 mparsian 897801646 42919 Oct 28 23:36 NOTICE 7 | drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 R 8 | -rw-r--r--@ 1 mparsian 897801646 3952 Oct 28 23:36 README.md 9 | -rw-r--r--@ 1 mparsian 897801646 156 Oct 28 23:36 RELEASE 10 | drwxr-xr-x@ 29 mparsian 897801646 928 Oct 28 23:36 bin 11 | drwxr-xr-x@ 9 mparsian 897801646 288 Oct 28 23:36 conf 12 | drwxr-xr-x@ 5 mparsian 897801646 160 Oct 28 23:36 data 13 | drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 examples 14 | drwxr-xr-x@ 227 mparsian 897801646 7264 Oct 28 23:36 jars 15 | drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 kubernetes 16 | drwxr-xr-x@ 48 mparsian 897801646 1536 Oct 28 23:36 licenses 17 | drwxr-xr-x 16 mparsian 897801646 512 Mar 25 12:29 logs 18 | drwxr-xr-x@ 19 mparsian 897801646 608 Oct 28 23:36 python 19 | drwxr-xr-x@ 24 mparsian 897801646 768 Oct 28 23:36 sbin 20 | drwxr-xr-x 2 mparsian 897801646 64 Jan 8 03:00 work 21 | drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 yarn 22 | 23 | 24 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l bin 25 | total 224 26 | -rwxr-xr-x@ 1 mparsian 897801646 1089 Oct 28 23:36 beeline 27 | -rw-r--r--@ 1 mparsian 897801646 1064 Oct 28 23:36 beeline.cmd 28 | -rwxr-xr-x@ 1 mparsian 897801646 5427 Oct 28 23:36 docker-image-tool.sh 29 | -rwxr-xr-x@ 1 mparsian 897801646 1933 Oct 28 23:36 find-spark-home 30 | -rw-r--r--@ 1 mparsian 897801646 2681 Oct 28 23:36 find-spark-home.cmd 31 | -rw-r--r--@ 1 mparsian 897801646 1892 Oct 28 23:36 load-spark-env.cmd 32 | -rw-r--r--@ 1 mparsian 897801646 2025 Oct 28 23:36 load-spark-env.sh 33 | -rwxr-xr-x@ 1 mparsian 897801646 2987 Oct 28 23:36 pyspark 34 | ... 35 | 36 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ./bin/pyspark 37 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 38 | [Clang 6.0 (clang-600.0.57)] on darwin 39 | Type "help", "copyright", "credits" or "license" for more information. 40 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 41 | Welcome to 42 | ____ __ 43 | / __/__ ___ _____/ /__ 44 | _\ \/ _ \/ _ `/ __/ '_/ 45 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 46 | /_/ 47 | 48 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 49 | SparkSession available as 'spark'. 50 | 51 | >>> spark 52 | 53 | >>> 54 | >>> 55 | >>> 56 | >>> data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 57 | >>> data 58 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 59 | >>> 60 | >>> rdd = spark.sparkContext.parallelize(data) 61 | >>> rdd.count() 62 | 12 63 | >>> rdd.collect() 64 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 65 | >>> rdd.getNumPartitions() 66 | 8 67 | >>> rdd2 = spark.sparkContext.parallelize(data, 3) 68 | >>> rdd2.collect() 69 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 70 | >>> rdd2.getNumPartitions() 71 | 3 72 | >>> rdd3 = rdd.map(lambda x : x+100) 73 | >>> rdd3.collect() 74 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112] 75 | >>> 76 | >>> def myfun(x): 77 | ... return x+100 78 | ... 79 | >>> 80 | >>> 81 | >>> y = myfun(4) 82 | >>> y 83 | 104 84 | >>> z = myfun(60) 85 | >>> z 86 | 160 87 | >>> rdd4 = rdd.map(myfun) 88 | >>> rdd4.collect() 89 | [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112] 90 | >>> rdd5 = rdd.map(lambda x: (x, 1)) 91 | >>> rdd5.collect() 92 | [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)] 93 | >>> rdd2.collect() 94 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 95 | >>> N = rdd.reduce(lambda x, y: x+y) 96 | >>> N 97 | 78 98 | >>> exit() 99 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-04-18.txt: -------------------------------------------------------------------------------- 1 | 2 | mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ cat > fox.txt 3 | a fox jumped 4 | a red fox jumped and jumped 5 | a blue and red fox jumped 6 | fox is blue red 7 | 8 | $ cat fox.txt 9 | a fox jumped 10 | a red fox jumped and jumped 11 | a blue and red fox jumped 12 | fox is blue red 13 | 14 | ~/spark-2.4.0 $ ./bin/pyspark 15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 16 | [Clang 6.0 (clang-600.0.57)] on darwin 17 | Type "help", "copyright", "credits" or "license" for more information. 18 | 2019-04-18 18:02:14 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 19 | Setting default log level to "WARN". 20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 21 | Welcome to 22 | ____ __ 23 | / __/__ ___ _____/ /__ 24 | _\ \/ _ \/ _ `/ __/ '_/ 25 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 26 | /_/ 27 | 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 29 | SparkSession available as 'spark'. 30 | >>> spark 31 | 32 | 33 | >>> records = spark.sparkContext.textFile("/Users/mparsian/spark-2.4.0/fox.txt") 34 | >>> records.collect() 35 | [ 36 | 'a fox jumped', 37 | 'a red fox jumped and jumped', 38 | 'a blue and red fox jumped', 39 | 'fox is blue red' 40 | ] 41 | >>> records.count() 42 | 4 43 | >>> 44 | >>> def tokenize(record): 45 | ... tokens = record.split(" ") 46 | ... return tokens 47 | ... 48 | >>> 49 | >>> x = "a fox jumped" 50 | >>> x 51 | 'a fox jumped' 52 | >>> tokens = tokenize(x) 53 | >>> tokens 54 | ['a', 'fox', 'jumped'] 55 | >>> 56 | >>> 57 | >>> words = records.flatMap(tokenize) 58 | >>> words.collect() 59 | ['a', 'fox', 'jumped', 'a', 'red', 'fox', 'jumped', 'and', 'jumped', 'a', 'blue', 'and', 'red', 'fox', 'jumped', 'fox', 'is', 'blue', 'red'] 60 | >>> words.count() 61 | 19 62 | >>> pairs = words.map(lambda x : (x,1)) 63 | >>> pairs.collect() 64 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)] 65 | >>> pairs.count() 66 | 19 67 | >>> 68 | >>> frequencies = pairs.reduceByKey(lambda a, b: a+b) 69 | >>> frequencies.collect() 70 | [('is', 1), ('a', 3), ('fox', 4), ('jumped', 4), ('red', 3), ('and', 2), ('blue', 2)] 71 | >>> 72 | >>> 73 | >>> filtered = frequencies.filter(lambda x : x[1] > 2) 74 | >>> filtered.collect() 75 | [('a', 3), ('fox', 4), ('jumped', 4), ('red', 3)] 76 | >>> filtered.count() 77 | 4 78 | >>> a = ("dada", 5) 79 | >>> a[0] 80 | 'dada' 81 | >>> a[1] 82 | 5 83 | >>> 84 | >>> 85 | >>> test = records.map(tokenize) 86 | >>> test.collect() 87 | [['a', 'fox', 'jumped'], ['a', 'red', 'fox', 'jumped', 'and', 'jumped'], ['a', 'blue', 'and', 'red', 'fox', 'jumped'], ['fox', 'is', 'blue', 'red']] 88 | >>> test.count() 89 | 4 90 | >>> 91 | >>> 92 | >>> pairs.collect() 93 | [('a', 1), ('fox', 1), ('jumped', 1), ('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('a', 1), ('blue', 1), ('and', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('fox', 1), ('is', 1), ('blue', 1), ('red', 1)] 94 | >>> 95 | >>> grouped = pairs.groupByKey() 96 | 97 | >>> grouped.collect() 98 | [ 99 | ('is', ), 100 | ('a', ), 101 | ('fox', ), 102 | ('jumped', ), 103 | ('red', ), 104 | ('and', ), 105 | ('blue', ) 106 | ] 107 | >>> 108 | >>> grouped = pairs.groupByKey().mapValues(lambda it: list(it)) 109 | >>> grouped.collect() 110 | [ 111 | ('is', [1]), 112 | ('a', [1, 1, 1]), 113 | ('fox', [1, 1, 1, 1]), 114 | ('jumped', [1, 1, 1, 1]), 115 | ('red', [1, 1, 1]), 116 | ('and', [1, 1]), 117 | ('blue', [1, 1]) 118 | ] 119 | >>> grouped = pairs.groupByKey() 120 | >>> grouped.collect() 121 | [('is', ), ('a', ), ('fox', ), ('jumped', ), ('red', ), ('and', ), ('blue', )] 122 | >>> freq2 = grouped.mapValues(lambda it: sum(it)) 123 | >>> freq2.collect() 124 | [ 125 | ('is', 1), 126 | ('a', 3), 127 | ('fox', 4), 128 | ('jumped', 4), 129 | ('red', 3), 130 | ('and', 2), 131 | ('blue', 2) 132 | ] 133 | >>> freq2.count() 134 | 7 135 | >>> frequencies = records.flatMap(tokenize).map(lambda x: (x,1)).reduceByKey(lambda a, b: a+b) 136 | >>> frequencies.collect() 137 | [ 138 | ('is', 1), 139 | ('a', 3), 140 | ('fox', 4), 141 | ('jumped', 4), 142 | ('red', 3), 143 | ('and', 2), 144 | ('blue', 2) 145 | ] 146 | >>> 147 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-04-26.txt: -------------------------------------------------------------------------------- 1 | Finding Average by Key using reduceByKey() Transformation 2 | 3 | $ ./bin/pyspark 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 5 | [Clang 6.0 (clang-600.0.57)] on darwin 6 | Type "help", "copyright", "credits" or "license" for more information. 7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 8 | Welcome to 9 | ____ __ 10 | / __/__ ___ _____/ /__ 11 | _\ \/ _ \/ _ `/ __/ '_/ 12 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 13 | /_/ 14 | 15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 16 | SparkSession available as 'spark'. 17 | >>> 18 | >>> 19 | >>> 20 | >>> 21 | >>> data = [('k1', 3), ('k1', 4),('k1', 5),('k2', 7),('k2', 7),('k2', 7),('k3', 30),('k3', 30),('k3', 40),('k3', 50)] 22 | >>> data 23 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)] 24 | >>> 25 | >>> pairs = spark.sparkContext.parallelize(data) 26 | >>> pairs.collect() 27 | [('k1', 3), ('k1', 4), ('k1', 5), ('k2', 7), ('k2', 7), ('k2', 7), ('k3', 30), ('k3', 30), ('k3', 40), ('k3', 50)] 28 | >>> pairs.count() 29 | 10 30 | >>> pairs2 = pairs.distinct() 31 | >>> pairs2.count() 32 | 7 33 | >>> pairs2.collect() 34 | [('k1', 5), ('k3', 40), ('k1', 3), ('k3', 50), ('k2', 7), ('k1', 4), ('k3', 30)] 35 | >>> 36 | >>> tuples = pairs.map(lambda x: (x[0], (x[1], 1) ) ) 37 | >>> tuples.collect() 38 | [('k1', (3, 1)), ('k1', (4, 1)), ('k1', (5, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k2', (7, 1)), ('k3', (30, 1)), ('k3', (30, 1)), ('k3', (40, 1)), ('k3', (50, 1))] 39 | 40 | >>> 41 | >>> def adder(x, y): 42 | ... sum2 = x[0] + y[0] 43 | ... count = x[1] + y[1] 44 | ... return (sum2, count) 45 | ... 46 | >>> 47 | >>> x = (10, 2) 48 | >>> y = (20, 4) 49 | >>> r = adder(x, y) 50 | >>> r 51 | (30, 6) 52 | >>> 53 | >>> result = tuples.reduceByKey(adder) 54 | >>> result.collect() 55 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))] 56 | >>> result = tuples.reduceByKey(lambda x, y: adder(x, y)) 57 | >>> result.collect() 58 | [('k1', (12, 3)), ('k3', (150, 4)), ('k2', (21, 3))] 59 | >>> avg = result.mapValues(lambda pair: float(pair[0])/float(pair[1])) 60 | >>> avg.collect() 61 | [('k1', 4.0), ('k3', 37.5), ('k2', 7.0)] 62 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-05-09.txt: -------------------------------------------------------------------------------- 1 | Learn Partitioning RDDs and using mapPartitions() Transformation 2 | 3 | $ ./bin/pyspark 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 5 | [Clang 6.0 (clang-600.0.57)] on darwin 6 | Type "help", "copyright", "credits" or "license" for more information. 7 | Setting default log level to "WARN". 8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 14 | /_/ 15 | 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 17 | SparkSession available as 'spark'. 18 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 19 | >>> 20 | >>> 21 | >>> numbers 22 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 23 | >>> rdd = spark.sparkContext.parallelize(numbers, 3) 24 | >>> rdd.count() 25 | 10 26 | >>> rdd.collect() 27 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 28 | >>> 29 | 30 | >>> def f(iterator): 31 | ... for x in iterator: 32 | ... print(x) 33 | ... print("===") 34 | ... 35 | >>> 36 | >>> rdd.foreachPartition(f) 37 | 4 38 | 5 39 | 6 40 | === 41 | 7 42 | 8 43 | 9 44 | 10 45 | === 46 | 1 47 | 2 48 | 3 49 | === 50 | >>> 51 | >>> 52 | >>> rdd = spark.sparkContext.parallelize(numbers, 2) 53 | >>> rdd.foreachPartition(f) 54 | 1 55 | 2 56 | 3 57 | 4 58 | 5 59 | === 60 | 6 61 | 7 62 | 8 63 | 9 64 | 10 65 | === 66 | >>> 67 | >>> n = rdd.getNumPartitions() 68 | >>> n 69 | 2 70 | >>> rdd = spark.sparkContext.parallelize(numbers, 4) 71 | >>> n = rdd.getNumPartitions() 72 | >>> n 73 | 4 74 | >>> rdd.foreachPartition(f) 75 | 5 76 | 6 77 | === 78 | 3 79 | 4 80 | === 81 | 7 82 | 8 83 | 9 84 | 10 85 | === 86 | 1 87 | 2 88 | === 89 | >>> rdd = spark.sparkContext.parallelize(numbers, 14) 90 | >>> rdd.foreachPartition(f) 91 | 4 92 | === 93 | === 94 | === 95 | 3 96 | === 97 | 1 98 | === 99 | 5 100 | === 101 | 2 102 | === 103 | === 104 | 6 105 | === 106 | === 107 | 8 108 | === 109 | 7 110 | === 111 | 9 112 | === 113 | 10 114 | === 115 | >>> def min_max_count(iterator): 116 | ... firsttime = 1 117 | ... #minimum 118 | ... #maximum 119 | ... #count 120 | ... for x in iterator: 121 | ... if (firsttime == 1): 122 | ... minimum = x 123 | ... maximum = x 124 | ... count = 1 125 | ... firsttime = 0 126 | ... else: 127 | ... count = count + 1 128 | ... minimum = min(x, minimum) 129 | ... maximum = max(x, maximum) 130 | ... # 131 | ... return (minimum, maximum, count) 132 | ... 133 | >>> 134 | >>> data = [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16] 135 | >>> data 136 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16] 137 | >>> rdd = spark.sparkContext.parallelize(numbers, 3) 138 | >>> n = rdd.getNumPartitions() 139 | >>> n 140 | 3 141 | >>> rdd.collect() 142 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 143 | >>> result = rdd.mapPartitions(min_max_count) 144 | >>> result.collect() 145 | [1, 3, 3, 4, 6, 3, 7, 10, 4] 146 | >>> def min_max_count(iterator): 147 | ... firsttime = 1 148 | ... #minimum 149 | ... #maximum 150 | ... #count 151 | ... for x in iterator: 152 | ... if (firsttime == 1): 153 | ... minimum = x 154 | ... maximum = x 155 | ... count = 1 156 | ... firsttime = 0 157 | ... else: 158 | ... count = count + 1 159 | ... minimum = min(x, minimum) 160 | ... maximum = max(x, maximum) 161 | ... # 162 | ... return [minimum, maximum, count] 163 | ... 164 | >>> 165 | >>> result = rdd.mapPartitions(min_max_count) 166 | >>> result.collect() 167 | [1, 3, 3, 4, 6, 3, 7, 10, 4] 168 | >>> 169 | >>> 170 | >>> 171 | >>> 172 | >>> def min_max_count(iterator): 173 | ... firsttime = 1 174 | ... #minimum 175 | ... #maximum 176 | ... #count 177 | ... for x in iterator: 178 | ... if (firsttime == 1): 179 | ... minimum = x 180 | ... maximum = x 181 | ... count = 1 182 | ... firsttime = 0 183 | ... else: 184 | ... count = count + 1 185 | ... minimum = min(x, minimum) 186 | ... maximum = max(x, maximum) 187 | ... # 188 | ... return [[minimum, maximum, count]] 189 | ... 190 | >>> result = rdd.mapPartitions(min_max_count) 191 | >>> result.collect() 192 | [[1, 3, 3], [4, 6, 3], [7, 10, 4]] 193 | >>> 194 | 195 | >>> data 196 | [12, 34, 3, 5, 7, 9, 91, 77, 12, 13, 14, 15, 16] 197 | >>> rdd = spark.sparkContext.parallelize(data, 3) 198 | >>> 199 | >>> 200 | >>> result = rdd.mapPartitions(min_max_count) 201 | >>> result.collect() 202 | [[3, 34, 4], [7, 91, 4], [12, 16, 5]] 203 | >>> rdd.foreachPartition(f) 204 | 12 205 | 13 206 | 14 207 | 15 208 | 16 209 | === 210 | 7 211 | 9 212 | 91 213 | 77 214 | === 215 | 12 216 | 34 217 | 3 218 | 5 219 | === 220 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-10-09.txt: -------------------------------------------------------------------------------- 1 | /spark-2.4.4 $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | 19/10/09 18:57:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 7 | Setting default log level to "WARN". 8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 14 | /_/ 15 | 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 17 | SparkSession available as 'spark'. 18 | >>> 19 | >>> 20 | >>> 21 | >>> 22 | >>> numbers = [1, 2, 3, 1, 2, 3, 4, 4, 5, 6] 23 | >>> numbers 24 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6] 25 | >>> rdd = spark.sparkContext.parallelize(numbers) 26 | >>> rdd.collect() 27 | [1, 2, 3, 1, 2, 3, 4, 4, 5, 6] 28 | >>> rdd.count() 29 | 10 30 | >>> rdd2 = rdd.filter(lambda x : x > 3) 31 | >>> rdd2.collect() 32 | [4, 4, 5, 6] 33 | >>> 34 | >>> 35 | >>> def custom_filter(x): 36 | ... if x > 3: 37 | ... return True 38 | ... else: 39 | ... return False 40 | ... ^D 41 | >>> 42 | >>> x = custom_filter(10) 43 | >>> x 44 | True 45 | >>> x = custom_filter(2) 46 | >>> x 47 | False 48 | >>> rdd3 = rdd.filter(custom_filter) 49 | >>> rdd3.collect() 50 | [4, 4, 5, 6] 51 | >>> rdd2.collect() 52 | [4, 4, 5, 6] 53 | >>> 54 | >>> 55 | >>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2),('B', 7)] 56 | >>> data 57 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)] 58 | >>> 59 | >>> rdd = spark.sparkContext.parallelize(data) 60 | >>> rdd.collect() 61 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)] 62 | >>> 63 | >>> 64 | >>> 65 | >>> 66 | >>> total = rdd.reduceByKey(lambda x, y: x+y) 67 | >>> total.collect() 68 | [('B', 9), ('A', 14)] 69 | >>> 70 | >>> 71 | >>> rdd.collect() 72 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)] 73 | >>> grouped = rdd.groupByKey() 74 | >>> grouped.collect() 75 | [ 76 | ('B', ), 77 | ('A', ) 78 | ] 79 | >>> grouped.map(lambda x: (x[0], list(x[1])).collect() 80 | [('B', [2, 7]), ('A', [2, 3, 4, 5])] 81 | >>> total2 = grouped.map(lambda x: (x[0], sum(x[1]))) 82 | >>> total2.collect() 83 | [('B', 9), ('A', 14)] 84 | >>> 85 | 86 | >>> 87 | >>> spark 88 | 89 | >>> numbers = [-1, 2, 3, -55, 88, 99, -99, 66, 777] 90 | >>> numbers 91 | [-1, 2, 3, -55, 88, 99, -99, 66, 777] 92 | >>> rdd = spark.sparkContext.parallelize(numbers) 93 | >>> rdd.collect() 94 | [-1, 2, 3, -55, 88, 99, -99, 66, 777] 95 | >>> 96 | >>> positives = rdd.filter(lambda x : x > 0) 97 | >>> positives.collect() 98 | [2, 3, 88, 99, 66, 777] 99 | >>> 100 | >>> negatives = rdd.filter(lambda x : x < 0) 101 | >>> negatives.collect() 102 | [-1, -55, -99] 103 | >>> def keep_positives(n): 104 | ... if (n > 0): 105 | ... return True 106 | ... else: 107 | ... return False 108 | ... ^D 109 | >>> 110 | >>> a = keep_positives(100) 111 | >>> a 112 | True 113 | >>> a = keep_positives(-9) 114 | >>> a 115 | False 116 | >>> pos2 = rdd.filter(keep_positives) 117 | >>> pos2.collect() 118 | [2, 3, 88, 99, 66, 777] 119 | >>> pos2222 = pos2.filter(lambda x : True) 120 | >>> pos2222.collect() 121 | [2, 3, 88, 99, 66, 777] 122 | >>> 123 | >>> 124 | >>> pairs = [('A', 2), ('A', 3), ('A', 4),('A', 5), ('A', 6), ('B', 10), ('B', 2)] 125 | >>> pairs 126 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)] 127 | >>> 128 | >>> 129 | >>> rdd = spark.sparkContext.parallelize(pairs) 130 | >>> rdd.collect() 131 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)] 132 | >>> totals = rdd.reduceByKey(lambda a, b : a+b) 133 | >>> result = totals.collect() 134 | >>> result 135 | [('B', 12), ('A', 20)] 136 | >>> 137 | >>> 138 | >>> rdd.collect() 139 | [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)] 140 | >>> grouped = rdd.groupByKey() 141 | >>> grouped.collect() 142 | [ 143 | ('B', ), 144 | ('A', ) 145 | ] 146 | >>> 147 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect() 148 | [('B', [10, 2]), ('A', [2, 3, 4, 5, 6])] 149 | >>> 150 | >>> sum2 = grouped.map(lambda x: (x[0], sum(x[1]))) 151 | >>> sum2.collect() 152 | [('B', 12), ('A', 20)] 153 | >>> 154 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2019-10-16.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 6 | Setting default log level to "WARN". 7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 8 | Welcome to 9 | ____ __ 10 | / __/__ ___ _____/ /__ 11 | _\ \/ _ \/ _ `/ __/ '_/ 12 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 13 | /_/ 14 | 15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 16 | SparkSession available as 'spark'. 17 | >>> 18 | >>> a =[ 1, 2, 3, 4, 5] 19 | >>> rdd = spark.sparkContext.parallelize(a) 20 | >>> rdd.collect() 21 | [1, 2, 3, 4, 5] 22 | >>> rdd.count() 23 | 5 24 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y) 25 | >>> sumofvalues 26 | 15 27 | >>> 28 | >>> product = rdd.reduce(lambda x, y: x*y) 29 | >>> product 30 | 120 31 | >>> z = [ "1", "2", "3", "4", "5", "6", "7"] 32 | >>> rdd = spark.sparkContext.parallelize(z) 33 | >>> rdd.collect() 34 | ['1', '2', '3', '4', '5', '6', '7'] 35 | >>> concat = rdd.reduce(lambda x, y: x+y) 36 | >>> concat 37 | '1234567' 38 | >>> 39 | >>> [ "1", "2", "3", "4", "5", "6", "7"] 40 | ['1', '2', '3', '4', '5', '6', '7'] 41 | >>> z = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b"] 42 | >>> 43 | >>> 44 | >>> z 45 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b'] 46 | >>> rdd = spark.sparkContext.parallelize(z, 3) 47 | >>> rdd.collect() 48 | ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b'] 49 | >>> concat = rdd.reduce(lambda x, y: x+y) 50 | >>> concat 51 | '123456789ab' 52 | >>> rdd = spark.sparkContext.parallelize(z, 10) 53 | >>> concat = rdd.reduce(lambda x, y: x+y) 54 | >>> concat 55 | '123456789ab' 56 | 57 | 58 | >>> 59 | >>> nums = [1, 3, 5, 4, 2, 1, 0, 9, 10] 60 | >>> nums 61 | [1, 3, 5, 4, 2, 1, 0, 9, 10] 62 | >>> rdd = spark.sparkContext.parallelize(nums) 63 | >>> rdd.collect() 64 | [1, 3, 5, 4, 2, 1, 0, 9, 10] 65 | >>> rdd.count() 66 | 9 67 | >>> sumvalues = rdd.reduce(lambda a, b: a+b) 68 | >>> sumvalues 69 | 35 70 | 71 | >>> product = rdd.reduce(lambda a, b: a*b) 72 | >>> product 73 | 0 74 | >>> nums = [1, 3, 5, 4, 2, 1, 30, 9, 10] 75 | >>> rdd = spark.sparkContext.parallelize(nums) 76 | >>> sumvalues = rdd.reduce(lambda a, b: a+b) 77 | >>> sumvalues 78 | 65 79 | >>> product = rdd.reduce(lambda a, b: a*b) 80 | >>> product 81 | 324000 82 | >>> rdd.collect() 83 | [1, 3, 5, 4, 2, 1, 30, 9, 10] 84 | 85 | >>> strs = ["1", "3", "5", "4", "2", "1"] 86 | >>> strs 87 | ['1', '3', '5', '4', '2', '1'] 88 | >>> rdd = spark.sparkContext.parallelize(strs) 89 | >>> concat = rdd.reduce(lambda a, b: a+b) 90 | >>> concat 91 | '135421' 92 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-01-22.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | Welcome to 6 | ____ __ 7 | / __/__ ___ _____/ /__ 8 | _\ \/ _ \/ _ `/ __/ '_/ 9 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 10 | /_/ 11 | 12 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 13 | SparkSession available as 'spark'. 14 | >>> 15 | >>> 16 | >>> 17 | >>> spark 18 | 19 | >>> 20 | >>> sc = spark.sparkContext 21 | >>> 22 | >>> sc 23 | 24 | >>> 25 | >>> 26 | >>> numbers = [1, 2, 3, 4, 5, 6, -1, -2] 27 | >>> numbers 28 | [1, 2, 3, 4, 5, 6, -1, -2] 29 | >>> len(numbers) 30 | 8 31 | >>> rdd = sc.parallelize(numbers) 32 | >>> rdd.collect() 33 | [1, 2, 3, 4, 5, 6, -1, -2] 34 | >>> rdd.count() 35 | 8 36 | >>> rdd 37 | ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195 38 | >>> 39 | >>> rdd_pos = rdd.filter(lambda x: x > 0) 40 | >>> rdd_pos.collect() 41 | [1, 2, 3, 4, 5, 6] 42 | 43 | >>> rdd_pos.count() 44 | 6 45 | >>> 46 | >>> rdd_pos.collect() 47 | [1, 2, 3, 4, 5, 6] 48 | >>> 49 | >>> sum_of_all = rdd_pos.reduce(lambda x, y: x+y) 50 | >>> sum_of_all 51 | 21 52 | >>> rdd_pos.take(2) 53 | [1, 2] 54 | >>> 55 | >>> 56 | >>> rdd.collect() 57 | [1, 2, 3, 4, 5, 6, -1, -2] 58 | >>> rdd.count() 59 | 8 60 | >>> rdd4 = rdd.map(lambda x : x+100) 61 | >>> rdd4.collect() 62 | [101, 102, 103, 104, 105, 106, 99, 98] 63 | >>> 64 | >>> 65 | >>> 66 | >>> kv = [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)] 67 | >>> kv 68 | [('alex', 2), ('alex', 20), ('alex', 40), ('jane', 100), ('jane', 400)] 69 | >>> len(kv) 70 | 5 71 | >>> key_value_pairs = sc.parallelize(kv) 72 | >>> key_value_pairs.count() 73 | 5 74 | >>> key_value_pairs.collect() 75 | [ 76 | ('alex', 2), 77 | ('alex', 20), 78 | ('alex', 40), 79 | ('jane', 100), 80 | ('jane', 400) 81 | ] 82 | >>> 83 | >>> 84 | >>> grouped = key_value_pairs.groupByKey() 85 | >>> grouped.collect() 86 | [ 87 | ('alex', ), 88 | ('jane', ) 89 | ] 90 | >>> 91 | >>> grouped.map(lambda x: (x[0], list(x[1]))).collect() 92 | [ 93 | ('alex', [2, 20, 40]), 94 | ('jane', [100, 400]) 95 | ] 96 | >>> grouped_sum = grouped.map(lambda x: (x[0], sum(x[1]))) 97 | >>> grouped_sum.collect() 98 | [ 99 | ('alex', 62), 100 | ('jane', 500) 101 | ] 102 | >>> 103 | >>> 104 | >>> grouped.collect() 105 | [ 106 | ('alex', ), 107 | ('jane', ) 108 | ] 109 | >>> grouped_sum_2 = grouped.mapValues(lambda x: sum(x)) 110 | >>> grouped_sum_2.collect() 111 | [ 112 | ('alex', 62), 113 | ('jane', 500) 114 | ] 115 | >>> 116 | 117 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-01-24.txt: -------------------------------------------------------------------------------- 1 | How to read a text file and convert into an RDD[String] 2 | 3 | $ cat /tmp/books.txt 4 | ISBN-100,sales,biology 5 | IS-01235,sales,econ 6 | ISBN-101,sales,econ 7 | ISBN-102,sales,biology 8 | ISBN-109,econ,sales 9 | ISBN-103,CS,sales 10 | ISBN-104,CS,biology 11 | ISBN-105,CS,econ 12 | ISBN-200,CS 13 | 14 | $ ./bin/pyspark 15 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 16 | [Clang 6.0 (clang-600.0.57)] on darwin 17 | Welcome to 18 | ____ __ 19 | / __/__ ___ _____/ /__ 20 | _\ \/ _ \/ _ `/ __/ '_/ 21 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 22 | /_/ 23 | 24 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 25 | SparkSession available as 'spark'. 26 | >>> 27 | >>> 28 | >>> 29 | >>> spark 30 | 31 | >>> 32 | >>> 33 | >>> 34 | >>> input_path = "/tmp/books.txt" 35 | >>> 36 | >>> records = spark.sparkContext.textFile(input_path) 37 | >>> records.collect() 38 | [ 39 | 'ISBN-100,sales,biology', 40 | 'IS-01235,sales,econ', 41 | 'ISBN-101,sales,econ', 42 | 'ISBN-102,sales,biology', 43 | 'ISBN-109,econ,sales', 44 | 'ISBN-103,CS,sales', 45 | 'ISBN-104,CS,biology', 46 | 'ISBN-105,CS,econ', 47 | 'ISBN-200,CS' 48 | ] 49 | >>> records.count() 50 | 9 -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-02-03.txt: -------------------------------------------------------------------------------- 1 | mparsian@Mahmouds-MacBook ~/spark-2.4.4 $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | Welcome to 4 | ____ __ 5 | / __/__ ___ _____/ /__ 6 | _\ \/ _ \/ _ `/ __/ '_/ 7 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 8 | /_/ 9 | 10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 11 | SparkSession available as 'spark'. 12 | >>> 13 | >>> 14 | >>> numbers = [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30] 15 | >>> 16 | >>> 17 | >>> numbers 18 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30] 19 | >>> len(numbers) 20 | 56 21 | >>> rdd = spark.sparkContext.parallelize(numbers) 22 | >>> rdd.count() 23 | 56 24 | >>> rdd.collect() 25 | [1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -2, 3, 1, 2, -20, 30, 10, 20, -20, 30] 26 | >>> 27 | >>> 28 | >>> def min_max_count(partition): 29 | ... first_time = True 30 | ... count = 0 31 | ... for n in partition: 32 | ... count += 1 33 | ... if first_time == True: 34 | ... min2 = n 35 | ... max2 = n 36 | ... first_time = False 37 | ... else: 38 | ... min2 = min(min2, n) 39 | ... max2 = max(max2, n) 40 | ... return (min2, max2, count) 41 | ... 42 | >>> 43 | >>> target = rdd.mapPartitions(min_max_count) 44 | >>> target.count() 45 | 24 46 | >>> target.collect() 47 | [-2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7, -2, 3, 7, -2, 3, 7, -2, 3, 7, -20, 30, 7] 48 | >>> 49 | >>> 50 | >>> def min_max_count(partition): 51 | ... first_time = True 52 | ... count = 0 53 | ... for n in partition: 54 | ... count += 1 55 | ... if first_time == True: 56 | ... min2 = n 57 | ... max2 = n 58 | ... first_time = False 59 | ... else: 60 | ... min2 = min(min2, n) 61 | ... max2 = max(max2, n) 62 | ... return [(min2, max2, count)] 63 | ... 64 | >>> 65 | >>> target = rdd.mapPartitions(min_max_count) 66 | >>> target.collect() 67 | [(-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7), (-2, 3, 7), (-2, 3, 7), (-2, 3, 7), (-20, 30, 7)] 68 | >>> 69 | >>> rdd.getNumPartitions() 70 | 8 71 | >>> rdd = spark.sparkContext.parallelize(numbers, 4) 72 | >>> rdd.getNumPartitions() 73 | 4 74 | >>> target = rdd.mapPartitions(min_max_count) 75 | >>> target.collect() 76 | [(-2, 3, 14), (-20, 30, 14), (-2, 3, 14), (-20, 30, 14)] 77 | >>> 78 | >>> 79 | >>> 80 | >>> def add_t3(x, y): 81 | ... count = x[2] + y[2] 82 | ... min2 = min(x[0], y[0]) 83 | ... max2 = max(x[1], y[1]) 84 | ... return (min2, max2, count) 85 | ... 86 | >>> 87 | >>> add_t3( (2, 5, 40), (7, 50, 60)) 88 | (2, 50, 100) 89 | >>> final_result = target.reduce(add_t3) 90 | >>> final_result 91 | (-20, 30, 56) 92 | >>> 93 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-04-16.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | 6 | Welcome to 7 | ____ __ 8 | / __/__ ___ _____/ /__ 9 | _\ \/ _ \/ _ `/ __/ '_/ 10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 11 | /_/ 12 | 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 14 | SparkSession available as 'spark'. 15 | >>> 16 | >>> spark 17 | 18 | >>> 19 | >>> input_path = '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt' 20 | >>> input_path 21 | '/Users/mparsian/spark-2.4.4/zbin/foxdata.txt' 22 | >>> 23 | >>> recs = spark.sparkContext.textFile(input_path) 24 | >>> 25 | >>> recs.collect() 26 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped'] 27 | >>> recs.count() 28 | 3 29 | >>> 30 | >>> 31 | >>> words = recs.map(lambda r: r.split(" ")) 32 | >>> words.collect() 33 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence'], ['red', 'fox', 'jumped']] 34 | >>> 35 | >>> words.take(1) 36 | [['red', 'fox', 'jumped', 'high']] 37 | >>> words.take(2) 38 | [['red', 'fox', 'jumped', 'high'], ['fox', 'jumped', 'over', 'high', 'fence']] 39 | >>> # recs : RDD[String] 40 | ... 41 | >>> # words : RDD[[String]] 42 | ... 43 | >>> x = "fox jumped" 44 | >>> y = x.split(" ") 45 | >>> y 46 | ['fox', 'jumped'] 47 | >>> 48 | >>> 49 | >>> single_words = words.flatMap(lambda x: x) 50 | >>> single_words.collect() 51 | ['red', 'fox', 'jumped', 'high', 'fox', 'jumped', 'over', 'high', 'fence', 'red', 'fox', 'jumped'] 52 | >>> words.count() 53 | 3 54 | >>> single_words.count() 55 | 12 56 | >>> # single_words : RDD[String] 57 | ... 58 | >>> 59 | >>> pairs = single_words.map(lambda x : (x, 1)) 60 | >>> pairs.collect() 61 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)] 62 | >>> 63 | >>> pairs.collect() 64 | [('red', 1), ('fox', 1), ('jumped', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('high', 1), ('fence', 1), ('red', 1), ('fox', 1), ('jumped', 1)] 65 | >>> freq = pairs.reduceByKey(lambda a, b : a+b) 66 | >>> freq.collect() 67 | [('high', 2), ('fence', 1), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)] -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-04-23.txt: -------------------------------------------------------------------------------- 1 | ~/spark-2.4.4 $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | Welcome to 4 | ____ __ 5 | / __/__ ___ _____/ /__ 6 | _\ \/ _ \/ _ `/ __/ '_/ 7 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 8 | /_/ 9 | 10 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 11 | SparkSession available as 'spark'. 12 | >>> data = [ [1, 2, 3], [4, 5], [], [10] ] 13 | >>> data 14 | [[1, 2, 3], [4, 5], [], [10]] 15 | >>> len(data) 16 | 4 17 | >>> rdd = spark.sparkContext.parallelize(data) 18 | >>> rdd.collect() 19 | [[1, 2, 3], [4, 5], [], [10]] 20 | >>> rdd.count() 21 | 4 22 | >>> rdd2 = rdd.map(lambda x: x) 23 | >>> rdd2.count() 24 | 4 25 | >>> rdd2.collect() 26 | [[1, 2, 3], [4, 5], [], [10]] 27 | >>> 28 | >>> rdd3 = rdd.flatMap(lambda x: x) 29 | >>> rdd3.count() 30 | 6 31 | >>> rdd3.collect() 32 | [1, 2, 3, 4, 5, 10] 33 | >>> 34 | >>> data2 = [ [1, 2, 3, [44, 55] ], [4, 5], [], [10] ] 35 | >>> rdd4 = spark.sparkContext.parallelize(data2) 36 | >>> rdd4.collect() 37 | [[1, 2, 3, [44, 55]], [4, 5], [], [10]] 38 | >>> rdd5 = rdd4.flatMap(lambda x: x) 39 | >>> rdd5.collect() 40 | [1, 2, 3, [44, 55], 4, 5, 10] 41 | >>> 42 | >>> 43 | >>> data = [1, 2, 3, 4, 5, 6] 44 | >>> rdd = spark.sparkContext.parallelize(data) 45 | >>> rdd.collect() 46 | [1, 2, 3, 4, 5, 6] 47 | >>> sumofvalues = rdd.reduce(lambda x, y: x+y) 48 | >>> sumofvalues 49 | 21 50 | >>> sumofvalues = rdd.reduce(lambda x, y: x*y) 51 | >>> sumofvalues 52 | 720 53 | >>> tuples2 = [(1,20), (3,40), (5,60)] 54 | >>> rdd = spark.sparkContext.parallelize(tuples2) 55 | >>> rdd.collect() 56 | [(1, 20), (3, 40), (5, 60)] 57 | >>> rdd.count() 58 | 3 59 | >>> sum2 = rdd.rduce(lambda x, y: (x[0]+y[0], x[1]+y[1])) 60 | Traceback (most recent call last): 61 | File "", line 1, in 62 | AttributeError: 'RDD' object has no attribute 'rduce' 63 | >>> sum2 = rdd.reduce(lambda x, y: (x[0]+y[0], x[1]+y[1])) 64 | >>> sum2 65 | (9, 120) 66 | >>> 67 | >>> 68 | >>> kv = [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)] 69 | >>> kv 70 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)] 71 | >>> len(kv) 72 | 6 73 | >>> rdd = spark.sparkContext.parallelize(kv) 74 | >>> rdd.collect() 75 | [('A', 2), ('A', 3), ('A', 4), ('B', 10), ('B', 20), ('C', 7)] 76 | >>> rdd.count() 77 | 6 78 | >>> sum_by_key = rdd.reduceByKey(lambda x, y: x+y) 79 | >>> sum_by_key.collect() 80 | [('B', 30), ('C', 7), ('A', 9)] 81 | >>> 82 | >>> 83 | >>> 84 | >>> grouped = rdd.groupByKey() 85 | >>> grouped.collect() 86 | [('B', ), ('C', ), ('A', )] 87 | >>> grouped.mapValues(lambda iter: list(iter)).collect() 88 | [('B', [10, 20]), ('C', [7]), ('A', [2, 3, 4])] 89 | >>> 90 | >>> sum_of_values_2 = grouped.mapValues(lambda iter: sum(iter)) 91 | >>> sum_of_values_2.collect() 92 | [('B', 30), ('C', 7), ('A', 9)] 93 | 94 | ... # find average of values per key for a give rdd by groupByKey() 95 | 96 | ... # find average of values per key for a give rdd by reduceByKey() 97 | ... 98 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-07-06-word-count.txt: -------------------------------------------------------------------------------- 1 | $ cat /tmp/foxy.txt 2 | a Fox jumped high and high and jumped and jumped 3 | fox of red jumped fox of red jumped fox of red jumped 4 | oh no 5 | fox of blue jumped 6 | oh boy 7 | a Fox is a red fox of hen 8 | a fox is a high fox 9 | orange fox is high and blue and blue 10 | 11 | mparsian@usfc-olw-025011 ~/spark-3.0.0 $ ./bin/pyspark 12 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 13 | [Clang 6.0 (clang-600.0.57)] on darwin 14 | Type "help", "copyright", "credits" or "license" for more information. 15 | 20/07/06 17:59:22 WARN Utils: Your hostname, Mahmouds-MacBook.local resolves to a loopback address: 127.0.0.1; using 10.0.0.93 instead (on interface en0) 16 | 20/07/06 17:59:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address 17 | 20/07/06 17:59:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 18 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 19 | Setting default log level to "WARN". 20 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 21 | Welcome to 22 | ____ __ 23 | / __/__ ___ _____/ /__ 24 | _\ \/ _ \/ _ `/ __/ '_/ 25 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 26 | /_/ 27 | 28 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 29 | SparkSession available as 'spark'. 30 | >>> 31 | >>> 32 | >>> spark 33 | 34 | >>> 35 | >>> input_path = '/tmp/foxy.txt' 36 | >>> input_path 37 | '/tmp/foxy.txt' 38 | >>> 39 | >>> recs = spark.sparkContext.textFile(input_path) 40 | >>> recs.count() 41 | 8 42 | >>> recs.collect() 43 | [ 44 | 'a Fox jumped high and high and jumped and jumped', 45 | 'fox of red jumped fox of red jumped fox of red jumped', 46 | 'oh no', 47 | 'fox of blue jumped', 48 | 'oh boy', 49 | 'a Fox is a red fox of hen', 50 | 'a fox is a high fox', 51 | 'orange fox is high and blue and blue' 52 | ] 53 | >>> 54 | >>> 55 | >>> 56 | >>> splitted = recs.map(lambda x: x.split(" ")) 57 | >>> splitted.count() 58 | 8 59 | >>> splitted.collect() 60 | [ 61 | ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped'], 62 | ['fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped'], 63 | ['oh', 'no'], 64 | ['fox', 'of', 'blue', 'jumped'], 65 | ['oh', 'boy'], 66 | ['a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen'], 67 | ['a', 'fox', 'is', 'a', 'high', 'fox'], 68 | ['orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue'] 69 | ] 70 | >>> 71 | >>> 72 | >>> words = splitted.flatMap(lambda x: x) 73 | >>> words.count() 74 | 52 75 | >>> words.collect() 76 | ['a', 'Fox', 'jumped', 'high', 'and', 'high', 'and', 'jumped', 'and', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'fox', 'of', 'red', 'jumped', 'oh', 'no', 'fox', 'of', 'blue', 'jumped', 'oh', 'boy', 'a', 'Fox', 'is', 'a', 'red', 'fox', 'of', 'hen', 'a', 'fox', 'is', 'a', 'high', 'fox', 'orange', 'fox', 'is', 'high', 'and', 'blue', 'and', 'blue'] 77 | >>> 78 | >>> 79 | >>> pairs = words.map(lambda x : (x, 1)) 80 | >>> pairs.collect() 81 | [('a', 1), ('Fox', 1), ('jumped', 1), ('high', 1), ('and', 1), ('high', 1), ('and', 1), ('jumped', 1), ('and', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('fox', 1), ('of', 1), ('red', 1), ('jumped', 1), ('oh', 1), ('no', 1), ('fox', 1), ('of', 1), ('blue', 1), ('jumped', 1), ('oh', 1), ('boy', 1), ('a', 1), ('Fox', 1), ('is', 1), ('a', 1), ('red', 1), ('fox', 1), ('of', 1), ('hen', 1), ('a', 1), ('fox', 1), ('is', 1), ('a', 1), ('high', 1), ('fox', 1), ('orange', 1), ('fox', 1), ('is', 1), ('high', 1), ('and', 1), ('blue', 1), ('and', 1), ('blue', 1)] 82 | >>> 83 | >>> 84 | >>> freq = pairs.reduceByKey(lambda a, b: a+b) 85 | >>> 86 | >>> freq.collect() 87 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)] 88 | >>> 89 | >>> 90 | >>> grouped = pairs.groupByKey() 91 | >>> grouped.collect() 92 | [ 93 | ('Fox', ), 94 | ('high', ), 95 | ('of', ), 96 | ('oh', ), 97 | ('no', ), 98 | ('boy', ), 99 | ('is', ), 100 | ('hen', ), 101 | ('orange', ), 102 | ('a', ), 103 | ('jumped', ), 104 | ('and', ), 105 | ('fox', ), 106 | ('red', ), 107 | ('blue', ) 108 | ] 109 | >>> 110 | >>> grouped.mapValues(lambda iter: list(iter)).collect() 111 | [ 112 | ('Fox', [1, 1]), 113 | ('high', [1, 1, 1, 1]), 114 | ('of', [1, 1, 1, 1, 1]), 115 | ('oh', [1, 1]), 116 | ('no', [1]), 117 | ('boy', [1]), 118 | ('is', [1, 1, 1]), 119 | ('hen', [1]), 120 | ('orange', [1]), 121 | ('a', [1, 1, 1, 1, 1]), 122 | ('jumped', [1, 1, 1, 1, 1, 1, 1]), 123 | ('and', [1, 1, 1, 1, 1]), 124 | ('fox', [1, 1, 1, 1, 1, 1, 1, 1]), 125 | ('red', [1, 1, 1, 1]), 126 | ('blue', [1, 1, 1]) 127 | ] 128 | >>> freq2 = grouped.mapValues(lambda iter: sum(iter)) 129 | >>> freq2.collect() 130 | [('Fox', 2), ('high', 4), ('of', 5), ('oh', 2), ('no', 1), ('boy', 1), ('is', 3), ('hen', 1), ('orange', 1), ('a', 5), ('jumped', 7), ('and', 5), ('fox', 8), ('red', 4), ('blue', 3)] 131 | >>> 132 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-10-05.txt: -------------------------------------------------------------------------------- 1 | $ cat /tmp/foxy.txt 2 | a fox jumped and jumped 3 | red fox jumped high 4 | a red high fox jumped and jumped 5 | red fox is red 6 | 7 | $ ./bin/pyspark 8 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 9 | [Clang 6.0 (clang-600.0.57)] on darwin 10 | Type "help", "copyright", "credits" or "license" for more information. 11 | 12 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 13 | Setting default log level to "WARN". 14 | To adjust logging level use sc.setLogLevel(newLevel). 15 | For SparkR, use setLogLevel(newLevel). 16 | Welcome to 17 | ____ __ 18 | / __/__ ___ _____/ /__ 19 | _\ \/ _ \/ _ `/ __/ '_/ 20 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 21 | /_/ 22 | 23 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 24 | SparkSession available as 'spark'. 25 | >>> 26 | >>> 27 | >>> numbers = [1, 2, 3, 4, 5, 6, 10] 28 | >>> numbers 29 | [1, 2, 3, 4, 5, 6, 10] 30 | >>> 31 | >>> 32 | >>> spark 33 | 34 | 35 | >>># create a new RDD from a Python collection named numbers 36 | >>> rdd_numbers = spark.sparkContext.parallelize(numbers) 37 | >>> rdd_numbers.count() 38 | 7 39 | 40 | >>> rdd_numbers.collect() 41 | [1, 2, 3, 4, 5, 6, 10] 42 | >>> # rdd_numbers : RDD[Integer] 43 | ... 44 | >>> total = rdd_numbers.reduce(lambda x, y: x+y) 45 | >>> total 46 | 31 47 | 48 | >>># create a new RDD from rdd_numbers 49 | >>> tuples2 = rdd_numbers.map(lambda x: (x, x+1)) 50 | >>> tuples2.count() 51 | 7 52 | >>> tuples2.collect() 53 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (10, 11)] 54 | >>> 55 | >>> 56 | >>> input_path = '/tmp/foxy.txt' 57 | >>># create a new RDD[String] from a given text file 58 | >>> recs = spark.sparkContext.textFile(input_path) 59 | >>> recs.collect() 60 | [ 61 | 'a fox jumped and jumped', 62 | 'red fox jumped high', 63 | 'a red high fox jumped and jumped', 64 | 'red fox is red' 65 | ] 66 | >>> recs.count() 67 | 4 68 | >>> # recs : RDD[String] 69 | 70 | >>># create a new RDD[(String, Integer)] 71 | >>> recs_length = recs.map(lambda x : (x, len(x))) 72 | >>> recs_length.collect() 73 | [ 74 | ('a fox jumped and jumped', 23), 75 | ('red fox jumped high', 19), 76 | ('a red high fox jumped and jumped', 32), 77 | ('red fox is red', 14) 78 | ] 79 | >>> # recs_length : RDD[(String, Integer)] 80 | 81 | >>># keep only records if their lengt is greater than 20 82 | >>> recs_gt_20 = recs.filter(lambda x: len(x) > 20) 83 | >>> 84 | >>> recs_gt_20.collect() 85 | [ 86 | 'a fox jumped and jumped', 87 | 'a red high fox jumped and jumped' 88 | ] 89 | >>> recs_gt_20.count() 90 | 2 -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-10-07.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 6 | Setting default log level to "WARN". 7 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 8 | Welcome to 9 | ____ __ 10 | / __/__ ___ _____/ /__ 11 | _\ \/ _ \/ _ `/ __/ '_/ 12 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 13 | /_/ 14 | 15 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 16 | SparkSession available as 'spark'. 17 | >>> spark 18 | 19 | >>> 20 | >>> 21 | >>> 22 | >>> # create RDD[(String, Integer)] 23 | ... 24 | >>> key_value_pairs = 25 | [ 26 | ('alex', 10), 27 | ('alex', 20), 28 | ('alex', 30), 29 | ('bob', 100), 30 | ('bob', 200), 31 | ('zazo', 7) 32 | ] 33 | 34 | >>> # create an RDD[(String, Integer)] from a python collection 35 | >>> key_value = spark.sparkContext.parallelize(key_value_pairs) 36 | >>> key_value.count() 37 | 6 38 | >>> key_value.collect() 39 | [ 40 | ('alex', 10), 41 | ('alex', 20), 42 | ('alex', 30), 43 | ('bob', 100), 44 | ('bob', 200), 45 | ('zazo', 7) 46 | ] 47 | >>> 48 | >>> 49 | >>># use the reduceByKey() transformation 50 | >>> sum_of_values_per_key = key_value.reduceByKey(lambda x, y: x+y) 51 | >>> 52 | >>> sum_of_values_per_key.count() 53 | 3 54 | >>> sum_of_values_per_key.collect() 55 | [ 56 | ('bob', 300), 57 | ('alex', 60), 58 | ('zazo', 7) 59 | ] 60 | >>> 61 | >>> 62 | >>> 63 | >>> filtered = sum_of_values_per_key.filter(lambda x: x[1] > 10) 64 | >>> filtered.collect() 65 | [('bob', 300), ('alex', 60)] 66 | >>> 67 | >>> 68 | >>> key_value.collect() 69 | [ 70 | ('alex', 10), 71 | ('alex', 20), 72 | ('alex', 30), 73 | ('bob', 100), 74 | ('bob', 200), 75 | ('zazo', 7) 76 | ] 77 | >>> 78 | >>> grouped = key_value.groupByKey() 79 | >>> grouped.collect() 80 | [ 81 | ('bob', ), 82 | ('alex', ), 83 | ('zazo', ) 84 | ] 85 | >>> grouped.mapValues(lambda v : list(v)).collect() 86 | [ 87 | ('bob', [100, 200]), 88 | ('alex', [10, 20, 30]), 89 | ('zazo', [7]) 90 | ] 91 | >>> sum_of_values_per_key_2 = grouped.mapValues(lambda values: sum(values)) 92 | >>> sum_of_values_per_key_2.collect() 93 | [ 94 | ('bob', 300), 95 | ('alex', 60), 96 | ('zazo', 7) 97 | ] 98 | >>> 99 | >>> 100 | >>> pairs = [('a', 10), ('a', 100), ('a', 200), ('b', 10)] 101 | >>> rdd = spark.sparkContext.parallelize(pairs) 102 | >>> 103 | >>> rdd.collect() 104 | [('a', 10), ('a', 100), ('a', 200), ('b', 10)] 105 | >>> rdd2 = rdd.mapValues(lambda v: v+1000) 106 | >>> rdd2.collect() 107 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)] 108 | >>> 109 | >>> rdd3 = rdd.map(lambda x: x[1]+1000) 110 | >>> rdd3.collect() 111 | [1010, 1100, 1200, 1010] 112 | >>> 113 | >>> 114 | >>> rdd3 = rdd.map(lambda x: (x[0], x[1]+1000)) 115 | >>> rdd3.collect() 116 | [('a', 1010), ('a', 1100), ('a', 1200), ('b', 1010)] 117 | >>> 118 | >>> 119 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob'] ] 120 | >>> rdd = spark.sparkContext.parallelize(data) 121 | >>> rdd.collect() 122 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']] 123 | >>> rdd.count() 124 | 5 125 | >>> flattened = rdd.flatMap(lambda x: x) 126 | >>> flattened.count() 127 | 6 128 | >>> flattened.collect() 129 | ['a', 'b', 'c', 'z', 'alex', 'bob'] 130 | >>> mapped = rdd.map(lambda x: x) 131 | >>> mapped.count() 132 | 5 133 | >>> mapped.collect() 134 | [['a', 'b', 'c'], ['z'], [], [], ['alex', 'bob']] 135 | >>> 136 | >>> 137 | >>> data = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ] 138 | >>> flattened2 = rdd.flatMap(lambda x: x) 139 | >>> flattened2.collect() 140 | ['a', 'b', 'c', 'z', 'alex', 'bob'] 141 | >>> 142 | >>> 143 | >>> 144 | >>> data2 = [ ['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob') ] 145 | >>> data2 146 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')] 147 | >>> rdd2 = spark.sparkContext.parallelize(data2) 148 | >>> 149 | >>> 150 | >>> rdd2.collect() 151 | [['a', 'b', 'c'], ['z'], [], [], ('alex', 'bob')] 152 | >>> rdd2.count() 153 | 5 154 | >>> flattened2 = rdd2.flatMap(lambda x: x) 155 | >>> flattened2.collect() 156 | ['a', 'b', 'c', 'z', 'alex', 'bob'] 157 | >>> 158 | >>> 159 | >>> data3 = [ ['a', 'b', 'c'], ['z'], [], [], 'alex', 'bob' ] 160 | >>> rdd3 = spark.sparkContext.parallelize(data3) 161 | >>> flattened3 = rdd3.flatMap(lambda x: x) 162 | >>> flattened3.collect() 163 | ['a', 'b', 'c', 'z', 'a', 'l', 'e', 'x', 'b', 'o', 'b'] 164 | >>> 165 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-10-12.txt: -------------------------------------------------------------------------------- 1 | $ ls -l /tmp/data/ 2 | -rw-r--r-- 1 mparsian wheel 72 Oct 12 20:00 file1 3 | -rw-r--r-- 1 mparsian wheel 94 Oct 12 20:01 file2 4 | -rw-r--r-- 1 mparsian wheel 35 Oct 12 20:01 file3 5 | 6 | $ cat /tmp/data/file1 7 | file1: this is record 1 8 | file1: this is record 2 9 | file1: this is record 3 10 | 11 | $ cat /tmp/data/file2 12 | file2: this is record 1 13 | file2: this is record 2 14 | file2: this is fox 3 15 | file2: this is it 4 16 | 17 | $ cat /tmp/data/file3 18 | file3: record 1 19 | file3: ewcord 2222 20 | 21 | 22 | $ ./bin/pyspark 23 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 24 | [Clang 6.0 (clang-600.0.57)] on darwin 25 | Type "help", "copyright", "credits" or "license" for more information. 26 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 27 | Setting default log level to "WARN". 28 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 29 | Welcome to 30 | ____ __ 31 | / __/__ ___ _____/ /__ 32 | _\ \/ _ \/ _ `/ __/ '_/ 33 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 34 | /_/ 35 | 36 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 37 | SparkSession available as 'spark'. 38 | 39 | 40 | >>> input_path = '/tmp/data' 41 | >>> 42 | >>> recs = spark.sparkContext.textFile(input_path) 43 | >>> recs.count() 44 | 9 45 | >>> recs.collect() 46 | ['file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3'] 47 | >>> 48 | >>> union2 = recs.union(recs) 49 | >>> union2.count() 50 | 18 51 | >>> union2.collect() 52 | ['file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3', 'file3: record 1', 'file3: ewcord 2222', 'file2: this is record 1', 'file2: this is record 2', 'file2: this is fox 3', 'file2: this is it 4', 'file1: this is record 1', 'file1: this is record 2', 'file1: this is record 3'] 53 | 54 | 55 | 56 | >>> records = [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)] 57 | >>> 58 | >>> 59 | >>> records 60 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)] 61 | >>> 62 | >>> recs_rdd = spark.sparkContext.parallelize(records) 63 | >>> recs_rdd.count() 64 | 6 65 | >>> recs_rdd.collect() 66 | [('A', 1), ('B', 10), ('A', 2), ('A', 3), ('B', 20), ('B', 60)] 67 | >>> # recs_rdd: RDD[(String, Integer)] 68 | ... 69 | >>> sum_per_key = recs_rdd.reduceByKey(lambda x, y: x+y) 70 | >>> sum_per_key.count() 71 | 2 72 | >>> sum_per_key.collect() 73 | [('B', 90), ('A', 6)] 74 | >>> # avg_by_key: [('B', 30), ('A', 2)] 75 | ... 76 | >>> 77 | 78 | >>> sum_count = recs_rdd.mapValues(lambda v: (v, 1)) 79 | >>> 80 | >>> sum_count.collect() 81 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))] 82 | >>> 83 | >>> 84 | >>> sum_count1 = (10, 1) 85 | >>> sum_count2 = (20, 2) 86 | >>> # (10+20, 1+2) 87 | ... # (30, 3) 88 | ... 89 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1])) 90 | >>> sum_count_per_key.collect() 91 | [('B', (90, 3)), ('A', (6, 3))] 92 | >>> 93 | ])> avg_by_key = sum_count_per_key.mapValues(lambda sum_and_count_tuple : sum_and_count_tuple[0] / sum_and_count_tuple[1 94 | >>> avg_by_key.count() 95 | 2 96 | >>> avg_by_key.collect() 97 | [('B', 30.0), ('A', 2.0)] 98 | >>> 99 | 100 | 101 | >>> sum_count.collect() 102 | [('A', (1, 1)), ('B', (10, 1)), ('A', (2, 1)), ('A', (3, 1)), ('B', (20, 1)), ('B', (60, 1))] 103 | 104 | >>> def add_sum_count(x, y): 105 | ... sum2 = x[0] + y[0] 106 | ... count = x[1] + y[1] 107 | ... return (sum2, count) 108 | ... 109 | >>> 110 | >>> sum_count_per_key = sum_count.reduceByKey(lambda x, y: add_sum_count(x, y)) 111 | >>> sum_count_per_key.collect() 112 | [('B', (90, 3)), ('A', (6, 3))] 113 | >>> avg_per_key = sum_count_per_key.mapValues(lambda tuple: tuple[0] / tuple[1]) 114 | >>> avg_per_key.collect() 115 | [('B', 30.0), ('A', 2.0)] 116 | >>> 117 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2020-10-19.txt: -------------------------------------------------------------------------------- 1 | mapPartitions() Explained. 2 | 3 | 4 | ./bin/pyspark 5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 6 | [Clang 6.0 (clang-600.0.57)] on darwin 7 | Type "help", "copyright", "credits" or "license" for more information. 8 | 20/10/19 20:19:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 9 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 10 | Setting default log level to "WARN". 11 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 12 | Welcome to 13 | ____ __ 14 | / __/__ ___ _____/ /__ 15 | _\ \/ _ \/ _ `/ __/ '_/ 16 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 17 | /_/ 18 | 19 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 20 | SparkSession available as 'spark'. 21 | >>> input_path = '/Users/mparsian/numbers' 22 | >>> rdd = spark.sparkContext.textFile(input_path) 23 | >>> 24 | >>> rdd.collect() 25 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45'] 26 | >>> num_of_partitions = rdd.numPartitions() 27 | Traceback (most recent call last): 28 | File "", line 1, in 29 | AttributeError: 'RDD' object has no attribute 'numPartitions' 30 | >>> num_of_partitions = rdd.getNumPartitions() 31 | >>> 32 | >>> num_of_partitions 33 | 2 34 | >>> rdd = spark.sparkContext.textFile(input_path, 4) 35 | >>> num_of_partitions = rdd.getNumPartitions() 36 | >>> num_of_partitions 37 | 5 38 | >>> rdd = spark.sparkContext.textFile(input_path, 4) 39 | >>> num_of_partitions = rdd.getNumPartitions() 40 | >>> num_of_partitions 41 | 5 42 | >>> def debug(iterator): 43 | ... elements = [] 44 | ... for x in iterator: 45 | ... elements.append(x) 46 | ... print("elements="+ str(elements)) 47 | ... 48 | >>> 49 | >>> rdd.foreachPartition(debug) 50 | elements=['78', '79', '60', '56', '45'] 51 | elements=[] 52 | elements=['11', '14', '4', '3', '8', '9'] 53 | elements=['3', '5', '55', '44', '9', '3', '66'] 54 | elements=['77', '88', '34', '23'] 55 | >>> 56 | >>> 57 | >>> rdd = spark.sparkContext.textFile(input_path) 58 | >>> rdd.colect() 59 | Traceback (most recent call last): 60 | File "", line 1, in 61 | AttributeError: 'RDD' object has no attribute 'colect' 62 | >>> rdd.collect() 63 | ['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11', '14', '4', '3', '8', '9', '78', '79', '60', '56', '45'] 64 | >>> num_of_partitions = rdd.getNumPartitions() 65 | >>> num_of_partitions 66 | 2 67 | >>> rdd.foreachPartition(debug) 68 | elements=['14', '4', '3', '8', '9', '78', '79', '60', '56', '45'] 69 | elements=['3', '5', '55', '44', '9', '3', '66', '77', '88', '34', '23', '11'] 70 | >>> 71 | >>> 72 | >>> 73 | >>> def find_min_max(partition): 74 | ... first_time = False 75 | ... for n in partition: 76 | ... if first_time == False: 77 | ... min2 = n 78 | ... max2 = n 79 | ... first_time == True 80 | ... else: 81 | ... min2 = min(n, min2) 82 | ... max2 = max(n, max2) 83 | ... return [(min2, max2)] 84 | ... 85 | >>> 86 | >>> target = rdd.mapPartitions(find_min_max) 87 | >>> target.collect() 88 | [('11', '11'), ('45', '45')] 89 | >>> 90 | >>> rdd_integer = rdd.map(lambda n : int(n)) 91 | >>> rdd_integer.collect() 92 | [3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11, 14, 4, 3, 8, 9, 78, 79, 60, 56, 45] 93 | >>> target = rdd.mapPartitions(find_min_max) 94 | >>> target.collect() 95 | [('11', '11'), ('45', '45')] 96 | >>> 97 | >>> 98 | >>> target = rdd_integer.mapPartitions(find_min_max) 99 | >>> target.collect() 100 | [(11, 11), (45, 45)] 101 | >>> 102 | >>> 103 | >>> def find_min_max(partition): 104 | ... first_time = False 105 | ... for n in partition: 106 | ... if first_time == False: 107 | ... min2 = n 108 | ... max2 = n 109 | ... first_time = True 110 | ... else: 111 | ... min2 = min(n, min2) 112 | ... max2 = max(n, max2) 113 | ... return [(min2, max2)] 114 | ... 115 | ... 116 | >>> def debug(iterator): 117 | ... elements = [] 118 | ... for x in iterator: 119 | ... elements.append(x) 120 | ... print("elements="+ str(elements)) 121 | ... 122 | >>> 123 | >>> target = rdd_integer.mapPartitions(find_min_max) 124 | 125 | >>> target.collect() 126 | [(3, 88), (3, 79)] 127 | >>> rdd_integer.foreachPartition(debug) 128 | elements=[14, 4, 3, 8, 9, 78, 79, 60, 56, 45] 129 | elements=[3, 5, 55, 44, 9, 3, 66, 77, 88, 34, 23, 11] 130 | >>> target 131 | PythonRDD[14] at collect at :1 132 | >>> final_min_max = target.reduce(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) 133 | >>> final_min_max 134 | (3, 88) 135 | >>> 136 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-01-19.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | 21/01/19 20:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 7 | Setting default log level to "WARN". 8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 14 | /_/ 15 | 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 17 | SparkSession available as 'spark'. 18 | >>> 19 | >>> 20 | >>> tuples2 = [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)] 21 | >>> tuples2 22 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)] 23 | >>> 24 | >>> 25 | >>> pairs_rdd = spark.sparkContext.parallelize(tuples2) 26 | >>> pairs_rdd 27 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262 28 | 29 | >>> pairs_rdd.collect() 30 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)] 31 | >>> pairs_rdd.count() 32 | 5 33 | >>> tuples33 = [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)] 34 | >>> tuples33 35 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)] 36 | >>> rdd = spark.sparkContext.parallelize(tuples33) 37 | >>> 38 | >>> rdd.collect() 39 | [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)] 40 | >>> rdd.count() 41 | 3 42 | >>> 43 | >>> 44 | >>> 45 | >>> pairs_rdd.collect() 46 | [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)] 47 | 48 | >>> new_rdd = pairs_rdd.map(lambda x: (x[0], x[1], 2*int(x[1]))) 49 | >>> new_rdd.collect() 50 | [('alex', 4, 8), ('alex', 5, 10), ('bob', 40, 80), ('bob', 50, 100), ('bob', 4, 8)] 51 | >>> 52 | >>> columns = ["name", "age", "salary"] 53 | >>> some_tuples = [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)] 54 | >>> df = spark.createDataFrame(some_tuples, columns) 55 | >>> df.show() 56 | +----+---+-------+ 57 | |name|age| salary| 58 | +----+---+-------+ 59 | |alex| 40| 80000| 60 | |alex| 50|1000000| 61 | | bob| 40|8000000| 62 | | bob| 50| 10000| 63 | +----+---+-------+ 64 | 65 | >>> df.printSchema() 66 | root 67 | |-- name: string (nullable = true) 68 | |-- age: long (nullable = true) 69 | |-- salary: long (nullable = true) 70 | 71 | >>> rdd = spark.sparkContext.parallelize(some_tuples) 72 | >>> rdd.collect() 73 | [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)] 74 | >>> rdd.take(2) 75 | [('alex', 40, 80000), ('alex', 50, 1000000)] 76 | >>> 77 | 78 | >>> 79 | >>> data = ["alex,20", "alex,30", "bob,40", "bob,50", "bob,60"] 80 | >>> data 81 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60'] 82 | >>> 83 | >>> 84 | >>> rdd = spark.sparkContext.parallelize(data) 85 | >>> rdd.collect() 86 | ['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60'] 87 | >>> rdd.count() 88 | 5 89 | 90 | >>> def create_pairs(rec): 91 | ... tokens = rec.split(",") 92 | ... key = tokens[0] 93 | ... value = tokens[1] 94 | ... return (key, value) 95 | ... 96 | >>> 97 | >>> pairs = rdd.map(lambda x: create_pairs(x)) 98 | >>> pairs.collect() 99 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')] 100 | >>> pairs.count() 101 | 5 102 | >>> pairs = rdd.map(create_pairs) 103 | >>> pairs.collect() 104 | [('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')] 105 | >>> pairs.count() 106 | 5 107 | >>> 108 | >>> sum_by_key = pairs.reduceByKey(lambda x, y: x+y) 109 | >>> sum_by_key.collect() 110 | [('bob', '405060'), ('alex', '2030')] 111 | >>> 112 | >>> def create_pair(rec): 113 | ... tokens = rec.split(",") 114 | ... key = tokens[0] 115 | ... value = int(tokens[1]) 116 | ... return (key, value) 117 | ... 118 | >>> 119 | 120 | >>> rdd2 = rdd.map(lambda x: create_pair(x)) 121 | >>> rdd2.collect() 122 | [('alex', 20), ('alex', 30), ('bob', 40), ('bob', 50), ('bob', 60)] 123 | >>> sum_by_key = rdd2.reduceByKey(lambda x, y: x+y) 124 | >>> sum_by_key.collect() 125 | [('bob', 150), ('alex', 50)] 126 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-01-26.txt: -------------------------------------------------------------------------------- 1 | Spark's Mapper Transformations: 2 | 3 | # map: 1 -> 1 4 | 5 | # flatMap: 1 -> Many 6 | 7 | # mapPartitions: partition -> 1 (Many to 1) 8 | 9 | Many = 0, 1, 2, 3, 4, ... 10 | partition = many elements 11 | 12 | $ ./bin/pyspark 13 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 14 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 15 | Welcome to 16 | ____ __ 17 | / __/__ ___ _____/ /__ 18 | _\ \/ _ \/ _ `/ __/ '_/ 19 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 20 | /_/ 21 | 22 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 23 | SparkSession available as 'spark'. 24 | >>> 25 | >>> 26 | >>> spark 27 | 28 | >>> sc = spark.sparkContext 29 | >>> sc 30 | 31 | >>> 32 | >>> 33 | >>> data = [ [1, 2, 3], [4, 5, 6, 7] ] 34 | >>> data 35 | [[1, 2, 3], [4, 5, 6, 7]] 36 | >>> data[0] 37 | [1, 2, 3] 38 | >>> data[1] 39 | [4, 5, 6, 7] 40 | >>> 41 | >>> rdd = spark.sparkContext.parallelize(data) 42 | >>> rdd.collect() 43 | [[1, 2, 3], [4, 5, 6, 7]] 44 | >>> rdd.count() 45 | 2 46 | >>> 47 | >>> rdd_mapped = rdd.map(lambda x: x) 48 | >>> rdd_mapped.collect() 49 | [[1, 2, 3], [4, 5, 6, 7]] 50 | >>> rdd_mapped.count() 51 | 2 52 | >>> 53 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x) 54 | >>> rdd_flat_mapped.collect() 55 | [1, 2, 3, 4, 5, 6, 7] 56 | >>> rdd_flat_mapped.count() 57 | 7 58 | >>> data = [ [1, 2, 3], [], [4, 5, 6, 7], [], [9] ] 59 | >>> data 60 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]] 61 | >>> data[0] 62 | [1, 2, 3] 63 | >>> data[1] 64 | [] 65 | >>> data[3] 66 | [] 67 | >>> data[2] 68 | [4, 5, 6, 7] 69 | >>> data[3] 70 | [] 71 | >>> data[4] 72 | [9] 73 | >>> rdd = spark.sparkContext.parallelize(data) 74 | >>> rdd.collect() 75 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]] 76 | >>> rdd.count() 77 | 5 78 | >>> rdd_mapped = rdd.map(lambda x: x) 79 | >>> rdd_mapped.collect() 80 | [[1, 2, 3], [], [4, 5, 6, 7], [], [9]] 81 | >>> rdd_mapped.count() 82 | 5 83 | >>> rdd_flat_mapped = rdd.flatMap(lambda x: x) 84 | >>> rdd_flat_mapped.collect() 85 | [1, 2, 3, 4, 5, 6, 7, 9] 86 | >>> rdd_flat_mapped.count() 87 | 8 88 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-04-12.txt: -------------------------------------------------------------------------------- 1 | ~/spark-3.1.1 $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | 21/04/12 20:59:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 6 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 7 | Setting default log level to "WARN". 8 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 14 | /_/ 15 | 16 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 17 | Spark context Web UI available at http://10.0.0.93:4040 18 | Spark context available as 'sc' (master = local[*], app id = local-1618286379380). 19 | SparkSession available as 'spark'. 20 | >>> spark 21 | 22 | >>> 23 | >>> 24 | >>> numbers = [1, 2, 3, 6, 7, 8, 99, 10, -10, -30] 25 | >>> numbers 26 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30] 27 | 28 | >>># create an RDD[Integer] from a collection 29 | >>># RDD = Resilient Distributed Dataset 30 | >>> rdd = spark.sparkContext.parallelize(numbers) 31 | >>> rdd.collect() 32 | [1, 2, 3, 6, 7, 8, 99, 10, -10, -30] 33 | >>> rdd.count() 34 | 10 35 | 36 | >>># fund sum of all numbers in rdd as (RDD[Integer]) 37 | >>> total = rdd.reduce(lambda x, y: x+y) 38 | >>> total 39 | 96 40 | 41 | >>>#apply a filter: find all positive numbers 42 | >>> positives = rdd.filter(lambda x : x > 0) 43 | >>> positives.collect() 44 | [1, 2, 3, 6, 7, 8, 99, 10] 45 | >>> 46 | >>># increment every element by 1000 47 | >>> rdd2 = rdd.map(lambda x : x+1000) 48 | >>> rdd2.collect() 49 | [1001, 1002, 1003, 1006, 1007, 1008, 1099, 1010, 990, 970] 50 | >>> 51 | >>># create (key, value) pairs 52 | >>> data = [("m1", 4), ("m1", 5), ("m2", 3), ("m2", 4), ("m2", 5), ("m3", 2), ("m3", 4)] 53 | >>> data 54 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)] 55 | 56 | >>> 57 | >>> pairs = spark.sparkContext.parallelize(data) 58 | >>> pairs.collect() 59 | [('m1', 4), ('m1', 5), ('m2', 3), ('m2', 4), ('m2', 5), ('m3', 2), ('m3', 4)] 60 | 61 | >>># keep elements if their associated value is Greater Than 3 62 | >>># x[0] refers to key 63 | >>># x[1] refers to value 64 | >>> rating45 = pairs.filter(lambda x : x[1] > 3) 65 | >>> rating45.collect() 66 | [('m1', 4), ('m1', 5), ('m2', 4), ('m2', 5), ('m3', 4)] 67 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-04-14.txt: -------------------------------------------------------------------------------- 1 | $ cat /tmp/foxdata.txt 2 | a red fox jumped of high 3 | fox jumped over a high fence 4 | red of fox jumped 5 | 6 | 7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 8 | ... 9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 10 | Welcome to 11 | ____ __ 12 | / __/__ ___ _____/ /__ 13 | _\ \/ _ \/ _ `/ __/ '_/ 14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 15 | /_/ 16 | 17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 18 | Spark context Web UI available at http://10.0.0.93:4040 19 | Spark context available as 'sc' (master = local[*], app id = local-1618456720582). 20 | SparkSession available as 'spark'. 21 | >>> 22 | >>> 23 | >>> 24 | >>> spark 25 | 26 | 27 | >>> input_path = "/tmp/foxdata.txt" 28 | >>> input_path 29 | '/tmp/foxdata.txt' 30 | >>> # Read input path and create an RDD[String] 31 | ... 32 | >>> records = spark.sparkContext.textFile(input_path) 33 | >>> records 34 | /tmp/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 35 | >>> 36 | >>> records.collect() 37 | [ 38 | 'a red fox jumped of high', 39 | 'fox jumped over a high fence', 40 | 'red of fox jumped' 41 | ] 42 | >>> records.count() 43 | 3 44 | >>> # tokenize records and create RDD[ [String] ] 45 | ... 46 | >>> tokenizd = records.map(lambda record: record.split(" ")) 47 | >>> tokenizd.collect() 48 | [ 49 | ['a', 'red', 'fox', 'jumped', 'of', 'high'], 50 | ['fox', 'jumped', 'over', 'a', 'high', 'fence'], 51 | ['red', 'of', 'fox', 'jumped'] 52 | ] 53 | >>> tokenizd.count() 54 | 3 55 | >>> pairs = tokenizd.map(lambda word : (word, 1)) 56 | >>> pairs.collect() 57 | [ 58 | (['a', 'red', 'fox', 'jumped', 'of', 'high'], 1), 59 | (['fox', 'jumped', 'over', 'a', 'high', 'fence'], 1), 60 | (['red', 'of', 'fox', 'jumped'], 1) 61 | ] 62 | >>> 63 | >>> words = tokenizd.flatMap(lambda arr: arr) 64 | >>> words.collect() 65 | ['a', 'red', 'fox', 'jumped', 'of', 'high', 'fox', 'jumped', 'over', 'a', 'high', 'fence', 'red', 'of', 'fox', 'jumped'] 66 | >>> words.count() 67 | 16 68 | >>> # words : RDD[String] 69 | ... 70 | >>> key_value_pairs = words.map(lambda word: (word, 1)) 71 | >>> key_value_pairs.collect() 72 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)] 73 | >>> 74 | >>> # key_value_pairs: RDD[(String, Integer)] 75 | ... 76 | >>> 77 | >>> grouped = key_value_pairs.groupByKey() 78 | >>> grouped.collect() 79 | [ 80 | ('of', ), 81 | ('high', ), 82 | ('fence', ), 83 | ('a', ), 84 | ('red', ), 85 | ('fox', ), 86 | ('jumped', ), 87 | ('over', ) 88 | ] 89 | >>> 90 | >>> debugged = grouped.mapValues(lambda values: list(values)) 91 | >>> debugged.collect() 92 | [ 93 | ('of', [1, 1]), 94 | ('high', [1, 1]), 95 | ('fence', [1]), 96 | ('a', [1, 1]), 97 | ('red', [1, 1]), 98 | ('fox', [1, 1, 1]), 99 | ('jumped', [1, 1, 1]), 100 | ('over', [1]) 101 | ] 102 | >>> 103 | >>> 104 | >>> frequency = grouped.mapValues(lambda values: sum(values)) 105 | >>> frequency.collect() 106 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)] 107 | >>> 108 | >>> 109 | >>> 110 | >>> key_value_pairs.collect() 111 | [('a', 1), ('red', 1), ('fox', 1), ('jumped', 1), ('of', 1), ('high', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('a', 1), ('high', 1), ('fence', 1), ('red', 1), ('of', 1), ('fox', 1), ('jumped', 1)] 112 | >>> 113 | >>> 114 | >>> 115 | >>> reduced = key_value_pairs.reduceByKey(lambda x, y: x+y) 116 | >>> reduced.collect() 117 | [('of', 2), ('high', 2), ('fence', 1), ('a', 2), ('red', 2), ('fox', 3), ('jumped', 3), ('over', 1)] 118 | >>> 119 | >>> rdd7 = reduced.mapValues(lambda x: x+100) 120 | >>> rdd7.collect() 121 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)] 122 | 123 | >>> rdd77 = reduced.map(lambda x: x[1]+100) 124 | >>> rdd77.collect() 125 | [102, 102, 101, 102, 102, 103, 103, 101] 126 | 127 | >>> rdd77 = reduced.map(lambda x: (x[0], x[1]+100)) 128 | >>> rdd77.collect() 129 | [('of', 102), ('high', 102), ('fence', 101), ('a', 102), ('red', 102), ('fox', 103), ('jumped', 103), ('over', 101)] 130 | >>> 131 | 132 | >>># get number of partitions for rdd77 133 | >>> rdd77.getNumPartitions() 134 | 2 135 | >>> 136 | >>> 137 | >>> KV = [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)] 138 | >>> KV 139 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)] 140 | >>> rdd = spark.sparkContext.parallelize(KV) 141 | >>> 142 | >>> rdd.collect() 143 | [('x', 3), ('x', 5), ('x', 8), ('y', 50), ('y', 60), ('y', 70), ('z', 3)] 144 | >>> rdd.count() 145 | 7 146 | >>> 147 | >>> filtered1 = rdd.filter(lambda x : x[1] > 10) 148 | >>> filtered1.collect() 149 | [('y', 50), ('y', 60), ('y', 70)] 150 | >>> filtered2 = rdd.filter(lambda x : x[1] < 10) 151 | >>> filtered2.collect() 152 | [('x', 3), ('x', 5), ('x', 8), ('z', 3)] 153 | >>> 154 | >>> 155 | >>> added = rdd.reduceByKey(lambda a, b: a+b) 156 | >>> added.collect() 157 | [('y', 180), ('z', 3), ('x', 16)] 158 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-04-19.txt: -------------------------------------------------------------------------------- 1 | $ ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | [Clang 6.0 (clang-600.0.57)] on darwin 4 | Type "help", "copyright", "credits" or "license" for more information. 5 | 6 | 21/04/19 20:20:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 7 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 8 | Setting default log level to "WARN". 9 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 10 | Welcome to 11 | ____ __ 12 | / __/__ ___ _____/ /__ 13 | _\ \/ _ \/ _ `/ __/ '_/ 14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 15 | /_/ 16 | 17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 18 | Spark context Web UI available at http://10.0.0.93:4040 19 | Spark context available as 'sc' (master = local[*], app id = local-1618888841845). 20 | SparkSession available as 'spark'. 21 | >>> 22 | >>># Create an RDD[(String, Integer)] as rdd 23 | >>> kv =[('a', 3), ('a', 4), ('a', 5), ('b', 30),('b', 40),('b', 50),('z', 3)] 24 | >>> rdd = spark.sparkContext.parallelize(kv) 25 | >>> 26 | >>> 27 | >>> rdd.count() 28 | 7 29 | >>> rdd.collect() 30 | [('a', 3), ('a', 4), ('a', 5), ('b', 30), ('b', 40), ('b', 50), ('z', 3)] 31 | >>> def mapfun1(e): 32 | ... k = e[0] 33 | ... v = e[1] 34 | ... return (k, (v, v+5)) 35 | ... 36 | >>># Create an RDD[(String, (Integer, Integer))] as rdd2 37 | >>># rdd2 has key type of String and value type of (Integer, Integer) 38 | >>> rdd2 = rdd.map(mapfun1) 39 | >>> 40 | >>> rdd2.collect() 41 | [('a', (3, 8)), ('a', (4, 9)), ('a', (5, 10)), ('b', (30, 35)), ('b', (40, 45)), ('b', (50, 55)), ('z', (3, 8))] 42 | >>> rdd2.count() 43 | 7 44 | >>> # rdd: RDD[(String, Integer)] 45 | ... 46 | >>> # rdd2: RDD[(String, (Integer, Integer)] 47 | >>> 48 | >>># Create an RDD[(String, Integer)] as rdd3 49 | >>> rdd3 = rdd2.map(lambda x: (x[0], x[1][0]+x[1][1])) 50 | >>> rdd3.count() 51 | 7 52 | >>> rdd3.collect() 53 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)] 54 | >>> 55 | >>> rdd31 = rdd2.mapValues(lambda v: v[0]+v[1]) 56 | >>> rdd31.count() 57 | 7 58 | >>> rdd31.collect() 59 | [('a', 11), ('a', 13), ('a', 15), ('b', 65), ('b', 85), ('b', 105), ('z', 11)] 60 | >>> 61 | >>> 62 | >>> 63 | >>> strings = ["abc", "xyzt", "", "123"] 64 | >>> rdd_strings = spark.sparkContext.parallelize(strings) 65 | >>> 66 | >>> rdd_strings.count() 67 | 4 68 | >>> rdd_strings_2 = rdd_strings.flatMap(lambda v: v) 69 | >>> rdd_strings_2.collect() 70 | ['a', 'b', 'c', 'x', 'y', 'z', 't', '1', '2', '3'] 71 | >>> 72 | >>> lists = [ [1, 2, 3], [], [6,7,8,9,10], [] ] 73 | >>> rdd4 = spark.sparkContext.parallelize(lists) 74 | >>> rdd4.collect() 75 | [[1, 2, 3], [], [6, 7, 8, 9, 10], []] 76 | >>> rdd4.count() 77 | 4 78 | 79 | >>> rdd5 = rdd4.flatMap(lambda v: v) 80 | >>> rdd5.collect() 81 | [1, 2, 3, 6, 7, 8, 9, 10] 82 | >>> rdd5.count() 83 | 8 84 | >>> 85 | >>> lists = [ [7, (1,2), (2,4)], ["abc", 99], [6, (7, 7), (8, 8)], [] ] 86 | >>> rdd9 = spark.sparkContext.parallelize(lists) 87 | >>> rdd9.collect() 88 | [[7, (1, 2), (2, 4)], ['abc', 99], [6, (7, 7), (8, 8)], []] 89 | >>> rdd9.count() 90 | 4 91 | >>> rdd10 = rdd9.flatMap(lambda v: v) 92 | >>> rdd10.collect() 93 | [7, (1, 2), (2, 4), 'abc', 99, 6, (7, 7), (8, 8)] 94 | >>> 95 | >>> 96 | >>> rdd11 = rdd10.flatMap(lambda v: v) 97 | >>> rdd11.collect() 98 | 21/04/19 20:43:44 ERROR Executor: Exception in task 5.0 in stage 17.0 (TID 141) 99 | TypeError: 'int' object is not iterable 100 | 101 | >>> 102 | >>> mylist = [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)] 103 | >>> rdd = spark.sparkContext.parallelize(mylist) 104 | >>> rdd.collect() 105 | [(7, 1, 2), (2, 4), ('abc', 99, 6), (7, 7), (8, 8)] 106 | >>> rdd2 = rdd.flatMap(lambda x: x) 107 | >>> rdd2.collect() 108 | [7, 1, 2, 2, 4, 'abc', 99, 6, 7, 7, 8, 8] 109 | >>> 110 | >>> 111 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-04-21-mapPartitions.txt: -------------------------------------------------------------------------------- 1 | ./bin/pyspark 2 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 3 | ... 4 | Welcome to 5 | ____ __ 6 | / __/__ ___ _____/ /__ 7 | _\ \/ _ \/ _ `/ __/ '_/ 8 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 9 | /_/ 10 | 11 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 12 | Spark context Web UI available at http://10.0.0.93:4040 13 | Spark context available as 'sc' (master = local[*], app id = local-1619061713234). 14 | SparkSession available as 'spark'. 15 | >>> 16 | >>> 17 | >>> 18 | >>> nums = [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2] 19 | >>> nums 20 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2] 21 | >>> 22 | >>> 23 | 24 | >>> rdd = sc.parallelize(nums) 25 | >>> rdd.collect() 26 | [1, 2, 3, 4, -1, 4, 5, 6, 7, -3, -1, 2, 3, 9, -1, -2] 27 | >>># find the default number of partitions 28 | >>> rdd.getNumPartitions() 29 | 8 30 | >>> 31 | >>># set number of partitions explicitly to 3 32 | >>> rdd = sc.parallelize(nums, 3) 33 | >>> rdd.getNumPartitions() 34 | 3 35 | >>> def debug(partition): 36 | ... elements = [] 37 | ... for x in partition: 38 | ... elements.append(x) 39 | ... print("elements=", elements) 40 | ... 41 | >>> rdd.foreachPartition(debug) 42 | elements= [4, 5, 6, 7, -3] 43 | elements= [1, 2, 3, 4, -1] 44 | elements= [-1, 2, 3, 9, -1, -2] 45 | >>> 46 | >>>#define a function which handles a single partition 47 | >>> def min_max_count(partition): 48 | ... first_time = False 49 | ... local_count = 0 50 | ... for n in partition: 51 | ... local_count += 1 52 | ... if (first_time == False): 53 | ... local_min = n 54 | ... local_max = n 55 | ... first_time = True 56 | ... else: 57 | ... local_min = min(n, local_min) 58 | ... local_max = max(n, local_max) 59 | ... return [(local_min, local_max, local_count)] 60 | ... 61 | >>># Test your custom function without Spark 62 | >>> x = [1, 2, 3, -3, -6, 9, 10, 4, 5, 6] 63 | >>> result = min_max_count(x) 64 | >>> result 65 | [(-6, 10, 10)] 66 | >>> 67 | >>> rdd.foreachPartition(debug) 68 | elements= [1, 2, 3, 4, -1] 69 | elements= [-1, 2, 3, 9, -1, -2] 70 | elements= [4, 5, 6, 7, -3] 71 | >>> 72 | >>> rdd2 = rdd.mapPartitions(min_max_count) 73 | >>> rdd2.collect() 74 | [(-1, 4, 5), (-3, 7, 5), (-2, 9, 6)] 75 | 76 | >>> final_answer = rdd2.reduce(lambda x, y: ( min(x[0], y[0]), max(x[1], y[1]), x[2]+y[2]) ) 77 | >>> final_answer 78 | (-3, 9, 16) 79 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-04-29-min-max-avg.txt: -------------------------------------------------------------------------------- 1 | Given billions of numbers, find (minimum, maximum, average) 2 | for all numbers. 3 | 4 | I provide 2 solutions: one using tuple of 4: (minimum, maximum, sum, count) 5 | another solution using tuple of 3: (minimum, maximum, sum) 6 | 7 | 8 | $ ./bin/pyspark 9 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 10 | Welcome to 11 | ____ __ 12 | / __/__ ___ _____/ /__ 13 | _\ \/ _ \/ _ `/ __/ '_/ 14 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 15 | /_/ 16 | 17 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 18 | Spark context Web UI available at http://10.0.0.93:4040 19 | Spark context available as 'sc' (master = local[*], app id = local-1619727491830). 20 | SparkSession available as 'spark'. 21 | >>> 22 | >>> 23 | >>> nums = [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8] 24 | >>> 25 | >>># Let rd denote billions of numbers 26 | >>> rdd = spark.sparkContext.parallelize(nums) 27 | >>> rdd.collect() 28 | [1, 2, 3, -1, -2, -3, 4, 5, 6, 7, 8] 29 | >>> 30 | 31 | >>># Create tuple of 4 elements as: (minimum, maximum, sum, count) 32 | >>> tuple4 = rdd.map(lambda n: (n, n, n, 1)) 33 | >>> tuple4.collect() 34 | [(1, 1, 1, 1), (2, 2, 2, 1), (3, 3, 3, 1), (-1, -1, -1, 1), (-2, -2, -2, 1), (-3, -3, -3, 1), (4, 4, 4, 1), (5, 5, 5, 1), (6, 6, 6, 1), (7, 7, 7, 1), (8, 8, 8, 1)] 35 | 36 | >>># Perform a reduction on tuple4 37 | >>> min_max_sum_count = tuple4.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2], x[3]+y[3]) ) 38 | >>> 39 | >>># Now, min_max_sum_count represents (minimum, maximum, sum, count) 40 | >>> min_max_sum_count 41 | (-3, 8, 30, 11) 42 | >>> final = (min_max_sum_count[0], min_max_sum_count[1], min_max_sum_count[2] / min_max_sum_count[3]) 43 | >>> final 44 | (-3, 8, 2.727272727272727) 45 | >>> 46 | 47 | >>># Solution using tuple of 3 48 | >>> tuple3 = rdd.map(lambda n: (n, n, n)) 49 | >>> min_max_sum = tuple3.reduce(lambda x, y: (min(x[0], y[0]), max(x[1],y[1]), x[2]+y[2]) ) 50 | >>> min_max_sum 51 | (-3, 8, 30) 52 | >>> N = rdd.count() 53 | >>> N 54 | 11 55 | >>> final = (min_max_sum[0], min_max_sum[1], min_max_sum[2] / N) 56 | >>> final 57 | (-3, 8, 2.727272727272727) -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-05-05-join.txt: -------------------------------------------------------------------------------- 1 | PySpark Documentation: Join function in PySpark: 2 | http://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.join.html 3 | 4 | $ ./bin/pyspark 5 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 6 | 7 | Welcome to 8 | ____ __ 9 | / __/__ ___ _____/ /__ 10 | _\ \/ _ \/ _ `/ __/ '_/ 11 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.1 12 | /_/ 13 | 14 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 15 | Spark context Web UI available at http://10.0.0.93:4040 16 | Spark context available as 'sc' (master = local[*], app id = local-1620269740798). 17 | SparkSession available as 'spark'. 18 | >>> 19 | >>> x = spark.sparkContext.parallelize([("spark", 1), ("hadoop", 4)]) 20 | >>> x.collect() 21 | [ 22 | ('spark', 1), 23 | ('hadoop', 4) 24 | ] 25 | >>> 26 | >>> y = spark.sparkContext.parallelize([("spark", 2), ("hadoop", 5)]) 27 | >>> y.collect() 28 | [ 29 | ('spark', 2), 30 | ('hadoop', 5) 31 | ] 32 | >>> 33 | >>> joined = x.join(y) 34 | >>> joined.collect() 35 | [ 36 | ('spark', (1, 2)), 37 | ('hadoop', (4, 5)) 38 | ] 39 | >>> 40 | >>> 41 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("c", 4)]) 42 | >>> x.collect() 43 | [('a', 1), ('b', 4), ('c', 4)] 44 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("d", 8)]) 45 | >>> y.collect() 46 | [('a', 2), ('a', 3), ('a', 7), ('d', 8)] 47 | >>> 48 | >>> joined = x.join(y) 49 | >>> joined.collect() 50 | [('a', (1, 2)), ('a', (1, 3)), ('a', (1, 7))] 51 | >>> 52 | >>> 53 | >>> joined.count() 54 | 3 55 | >>> x = spark.sparkContext.parallelize([("a", 1), ("b", 4), ("b", 5), ("c", 4)]); 56 | >>> x.collect() 57 | [('a', 1), ('b', 4), ('b', 5), ('c', 4)] 58 | >>> 59 | >>> y = spark.sparkContext.parallelize([("a", 2), ("a", 3), ("a", 7), ("b", 61), ("b", 71), ("d", 8)]) 60 | >>> y.collect() 61 | [('a', 2), ('a', 3), ('a', 7), ('b', 61), ('b', 71), ('d', 8)] 62 | >>> joined = x.join(y) 63 | >>> joined.collect() 64 | [ 65 | ('b', (4, 61)), 66 | ('b', (4, 71)), 67 | ('b', (5, 61)), 68 | ('b', (5, 71)), 69 | ('a', (1, 2)), 70 | ('a', (1, 3)), 71 | ('a', (1, 7)) 72 | ] 73 | >>> 74 | >>>#pyspark.RDD.cartesian 75 | >>>#RDD.cartesian(other) 76 | >>>#Return the Cartesian product of this RDD and another one, 77 | >>>#that is, the RDD of all pairs of elements (a, b) where a is 78 | >>>#in self and b is in other. 79 | >>># Examples 80 | 81 | >>> 82 | >>> rdd = spark.sparkContext.parallelize([1, 2]) 83 | >>> sorted(rdd.cartesian(rdd).collect()) 84 | [(1, 1), (1, 2), (2, 1), (2, 2)] 85 | 86 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-10-11-filter-map-flatMap.txt: -------------------------------------------------------------------------------- 1 | Understand filter(), map(), and flatMap() 2 | 3 | $ ./bin/pyspark 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 6 | Welcome to 7 | ____ __ 8 | / __/__ ___ _____/ /__ 9 | _\ \/ _ \/ _ `/ __/ '_/ 10 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.2 11 | /_/ 12 | 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 14 | Spark context Web UI available at http://10.0.0.94:4040 15 | Spark context available as 'sc' (master = local[*], app id = local-1634007457887). 16 | SparkSession available as 'spark'. 17 | >>> 18 | >>> 19 | >>> 20 | >>> records = ["this is fox", "fox", "is", "fox is red", "fox is gone"] 21 | >>> records 22 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone'] 23 | >>> >>> 24 | >>> 25 | >>> rdd = sc.parallelize(records) 26 | >>> 27 | >>> rdd 28 | ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 29 | >>> rdd.count() 30 | 5 31 | >>> rdd.collect() 32 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone'] 33 | >>> 34 | >>> 35 | >>> filtered = rdd.filter(lambda x: len(x) > 3) 36 | >>> filtered.collect() 37 | ['this is fox', 'fox is red', 'fox is gone'] 38 | >>> 39 | >>> 40 | >>> def apply_filter(x): 41 | ... if len(x) > 3: return True 42 | ... return False 43 | ... 44 | >>> 45 | >>> b = apply_filter("this is a long one") 46 | >>> b 47 | True 48 | >>> c = apply_filter("one") 49 | >>> c 50 | False 51 | >>> 52 | >>> filtered_recs = rdd.filter(apply_filter) 53 | >>> 54 | >>> filtered_recs.collect() 55 | ['this is fox', 'fox is red', 'fox is gone'] 56 | >>> 57 | >>> 58 | >>> rdd.collect() 59 | ['this is fox', 'fox', 'is', 'fox is red', 'fox is gone'] 60 | >>> flattened = rdd.flatMap(lambda x: x.split(" ")) 61 | >>> 62 | >>> flattened.collect() 63 | ['this', 'is', 'fox', 'fox', 'is', 'fox', 'is', 'red', 'fox', 'is', 'gone'] 64 | >>> flattened.count() 65 | 11 66 | >>> mapped = rdd.map(lambda x: x.split(" ")) 67 | >>> mapped.collect() 68 | [['this', 'is', 'fox'], ['fox'], ['is'], ['fox', 'is', 'red'], ['fox', 'is', 'gone']] 69 | >>> mapped.count() 70 | 5 71 | >>> 72 | >>> a = [ ["this", "is"], [], [], ["fox", "is", "red", "jumped"] ] 73 | >>> a 74 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']] 75 | >>> rdd_list = sc.parallelize(a) 76 | >>> rdd_list.collect() 77 | [['this', 'is'], [], [], ['fox', 'is', 'red', 'jumped']] 78 | >>> rdd_list.count() 79 | 4 80 | >>> flattened22 = rdd_list.flatMap(lambda L : L) 81 | >>> flattened22.collect() 82 | ['this', 'is', 'fox', 'is', 'red', 'jumped'] 83 | >>> 84 | >>> 85 | >>> key_value_pairs = [("a", 10), ("a", 20), ("a", 30), ("a", 40), ("b", 300), ("b", 400)] 86 | >>> key_value_pairs 87 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)] 88 | >>> key_value_rdd = sc.parallelize(key_value_pairs) 89 | >>> 90 | >>> key_value_rdd.collect() 91 | [('a', 10), ('a', 20), ('a', 30), ('a', 40), ('b', 300), ('b', 400)] 92 | >>> 93 | >>> def custom_func(x): 94 | ... k = x[0] 95 | ... v = x[1] 96 | ... if (v < 30): return [] 97 | ... return [(k, v+1000), ("MYKEY", v+4000)] 98 | ... 99 | >>> 100 | >>> y = custom_func(("x", 25)) 101 | >>> y 102 | [] 103 | >>> y = custom_func(("x", 300)) 104 | >>> y 105 | [('x', 1300), ('MYKEY', 4300)] 106 | >>> flattened = key_value_rdd.flatMap(custom_func) 107 | >>> flattened.collect() 108 | [('a', 1030), ('MYKEY', 4030), ('a', 1040), ('MYKEY', 4040), ('b', 1300), ('MYKEY', 4300), ('b', 1400), ('MYKEY', 4400)] 109 | >>> flattened.count() 110 | 8 111 | >>> 112 | >>> mapped = key_value_rdd.map(custom_func) 113 | >>> mapped.collect() 114 | [[], [], [('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]] 115 | >>> mapped.count() 116 | 6 117 | >>> filtered99 = mapped.filter(lambda x: len(x) > 0) 118 | >>> filtered99.collect() 119 | [[('a', 1030), ('MYKEY', 4030)], [('a', 1040), ('MYKEY', 4040)], [('b', 1300), ('MYKEY', 4300)], [('b', 1400), ('MYKEY', 4400)]] 120 | >>> 121 | >>> 122 | >>> 123 | >>> x = set() 124 | >>> x.add(1) 125 | >>> x 126 | {1} 127 | >>> x.add(1) 128 | >>> x 129 | {1} 130 | >>> x.add(3) 131 | >>> x.add(4) 132 | >>> x 133 | {1, 3, 4} 134 | >>> x.add(4) 135 | >>> x.add(4) 136 | >>> x.add(4) 137 | >>> x.add(4) 138 | >>> x 139 | {1, 3, 4} 140 | >>> x = [] 141 | >>> x.append(1) 142 | >>> x 143 | [1] 144 | >>> x.append(1) 145 | >>> x 146 | [1, 1] 147 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-10-20-understanding-partitions.txt: -------------------------------------------------------------------------------- 1 | Understanding Partitions 2 | 3 | $ ./bin/pyspark 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 5 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 6 | Welcome to 7 | ____ __ 8 | / __/__ ___ _____/ /__ 9 | _\ \/ _ \/ _ `/ __/ '_/ 10 | /__ / .__/\_,_/_/ /_/\_\ version 3.1.2 11 | /_/ 12 | 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 14 | Spark context Web UI available at http://10.0.0.94:4040 15 | Spark context available as 'sc' (master = local[*], app id = local-1634788905125). 16 | SparkSession available as 'spark'. 17 | >>> 18 | >>> nums = [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90] 19 | >>> nums 20 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90] 21 | >>> # rdd : RDD[Integer] 22 | >>> rdd = sc.parallelize(nums) 23 | >>> rdd.count() 24 | 26 25 | >>> rdd.collect() 26 | [1, 2, 3, 4, 5, 77, 77, 66, 99, 33, 33, 22, 22, 11, 123, 44, 45, 67, 89, 77, 66, 44, 55, 99, 80, 90] 27 | >>> 28 | >>> # get number of partitions (default, set by cluster manager) 29 | >>> rdd.getNumPartitions() 30 | 8 31 | >>> # set number of partitions explicitly to 4 32 | >>> rdd2 = sc.parallelize(nums, 4) 33 | >>> rdd2.getNumPartitions() 34 | 4 35 | >>> # define a debugger to output all elements of a partition 36 | >>> def debug_partition(partition): 37 | ... print("partition=", list(partition)) 38 | ... 39 | >>> rdd.foreachPartition(debug_partition) 40 | partition= [1, 2, 3] 41 | partition= [33, 33, 22] 42 | partition= [22, 11, 123] 43 | partition= [44, 45, 67] 44 | partition= [77, 66, 99] 45 | partition= [44, 55, 99, 80, 90] 46 | partition= [89, 77, 66] 47 | partition= [4, 5, 77] 48 | >>> 49 | >>> rdd2.foreachPartition(debug_partition) 50 | partition= [89, 77, 66, 44, 55, 99, 80, 90] 51 | partition= [1, 2, 3, 4, 5, 77] 52 | partition= [22, 11, 123, 44, 45, 67] 53 | partition= [77, 66, 99, 33, 33, 22] 54 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2021-10-25-RDD-join.txt: -------------------------------------------------------------------------------- 1 | Inner Join Example 2 | 3 | $ pyspark 4 | Python 3.7.10 (default, Jun 3 2021, 00:02:01) 5 | Welcome to 6 | ____ __ 7 | / __/__ ___ _____/ /__ 8 | _\ \/ _ \/ _ `/ __/ '_/ 9 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.5-amzn-0 10 | /_/ 11 | 12 | Using Python version 3.7.10 (default, Jun 3 2021 00:02:01) 13 | SparkContext available as 'sc'. 14 | SparkSession available as 'spark'. 15 | >>> 16 | >>> 17 | >>> x = sc.parallelize([("a", 1), ("a", 4), ("b", 7), ("b", 8), ("c", 89)]) 18 | >>> y = sc.parallelize([("a", 100), ("a", 400), ("b", 700), ("b", 800), ("b", 900), ("d", 890)]) 19 | >>> x.collect() 20 | [ 21 | ('a', 1), ('a', 4), 22 | ('b', 7), ('b', 8), 23 | ('c', 89) 24 | ] 25 | >>> y.collect() 26 | [ 27 | ('a', 100), ('a', 400), 28 | ('b', 700), ('b', 800), ('b', 900), 29 | ('d', 890) 30 | ] 31 | 32 | >>> joined = x.join(y) 33 | >>> joined.collect() 34 | [ 35 | ('b', (7, 800)), 36 | ('b', (7, 900)), 37 | ('b', (7, 700)), 38 | ('b', (8, 800)), 39 | ('b', (8, 900)), 40 | ('b', (8, 700)), 41 | ('a', (1, 100)), 42 | ('a', (1, 400)), 43 | ('a', (4, 100)), 44 | ('a', (4, 400)) 45 | ] 46 | >>> joined2 = y.join(x) 47 | >>> joined2.collect() 48 | [ 49 | ('b', (700, 8)), 50 | ('b', (700, 7)), 51 | ('b', (800, 8)), 52 | ('b', (800, 7)), 53 | ('b', (900, 8)), 54 | ('b', (900, 7)), 55 | ('a', (100, 4)), 56 | ('a', (100, 1)), 57 | ('a', (400, 4)), 58 | ('a', (400, 1)) 59 | ] 60 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2022-04-12.txt: -------------------------------------------------------------------------------- 1 | ~ % cd spark-3.2.0 2 | spark-3.2.0 % ls -l 3 | total 192 4 | -rwxrwxrwx@ 1 mparsian staff 22878 Oct 6 2021 LICENSE 5 | -rwxrwxrwx@ 1 mparsian staff 57677 Oct 6 2021 NOTICE 6 | drwxrwxrwx@ 3 mparsian staff 96 Oct 6 2021 R 7 | -rwxrwxrwx@ 1 mparsian staff 4512 Oct 6 2021 README.md 8 | -rwxrwxrwx@ 1 mparsian staff 167 Oct 6 2021 RELEASE 9 | drwxrwxrwx@ 29 mparsian staff 928 Nov 17 18:15 bin 10 | drwxrwxrwx@ 9 mparsian staff 288 Nov 17 18:15 conf 11 | drwxrwxrwx@ 5 mparsian staff 160 Nov 17 18:15 data 12 | drwxrwxrwx@ 4 mparsian staff 128 Oct 6 2021 examples 13 | drwxrwxrwx@ 237 mparsian staff 7584 Nov 17 18:15 jars 14 | drwxrwxrwx@ 4 mparsian staff 128 Nov 17 18:15 kubernetes 15 | drwxrwxrwx@ 60 mparsian staff 1920 Nov 17 18:15 licenses 16 | drwxrwxrwx@ 20 mparsian staff 640 Nov 17 18:15 python 17 | drwxrwxrwx@ 29 mparsian staff 928 Nov 17 18:15 sbin 18 | drwxrwxrwx@ 3 mparsian staff 96 Oct 6 2021 yarn 19 | 20 | spark-3.2.0 % ./bin/pyspark 21 | Python 3.8.9 (default, Mar 30 2022, 13:51:17) 22 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin 23 | Type "help", "copyright", "credits" or "license" for more information. 24 | Welcome to 25 | ____ __ 26 | / __/__ ___ _____/ /__ 27 | _\ \/ _ \/ _ `/ __/ '_/ 28 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0 29 | /_/ 30 | 31 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17) 32 | Spark context Web UI available at http://10.0.0.234:4040 33 | Spark context available as 'sc' (master = local[*], app id = local-1649822374103). 34 | SparkSession available as 'spark'. 35 | >>> 36 | >>> 37 | >>> spark.version 38 | '3.2.0' 39 | >>> 40 | >>> 41 | >>> numbers = [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50] 42 | >>> numbers 43 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50] 44 | >>> # rdd = Resilient Dist. Dataset 45 | >>> rdd = spark.sparkContext.parallelize(numbers) 46 | >>> rdd.collect() 47 | [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50] 48 | >>> # rdd is partitioned, read-only, operates in parallel 49 | >>> rdd.count() 50 | 11 51 | >>> total = rdd.reduce(lambda x, y: x+y) 52 | >>> total 53 | 168 54 | >>> rdd_greater_than_20 = rdd.filter(lambda x : x > 20) 55 | >>> rdd_greater_than_20.collect() 56 | [30, 40, 50] 57 | >>> 58 | >>> rdd_greater_than_20.count() 59 | 3 60 | >>> rdd.take(3) 61 | [1, 2, 5] 62 | >>> 63 | >>> ^D -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2022-04-14-mappers-and-filters-and-reduce.txt: -------------------------------------------------------------------------------- 1 | # spark : SparkSession 2 | 3 | # create a Python collection 4 | numbers = [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30] 5 | 6 | # create an RDD[Integer] from a Python collection 7 | rdd = spark.sparkContext.parallelize(numbers) 8 | 9 | # get all elements (used for debugging -- do not use this for large RDDs) 10 | rdd.collect() 11 | [1, 2, 3, 4, 5, -1, -2, -3, 10, 12, 30] 12 | 13 | # count the number of elements 14 | rdd.count() 15 | 11 16 | 17 | # apply a map() transformation to rdd and create a new RDD as rdd2 18 | rdd2 = rdd.map(lambda x : 3 *x) 19 | rdd2.collect() 20 | [3, 6, 9, 12, 15, -3, -6, -9, 30, 36, 90] 21 | 22 | # create a new RDD (as rdd3) from rdd2 23 | rdd3 = rdd2.map(lambda x: (x, 2*x)) 24 | rdd3.collect() 25 | [ 26 | (3, 6), 27 | (6, 12), 28 | (9, 18), 29 | (12, 24), 30 | (15, 30), 31 | (-3, -6), 32 | (-6, -12), 33 | (-9, -18), 34 | (30, 60), 35 | (36, 72), 36 | (90, 180) 37 | ] 38 | 39 | # find all positive numbers from a given RDD (as rdd) 40 | # filter() is a transformation 41 | positives = rdd.filter(lambda x : x > 0) 42 | positives.collect() 43 | [1, 2, 3, 4, 5, 10, 12, 30] 44 | 45 | # find all negative numbers from a given RDD (as rdd) 46 | # filter() is a transformation 47 | negatives = rdd.filter(lambda x : x < 0) 48 | negatives.collect() 49 | [-1, -2, -3] 50 | 51 | # find the sum of all numbers for a given RDD[Integer] 52 | # reduce() is an action: it creates a NON-RDD 53 | # reduce() is NOT a Transformation): it does NOT create an RDD 54 | total = rdd.reduce(lambda x, y: x+y) 55 | 56 | 57 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session-2022-04-19-read-text-groupbykey-mapvalues-filter.txt: -------------------------------------------------------------------------------- 1 | % cat /tmp/movies.txt 2 | user9,m1,5 3 | user8,m2,4 4 | user1,m1,2 5 | user1,9 6 | user1,m1,2 7 | user2,m2,3 8 | user2,m3,5 9 | user3,m3,4 10 | user6,m3,4 11 | user7,m3,3 12 | user3,king 13 | user4,m1,3 14 | user5,m2,5 15 | user6,m4,5 16 | user7,m5,5 17 | user1 18 | user3,m3,5 19 | user4,m4,1 20 | 21 | % ./bin/pyspark 22 | Python 3.8.9 (default, Mar 30 2022, 13:51:17) 23 | [Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin 24 | Type "help", "copyright", "credits" or "license" for more information. 25 | Welcome to 26 | ____ __ 27 | / __/__ ___ _____/ /__ 28 | _\ \/ _ \/ _ `/ __/ '_/ 29 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0 30 | /_/ 31 | 32 | Using Python version 3.8.9 (default, Mar 30 2022 13:51:17) 33 | Spark context Web UI available at http://10.0.0.234:4041 34 | Spark context available as 'sc' (master = local[*], app id = local-1650425312842). 35 | SparkSession available as 'spark'. 36 | >>> 37 | >>> 38 | >>> 39 | >>> input_path = "/tmp/movies.txt" 40 | >>> input_path 41 | '/tmp/movies.txt' 42 | >>> # read input and create RDD[String] 43 | >>> records = spark.sparkContext.textFile(input_path) 44 | >>> records.collect() 45 | [ 46 | 'user9,m1,5', 47 | 'user8,m2,4', 48 | 'user1,m1,2', 49 | 'user1,9', 50 | 'user1,m1,2', 51 | 'user2,m2,3', 52 | 'user2,m3,5', 53 | 'user3,m3,4', 54 | 'user6,m3,4', 55 | 'user7,m3,3', 56 | 'user3,king', 57 | 'user4,m1,3', 58 | 'user5,m2,5', 59 | 'user6,m4,5', 60 | 'user7,m5,5', 61 | 'user1', 62 | 'user3,m3,5', 63 | 'user4,m4,1' 64 | ] 65 | >>> records.count() 66 | 18 67 | >>> 68 | >>> 69 | >>> records.getNumPartitions() 70 | 2 71 | >>> 72 | >>> 73 | >>> 74 | >>> pairs = [("A", 3), ("A", 4), ("A", 5), ("B", 30), ("B", 40), ("B", 50), ("B", 60), ("C", 100)] 75 | >>> pairs 76 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)] 77 | >>> rdd = spark.sparkContext.parallelize(pairs) 78 | >>> rdd.collect() 79 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)] 80 | >>> rdd.count() 81 | 8 82 | >>> rdd.getNumPartitions() 83 | 16 84 | >>> # NOTE: since the number of partitions is more than 85 | >>> # the number of elements: this implies that 86 | >>> # some of the partitions can be empty (partition 87 | >>> # is created, but has no elements at all). 88 | >>> 89 | >>> 90 | # find average of values per key: A, B, C 91 | >>> # use groupByKey() transformation 92 | >>> grouped = rdd.groupByKey() 93 | >>> grouped.collect() 94 | [ 95 | ('B', ), 96 | ('C', ), 97 | ('A', ) 98 | ] 99 | 100 | >>> grouped.mapValues(lambda values: list(values)).collect() 101 | [ 102 | ('B', [30, 40, 50, 60]), 103 | ('C', [100]), 104 | ('A', [3, 4, 5]) 105 | ] 106 | >>> # similar to SQL's GROUP BY 107 | >>> # values : ResultIterable 108 | >>> avg_by_key = grouped.mapValues(lambda values: sum(values) / len(values)) 109 | >>> avg_by_key.collect() 110 | [('B', 45.0), ('C', 100.0), ('A', 4.0)] 111 | >>> 112 | >>> 113 | >>> rdd.collect() 114 | [('A', 3), ('A', 4), ('A', 5), ('B', 30), ('B', 40), ('B', 50), ('B', 60), ('C', 100)] 115 | >>> rdd_44 = rdd.mapValues(lambda v : v * 10) 116 | >>> rdd_44.collect() 117 | [('A', 30), ('A', 40), ('A', 50), ('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)] 118 | >>> # v : denotes the value component of (key, value) pair. 119 | >>> 120 | >>> 121 | >>> # apply a filter and keep (key, value) pairs 122 | >>> # if and only if value id greter than 100 123 | >>> 124 | >>> # understand tuple of 2 elements as (key, value) pair: 125 | >>> x = ("K", 2345) 126 | >>> x[0] 127 | 'K' 128 | >>> x[1] 129 | 2345 130 | >>> 131 | >>> 132 | >>> # apply a filter to rdd_44 and keep (key, value) 133 | >>> # pairs if and only if value is greter than 100 134 | >>> # x denotes a single element of source RDD (rdd_44) 135 | >>> rdd5 = rdd_44.filter(lambda x: x[1] > 100) 136 | >>> rdd5.collect() 137 | [('B', 300), ('B', 400), ('B', 500), ('B', 600), ('C', 1000)] 138 | >>> 139 | >>> 140 | >>> some_lists = [ [1, 2, 3], [7, 8, 9, 10], [], [] ] 141 | >>> len(some_lists) 142 | 4 143 | >>> some_lists[0] 144 | [1, 2, 3] 145 | >>> some_lists[1] 146 | [7, 8, 9, 10] 147 | >>> some_lists[2] 148 | [] 149 | >>> some_lists[3] 150 | [] 151 | >>> rdd = spark.sparkContext.parallelize(some_lists) 152 | >>> rdd.collect() 153 | [[1, 2, 3], [7, 8, 9, 10], [], []] 154 | >>> rdd.count() 155 | 4 156 | >>> # each rdd element is a list denoted by [...] 157 | >>> 158 | >>> rdd2 = rdd.flatMap(lambda x: x) 159 | >>> rdd2.collect() 160 | [1, 2, 3, 7, 8, 9, 10] 161 | >>> rdd2.count() 162 | 7 163 | >>> rdd3 = rdd.map(lambda x: x) 164 | >>> rdd3.collect() 165 | [[1, 2, 3], [7, 8, 9, 10], [], []] 166 | >>> rdd3.collect() 167 | [[1, 2, 3], [7, 8, 9, 10], [], []] 168 | >>> rdd3.count() 169 | 4 170 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/pyspark-session_2020-07-01.txt: -------------------------------------------------------------------------------- 1 | cat /Users/mparsian/spark-3.0.0/zbin/foxdata.txt 2 | red fox jumped high 3 | fox jumped over high fence 4 | red fox jumped 5 | 6 | mparsian@Mahmouds-MacBook ~/spark-3.0.0 $ ./bin/pyspark 7 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 8 | [Clang 6.0 (clang-600.0.57)] on darwin 9 | Type "help", "copyright", "credits" or "license" for more information. 10 | 20/07/01 17:51:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 11 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 12 | Setting default log level to "WARN". 13 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 14 | Welcome to 15 | ____ __ 16 | / __/__ ___ _____/ /__ 17 | _\ \/ _ \/ _ `/ __/ '_/ 18 | /__ / .__/\_,_/_/ /_/\_\ version 3.0.0 19 | /_/ 20 | 21 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 22 | SparkSession available as 'spark'. 23 | >>> 24 | >>> 25 | >>> 26 | >>> 27 | >>> input_path = '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt' 28 | >>> input_path 29 | '/Users/mparsian/spark-3.0.0/zbin/foxdata.txt' 30 | >>> recs = spark.sparkContext.textFile(input_path) 31 | >>> 32 | >>> 33 | >>> 34 | >>> recs 35 | /Users/mparsian/spark-3.0.0/zbin/foxdata.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 36 | >>> 37 | >>> 38 | >>> recs.collect() 39 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped'] 40 | >>> recs.count() 41 | 3 42 | >>> rdd_with_len = recs.map(lambda x: (x, len(x))) 43 | >>> rdd_with_len.collect() 44 | [('red fox jumped high', 19), ('fox jumped over high fence', 26), ('red fox jumped', 14)] 45 | >>> 46 | >>> 47 | >>> 48 | >>> upper = recs.map(lambda x: x.upper()) 49 | >>> upper.collect() 50 | ['RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED'] 51 | >>> spark 52 | 53 | >>> lower = recs.map(lambda x: x.lower()) 54 | >>> lower.collect() 55 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped'] 56 | >>> 57 | >>> 58 | >>> 59 | >>> lower_and_upper = lower.union(upper) 60 | >>> lower_and_upper.collect() 61 | ['red fox jumped high', 'fox jumped over high fence', 'red fox jumped', 'RED FOX JUMPED HIGH', 'FOX JUMPED OVER HIGH FENCE', 'RED FOX JUMPED'] 62 | >>> lower_and_upper.count() 63 | 6 64 | >>> 65 | >>> 66 | >>> 67 | >>> counts = recs.map(lambda x : (len(x), 3*len(x))) 68 | >>> counts.collect() 69 | [(19, 57), (26, 78), (14, 42)] 70 | >>> 71 | >>> 72 | >>> 73 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 74 | >>> numbers 75 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 76 | 77 | >>> numbers = [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 78 | >>> 79 | >>> 80 | >>> 81 | >>> numbers 82 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 83 | >>> rdd = spark.sparkContext.parallelize(numbers) 84 | >>> rdd.collect() 85 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 86 | >>> rdd.count() 87 | 14 88 | >>> pos = rdd.filter(lambda x: x > 0) 89 | >>> pos.collect() 90 | [1, 2, 3, 5, 6, 7, 8, 77, 99, 100] 91 | 92 | >>> 93 | >>> squared = rdd.map(lambda x : x*x) 94 | >>> squared.collect() 95 | [1, 4, 9, 25, 36, 49, 64, 1, 16, 5929, 9801, 7569, 10000, 10000] 96 | >>> tuples3 = rdd.map(lambda x : (x, x*x, x*100)) 97 | >>> tuples3.collect() 98 | [(1, 1, 100), (2, 4, 200), (3, 9, 300), (5, 25, 500), (6, 36, 600), (7, 49, 700), (8, 64, 800), (-1, 1, -100), (-4, 16, -400), (77, 5929, 7700), (99, 9801, 9900), (-87, 7569, -8700), (-100, 10000, -10000), (100, 10000, 10000)] 99 | >>> 100 | >>> 101 | >>> 102 | >>> rdd.collect() 103 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 104 | >>> gt4 = rdd.filter(lambda x: x > 4) 105 | >>> gt4.collect() 106 | [5, 6, 7, 8, 77, 99, 100] 107 | >>> 108 | >>> 109 | >>> 110 | >>> rdd.collect() 111 | [1, 2, 3, 5, 6, 7, 8, -1, -4, 77, 99, -87, -100, 100] 112 | >>> total = rdd.reduce(lambda x, y: x+y) 113 | >>> total 114 | 116 115 | 116 | Assume that rdd has 3 partitions: partition-1, partition-2, partition-3 117 | 118 | >>> partition-1: 1, 2, 3, 5, 6, 7, 8 119 | partition-1: will sum up to: 32 120 | 121 | >>> partition-2: -1, -4, 77, 99 122 | partition-2: will sum up to: 171 123 | 124 | >>> partition-3: -87, -100, 100 125 | partition-3: will sum up to: -87 126 | 127 | =============== 128 | partition-1 & partition-2 will result in: 203 129 | 203 & partition-3 will result in: 116 (Final result) 130 | 131 | -------------------------------------------------------------------------------- /tutorial/pyspark-examples/rdds/understanding_partitions.txt: -------------------------------------------------------------------------------- 1 | understanding_partitions.txt 2 | 3 | $ ./bin/pyspark 4 | Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43) 5 | Type "help", "copyright", "credits" or "license" for more information. 6 | Welcome to 7 | ____ __ 8 | / __/__ ___ _____/ /__ 9 | _\ \/ _ \/ _ `/ __/ '_/ 10 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.0 11 | /_/ 12 | 13 | Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43) 14 | SparkSession available as 'spark'. 15 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 16 | >>> rdd = sc.parallelize(numbers, 3) 17 | >>> rdd.collect() 18 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 19 | >>> rdd.count() 20 | 10 21 | >>> rdd.getNumPartitions() 22 | 3 23 | >>> def f(iterator): 24 | ... for x in iterator: 25 | ... print(x) 26 | ... print("=====") 27 | ... 28 | >>> 29 | >>> rdd.foreachPartition(f) 30 | 4 31 | 5 32 | 6 33 | ===== 34 | 7 35 | 8 36 | 9 37 | 10 38 | ===== 39 | 1 40 | 2 41 | 3 42 | ===== 43 | >>> rdd_default = sc.parallelize(numbers) 44 | >>> rdd_default.collect() 45 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 46 | >>> rdd.getNumPartitions() 47 | 3 48 | >>> rdd_default.getNumPartitions() 49 | 8 50 | >>> rdd.foreachPartition(f) 51 | 4 52 | 5 53 | 6 54 | ===== 55 | 1 56 | 2 57 | 3 58 | ===== 59 | 7 60 | 8 61 | 9 62 | 10 63 | ===== 64 | >>> rdd_default.foreachPartition(f) 65 | 6 66 | ===== 67 | 7 68 | ===== 69 | 3 70 | ===== 71 | 2 72 | ===== 73 | 8 74 | ===== 75 | 4 76 | 5 77 | ===== 78 | 9 79 | 10 80 | ===== 81 | 1 82 | ===== 83 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 84 | >>> rdd_by_4 = sc.parallelize(numbers, 4) 85 | >>> rdd_by_4.collect() 86 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 87 | >>> rdd_by_4.foreachPartition(f) 88 | 1 89 | 2 90 | 3 91 | ===== 92 | 10 93 | 11 94 | 12 95 | ===== 96 | 4 97 | 5 98 | 6 99 | ===== 100 | 7 101 | 8 102 | 9 103 | ===== 104 | >>> numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15] 105 | >>> rdd_by_6 = sc.parallelize(numbers, 6) 106 | >>> rdd_by_6.foreachPartition(f) 107 | 7 108 | 8 109 | ===== 110 | 1 111 | 2 112 | ===== 113 | 11 114 | 12 115 | 13 116 | 15 117 | ===== 118 | 3 119 | 4 120 | ===== 121 | 9 122 | 10 123 | ===== 124 | 5 125 | 6 126 | ===== 127 | >>> numbers = [1, 2, 3, 4, 5, 6] 128 | >>> rdd_empty = sc.parallelize(numbers, 10) 129 | >>> rdd_empty.foreachPartition(f) 130 | 2 131 | ===== 132 | 3 133 | ===== 134 | ===== 135 | ===== 136 | 4 137 | ===== 138 | ===== 139 | 6 140 | ===== 141 | 1 142 | ===== 143 | 5 144 | ===== 145 | ===== 146 | >>> -------------------------------------------------------------------------------- /tutorial/pyspark-udf/pyspark_udf_maptype.txt: -------------------------------------------------------------------------------- 1 | $SPARK_HOME/bin/pyspark 2 | Python 3.8.9 (default, Nov 9 2021, 04:26:29) 3 | Welcome to 4 | ____ __ 5 | / __/__ ___ _____/ /__ 6 | _\ \/ _ \/ _ `/ __/ '_/ 7 | /__ / .__/\_,_/_/ /_/\_\ version 3.2.0 8 | /_/ 9 | 10 | Using Python version 3.8.9 (default, Nov 9 2021 04:26:29) 11 | Spark context Web UI available at http://10.0.0.232:4040 12 | Spark context available as 'sc' (master = local[*], app id = local-1641011178190). 13 | SparkSession available as 'spark'. 14 | 15 | >>> from pyspark.sql import Row 16 | 17 | >>> data = spark.createDataFrame( 18 | ... [Row(zip_code='94087', city='Sunnyvale'), 19 | ... Row(zip_code='94088', city='Cupertino'), 20 | ... Row(zip_code='95055', city='Santa Clara'), 21 | ... Row(zip_code='95054', city='Palo Alto')]) 22 | 23 | >>> 24 | >>> data.show() 25 | +--------+-----------+ 26 | |zip_code| city| 27 | +--------+-----------+ 28 | | 94087| Sunnyvale| 29 | | 94088| Cupertino| 30 | | 95055|Santa Clara| 31 | | 95054| Palo Alto| 32 | +--------+-----------+ 33 | 34 | >>> from pyspark.sql.functions import udf 35 | >>> from pyspark.sql import types as T 36 | >>> 37 | >>> @udf(T.MapType(T.StringType(), T.StringType())) 38 | ... def create_structure(zip_code, city): 39 | ... return {zip_code: city} 40 | ... 41 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).toJSON().collect() 42 | [ 43 | '{"zip_code":"94087","city":"Sunnyvale","structure":{"94087":"Sunnyvale"}}', 44 | '{"zip_code":"94088","city":"Cupertino","structure":{"94088":"Cupertino"}}', 45 | '{"zip_code":"95055","city":"Santa Clara","structure":{"95055":"Santa Clara"}}', 46 | '{"zip_code":"95054","city":"Palo Alto","structure":{"95054":"Palo Alto"}}' 47 | ] 48 | 49 | >>> data.withColumn('structure', create_structure(data.zip_code, data.city)).show(truncate=False) 50 | +--------+-----------+----------------------+ 51 | |zip_code|city |structure | 52 | +--------+-----------+----------------------+ 53 | |94087 |Sunnyvale |{94087 -> Sunnyvale} | 54 | |94088 |Cupertino |{94088 -> Cupertino} | 55 | |95055 |Santa Clara|{95055 -> Santa Clara}| 56 | |95054 |Palo Alto |{95054 -> Palo Alto} | 57 | +--------+-----------+----------------------+ 58 | -------------------------------------------------------------------------------- /tutorial/split-function/README.md: -------------------------------------------------------------------------------- 1 | How To Use Split Function 2 | ========================= 3 | 4 | * Example-1: Split ````RDD```` into Tokens 5 | 6 | ```` 7 | # ./bin/pyspark 8 | Python 2.7.10 (default, Oct 23 2015, 19:19:21) 9 | 10 | Welcome to 11 | ____ __ 12 | / __/__ ___ _____/ /__ 13 | _\ \/ _ \/ _ `/ __/ '_/ 14 | /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 15 | /_/ 16 | 17 | Using Python version 2.7.10 (default, Oct 23 2015 19:19:21) 18 | SparkContext available as sc, HiveContext available as sqlContext. 19 | 20 | >>> data = ["abc,de", "abc,de,ze", "abc,de,ze,pe"] 21 | >>> data 22 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe'] 23 | 24 | >>> rdd = sc.parallelize(data) 25 | >>> rdd.collect() 26 | ['abc,de', 'abc,de,ze', 'abc,de,ze,pe'] 27 | >>> rdd.count() 28 | 3 29 | 30 | >>> rdd2 = rdd.flatMap(lambda x : x.split(",")) 31 | >>> rdd2.collect() 32 | ['abc', 'de', 'abc', 'de', 'ze', 'abc', 'de', 'ze', 'pe'] 33 | >>> rdd2.count() 34 | 9 35 | ```` 36 | 37 | * Example-2: Create Key-Value Pairs 38 | 39 | ```` 40 | >>> data2 = ["abc,de", "xyz,deeee,ze", "abc,de,ze,pe", "xyz,bababa"] 41 | >>> data2 42 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa'] 43 | 44 | >>> rdd4 = sc.parallelize(data2) 45 | >>> rdd4.collect() 46 | ['abc,de', 'xyz,deeee,ze', 'abc,de,ze,pe', 'xyz,bababa'] 47 | 48 | >>> rdd5 = rdd4.map(lambda x : (x.split(",")[0], x.split(",")[1])) 49 | >>> rdd5.collect() 50 | [('abc', 'de'), ('xyz', 'deeee'), ('abc', 'de'), ('xyz', 'bababa')] 51 | ```` 52 | -------------------------------------------------------------------------------- /tutorial/top-N/top-N.txt: -------------------------------------------------------------------------------- 1 | # ./pyspark 2 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 3 | Welcome to 4 | ____ __ 5 | / __/__ ___ _____/ /__ 6 | _\ \/ _ \/ _ `/ __/ '_/ 7 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 8 | /_/ 9 | 10 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 11 | SparkContext available as sc. 12 | >>> 13 | >>> nums = [10, 1, 2, 9, 3, 4, 5, 6, 7] 14 | >>> sc.parallelize(nums).takeOrdered(3) 15 | [1, 2, 3] 16 | >>> sc.parallelize(nums).takeOrdered(3, key=lambda x: -x) 17 | [10, 9, 7] 18 | >>> 19 | >>> kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")] 20 | >>> sc.parallelize(kv).takeOrdered(3) 21 | [(1, 'z2'), (2, 'z3'), (3, 'z5')] 22 | >>> 23 | >>> sc.parallelize(kv).takeOrdered(3, key=lambda x: -x[0]) 24 | [(10, 'z1'), (9, 'z4'), (7, 'z9')] 25 | -------------------------------------------------------------------------------- /tutorial/wordcount/README.md: -------------------------------------------------------------------------------- 1 | * word_count.py 2 | 3 | Word Count solution in PySpark: Note that input file is 4 | hard-coded: not a very good practice. The purpose is to 5 | show how to read files in Spark. 6 | 7 | * word_count_ver2.py 8 | 9 | I pass input file as a parameter. 10 | 11 | 12 | ```` 13 | best regards, 14 | Mahmoud Parsian 15 | ```` 16 | -------------------------------------------------------------------------------- /tutorial/wordcount/run_word_count.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1" 3 | # 4 | # define your input path 5 | #INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG="/Users/mparsian/zmp/pyspark_book_project/programs/word_count.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG 12 | -------------------------------------------------------------------------------- /tutorial/wordcount/run_word_count_ver2.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/Users/mparsian/spark-2.2.1" 3 | # 4 | # define your input path 5 | INPUT_PATH="file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" 6 | # 7 | # define your PySpark program 8 | PROG="/Users/mparsian/zmp/github/pyspark-tutorial/tutorial/wordcount/word_count_ver2.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /tutorial/wordcount/word_count.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | 5 | from pyspark.sql import SparkSession 6 | #----------------------------------- 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | # create an instance of a SparkSession as spark 12 | spark = SparkSession\ 13 | .builder\ 14 | .appName("wordcount")\ 15 | .getOrCreate() 16 | 17 | inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" 18 | 19 | # create SparkContext as sc 20 | sc = spark.sparkContext 21 | 22 | # create RDD from a text file 23 | textfileRDD = sc.textFile(inputPath) 24 | print(textfileRDD.collect()) 25 | 26 | wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) 27 | print(wordsRDD.collect()) 28 | 29 | pairsRDD = wordsRDD.map(lambda word: (word, 1)) 30 | print(pairsRDD.collect()) 31 | 32 | frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) 33 | print(frequenciesRDD.collect()) 34 | 35 | # done! 36 | spark.stop() 37 | -------------------------------------------------------------------------------- /tutorial/wordcount/word_count_ver2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | 5 | from pyspark.sql import SparkSession 6 | #----------------------------------- 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | # create an instance of a SparkSession as spark 12 | spark = SparkSession\ 13 | .builder\ 14 | .appName("wordcount")\ 15 | .getOrCreate() 16 | 17 | # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" 18 | # 19 | # sys.argv[0] is the name of the script. 20 | # sys.argv[1] is the first parameter 21 | inputPath = sys.argv[1] # input file 22 | print("inputPath: {}".format(inputPath)) 23 | 24 | 25 | # create SparkContext as sc 26 | sc = spark.sparkContext 27 | 28 | # create RDD from a text file 29 | textfileRDD = sc.textFile(inputPath) 30 | print(textfileRDD.collect()) 31 | 32 | wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) 33 | print(wordsRDD.collect()) 34 | 35 | pairsRDD = wordsRDD.map(lambda word: (word, 1)) 36 | print(pairsRDD.collect()) 37 | 38 | frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) 39 | print(frequenciesRDD.collect()) 40 | 41 | # done! 42 | spark.stop() 43 | -------------------------------------------------------------------------------- /tutorial/wordcount/wordcount-shorthand.txt: -------------------------------------------------------------------------------- 1 | # cat data.txt 2 | crazy crazy fox jumped 3 | crazy fox jumped 4 | fox is fast 5 | fox is smart 6 | dog is smart 7 | 8 | # ./bin/pyspark 9 | Welcome to 10 | ____ __ 11 | / __/__ ___ _____/ /__ 12 | _\ \/ _ \/ _ `/ __/ '_/ 13 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 14 | /_/ 15 | 16 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 17 | SparkContext available as sc, SQLContext available as sqlContext. 18 | >>> 19 | >>> lines = sc.textFile('data.txt', 1); 20 | >>> lines.collect() 21 | [ 22 | u'crazy crazy fox jumped', 23 | u'crazy fox jumped', 24 | u'fox is fast', 25 | u'fox is smart', 26 | u'dog is smart' 27 | ] 28 | 29 | >>> frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) 30 | >>> frequencies.collect() 31 | [ 32 | (u'crazy', 3), 33 | (u'jumped', 2), 34 | (u'is', 3), 35 | (u'fox', 4), 36 | (u'dog', 1), 37 | (u'fast', 1), 38 | (u'smart', 2) 39 | ] 40 | 41 | >>> frequencies.count() 42 | 7 -------------------------------------------------------------------------------- /tutorial/wordcount/wordcount.txt: -------------------------------------------------------------------------------- 1 | 1. Prepare Input 2 | 3 | # cat data.txt 4 | crazy crazy fox jumped 5 | crazy fox jumped 6 | fox is fast 7 | fox is smart 8 | dog is smart 9 | 10 | 2. Invoke pyspark 11 | 12 | # export SPARK_HOME=... 13 | # SPARK_HOME/bin/pyspark 14 | Python 2.6.9 (unknown, Sep 9 2014, 15:05:12) 15 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.39)] on darwin 16 | Type "help", "copyright", "credits" or "license" for more information. 17 | 18 | Welcome to 19 | ____ __ 20 | / __/__ ___ _____/ /__ 21 | _\ \/ _ \/ _ `/ __/ '_/ 22 | /__ / .__/\_,_/_/ /_/\_\ version 1.2.0 23 | /_/ 24 | 25 | Using Python version 2.6.9 (unknown, Sep 9 2014 15:05:12) 26 | SparkContext available as sc. 27 | >>> sc 28 | 29 | >>> lines = sc.textFile("data.txt", 1) 30 | >>> debuglines = lines.collect(); 31 | >>> debuglines 32 | [u'crazy crazy fox jumped', 33 | u'crazy fox jumped', 34 | u'fox is fast', 35 | u'fox is smart', 36 | u'dog is smart' 37 | ] 38 | >>> words = lines.flatMap(lambda x: x.split(' ')) 39 | >>> debugwords = words.collect(); 40 | >>> debugwords 41 | [ 42 | u'crazy', 43 | u'crazy', 44 | u'fox', 45 | u'jumped', 46 | u'crazy', 47 | u'fox', 48 | u'jumped', 49 | u'fox', 50 | u'is', 51 | u'fast', 52 | u'fox', 53 | u'is', 54 | u'smart', 55 | u'dog', 56 | u'is', 57 | u'smart' 58 | ] 59 | >>> ones = words.map(lambda x: (x, 1)) 60 | >>> debugones = ones.collect() 61 | >>> debugones 62 | [ 63 | (u'crazy', 1), 64 | (u'crazy', 1), 65 | (u'fox', 1), 66 | (u'jumped', 1), 67 | (u'crazy', 1), 68 | (u'fox', 1), 69 | (u'jumped', 1), 70 | (u'fox', 1), 71 | (u'is', 1), 72 | (u'fast', 1), 73 | (u'fox', 1), 74 | (u'is', 1), 75 | (u'smart', 1), 76 | (u'dog', 1), 77 | (u'is', 1), 78 | (u'smart', 1) 79 | ] 80 | >>> counts = ones.reduceByKey(lambda x, y: x + y) 81 | >>> debugcounts = counts.collect() 82 | >>> debugcounts 83 | [ 84 | (u'crazy', 3), 85 | (u'jumped', 2), 86 | (u'is', 3), 87 | (u'fox', 4), 88 | (u'dog', 1), 89 | (u'fast', 1), 90 | (u'smart', 2) 91 | ] 92 | >>> 93 | >>> counts.saveAsTextFile("output") 94 | 95 | 3. Examine Output 96 | 97 | # cat output/part* 98 | (u'crazy', 3) 99 | (u'jumped', 2) 100 | (u'is', 3) 101 | (u'fox', 4) 102 | (u'dog', 1) 103 | (u'fast', 1) 104 | (u'smart', 2) 105 | --------------------------------------------------------------------------------