├── .gitignore ├── LICENSE ├── README.md ├── average-word-length └── average_word_length.py ├── checkpointing └── looping_test.py ├── joining-datasets └── joins.py ├── kafka-streaming-wordcount ├── kafka_streaming_wordcount.py └── stream_words.sh ├── recommendation-engine └── movielens_recommender.py ├── streaming-wordcount ├── stream_words.sh └── streaming_wordcount.py ├── using-external-programs ├── parsefixedwidth.pl └── pipe_example.py └── wordcount ├── minimize_logging.sh └── wordcounts.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Analytics with Spark Using Python 2 | ### by Jeffrey Aven 3 | 4 | This repository contains the source code for exercises from the book 5 | 6 | ![data_analytics_with_spark](https://sparkusingpython.s3.amazonaws.com/images/data-analytics-with-spark-using-python.jpg) -------------------------------------------------------------------------------- /average-word-length/average_word_length.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Using Broadcast Variables and Accumulators' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # $ spark-submit --master local average_word_length.py 7 | # 8 | 9 | from pyspark import SparkConf, SparkContext 10 | conf = SparkConf().setAppName('Using Broadcast Variables and Accumulators') 11 | sc = SparkContext(conf=conf) 12 | 13 | # step 2 14 | import urllib.request 15 | stopwordsurl = "https://s3.amazonaws.com/sparkusingpython/stopwords/stop-word-list.csv" 16 | req = urllib.request.Request(stopwordsurl) 17 | with urllib.request.urlopen(req) as response: 18 | stopwordsdata = response.read().decode("utf-8") 19 | stopwordslist = stopwordsdata.split(",") 20 | # step 3 21 | stopwords = sc.broadcast(stopwordslist) 22 | # step 4 23 | word_count = sc.accumulator(0) 24 | total_len = sc.accumulator(0.0) 25 | # step 5 26 | def add_values(word,word_count,total_len): 27 | word_count += 1 28 | total_len += len(word) 29 | # step 6 30 | words = sc.textFile('file:///opt/spark/data/shakespeare.txt') \ 31 | .flatMap(lambda line: line.split()) \ 32 | .map(lambda x: x.lower()) \ 33 | .filter(lambda x: x not in stopwords.value) 34 | # step 7 35 | words.foreach(lambda x: add_values(x, word_count, total_len)) 36 | # step 8 37 | avgwordlen = total_len.value/word_count.value 38 | print("Total Number of Words: " + str(word_count.value)) 39 | print("Average Word Length: " + str(avgwordlen)) -------------------------------------------------------------------------------- /checkpointing/looping_test.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Checkpointing RDDs' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # $ spark-submit --master looping_test.py 7 | # 8 | 9 | import sys 10 | from pyspark import SparkConf, SparkContext 11 | sc = SparkContext() 12 | sc.setCheckpointDir("file:///tmp/checkpointdir") 13 | rddofints = sc.parallelize([1,2,3,4,5,6,7,8,9,10]) 14 | try: 15 | # this will create a very long lineage for rddofints 16 | for i in range(1000): 17 | rddofints = rddofints.map(lambda x: x+1) 18 | if i % 10 == 0: 19 | print("Looped " + str(i) + " times") 20 | #rddofints.checkpoint() 21 | rddofints.count() 22 | except Exception as e: 23 | print("Exception : " + str(e)) 24 | print("RDD Debug String : ") 25 | print(rddofints.toDebugString()) 26 | sys.exit() 27 | print("RDD Debug String : ") 28 | print(rddofints.toDebugString()) 29 | 30 | -------------------------------------------------------------------------------- /joining-datasets/joins.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Joining Datasets in Spark' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # Execute this program using spark-submit as follows: 7 | # 8 | # $ spark-submit --master local joins.py \ 9 | # $SPARK_HOME/data/bike-share \ 10 | # $SPARK_HOME/data/avgsbystation 11 | # 12 | 13 | import sys, re 14 | from pyspark import SparkConf, SparkContext 15 | conf = SparkConf().setAppName('Joining Datasets in Spark') 16 | sc = SparkContext(conf=conf) 17 | 18 | # check command line arguments 19 | if (len(sys.argv) != 3): 20 | print("""\ 21 | This program will find the top stations for average bikes available 22 | by station and hour from the Bay Area Bike Share dataset 23 | 24 | Usage: joins.py 25 | """) 26 | sys.exit(0) 27 | else: 28 | inputpath = sys.argv[1] 29 | outputdir = sys.argv[2] 30 | 31 | stations = sc.textFile(inputpath + "/stations") \ 32 | .map(lambda x: x.split(',')) \ 33 | .filter(lambda x: x[5] == 'San Jose') \ 34 | .map(lambda x: (int(x[0]), x[1])) \ 35 | .keyBy(lambda x: x[0]) 36 | 37 | status = sc.textFile(inputpath + "/status") \ 38 | .map(lambda x: x.split(',')) \ 39 | .map(lambda x: (x[0], x[1], x[2], x[3].replace('"',''))) \ 40 | .map(lambda x: (x[0], x[1], x[2], x[3].split(' '))) \ 41 | .map(lambda x: (x[0], x[1], x[2], x[3][0].split('-'), x[3][1].split(':'))) \ 42 | .map(lambda x: (int(x[0]), int(x[1]), int(x[3][0]), int(x[3][1]), int(x[3][2]), int(x[4][0]))) \ 43 | .filter(lambda x: x[2]==2015 and x[3]==2 and x[4]>=22) \ 44 | .map(lambda x: (x[0], x[1], x[5])) \ 45 | .keyBy(lambda x: x[0]) 46 | 47 | joined = status.join(stations) 48 | 49 | cleaned = joined.map(lambda x: (x[0], x[1][0][1], x[1][0][2], x[1][1][1])) 50 | 51 | topavail = cleaned.keyBy(lambda x: (x[3],x[2])) \ 52 | .mapValues(lambda x: (x[1], 1)) \ 53 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \ 54 | .mapValues(lambda x: (x[0]/x[1])) \ 55 | .keyBy(lambda x: x[1]) \ 56 | .sortByKey(ascending=False) \ 57 | .map(lambda x: (x[1][0][0], x[1][0][1], x[0])) \ 58 | .persist() 59 | 60 | topavail.saveAsTextFile("file://" + outputdir) 61 | top10stations = topavail.take(10) 62 | print("The top ten stations by hour are : ") 63 | for stationinfo in top10stations: 64 | print(str(stationinfo)) 65 | print("Check the complete output in " + outputdir) -------------------------------------------------------------------------------- /kafka-streaming-wordcount/kafka_streaming_wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Using Spark with Kafka' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # in one terminal, start Kafka 7 | # 8 | # in a second terminal, create a topic named shakespeare 9 | # $ $KAFKA_HOME/bin/kafka-topics.sh \ 10 | # --create \ 11 | # --zookeeper localhost:2181 \ 12 | # --replication-factor 1 \ 13 | # --partitions 1 \ 14 | # --topic shakespeare 15 | # 16 | # in the same terminal execute: 17 | # $ spark-submit --master local[2] \ 18 | # --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.0 \ 19 | # kafka_streaming_wordcount.py 20 | # 21 | # in a third terminal execute: 22 | # $ sh stream_words.sh 23 | # 24 | 25 | from pyspark import SparkConf, SparkContext 26 | conf = SparkConf().setAppName('Using Spark with Kafka') 27 | sc = SparkContext(conf=conf) 28 | from pyspark.streaming import StreamingContext 29 | from pyspark.streaming.kafka import KafkaUtils 30 | ssc = StreamingContext(sc, 30) 31 | brokers = "localhost:9092" 32 | topic = "shakespeare" 33 | stream = KafkaUtils.createDirectStream \ 34 | (ssc, [topic], {"metadata.broker.list": brokers}) 35 | lines = stream.map(lambda x: x[1]) 36 | counts = lines.flatMap(lambda line: line.split(" ")) \ 37 | .map(lambda word: (word, 1)) \ 38 | .reduceByKey(lambda a, b: a+b) 39 | counts.pprint() 40 | ssc.start() 41 | ssc.awaitTermination() -------------------------------------------------------------------------------- /kafka-streaming-wordcount/stream_words.sh: -------------------------------------------------------------------------------- 1 | while read line; do echo -e "$line\n"; sleep 1; done < /opt/spark/data/shakespeare.txt | $KAFKA_HOME/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic shakespeare -------------------------------------------------------------------------------- /recommendation-engine/movielens_recommender.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Implementing a Recommender Using Spark MLlib' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # Execute this program using spark-submit as follows: 7 | # 8 | # $ spark-submit movielens_recommender.py \ 9 | # hdfs:///path/to/movielens.dat 10 | # 11 | 12 | import sys 13 | from pyspark import SparkConf, SparkContext 14 | from pyspark.mllib.recommendation \ 15 | import ALS, MatrixFactorizationModel, Rating 16 | conf = SparkConf().setAppName('Movielens Recommender') 17 | sc = SparkContext(conf=conf) 18 | 19 | # check command line arguments 20 | if (len(sys.argv) != 2): 21 | print("""\ 22 | This program will train and test a recommendation engine 23 | using the movielens dataset 24 | 25 | Usage: movielens_recommender.py 26 | """) 27 | sys.exit(0) 28 | else: 29 | inputpath = sys.argv[1] 30 | 31 | data = sc.textFile(inputpath) 32 | ratings = data.map(lambda x: x.split('\t')) \ 33 | .map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2]))) 34 | 35 | rank = 10 36 | numIterations = 10 37 | model = ALS.train(ratings, rank, numIterations) 38 | 39 | testdata = ratings.map(lambda p: (p[0], p[1])) 40 | predictions = model.predictAll(testdata) \ 41 | .map(lambda r: ((r[0], r[1]), r[2])) 42 | ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])) \ 43 | .join(predictions) 44 | MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2) \ 45 | .mean() 46 | model.save(sc, "ratings_model") 47 | print("Mean Squared Error = " + str(MSE)) -------------------------------------------------------------------------------- /streaming-wordcount/stream_words.sh: -------------------------------------------------------------------------------- 1 | while read line; do echo -e "$line\n"; sleep 1; done < /opt/spark/data/shakespeare.txt | nc -lk 9999 -------------------------------------------------------------------------------- /streaming-wordcount/streaming_wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Getting Started with Spark Streaming' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # in one terminal execute: 7 | # $ spark-submit --master local[2] streaming_wordcount.py 8 | # 9 | # in a second terminal execute: 10 | # $ sh stream_words.sh 11 | # 12 | 13 | from pyspark import SparkConf, SparkContext 14 | conf = SparkConf().setAppName('Getting Started with Spark Streaming') 15 | sc = SparkContext(conf=conf) 16 | 17 | import re 18 | from pyspark.streaming import StreamingContext 19 | ssc = StreamingContext(sc, 30) 20 | lines = ssc.socketTextStream('localhost', 9999) 21 | wordcounts = lines.filter(lambda line: len(line) > 0) \ 22 | .flatMap(lambda line: re.split('\W+', line)) \ 23 | .filter(lambda word: len(word) > 0) \ 24 | .map(lambda word: (word.lower(), 1)) \ 25 | .reduceByKey(lambda x, y: x + y) 26 | wordcounts.pprint() 27 | ssc.start() 28 | ssc.awaitTermination() -------------------------------------------------------------------------------- /using-external-programs/parsefixedwidth.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | my $format = 'A6 A8 A8 A12 A2 A5'; 3 | while (<>) { 4 | chomp; 5 | my( $custid, $orderid, $date, 6 | $city, $state, $zip) = 7 | unpack( $format, $_ ); 8 | print "$custid\t$orderid\t$date\t$city\t$state\t$zip"; 9 | } 10 | -------------------------------------------------------------------------------- /using-external-programs/pipe_example.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'Processing RDDs with External Programs' Example in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # Execute this program using spark-submit as follows: 7 | # 8 | # $ spark-submit --master local pipe_example.py 9 | # 10 | 11 | import sys 12 | import os.path 13 | from pyspark import SparkConf, SparkContext 14 | conf = SparkConf().setAppName('Using External Programs') 15 | sc = SparkContext(conf=conf) 16 | 17 | # check for parsefixedwidth.pl script 18 | if os.path.isfile('parsefixedwidth.pl'): 19 | sc.addFile("parsefixedwidth.pl") 20 | fixed_width = sc.parallelize(['3840961028752220160317Hayward CA94541']) 21 | piped = fixed_width.pipe("parsefixedwidth.pl") \ 22 | .map(lambda x: x.split('\t')) 23 | print(piped.collect()) 24 | else: 25 | print("""\ 26 | The parsefixedwidth.pl script must exist in the current directory 27 | """) 28 | sys.exit(0) -------------------------------------------------------------------------------- /wordcount/minimize_logging.sh: -------------------------------------------------------------------------------- 1 | sed "s/log4j.rootCategory=INFO, console/log4j.rootCategory=ERROR, console/" $SPARK_HOME/conf/log4j.properties.template > $SPARK_HOME/conf/log4j.properties -------------------------------------------------------------------------------- /wordcount/wordcounts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Source code for the 'MapReduce and Word Count' Exercise in 3 | # Data Analytics with Spark Using Python 4 | # by Jeffrey Aven 5 | # 6 | # Execute this program using spark-submit as follows: 7 | # 8 | # $ spark-submit --master local wordcounts.py \ 9 | # $SPARK_HOME/data/shakespeare.txt \ 10 | # $SPARK_HOME/data/wordcounts 11 | # 12 | 13 | import sys, re 14 | from pyspark import SparkConf, SparkContext 15 | conf = SparkConf().setAppName('Word Counts') 16 | sc = SparkContext(conf=conf) 17 | 18 | # check command line arguments 19 | if (len(sys.argv) != 3): 20 | print("""\ 21 | This program will count occurances of each word in a document or documents 22 | and return the counts sorted by the most frequently occuring words 23 | 24 | Usage: wordcounts.py 25 | """) 26 | sys.exit(0) 27 | else: 28 | inputpath = sys.argv[1] 29 | outputdir = sys.argv[2] 30 | 31 | # count and sort word occurances 32 | wordcounts = sc.textFile("file://" + inputpath) \ 33 | .filter(lambda line: len(line) > 0) \ 34 | .flatMap(lambda line: re.split('\W+', line)) \ 35 | .filter(lambda word: len(word) > 0) \ 36 | .map(lambda word:(word.lower(),1)) \ 37 | .reduceByKey(lambda v1, v2: v1 + v2) \ 38 | .map(lambda x: (x[1],x[0])) \ 39 | .sortByKey(ascending=False) \ 40 | .persist() 41 | wordcounts.saveAsTextFile("file://" + outputdir) 42 | top5words = wordcounts.take(5) 43 | justwords = [] 44 | for wordsandcounts in top5words: 45 | justwords.append(wordsandcounts[1]) 46 | print("The top five words are : " + str(justwords)) 47 | print("Check the complete output in " + outputdir) --------------------------------------------------------------------------------