├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE.md ├── README.md ├── bin ├── datagen.cmd ├── fakelogs.cmd └── fakelogs.sh ├── build-project ├── build.sbt ├── files ├── call_signs2.txt ├── callsign_tbl ├── callsign_tbl_sorted ├── callsigns ├── cqlsh_setup ├── fake_logs │ ├── log1.log │ └── log2.log ├── favourite_animals.csv ├── flumeconf.cfg ├── ham.txt ├── happypandas ├── int_string.csv ├── pandainfo.json ├── spam.txt └── testweet.json ├── mini-complete-example ├── README.md ├── build.sbt ├── pom.xml ├── project │ └── plugins.sbt ├── sbt │ └── sbt └── src │ └── main │ ├── java │ └── com │ │ └── oreilly │ │ └── learningsparkexamples │ │ └── mini │ │ └── java │ │ ├── BasicMap.java │ │ └── WordCount.java │ └── scala │ └── com │ └── oreilly │ └── learningsparkexamples │ └── mini │ └── scala │ ├── BasicMap.scala │ └── WordCount.scala ├── pom.xml ├── project └── plugins.sbt ├── run-all-examples ├── sbt └── sbt ├── setup-project └── src ├── R └── finddistance.R ├── main ├── java │ └── com │ │ └── oreilly │ │ └── learningsparkexamples │ │ └── java │ │ ├── BasicAvg.java │ │ ├── BasicAvgMapPartitions.java │ │ ├── BasicAvgWithKryo.java │ │ ├── BasicFlatMap.java │ │ ├── BasicJoinCsv.java │ │ ├── BasicLoadJson.java │ │ ├── BasicLoadSequenceFile.java │ │ ├── BasicLoadWholeCsv.java │ │ ├── BasicMap.java │ │ ├── BasicMapPartitions.java │ │ ├── BasicMapThenFilter.java │ │ ├── BasicMapToDouble.java │ │ ├── BasicQueryCassandra.java │ │ ├── BasicSaveSequenceFile.java │ │ ├── BasicSum.java │ │ ├── CallLog.java │ │ ├── ChapterSixExample.java │ │ ├── HappyPerson.java │ │ ├── IntersectByKey.java │ │ ├── KafkaInput.java │ │ ├── KeyValueMapFilter.java │ │ ├── LoadHive.java │ │ ├── LoadJsonWithSparkSQL.java │ │ ├── MLlib.java │ │ ├── PerKeyAvg.java │ │ ├── RemoveOutliers.java │ │ ├── SparkSQLTwitter.java │ │ ├── StreamingLogInput.java │ │ ├── WordCount.java │ │ └── logs │ │ ├── ApacheAccessLog.java │ │ ├── Flags.java │ │ ├── Functions.java │ │ ├── LogAnalyzerAppMain.java │ │ ├── LogAnalyzerTotal.java │ │ ├── LogAnalyzerWindowed.java │ │ ├── LogStatistics.java │ │ ├── ReadTransferStats.java │ │ └── Renderer.java ├── protobuf │ ├── address_book.proto │ └── places.proto └── scala │ └── com │ └── oreilly │ └── learningsparkexamples │ └── scala │ ├── BasicAvg.scala │ ├── BasicAvgFromFile.scala │ ├── BasicAvgFromFiles.scala │ ├── BasicAvgMapPartitions.scala │ ├── BasicAvgWithKryo.scala │ ├── BasicFilterUnionCombo.scala │ ├── BasicIntersectByKey.scala │ ├── BasicLoadNums.scala │ ├── BasicLoadSequenceFile.scala │ ├── BasicLoadTextFromFTP.scala │ ├── BasicMap.scala │ ├── BasicMapNoCache.scala │ ├── BasicMapPartitions.scala │ ├── BasicMapThenFilter.scala │ ├── BasicParseCsv.scala │ ├── BasicParseJson.scala │ ├── BasicParseJsonWithJackson.scala │ ├── BasicParseWholeFileCsv.scala │ ├── BasicQueryCassandra.scala │ ├── BasicSaveProtoBuf.scala │ ├── BasicSaveSequenceFile.scala │ ├── BasicStreamingExample.scala │ ├── BasicSum.scala │ ├── ChapterSixExample.scala │ ├── FlumeInput.scala │ ├── KafkaInput.scala │ ├── LoadHive.scala │ ├── LoadJsonWithElephantBird.scala │ ├── LoadJsonWithSparkSQL.scala │ ├── LoadKeyValueTextInput.scala │ ├── LoadSimpleJdbc.scala │ ├── MLlib.scala │ ├── MLlibPipeline.disabled_until_111 │ ├── PerKeyAvg.scala │ ├── PipeExample.scala │ ├── RemoveOutliers.scala │ ├── SparkSQLTwitter.scala │ ├── StreamingLogInput.scala │ ├── WordCount.scala │ ├── WriteSimpleDB.scala │ └── logs │ ├── LogAnalyzerAppMain.scala │ ├── LogAnalyzerTotal.scala │ ├── LogAnalyzerWindowed.scala │ └── ReadTransferStats.scala ├── perl └── splitwords.pl └── python ├── AvgMapPartitions.py ├── BasicAvg.py ├── BasicFilterMap.py ├── BasicKeyValueMapFilter.py ├── BasicMap.py ├── BasicMapPartitions.py ├── BasicSum.py ├── ChapterSixExample.py ├── IntersectByKey.py ├── LoadCsv.py ├── LoadHive.py ├── LoadJson.py ├── MLlib.py ├── MakeHiveTable.py ├── MakeParquetFile.py ├── PerKeyAvg.py ├── QueryParquetFile.py ├── QueryParuetFile.py ├── RemoveOutliers.py ├── SparkSQLTwitter.py └── WordCount.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | dist/* 6 | target/ 7 | lib_managed/ 8 | src_managed/ 9 | project/boot/ 10 | project/plugins/project/ 11 | sbt/*.jar 12 | mini-complete-example/sbt/*.jar 13 | 14 | # Scala-IDE specific 15 | .scala_dependencies 16 | 17 | #Emacs 18 | *~ 19 | 20 | #ignore the metastore 21 | metastore_db/* -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.10.4 4 | # Install R 5 | before_install: 6 | - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh 7 | - chmod 755 ./travis-tool.sh 8 | - ./travis-tool.sh bootstrap 9 | install: 10 | - ./travis-tool.sh install_deps 11 | before_script: 12 | - ./setup-project 13 | script: 14 | - ./build-project -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: learning-spark-examples 2 | Version: 0.1 3 | Depends: Imap 4 | License: MIT License 5 | Description: Examples for the learning spark book. 6 | Title: Examples for the learning spark book. 7 | Author@R: c(person("Holden Karau", role = c("aut", "cre"), email="holden@pigscanfly.ca")) -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Holden Karau and respective authors. The learning spark examples are licensed under the [MIT license](http://opensource.org/licenses/MIT). 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![buildstatus](https://travis-ci.org/holdenk/learning-spark-examples.svg?branch=master)](https://travis-ci.org/holdenk/learning-spark-examples) 2 | Examples for Learning Spark 3 | =============== 4 | Examples for the Learning Spark book. These examples require a number of libraries and as such have long build files. We have also added a stand alone example with minimal dependencies and a small build file 5 | in the mini-complete-example directory. 6 | 7 | 8 | These examples have been updated to run against Spark 1.3 so they may 9 | be slightly different than the versions in your copy of "Learning Spark". 10 | 11 | Requirements 12 | == 13 | * JDK 1.7 or higher 14 | * Scala 2.10.3 15 | - scala-lang.org 16 | * Spark 1.3 17 | * Protobuf compiler 18 | - On debian you can install with sudo apt-get install protobuf-compiler 19 | * R & the CRAN package Imap are required for the ChapterSixExample 20 | * The Python examples require urllib3 21 | 22 | Python examples 23 | === 24 | 25 | From spark just run ./bin/pyspark ./src/python/[example] 26 | 27 | Spark Submit 28 | === 29 | 30 | You can also create an assembly jar with all of the dependencies for running either the java or scala 31 | versions of the code and run the job with the spark-submit script 32 | 33 | `./sbt/sbt assembly` OR `mvn package` 34 | 35 | `cd $SPARK_HOME; ./bin/spark-submit --class com.oreilly.learningsparkexamples.[lang].[example] ../learning-spark-examples/target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar` 36 | 37 | [![Learning Spark](http://akamaicovers.oreilly.com/images/0636920028512/cat.gif)](http://www.jdoqocy.com/click-7645222-11260198?url=http%3A%2F%2Fshop.oreilly.com%2Fproduct%2F0636920028512.do%3Fcmp%3Daf-strata-books-videos-product_cj_9781449358600_%2525zp&cjsku=0636920028512) 38 | -------------------------------------------------------------------------------- /bin/datagen.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 3 | ping -n 5 localhost > null 4 | echo 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 5 | ping -n 5 localhost > null 6 | echo 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" -------------------------------------------------------------------------------- /bin/fakelogs.cmd: -------------------------------------------------------------------------------- 1 | ncat -l 7777 -k -c datagen.cmd -------------------------------------------------------------------------------- /bin/fakelogs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | rm /tmp/logdata 3 | touch /tmp/logdata 4 | tail -f /tmp/logdata | nc -lk 7777 & 5 | TAIL_NC_PID=$! 6 | cat ./files/fake_logs/log1.log >> /tmp/logdata 7 | sleep 5 8 | cat ./files/fake_logs/log2.log >> /tmp/logdata 9 | sleep 1 10 | cat ./files/fake_logs/log1.log >> /tmp/logdata 11 | sleep 2 12 | cat ./files/fake_logs/log1.log >> /tmp/logdata 13 | sleep 3 14 | sleep 20 15 | kill $TAIL_NC_PID 16 | -------------------------------------------------------------------------------- /build-project: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | set -e 3 | set -x 4 | # Do our mini example first 5 | cd mini-complete-example 6 | ./sbt/sbt clean compile package 7 | ./sbt/sbt clean 8 | echo $PWD && mvn clean && mvn compile 9 | cd .. 10 | # Run the tests 11 | export SPARK_HOME=./spark-1.3.1-bin-hadoop1/ 12 | ./sbt/sbt compile package assembly 13 | echo $? 14 | time ./run-all-examples 15 | echo $? 16 | echo "done" 17 | # Try and build with maven, skip for now 18 | #mvn clean && mvn compile && mvn package 19 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | name := "learning-spark-examples" 6 | 7 | version := "0.0.1" 8 | 9 | scalaVersion := "2.10.4" 10 | 11 | javacOptions ++= Seq("-source", "1.7", "-target", "1.7") 12 | 13 | // protocol buffer support 14 | seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*) 15 | 16 | // additional libraries 17 | libraryDependencies ++= Seq( 18 | "org.apache.spark" %% "spark-core" % "1.3.1" % "provided", 19 | "org.apache.spark" %% "spark-sql" % "1.3.1", 20 | "org.apache.spark" %% "spark-hive" % "1.3.1", 21 | "org.apache.spark" %% "spark-streaming" % "1.3.1", 22 | "org.apache.spark" %% "spark-streaming-kafka" % "1.3.1", 23 | "org.apache.spark" %% "spark-streaming-flume" % "1.3.1", 24 | "org.apache.spark" %% "spark-mllib" % "1.3.1", 25 | "org.apache.commons" % "commons-lang3" % "3.0", 26 | "org.eclipse.jetty" % "jetty-client" % "8.1.14.v20131031", 27 | "com.typesafe.play" % "play-json_2.10" % "2.2.1", 28 | "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3", 29 | "com.fasterxml.jackson.module" % "jackson-module-scala_2.10" % "2.3.3", 30 | "org.elasticsearch" % "elasticsearch-hadoop-mr" % "2.0.0.RC1", 31 | "net.sf.opencsv" % "opencsv" % "2.0", 32 | "com.twitter.elephantbird" % "elephant-bird" % "4.5", 33 | "com.twitter.elephantbird" % "elephant-bird-core" % "4.5", 34 | "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.17", 35 | "mysql" % "mysql-connector-java" % "5.1.31", 36 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.0.0-rc5", 37 | "com.datastax.spark" %% "spark-cassandra-connector-java" % "1.0.0-rc5", 38 | "com.github.scopt" %% "scopt" % "3.2.0", 39 | "org.scalatest" %% "scalatest" % "2.2.1" % "test", 40 | "com.holdenkarau" %% "spark-testing-base" % "0.0.1" % "test" 41 | ) 42 | 43 | resolvers ++= Seq( 44 | "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", 45 | "Spray Repository" at "http://repo.spray.cc/", 46 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 47 | "Akka Repository" at "http://repo.akka.io/releases/", 48 | "Twitter4J Repository" at "http://twitter4j.org/maven2/", 49 | "Apache HBase" at "https://repository.apache.org/content/repositories/releases", 50 | "Twitter Maven Repo" at "http://maven.twttr.com/", 51 | "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", 52 | "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", 53 | "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", 54 | "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", 55 | Resolver.sonatypeRepo("public") 56 | ) 57 | 58 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 59 | { 60 | case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard 61 | case m if m.startsWith("META-INF") => MergeStrategy.discard 62 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first 63 | case PathList("org", "apache", xs @ _*) => MergeStrategy.first 64 | case PathList("org", "jboss", xs @ _*) => MergeStrategy.first 65 | case "about.html" => MergeStrategy.rename 66 | case "reference.conf" => MergeStrategy.concat 67 | case _ => MergeStrategy.first 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /files/call_signs2.txt: -------------------------------------------------------------------------------- 1 | KK6JKQ 2 | -------------------------------------------------------------------------------- /files/callsigns: -------------------------------------------------------------------------------- 1 | W8PAL 2 | KK6JKQ 3 | W6BB 4 | VE3UOW 5 | VE2CUA 6 | VE2UN 7 | OH2TI 8 | GB1MIR 9 | K2AMH 10 | UA1LO 11 | N7ICE 12 | -------------------------------------------------------------------------------- /files/cqlsh_setup: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS test; 2 | CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }; 3 | CREATE TABLE test.kv(key text PRIMARY KEY, value int); 4 | INSERT INTO test.kv(key, value) VALUES ('panda', 1); 5 | INSERT INTO test.kv(key, value) VALUES ('notpanda', 0); -------------------------------------------------------------------------------- /files/fake_logs/log1.log: -------------------------------------------------------------------------------- 1 | 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 2 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 3 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /favicon.ico HTTP/1.1" 200 1713 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 4 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 5 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET /jobmineimg.php?q=m HTTP/1.1" 200 222 "http://www.holdenkarau.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 6 | -------------------------------------------------------------------------------- /files/fake_logs/log2.log: -------------------------------------------------------------------------------- 1 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error78978 HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36" 2 | -------------------------------------------------------------------------------- /files/favourite_animals.csv: -------------------------------------------------------------------------------- 1 | holden,panda 2 | notholden,notpanda 3 | spark,bear -------------------------------------------------------------------------------- /files/flumeconf.cfg: -------------------------------------------------------------------------------- 1 | # Name the components on this agent 2 | panda.sources = r1 3 | panda.sinks = avroSink 4 | panda.channels = c1 5 | 6 | # avro sink 7 | panda.sinks = avroSink 8 | panda.sinks.avroSink.type = avro 9 | panda.sinks.avroSink.channel = memoryChannel 10 | panda.sinks.avroSink.hostname = localhost 11 | panda.sinks.avroSink.port = 7788 12 | 13 | # input 14 | panda.sources.r1.type = netcat 15 | panda.sources.r1.bind = localhost 16 | panda.sources.r1.port = 44444 17 | 18 | # Use a channel which buffers events in memory 19 | panda.channels.c1.type = memory 20 | panda.channels.c1.capacity = 1000 21 | panda.channels.c1.transactionCapacity = 100 22 | 23 | # Bind the source and sink to the channel 24 | panda.sources.r1.channels = c1 25 | panda.sinks.avroSink.channel = c1 -------------------------------------------------------------------------------- /files/ham.txt: -------------------------------------------------------------------------------- 1 | Dear Spark Learner, Thanks so much for attending the Spark Summit 2014! Check out videos of talks from the summit at ... 2 | Hi Mom, Apologies for being late about emailing and forgetting to send you the package. I hope you and bro have been ... 3 | Wow, hey Fred, just heard about the Spark petabyte sort. I think we need to take time to try it out immediately ... 4 | Hi Spark user list, This is my first question to this list, so thanks in advance for your help! I tried running ... 5 | Thanks Tom for your email. I need to refer you to Alice for this one. I haven't yet figured out that part either ... 6 | Good job yesterday! I was attending your talk, and really enjoyed it. I want to try out GraphX ... 7 | Summit demo got whoops from audience! Had to let you know. --Joe 8 | -------------------------------------------------------------------------------- /files/happypandas: -------------------------------------------------------------------------------- 1 | coffee 1 2 | coffee 2 3 | pandas 3 4 | happy 4 -------------------------------------------------------------------------------- /files/int_string.csv: -------------------------------------------------------------------------------- 1 | 1panda 2 | 2pandas 3 | 3pandas -------------------------------------------------------------------------------- /files/pandainfo.json: -------------------------------------------------------------------------------- 1 | {"name":"Sparky The Bear", "lovesPandas":true} 2 | {"name": "Holden"} 3 | {"name":"Sparky The Bear", "lovesPandas":true, "knows":{"friends": ["holden"]}} 4 | -------------------------------------------------------------------------------- /files/spam.txt: -------------------------------------------------------------------------------- 1 | Dear sir, I am a Prince in a far kingdom you have not heard of. I want to send you money via wire transfer so please ... 2 | Get Viagra real cheap! Send money right away to ... 3 | Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ... 4 | YOUR COMPUTER HAS BEEN INFECTED! YOU MUST RESET YOUR PASSWORD. Reply to this email with your password and SSN ... 5 | THIS IS NOT A SCAM! Send money and get access to awesome stuff really cheap and never have to ... 6 | -------------------------------------------------------------------------------- /files/testweet.json: -------------------------------------------------------------------------------- 1 | {"createdAt":"Nov 4, 2014 4:56:59 PM","id":529799371026485248,"text":"Adventures With Coffee, Code, and Writing.","source":"\u003ca href\u003d\"http://twitter.com\" rel\u003d\"nofollow\"\u003eTwitter Web Client\u003c/a\u003e","isTruncated":false,"inReplyToStatusId":-1,"inReplyToUserId":-1,"isFavorited":false,"retweetCount":0,"isPossiblySensitive":false,"contributorsIDs":[],"userMentionEntities":[],"urlEntities":[],"hashtagEntities":[],"mediaEntities":[],"currentUserRetweetId":-1,"user":{"id":15594928,"name":"Holden Karau","screenName":"holdenkarau","location":"","description":"","descriptionURLEntities":[],"isContributorsEnabled":false,"profileImageUrl":"http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","profileImageUrlHttps":"https://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","isProtected":false,"followersCount":1231,"profileBackgroundColor":"C0DEED","profileTextColor":"333333","profileLinkColor":"0084B4","profileSidebarFillColor":"DDEEF6","profileSidebarBorderColor":"FFFFFF","profileUseBackgroundImage":true,"showAllInlineMedia":false,"friendsCount":600,"createdAt":"Aug 5, 2011 9:42:44 AM","favouritesCount":1095,"utcOffset":-3,"profileBackgroundImageUrl":"","profileBackgroundImageUrlHttps":"","profileBannerImageUrl":"","profileBackgroundTiled":true,"lang":"en","statusesCount":6234,"isGeoEnabled":true,"isVerified":false,"translator":false,"listedCount":0,"isFollowRequestSent":false}} 2 | -------------------------------------------------------------------------------- /mini-complete-example/README.md: -------------------------------------------------------------------------------- 1 | Mini Examples for Spark 2 | =============== 3 | This directory contains a complete stand alone example with both Maven and SBT build tools. 4 | -------------------------------------------------------------------------------- /mini-complete-example/build.sbt: -------------------------------------------------------------------------------- 1 | name := "learning-spark-mini-example" 2 | 3 | version := "0.0.1" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | // additional libraries 8 | libraryDependencies ++= Seq( 9 | "org.apache.spark" %% "spark-core" % "1.1.0" % "provided" 10 | ) 11 | -------------------------------------------------------------------------------- /mini-complete-example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | com.oreilly.learningsparkexamples.mini 3 | learning-spark-mini-example 4 | 4.0.0 5 | example 6 | jar 7 | 0.0.1 8 | 9 | 10 | org.apache.spark 11 | spark-core_2.10 12 | 1.1.0 13 | provided 14 | 15 | 16 | 17 | 1.6 18 | 19 | 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-compiler-plugin 25 | 3.1 26 | 27 | ${java.version} 28 | ${java.version} 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /mini-complete-example/project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holdenk/learning-spark-examples/6862949df6c29c149ffcbedfd5948fe2ab5e2619/mini-complete-example/project/plugins.sbt -------------------------------------------------------------------------------- /mini-complete-example/sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script launches sbt for this project. If present it uses the system 21 | # version of sbt. If there is no system version of sbt it attempts to download 22 | # sbt locally. 23 | SBT_VERSION=0.13.7 24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 27 | 28 | # Download sbt launch jar if it hasn't been downloaded yet 29 | if [ ! -f ${JAR} ]; then 30 | # Download 31 | printf "Attempting to fetch sbt\n" 32 | JAR_DL=${JAR}.part 33 | if hash wget 2>/dev/null; then 34 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 35 | elif hash curl 2>/dev/null; then 36 | (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 37 | else 38 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 39 | exit -1 40 | fi 41 | fi 42 | if [ ! -f ${JAR} ]; then 43 | # We failed to download 44 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 45 | exit -1 46 | fi 47 | printf "Launching sbt from ${JAR}\n" 48 | java \ 49 | -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \ 50 | -jar ${JAR} \ 51 | "$@" 52 | -------------------------------------------------------------------------------- /mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/BasicMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.mini.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.function.Function; 14 | 15 | public class BasicMap { 16 | public static void main(String[] args) throws Exception { 17 | String master; 18 | if (args.length > 0) { 19 | master = args[0]; 20 | } else { 21 | master = "local"; 22 | } 23 | JavaSparkContext sc = new JavaSparkContext( 24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); 25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 26 | JavaRDD result = rdd.map( 27 | new Function() { public Integer call(Integer x) { return x*x;}}); 28 | System.out.println(StringUtils.join(result.collect(), ",")); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/WordCount.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a wordcount in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.mini.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.lang.Iterable; 9 | 10 | import scala.Tuple2; 11 | 12 | import org.apache.commons.lang.StringUtils; 13 | 14 | import org.apache.spark.SparkConf; 15 | import org.apache.spark.api.java.JavaRDD; 16 | import org.apache.spark.api.java.JavaPairRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.FlatMapFunction; 19 | import org.apache.spark.api.java.function.Function2; 20 | import org.apache.spark.api.java.function.PairFunction; 21 | 22 | 23 | public class WordCount { 24 | public static void main(String[] args) throws Exception { 25 | String inputFile = args[0]; 26 | String outputFile = args[1]; 27 | // Create a Java Spark Context. 28 | SparkConf conf = new SparkConf().setAppName("wordCount"); 29 | JavaSparkContext sc = new JavaSparkContext(conf); 30 | // Load our input data. 31 | JavaRDD input = sc.textFile(inputFile); 32 | // Split up into words. 33 | JavaRDD words = input.flatMap( 34 | new FlatMapFunction() { 35 | public Iterable call(String x) { 36 | return Arrays.asList(x.split(" ")); 37 | }}); 38 | // Transform into word and count. 39 | JavaPairRDD counts = words.mapToPair( 40 | new PairFunction(){ 41 | public Tuple2 call(String x){ 42 | return new Tuple2(x, 1); 43 | }}).reduceByKey(new Function2(){ 44 | public Integer call(Integer x, Integer y){ return x + y;}}); 45 | // Save the word count back out to a text file, causing evaluation. 46 | counts.saveAsTextFile(outputFile); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/BasicMap.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicMap { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME")) 15 | val input = sc.parallelize(List(1,2,3,4)) 16 | val result = input.map(x => x*x) 17 | println(result.collect().mkString(",")) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/WordCount.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates flatMap + countByValue for wordcount. 3 | */ 4 | package com.oreilly.learningsparkexamples.mini.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object WordCount { 10 | def main(args: Array[String]) { 11 | val inputFile = args(0) 12 | val outputFile = args(1) 13 | val conf = new SparkConf().setAppName("wordCount") 14 | // Create a Scala Spark Context. 15 | val sc = new SparkContext(conf) 16 | // Load our input data. 17 | val input = sc.textFile(inputFile) 18 | // Split up into words. 19 | val words = input.flatMap(line => line.split(" ")) 20 | // Transform into word and count. 21 | val counts = words.map(word => (word, 1)).reduceByKey{case (x, y) => x + y} 22 | // Save the word count back out to a text file, causing evaluation. 23 | counts.saveAsTextFile(outputFile) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | com.oreilly.learningsparkexamples 3 | java 4 | 4.0.0 5 | examples 6 | jar 7 | 0.0.2 8 | 9 | 10 | Akka repository 11 | http://repo.akka.io/releases 12 | 13 | 14 | scala-tools 15 | https://oss.sonatype.org/content/groups/scala-tools 16 | 17 | 18 | apache 19 | https://repository.apache.org/content/repositories/releases 20 | 21 | 22 | twitter 23 | http://maven.twttr.com/ 24 | 25 | 26 | central2 27 | http://central.maven.org/maven2/ 28 | 29 | 30 | 31 | 32 | org.apache.spark 33 | spark-core_2.10 34 | 1.3.1 35 | provided 36 | 37 | 38 | org.apache.spark 39 | spark-sql_2.10 40 | 1.3.1 41 | provided 42 | 43 | 44 | org.apache.spark 45 | spark-hive_2.10 46 | 1.3.1 47 | provided 48 | 49 | 50 | org.apache.spark 51 | spark-streaming_2.10 52 | 1.3.1 53 | 54 | 55 | org.apache.spark 56 | spark-streaming-kafka_2.10 57 | 1.3.1 58 | 59 | 60 | org.apache.spark 61 | spark-mllib 62 | 1.3.1 63 | 64 | 65 | com.datastax.spark 66 | spark-cassandra-connector 67 | 1.0.0-rc5 68 | 69 | 70 | com.datastax.spark 71 | spark-cassandra-connector-java 72 | 1.0.0-rc5 73 | 74 | 75 | org.elasticsearch 76 | elasticsearch-hadoop-mr 77 | 2.0.0.RC1 78 | 79 | 80 | org.eclipse.jetty 81 | jetty-client 82 | 8.1.14.v20131031 83 | 84 | 85 | com.fasterxml.jackson.core 86 | jackson-databind 87 | 2.3.3 88 | 89 | 90 | org.apache.commons 91 | commons-lang3 92 | 3.0 93 | 94 | 95 | net.sf.opencsv 96 | opencsv 97 | 2.0 98 | 99 | 100 | org.scalatest 101 | scalatest_${scala.binary.version} 102 | 2.2.1 103 | 104 | 105 | 106 | 1.7 107 | 108 | 109 | 110 | 111 | 112 | org.apache.maven.plugins 113 | maven-compiler-plugin 114 | 3.1 115 | 116 | ${java.version} 117 | ${java.version} 118 | 119 | 120 | 121 | org.apache.maven.plugins 122 | maven-assembly-plugin 123 | 2.2.2 124 | 125 | 126 | 127 | 128 | src/main/assembly/assembly.xml 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 2 | 3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 4 | 5 | resolvers += "Spray Repository" at "http://repo.spray.cc/" 6 | 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") 8 | 9 | addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3") 10 | -------------------------------------------------------------------------------- /run-all-examples: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to run all of the examples. Mostly to be used by travis for testing 3 | # Output the commands we run 4 | set -x 5 | # If any command fails, fail 6 | set -e 7 | # Build everything 8 | ./sbt/sbt compile package assembly > sbtlog || (echo "sbt failed" && cat ./sbtlog && exit 1) 9 | KAFKA_ROOT=./kafka_2.9.2-0.8.1.1 10 | SPARK_SUBMIT_SCRIPT=$SPARK_HOME/bin/spark-submit 11 | ASSEMBLY_JAR=./target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar 12 | # Mini cleanup 13 | rm -rf /tmp/py; mkdir -p /tmp/py 14 | rm -rf /tmp/java; mkdir -p /tmp/java 15 | rm -rf /tmp/scala; mkdir -p /tmp/scala 16 | # setup cassandra 17 | # cqlsh --file ./files/cqlsh_setup & 18 | # Scala 19 | echo "Running Scala programs" 20 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json 21 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.ChapterSixExample $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/scala/ch6out 22 | TWITTER_DATA=./files/testweet.json 23 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.SparkSQLTwitter $ASSEMBLY_JAR "$TWITTER_DATA" /tmp/scala/tweetout 24 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.BasicQueryCassandra $ASSEMBLY_JAR local localhost 25 | echo "Running Scala streaming program" 26 | ./bin/fakelogs.sh & 27 | sleep 1 28 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.StreamingLogInput $ASSEMBLY_JAR local[4] 29 | echo "Running Scala Kafka streaming example" 30 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-readers pandas 1 & 31 | KAFKA_PID=$! 32 | sleep 1 33 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas 34 | wait $KAFKA_PID 35 | echo "Running Scala Flume example" 36 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.FlumeInput $ASSEMBLY_JAR localhost 7788 & 37 | FLUME_PID=$! 38 | sleep 1 39 | echo "panda\nerror panda\n" | nc localhost 44444 40 | sleep 3 41 | echo "panda2\nerror panda2\n" | nc localhost 44444 42 | wait $FLUME_PID 43 | # Python 44 | echo "Running Python programs" 45 | $SPARK_SUBMIT_SCRIPT ./src/python/AvgMapPartitions.py local 46 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicAvg.py local 47 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicFilterMap.py local 48 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicKeyValueMapFilter.py local 49 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMapPartitions.py local 50 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMap.py local 51 | $SPARK_SUBMIT_SCRIPT ./src/python/ChapterSixExample.py local ./files/callsigns /tmp/py/pandaout 52 | $SPARK_SUBMIT_SCRIPT ./src/python/SparkSQLTwitter.py ./files/testweet.json /tmp/py/tweetout 53 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadCsv.py local ./files/favourite_animals.csv /tmp/py/panda_lovers.csv 54 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeHiveTable.py local ./files/int_string.csv pandaplural 55 | # Temporarily disabled due to API changes 56 | #$SPARK_SUBMIT_SCRIPT ./src/python/LoadHive.py local pandaplural 57 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadJson.py local ./files/pandainfo.json /tmp/py/loadjsonout 58 | $SPARK_SUBMIT_SCRIPT ./src/python/PerKeyAvg.py local 59 | $SPARK_SUBMIT_SCRIPT ./src/python/RemoveOutliers.py local 60 | $SPARK_SUBMIT_SCRIPT ./src/python/WordCount.py local 61 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeParquetFile.py local ./files/favourite_animals.csv /tmp/py/favouriteanimal_parquet 62 | $SPARK_SUBMIT_SCRIPT ./src/python/QueryParquetFile.py local /tmp/py/favouriteanimal_parquet 63 | 64 | # Java 65 | echo "Running Java programs" 66 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json 67 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.ChapterSixExample $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/java/ch6out 68 | ./sbt/sbt assembly && $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.SparkSQLTwitter $ASSEMBLY_JAR ./files/testweet.json /tmp/java/tweetout 69 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.BasicQueryCassandra $ASSEMBLY_JAR local localhost 70 | echo "Running Java streaming program" 71 | ./bin/fakelogs.sh & 72 | sleep 1 73 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.StreamingLogInput $ASSEMBLY_JAR local[4] 74 | sleep 5 75 | echo "Running Java Kafka streaming example" 76 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-java-readers 77 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas 78 | 79 | echo "Done running all programs :)" 80 | -------------------------------------------------------------------------------- /sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script launches sbt for this project. If present it uses the system 21 | # version of sbt. If there is no system version of sbt it attempts to download 22 | # sbt locally. 23 | SBT_VERSION=0.13.7 24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 27 | 28 | # Download sbt launch jar if it hasn't been downloaded yet 29 | if [ ! -f ${JAR} ]; then 30 | # Download 31 | printf "Attempting to fetch sbt\n" 32 | JAR_DL=${JAR}.part 33 | if hash wget 2>/dev/null; then 34 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 35 | elif hash curl 2>/dev/null; then 36 | (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 37 | else 38 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 39 | exit -1 40 | fi 41 | fi 42 | if [ ! -f ${JAR} ]; then 43 | # We failed to download 44 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 45 | exit -1 46 | fi 47 | printf "Launching sbt from ${JAR}\n" 48 | java \ 49 | -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \ 50 | -jar ${JAR} \ 51 | "$@" 52 | -------------------------------------------------------------------------------- /setup-project: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | set -e 4 | set -o pipefail 5 | sudo apt-get install -y axel time 6 | echo "Downloading misc tools" 7 | sudo rm -f /etc/apt/sources.list.d/cassandra.sources.list 8 | echo "deb http://debian.datastax.com/community stable main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list 9 | curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add - 10 | sudo apt-get update > aptlog & 11 | APT_GET_UPDATE_PID=$! 12 | axel http://d3kbcqa49mib13.cloudfront.net/spark-1.3.1-bin-hadoop1.tgz > sparkdl & 13 | SPARK_DL_PID=$! 14 | axel http://mirrors.ibiblio.org/apache/kafka/0.8.1.1/kafka_2.9.2-0.8.1.1.tgz > kafkadl & 15 | KAFKA_DL_PID=$! 16 | axel http://mirror.cogentco.com/pub/apache/flume/1.5.0.1/apache-flume-1.5.0.1-bin.tar.gz > flumedl & 17 | FLUME_DL_PID=$! 18 | wait $SPARK_DL_PID 19 | sudo mkdir -p /etc/apt/sources.list.d/ 20 | echo "install urllib3" 21 | sudo pip install urllib3 22 | wait $SPARK_DL_PID || echo "Spark DL finished early" 23 | tar -xf spark-1.3.1-bin-hadoop1.tgz 24 | wait $APT_GET_UPDATE_PID 25 | echo "Installing protobuf" 26 | sudo apt-get install protobuf-compiler 27 | echo $? 28 | # Set up cassandra 29 | echo "Waiting for apt-get update to finish" 30 | wait $APT_GET_UPDATE_PID || echo "apt-get update finished early" 31 | echo "Setting up dsc (cassandra)" 32 | sleep 1; 33 | #sudo apt-get -y --force-yes remove cassandra cassandra-tools 34 | #sudo rm -rf /etc/security/limits.d/cassandra.conf || echo "No cassandra security conf" 35 | #yes | sudo apt-get -y --force-yes install dsc21 > dscinstall.log 36 | #yes | sudo apt-get -y --force-yes install cassandra-tools > ctoolsinstall.log 37 | echo "Starting cassandra" 38 | sudo /etc/init.d/cassandra start 39 | echo $? 40 | echo "set up hive directories" 41 | export IAM=`whoami` 42 | sudo mkdir -p /user/hive && sudo chown -R $IAM /user/hive 43 | echo "done with setup" 44 | # Set up kafka 45 | echo "Setting up kafka" 46 | wait $KAFKA_DL_PID || echo "Kafka DL finished early" 47 | tar -xzf kafka_2.9.2-0.8.1.1.tgz 48 | cd kafka_2.9.2-0.8.1.1 49 | echo "Starting zookeeper" 50 | ./bin/zookeeper-server-start.sh config/zookeeper.properties & 51 | echo "Starting kafka" 52 | sleep 5 53 | ./bin/kafka-server-start.sh config/server.properties & 54 | sleep 5 55 | # publish a pandas topic to kafka 56 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic pandas --partition 1 --replication-factor 1 --create 57 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic logs --partition 1 --replication-factor 1 --create 58 | cd .. 59 | 60 | # set up flume 61 | wait $FLUME_DL_PID || echo "Flume DL finished early" 62 | echo "Setting up flume" 63 | tar -xf apache-flume-1.5.0.1-bin.tar.gz 64 | cd apache-flume-1.5.0.1-bin 65 | ./bin/flume-ng agent -n panda --conf-file ../files/flumeconf.cfg & 66 | disown $! 67 | cd .. 68 | echo $? 69 | -------------------------------------------------------------------------------- /src/R/finddistance.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library("Imap") 3 | f <- file("stdin") 4 | open(f) 5 | while(length(line <- readLines(f,n=1)) > 0) { 6 | # process line 7 | contents <- Map(as.numeric, strsplit(line, ",")) 8 | mydist <- gdist(contents[[1]][1], contents[[1]][2], contents[[1]][3], contents[[1]][4], 9 | units="m", a=6378137.0, b=6356752.3142, verbose = FALSE) 10 | write(mydist, stdout()) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicAvg.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates how to compute an average using aggregate in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.Serializable; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | 10 | import org.apache.commons.lang.StringUtils; 11 | 12 | import org.apache.spark.api.java.JavaRDD; 13 | import org.apache.spark.api.java.JavaSparkContext; 14 | import org.apache.spark.api.java.function.Function2; 15 | 16 | public final class BasicAvg { 17 | public static class AvgCount implements Serializable { 18 | public AvgCount(int total, int num) { 19 | total_ = total; 20 | num_ = num; 21 | } 22 | public int total_; 23 | public int num_; 24 | public float avg() { 25 | return total_ / (float) num_; 26 | } 27 | } 28 | 29 | public static void main(String[] args) throws Exception { 30 | String master; 31 | if (args.length > 0) { 32 | master = args[0]; 33 | } else { 34 | master = "local"; 35 | } 36 | 37 | JavaSparkContext sc = new JavaSparkContext( 38 | master, "basicavg", System.getenv("SPARK_HOME"), System.getenv("JARS")); 39 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 40 | Function2 addAndCount = new Function2() { 41 | @Override 42 | public AvgCount call(AvgCount a, Integer x) { 43 | a.total_ += x; 44 | a.num_ += 1; 45 | return a; 46 | } 47 | }; 48 | Function2 combine = new Function2() { 49 | @Override 50 | public AvgCount call(AvgCount a, AvgCount b) { 51 | a.total_ += b.total_; 52 | a.num_ += b.num_; 53 | return a; 54 | } 55 | }; 56 | AvgCount initial = new AvgCount(0,0); 57 | AvgCount result = rdd.aggregate(initial, addAndCount, combine); 58 | System.out.println(result.avg()); 59 | sc.stop(); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgMapPartitions.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partitions in Java to compute the average 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Iterator; 10 | 11 | import org.apache.commons.lang.StringUtils; 12 | 13 | import org.eclipse.jetty.client.ContentExchange; 14 | import org.eclipse.jetty.client.HttpClient; 15 | 16 | 17 | import org.apache.spark.api.java.JavaRDD; 18 | import org.apache.spark.api.java.JavaSparkContext; 19 | import org.apache.spark.api.java.function.FlatMapFunction; 20 | import org.apache.spark.api.java.function.Function2; 21 | 22 | public final class BasicAvgMapPartitions { 23 | class AvgCount { 24 | public AvgCount() { 25 | total_ = 0; 26 | num_ = 0; 27 | } 28 | public AvgCount(Integer total, Integer num) { 29 | total_ = total; 30 | num_ = num; 31 | } 32 | public AvgCount merge(Iterable input) { 33 | for (Integer elem : input) { 34 | num_ += 1; 35 | total_ += elem; 36 | } 37 | return this; 38 | } 39 | public Integer total_; 40 | public Integer num_; 41 | public float avg() { 42 | return total_ / (float) num_; 43 | } 44 | } 45 | 46 | public static void main(String[] args) throws Exception { 47 | String master; 48 | if (args.length > 0) { 49 | master = args[0]; 50 | } else { 51 | master = "local"; 52 | } 53 | BasicAvgMapPartitions bamp = new BasicAvgMapPartitions(); 54 | bamp.run(master); 55 | } 56 | 57 | public void run(String master) { 58 | JavaSparkContext sc = new JavaSparkContext( 59 | master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS")); 60 | JavaRDD rdd = sc.parallelize( 61 | Arrays.asList(1, 2, 3, 4, 5)); 62 | FlatMapFunction, AvgCount> setup = new FlatMapFunction, AvgCount>() { 63 | @Override 64 | public Iterable call(Iterator input) { 65 | AvgCount a = new AvgCount(0, 0); 66 | while (input.hasNext()) { 67 | a.total_ += input.next(); 68 | a.num_ += 1; 69 | } 70 | ArrayList ret = new ArrayList(); 71 | ret.add(a); 72 | return ret; 73 | } 74 | }; 75 | Function2 combine = new Function2() { 76 | @Override 77 | public AvgCount call(AvgCount a, AvgCount b) { 78 | a.total_ += b.total_; 79 | a.num_ += b.num_; 80 | return a; 81 | } 82 | }; 83 | 84 | AvgCount result = rdd.mapPartitions(setup).reduce(combine); 85 | System.out.println(result.avg()); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgWithKryo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates Kryo serialization in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.SparkConf; 12 | import org.apache.spark.serializer.KryoRegistrator; 13 | import org.apache.spark.api.java.JavaRDD; 14 | import org.apache.spark.api.java.JavaSparkContext; 15 | import org.apache.spark.api.java.function.Function2; 16 | 17 | import com.esotericsoftware.kryo.Kryo; 18 | import com.esotericsoftware.kryo.serializers.FieldSerializer; 19 | 20 | public final class BasicAvgWithKryo { 21 | // This is our custom class we will configure Kyro to serialize 22 | static class AvgCount implements java.io.Serializable { 23 | public AvgCount() { 24 | total_ = 0; 25 | num_ = 0; 26 | } 27 | public AvgCount(int total, int num) { 28 | total_ = total; 29 | num_ = num; 30 | } 31 | public float avg() { 32 | return total_ / (float) num_; 33 | } 34 | public int total_; 35 | public int num_; 36 | } 37 | 38 | public static class AvgRegistrator implements KryoRegistrator { 39 | public void registerClasses(Kryo kryo) { 40 | kryo.register(AvgCount.class, new FieldSerializer(kryo, AvgCount.class)); 41 | } 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | String master; 46 | if (args.length > 0) { 47 | master = args[0]; 48 | } else { 49 | master = "local"; 50 | } 51 | 52 | SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro"); 53 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); 54 | conf.set("spark.kryo.registrator", AvgRegistrator.class.getName()); 55 | JavaSparkContext sc = new JavaSparkContext(conf); 56 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 57 | Function2 addAndCount = new Function2() { 58 | @Override 59 | public AvgCount call(AvgCount a, Integer x) { 60 | a.total_ += x; 61 | a.num_ += 1; 62 | return a; 63 | } 64 | }; 65 | Function2 combine = new Function2() { 66 | @Override 67 | public AvgCount call(AvgCount a, AvgCount b) { 68 | a.total_ += b.total_; 69 | a.num_ += b.num_; 70 | return a; 71 | } 72 | }; 73 | AvgCount initial = new AvgCount(0,0); 74 | AvgCount result = rdd.aggregate(initial, addAndCount, combine); 75 | System.out.println(result.avg()); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicFlatMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple flatMap in Java to extract the words 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Map.Entry; 10 | 11 | 12 | import org.apache.commons.lang.StringUtils; 13 | 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaSparkContext; 17 | import org.apache.spark.api.java.function.FlatMapFunction; 18 | 19 | public class BasicFlatMap { 20 | public static void main(String[] args) throws Exception { 21 | 22 | if (args.length != 2) { 23 | throw new Exception("Usage BasicFlatMap sparkMaster inputFile"); 24 | } 25 | 26 | JavaSparkContext sc = new JavaSparkContext( 27 | args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); 28 | JavaRDD rdd = sc.textFile(args[1]); 29 | JavaRDD words = rdd.flatMap( 30 | new FlatMapFunction() { public Iterable call(String x) { 31 | return Arrays.asList(x.split(" ")); 32 | }}); 33 | Map result = words.countByValue(); 34 | for (Entry entry: result.entrySet()) { 35 | System.out.println(entry.getKey() + ":" + entry.getValue()); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicJoinCsv.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates joining two csv files 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.StringReader; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import scala.Tuple2; 10 | 11 | import au.com.bytecode.opencsv.CSVReader; 12 | 13 | import org.apache.commons.lang.StringUtils; 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaSparkContext; 17 | import org.apache.spark.api.java.function.PairFunction; 18 | 19 | public class BasicJoinCsv { 20 | 21 | public static class ParseLine implements PairFunction { 22 | public Tuple2 call(String line) throws Exception { 23 | CSVReader reader = new CSVReader(new StringReader(line)); 24 | String[] elements = reader.readNext(); 25 | Integer key = Integer.parseInt(elements[0]); 26 | return new Tuple2(key, elements); 27 | } 28 | } 29 | 30 | public static void main(String[] args) throws Exception { 31 | if (args.length != 3) { 32 | throw new Exception("Usage BasicJoinCsv sparkMaster csv1 csv2"); 33 | } 34 | String master = args[0]; 35 | String csv1 = args[1]; 36 | String csv2 = args[2]; 37 | BasicJoinCsv jsv = new BasicJoinCsv(); 38 | jsv.run(master, csv1, csv2); 39 | } 40 | 41 | public void run(String master, String csv1, String csv2) throws Exception { 42 | JavaSparkContext sc = new JavaSparkContext( 43 | master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); 44 | JavaRDD csvFile1 = sc.textFile(csv1); 45 | JavaRDD csvFile2 = sc.textFile(csv2); 46 | JavaPairRDD keyedRDD1 = csvFile1.mapToPair(new ParseLine()); 47 | JavaPairRDD keyedRDD2 = csvFile1.mapToPair(new ParseLine()); 48 | JavaPairRDD> result = keyedRDD1.join(keyedRDD2); 49 | List>> resultCollection = result.collect(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadJson.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a json file and finding out if people like pandas 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.StringReader; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Iterator; 10 | import java.lang.Iterable; 11 | import scala.Tuple2; 12 | 13 | import org.apache.commons.lang.StringUtils; 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaSparkContext; 17 | import org.apache.spark.api.java.function.FlatMapFunction; 18 | import org.apache.spark.api.java.function.Function; 19 | 20 | import com.fasterxml.jackson.databind.ObjectMapper; 21 | import com.fasterxml.jackson.databind.ObjectWriter; 22 | 23 | public class BasicLoadJson { 24 | 25 | public static class Person implements java.io.Serializable { 26 | public String name; 27 | public Boolean lovesPandas; 28 | } 29 | 30 | public static class ParseJson implements FlatMapFunction, Person> { 31 | public Iterable call(Iterator lines) throws Exception { 32 | ArrayList people = new ArrayList(); 33 | ObjectMapper mapper = new ObjectMapper(); 34 | while (lines.hasNext()) { 35 | String line = lines.next(); 36 | try { 37 | people.add(mapper.readValue(line, Person.class)); 38 | } catch (Exception e) { 39 | // Skip invalid input 40 | } 41 | } 42 | return people; 43 | } 44 | } 45 | 46 | public static class LikesPandas implements Function { 47 | public Boolean call(Person person) { 48 | return person.lovesPandas; 49 | } 50 | } 51 | 52 | 53 | public static class WriteJson implements FlatMapFunction, String> { 54 | public Iterable call(Iterator people) throws Exception { 55 | ArrayList text = new ArrayList(); 56 | ObjectMapper mapper = new ObjectMapper(); 57 | while (people.hasNext()) { 58 | Person person = people.next(); 59 | text.add(mapper.writeValueAsString(person)); 60 | } 61 | return text; 62 | } 63 | } 64 | 65 | public static void main(String[] args) throws Exception { 66 | if (args.length != 3) { 67 | throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]"); 68 | } 69 | String master = args[0]; 70 | String fileName = args[1]; 71 | String outfile = args[2]; 72 | 73 | JavaSparkContext sc = new JavaSparkContext( 74 | master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS")); 75 | JavaRDD input = sc.textFile(fileName); 76 | JavaRDD result = input.mapPartitions(new ParseJson()).filter(new LikesPandas()); 77 | JavaRDD formatted = result.mapPartitions(new WriteJson()); 78 | formatted.saveAsTextFile(outfile); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadSequenceFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a sequence file of people and how many pandas they have seen 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.List; 7 | import scala.Tuple2; 8 | 9 | import org.apache.spark.api.java.JavaPairRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | import org.apache.spark.api.java.function.PairFunction; 12 | import org.apache.hadoop.io.IntWritable; 13 | import org.apache.hadoop.io.Text; 14 | 15 | public class BasicLoadSequenceFile { 16 | 17 | public static class ConvertToNativeTypes implements PairFunction, String, Integer> { 18 | public Tuple2 call(Tuple2 record) { 19 | return new Tuple2(record._1.toString(), record._2.get()); 20 | } 21 | } 22 | 23 | public static void main(String[] args) throws Exception { 24 | if (args.length != 2) { 25 | throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]"); 26 | } 27 | String master = args[0]; 28 | String fileName = args[1]; 29 | 30 | JavaSparkContext sc = new JavaSparkContext( 31 | master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS")); 32 | JavaPairRDD input = sc.sequenceFile(fileName, Text.class, IntWritable.class); 33 | JavaPairRDD result = input.mapToPair(new ConvertToNativeTypes()); 34 | List> resultList = result.collect(); 35 | for (Tuple2 record : resultList) { 36 | System.out.println(record); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadWholeCsv.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates joining two csv files 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.StringReader; 7 | import java.util.Arrays; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import scala.Tuple2; 11 | 12 | import au.com.bytecode.opencsv.CSVReader; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | import org.apache.spark.api.java.JavaRDD; 16 | import org.apache.spark.api.java.JavaPairRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.FlatMapFunction; 19 | import org.apache.spark.api.java.function.Function; 20 | 21 | public class BasicLoadWholeCsv { 22 | 23 | public static class ParseLine implements FlatMapFunction, String[]> { 24 | public Iterable call(Tuple2 file) throws Exception { 25 | CSVReader reader = new CSVReader(new StringReader(file._2())); 26 | return reader.readAll(); 27 | } 28 | } 29 | 30 | public static void main(String[] args) throws Exception { 31 | if (args.length != 3) { 32 | throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key"); 33 | } 34 | String master = args[0]; 35 | String csvInput = args[1]; 36 | String outputFile = args[2]; 37 | final String key = args[3]; 38 | 39 | JavaSparkContext sc = new JavaSparkContext( 40 | master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); 41 | JavaPairRDD csvData = sc.wholeTextFiles(csvInput); 42 | JavaRDD keyedRDD = csvData.flatMap(new ParseLine()); 43 | JavaRDD result = 44 | keyedRDD.filter(new Function() { 45 | public Boolean call(String[] input) { return input[0].equals(key); }}); 46 | 47 | result.saveAsTextFile(outputFile); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.function.Function; 14 | 15 | public class BasicMap { 16 | public static void main(String[] args) throws Exception { 17 | String master; 18 | if (args.length > 0) { 19 | master = args[0]; 20 | } else { 21 | master = "local"; 22 | } 23 | JavaSparkContext sc = new JavaSparkContext( 24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); 25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 26 | JavaRDD result = rdd.map( 27 | new Function() { public Integer call(Integer x) { return x*x;}}); 28 | System.out.println(StringUtils.join(result.collect(), ",")); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicMapPartitions.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Iterator; 10 | 11 | import org.apache.commons.lang.StringUtils; 12 | 13 | import org.eclipse.jetty.client.ContentExchange; 14 | import org.eclipse.jetty.client.HttpClient; 15 | 16 | 17 | import org.apache.spark.api.java.JavaRDD; 18 | import org.apache.spark.api.java.JavaSparkContext; 19 | import org.apache.spark.api.java.function.FlatMapFunction; 20 | 21 | public class BasicMapPartitions { 22 | public static void main(String[] args) throws Exception { 23 | String master; 24 | if (args.length > 0) { 25 | master = args[0]; 26 | } else { 27 | master = "local"; 28 | } 29 | JavaSparkContext sc = new JavaSparkContext( 30 | master, "basicmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS")); 31 | JavaRDD rdd = sc.parallelize( 32 | Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB")); 33 | JavaRDD result = rdd.mapPartitions( 34 | new FlatMapFunction, String>() { 35 | public Iterable call(Iterator input) { 36 | ArrayList content = new ArrayList(); 37 | ArrayList cea = new ArrayList(); 38 | HttpClient client = new HttpClient(); 39 | try { 40 | client.start(); 41 | while (input.hasNext()) { 42 | ContentExchange exchange = new ContentExchange(true); 43 | exchange.setURL("http://qrzcq.com/call/" + input.next()); 44 | client.send(exchange); 45 | cea.add(exchange); 46 | } 47 | for (ContentExchange exchange : cea) { 48 | exchange.waitForDone(); 49 | content.add(exchange.getResponseContent()); 50 | } 51 | } catch (Exception e) { 52 | } 53 | return content; 54 | }}); 55 | System.out.println(StringUtils.join(result.collect(), ",")); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicMapThenFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map then filter in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.function.Function; 14 | 15 | public class BasicMapThenFilter { 16 | public static void main(String[] args) throws Exception { 17 | String master; 18 | if (args.length > 0) { 19 | master = args[0]; 20 | } else { 21 | master = "local"; 22 | } 23 | JavaSparkContext sc = new JavaSparkContext( 24 | master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); 25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 26 | JavaRDD squared = rdd.map( 27 | new Function() { public Integer call(Integer x) { return x*x;}}); 28 | JavaRDD result = squared.filter( 29 | new Function() { public Boolean call(Integer x) { return x != 1; }}); 30 | System.out.println(StringUtils.join(result.collect(), ",")); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicMapToDouble.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map to double in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaDoubleRDD; 13 | import org.apache.spark.api.java.JavaSparkContext; 14 | import org.apache.spark.api.java.function.DoubleFunction; 15 | 16 | public class BasicMapToDouble { 17 | public static void main(String[] args) throws Exception { 18 | String master; 19 | if (args.length > 0) { 20 | master = args[0]; 21 | } else { 22 | master = "local"; 23 | } 24 | JavaSparkContext sc = new JavaSparkContext( 25 | master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS")); 26 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 27 | JavaDoubleRDD result = rdd.mapToDouble( 28 | new DoubleFunction() { 29 | public double call(Integer x) { 30 | double y = (double) x; 31 | return y * y; 32 | } 33 | }); 34 | System.out.println(StringUtils.join(result.collect(), ",")); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicQueryCassandra.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a json file and finding out if people like pandas 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | import java.io.Serializable; 6 | 7 | import java.io.StringReader; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Iterator; 11 | import java.lang.Iterable; 12 | import scala.Tuple2; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | import org.apache.spark.SparkConf; 16 | import org.apache.spark.api.java.JavaRDD; 17 | import org.apache.spark.api.java.JavaPairRDD; 18 | import org.apache.spark.api.java.JavaSparkContext; 19 | import org.apache.spark.api.java.function.DoubleFunction; 20 | import org.apache.spark.api.java.function.FlatMapFunction; 21 | import org.apache.spark.api.java.function.Function; 22 | 23 | import com.datastax.spark.connector.CassandraRow; 24 | import static com.datastax.spark.connector.CassandraJavaUtil.javaFunctions; 25 | 26 | public class BasicQueryCassandra { 27 | public static void main(String[] args) throws Exception { 28 | if (args.length != 2) { 29 | throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]"); 30 | } 31 | String sparkMaster = args[0]; 32 | String cassandraHost = args[1]; 33 | SparkConf conf = new SparkConf(true) 34 | .set("spark.cassandra.connection.host", cassandraHost); 35 | 36 | JavaSparkContext sc = new JavaSparkContext( 37 | sparkMaster, "basicquerycassandra", conf); 38 | // entire table as an RDD 39 | // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int); 40 | JavaRDD data = javaFunctions(sc).cassandraTable("test" , "kv"); 41 | // print some basic stats 42 | System.out.println(data.mapToDouble(new DoubleFunction() { 43 | public double call(CassandraRow row) { 44 | return row.getInt("value"); 45 | }}).stats()); 46 | // write some basic data to Cassandra 47 | ArrayList input = new ArrayList(); 48 | input.add(KeyValue.newInstance("mostmagic", 3)); 49 | JavaRDD kvRDD = sc.parallelize(input); 50 | javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv"); 51 | } 52 | public static class KeyValue implements Serializable { 53 | private String key; 54 | private Integer value; 55 | public KeyValue() { 56 | } 57 | public static KeyValue newInstance(String k, Integer v) { 58 | KeyValue kv = new KeyValue(); 59 | kv.setKey(k); 60 | kv.setValue(v); 61 | return kv; 62 | } 63 | public String getKey() { 64 | return key; 65 | } 66 | public Integer getValue() { 67 | return value; 68 | } 69 | void setKey(String k) { 70 | this.key = k; 71 | } 72 | void setValue(Integer v) { 73 | this.value = v; 74 | } 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicSaveSequenceFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates saving a sequence file in Java using the old style hadoop APIs. 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import scala.Tuple2; 9 | 10 | import org.apache.spark.api.java.JavaPairRDD; 11 | import org.apache.spark.api.java.JavaSparkContext; 12 | import org.apache.spark.api.java.function.PairFunction; 13 | import org.apache.hadoop.io.IntWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 16 | 17 | public class BasicSaveSequenceFile { 18 | 19 | public static class ConvertToWritableTypes implements PairFunction, Text, IntWritable> { 20 | public Tuple2 call(Tuple2 record) { 21 | return new Tuple2(new Text(record._1), new IntWritable(record._2)); 22 | } 23 | } 24 | 25 | public static void main(String[] args) throws Exception { 26 | if (args.length != 2) { 27 | throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]"); 28 | } 29 | String master = args[0]; 30 | String fileName = args[1]; 31 | 32 | JavaSparkContext sc = new JavaSparkContext( 33 | master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS")); 34 | List> input = new ArrayList(); 35 | input.add(new Tuple2("coffee", 1)); 36 | input.add(new Tuple2("coffee", 2)); 37 | input.add(new Tuple2("pandas", 3)); 38 | JavaPairRDD rdd = sc.parallelizePairs(input); 39 | JavaPairRDD result = rdd.mapToPair(new ConvertToWritableTypes()); 40 | result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/BasicSum.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple fold in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.function.Function2; 14 | 15 | public class BasicSum { 16 | public static void main(String[] args) throws Exception { 17 | String master; 18 | if (args.length > 0) { 19 | master = args[0]; 20 | } else { 21 | master = "local"; 22 | } 23 | JavaSparkContext sc = new JavaSparkContext( 24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); 25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); 26 | Integer result = rdd.fold(0, new Function2() { 27 | public Integer call(Integer x, Integer y) { return x + y;}}); 28 | System.out.println(result); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/CallLog.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java; 2 | 3 | import java.io.Serializable; 4 | 5 | public class CallLog implements Serializable { 6 | public String callsign; 7 | public Double contactlat; 8 | public Double contactlong; 9 | public Double mylat; 10 | public Double mylong; 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/HappyPerson.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java; 2 | import java.io.Serializable; 3 | 4 | 5 | class HappyPerson implements Serializable { 6 | private String name; 7 | private String favouriteBeverage; 8 | public HappyPerson() {} 9 | public HappyPerson(String n, String b) { 10 | name = n; favouriteBeverage = b; 11 | } 12 | public String getName() { return name; } 13 | public void setName(String n) { name = n; } 14 | public String getFavouriteBeverage() { return favouriteBeverage; } 15 | public void setFavouriteBeverage(String b) { favouriteBeverage = b; } 16 | }; 17 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/IntersectByKey.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import com.google.common.collect.Iterables; 13 | 14 | import scala.Tuple2; 15 | 16 | import org.apache.commons.lang.StringUtils; 17 | 18 | import org.apache.spark.api.java.JavaRDD; 19 | import org.apache.spark.api.java.JavaPairRDD; 20 | import org.apache.spark.api.java.JavaSparkContext; 21 | import org.apache.spark.api.java.function.Function; 22 | import org.apache.spark.api.java.function.Function2; 23 | import org.apache.spark.api.java.function.FlatMapFunction; 24 | 25 | public final class IntersectByKey { 26 | public static JavaPairRDD intersectByKey(JavaPairRDD rdd1, JavaPairRDD rdd2) { 27 | JavaPairRDD, Iterable>> grouped = rdd1.cogroup(rdd2); 28 | return grouped.flatMapValues(new Function, Iterable>, Iterable>() { 29 | @Override 30 | public Iterable call(Tuple2, Iterable> input) { 31 | ArrayList al = new ArrayList(); 32 | if (!Iterables.isEmpty(input._1()) && !Iterables.isEmpty(input._2())) { 33 | Iterables.addAll(al, input._1()); 34 | Iterables.addAll(al, input._2()); 35 | } 36 | return al; 37 | } 38 | }); 39 | } 40 | public static void main(String[] args) throws Exception { 41 | String master; 42 | if (args.length > 0) { 43 | master = args[0]; 44 | } else { 45 | master = "local"; 46 | } 47 | 48 | JavaSparkContext sc = new JavaSparkContext( 49 | master, "IntersectByKey", System.getenv("SPARK_HOME"), System.getenv("JARS")); 50 | List> input1 = new ArrayList(); 51 | input1.add(new Tuple2("coffee", 1)); 52 | input1.add(new Tuple2("coffee", 2)); 53 | input1.add(new Tuple2("pandas", 3)); 54 | List> input2 = new ArrayList(); 55 | input2.add(new Tuple2("pandas", 20)); 56 | JavaPairRDD rdd1 = sc.parallelizePairs(input1); 57 | JavaPairRDD rdd2 = sc.parallelizePairs(input2); 58 | JavaPairRDD result = intersectByKey(rdd1, rdd2); 59 | for (Tuple2 entry : result.collect()) { 60 | System.out.println(entry._1() + ":" + entry._2()); 61 | } 62 | System.out.println("Done"); 63 | sc.stop(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/KafkaInput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map then filter in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | import org.apache.commons.lang.StringUtils; 12 | 13 | import org.apache.spark.SparkConf; 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaSparkContext; 16 | import org.apache.spark.api.java.function.Function; 17 | import org.apache.spark.streaming.api.java.JavaPairDStream; 18 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 19 | import org.apache.spark.streaming.Duration; 20 | import org.apache.spark.streaming.kafka.*; 21 | 22 | public final class KafkaInput { 23 | public static void main(String[] args) throws Exception { 24 | String zkQuorum = args[0]; 25 | String group = args[1]; 26 | SparkConf conf = new SparkConf().setAppName("KafkaInput"); 27 | // Create a StreamingContext with a 1 second batch size 28 | JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000)); 29 | Map topics = new HashMap(); 30 | topics.put("pandas", 1); 31 | JavaPairDStream input = KafkaUtils.createStream(jssc, zkQuorum, group, topics); 32 | input.print(); 33 | // start our streaming context and wait for it to "finish" 34 | jssc.start(); 35 | // Wait for 10 seconds then exit. To run forever call without a timeout 36 | jssc.awaitTermination(10000); 37 | // Stop the streaming context 38 | jssc.stop(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/KeyValueMapFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates how to make a PairRDD then do a basic filter 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import scala.Tuple2; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | 16 | import org.apache.spark.api.java.JavaRDD; 17 | import org.apache.spark.api.java.JavaPairRDD; 18 | import org.apache.spark.api.java.JavaSparkContext; 19 | import org.apache.spark.api.java.function.Function; 20 | import org.apache.spark.api.java.function.PairFunction; 21 | 22 | public final class KeyValueMapFilter { 23 | 24 | public static void main(String[] args) throws Exception { 25 | if (args.length != 2) { 26 | throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile"); 27 | } 28 | String master = args[0]; 29 | String inputFile = args[1]; 30 | 31 | JavaSparkContext sc = new JavaSparkContext( 32 | master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); 33 | JavaRDD input = sc.textFile(inputFile); 34 | PairFunction keyData = new PairFunction() { 35 | @Override 36 | public Tuple2 call(String x) { 37 | return new Tuple2(x.split(" ")[0], x); 38 | } 39 | }; 40 | Function, Boolean> longWordFilter = new Function, Boolean>() { 41 | @Override 42 | public Boolean call(Tuple2 input) { 43 | return (input._2().length() < 20); 44 | } 45 | }; 46 | JavaPairRDD rdd = input.mapToPair(keyData); 47 | JavaPairRDD result = rdd.filter(longWordFilter); 48 | Map resultMap = result.collectAsMap(); 49 | for (Entry entry : resultMap.entrySet()) { 50 | System.out.println(entry.getKey() + ":" + entry.getValue()); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/LoadHive.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading data from Hive with Spark SQL 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.StringReader; 7 | import java.util.Arrays; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import scala.Tuple2; 11 | 12 | import au.com.bytecode.opencsv.CSVReader; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | import org.apache.spark.api.java.JavaRDD; 16 | import org.apache.spark.api.java.JavaPairRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.FlatMapFunction; 19 | import org.apache.spark.api.java.function.Function; 20 | import org.apache.spark.sql.SQLContext; 21 | import org.apache.spark.sql.Row; 22 | import org.apache.spark.sql.DataFrame; 23 | 24 | public class LoadHive { 25 | 26 | public static class SquareKey implements Function { 27 | public Integer call(Row row) throws Exception { 28 | return row.getInt(0) * row.getInt(0); 29 | } 30 | } 31 | 32 | public static void main(String[] args) throws Exception { 33 | if (args.length != 3) { 34 | throw new Exception("Usage LoadHive sparkMaster tbl"); 35 | } 36 | String master = args[0]; 37 | String tbl = args[1]; 38 | 39 | JavaSparkContext sc = new JavaSparkContext( 40 | master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS")); 41 | SQLContext sqlCtx = new SQLContext(sc); 42 | DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src"); 43 | JavaRDD squaredKeys = rdd.toJavaRDD().map(new SquareKey()); 44 | List result = squaredKeys.collect(); 45 | for (Integer elem : result) { 46 | System.out.println(elem); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/LoadJsonWithSparkSQL.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading data from Hive with Spark SQL 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.io.StringReader; 7 | import java.util.Arrays; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import scala.Tuple2; 11 | 12 | import au.com.bytecode.opencsv.CSVReader; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | import org.apache.spark.api.java.JavaRDD; 16 | import org.apache.spark.api.java.JavaPairRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.FlatMapFunction; 19 | import org.apache.spark.api.java.function.Function; 20 | import org.apache.spark.sql.SQLContext; 21 | import org.apache.spark.sql.Row; 22 | import org.apache.spark.sql.DataFrame; 23 | 24 | public class LoadJsonWithSparkSQL { 25 | 26 | 27 | public static void main(String[] args) throws Exception { 28 | if (args.length != 2) { 29 | throw new Exception("Usage LoadJsonWithSparkSQL sparkMaster jsonFile"); 30 | } 31 | String master = args[0]; 32 | String jsonFile = args[1]; 33 | 34 | JavaSparkContext sc = new JavaSparkContext( 35 | master, "loadJsonwithsparksql"); 36 | SQLContext sqlCtx = new SQLContext(sc); 37 | DataFrame input = sqlCtx.jsonFile(jsonFile); 38 | input.printSchema(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/MLlib.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.oreilly.learningsparkexamples.java; 19 | 20 | import java.util.Arrays; 21 | 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.api.java.JavaRDD; 24 | import org.apache.spark.api.java.JavaSparkContext; 25 | import org.apache.spark.api.java.function.Function; 26 | 27 | import org.apache.spark.mllib.classification.LogisticRegressionModel; 28 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD; 29 | import org.apache.spark.mllib.feature.HashingTF; 30 | import org.apache.spark.mllib.linalg.Vector; 31 | import org.apache.spark.mllib.regression.LabeledPoint; 32 | 33 | public final class MLlib { 34 | 35 | public static void main(String[] args) { 36 | SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample"); 37 | JavaSparkContext sc = new JavaSparkContext(sparkConf); 38 | 39 | // Load 2 types of emails from text files: spam and ham (non-spam). 40 | // Each line has text from one email. 41 | JavaRDD spam = sc.textFile("files/spam.txt"); 42 | JavaRDD ham = sc.textFile("files/ham.txt"); 43 | 44 | // Create a HashingTF instance to map email text to vectors of 100 features. 45 | final HashingTF tf = new HashingTF(100); 46 | 47 | // Each email is split into words, and each word is mapped to one feature. 48 | // Create LabeledPoint datasets for positive (spam) and negative (ham) examples. 49 | JavaRDD positiveExamples = spam.map(new Function() { 50 | @Override public LabeledPoint call(String email) { 51 | return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); 52 | } 53 | }); 54 | JavaRDD negativeExamples = ham.map(new Function() { 55 | @Override public LabeledPoint call(String email) { 56 | return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); 57 | } 58 | }); 59 | JavaRDD trainingData = positiveExamples.union(negativeExamples); 60 | trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm. 61 | 62 | // Create a Logistic Regression learner which uses the LBFGS optimizer. 63 | LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD(); 64 | // Run the actual learning algorithm on the training data. 65 | LogisticRegressionModel model = lrLearner.run(trainingData.rdd()); 66 | 67 | // Test on a positive example (spam) and a negative one (ham). 68 | // First apply the same HashingTF feature transformation used on the training data. 69 | Vector posTestExample = 70 | tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" "))); 71 | Vector negTestExample = 72 | tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" "))); 73 | // Now use the learned model to predict spam/ham for new emails. 74 | System.out.println("Prediction for positive test example: " + model.predict(posTestExample)); 75 | System.out.println("Prediction for negative test example: " + model.predict(negTestExample)); 76 | 77 | sc.stop(); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/PerKeyAvg.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import scala.Tuple2; 13 | 14 | import org.apache.commons.lang.StringUtils; 15 | 16 | import org.apache.spark.api.java.JavaRDD; 17 | import org.apache.spark.api.java.JavaPairRDD; 18 | import org.apache.spark.api.java.JavaSparkContext; 19 | import org.apache.spark.api.java.function.Function; 20 | import org.apache.spark.api.java.function.Function2; 21 | 22 | public final class PerKeyAvg { 23 | public static class AvgCount implements java.io.Serializable { 24 | public AvgCount(int total, int num) { 25 | total_ = total; 26 | num_ = num; 27 | } 28 | public int total_; 29 | public int num_; 30 | public float avg() { 31 | return total_ / (float) num_; 32 | } 33 | } 34 | public static void main(String[] args) throws Exception { 35 | String master; 36 | if (args.length > 0) { 37 | master = args[0]; 38 | } else { 39 | master = "local"; 40 | } 41 | 42 | JavaSparkContext sc = new JavaSparkContext( 43 | master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS")); 44 | List> input = new ArrayList(); 45 | input.add(new Tuple2("coffee", 1)); 46 | input.add(new Tuple2("coffee", 2)); 47 | input.add(new Tuple2("pandas", 3)); 48 | JavaPairRDD rdd = sc.parallelizePairs(input); 49 | Function createAcc = new Function() { 50 | @Override 51 | public AvgCount call(Integer x) { 52 | return new AvgCount(x, 1); 53 | } 54 | }; 55 | Function2 addAndCount = new Function2() { 56 | @Override 57 | public AvgCount call(AvgCount a, Integer x) { 58 | a.total_ += x; 59 | a.num_ += 1; 60 | return a; 61 | } 62 | }; 63 | Function2 combine = new Function2() { 64 | @Override 65 | public AvgCount call(AvgCount a, AvgCount b) { 66 | a.total_ += b.total_; 67 | a.num_ += b.num_; 68 | return a; 69 | } 70 | }; 71 | AvgCount initial = new AvgCount(0,0); 72 | JavaPairRDD avgCounts = rdd.combineByKey(createAcc, addAndCount, combine); 73 | Map countMap = avgCounts.collectAsMap(); 74 | for (Entry entry : countMap.entrySet()) { 75 | System.out.println(entry.getKey() + ":" + entry.getValue().avg()); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/RemoveOutliers.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates remove outliers in Java using summary Stats 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaDoubleRDD; 13 | import org.apache.spark.api.java.JavaSparkContext; 14 | import org.apache.spark.api.java.function.Function; 15 | import org.apache.spark.util.StatCounter; 16 | 17 | public class RemoveOutliers { 18 | public static void main(String[] args) { 19 | String master; 20 | if (args.length > 0) { 21 | master = args[0]; 22 | } else { 23 | master = "local"; 24 | } 25 | JavaSparkContext sc = new JavaSparkContext( 26 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); 27 | JavaDoubleRDD input = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 1000.0)); 28 | JavaDoubleRDD result = removeOutliers(input); 29 | System.out.println(StringUtils.join(result.collect(), ",")); 30 | } 31 | static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) { 32 | final StatCounter summaryStats = rdd.stats(); 33 | final Double stddev = Math.sqrt(summaryStats.variance()); 34 | return rdd.filter(new Function() { public Boolean call(Double x) { 35 | return (Math.abs(x - summaryStats.mean()) < 3 * stddev); 36 | }}); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/SparkSQLTwitter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Load some tweets stored as JSON data and explore them. 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | import org.apache.commons.lang.StringUtils; 11 | 12 | import org.apache.spark.SparkConf; 13 | import org.apache.spark.api.java.JavaRDD; 14 | import org.apache.spark.api.java.JavaSparkContext; 15 | import org.apache.spark.api.java.function.Function; 16 | import org.apache.spark.sql.SQLContext; 17 | import org.apache.spark.sql.DataFrame; 18 | import org.apache.spark.sql.Row; 19 | import org.apache.spark.sql.api.java.UDF1; 20 | import org.apache.spark.sql.types.DataTypes; 21 | 22 | public class SparkSQLTwitter { 23 | public static void main(String[] args) { 24 | String inputFile = args[0]; 25 | SparkConf conf = new SparkConf(); 26 | JavaSparkContext sc = new JavaSparkContext(conf); 27 | SQLContext sqlCtx = new SQLContext(sc); 28 | DataFrame input = sqlCtx.jsonFile(inputFile); 29 | // Print the schema 30 | input.printSchema(); 31 | // Register the input schema RDD 32 | input.registerTempTable("tweets"); 33 | // Select tweets based on the retweetCount 34 | DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10"); 35 | Row[] result = topTweets.collect(); 36 | for (Row row : result) { 37 | System.out.println(row.get(0)); 38 | } 39 | JavaRDD topTweetText = topTweets.toJavaRDD().map(new Function() { 40 | public String call(Row row) { 41 | return row.getString(0); 42 | }}); 43 | System.out.println(topTweetText.collect()); 44 | // Create a person and turn it into a Schema RDD 45 | ArrayList peopleList = new ArrayList(); 46 | peopleList.add(new HappyPerson("holden", "coffee")); 47 | JavaRDD happyPeopleRDD = sc.parallelize(peopleList); 48 | DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class); 49 | happyPeopleSchemaRDD.registerTempTable("happy_people"); 50 | sqlCtx.udf().register("stringLengthJava", new UDF1() { 51 | @Override 52 | public Integer call(String str) throws Exception { 53 | return str.length(); 54 | } 55 | }, DataTypes.IntegerType); 56 | DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10"); 57 | Row[] lengths = tweetLength.collect(); 58 | for (Row row : result) { 59 | System.out.println(row.get(0)); 60 | } 61 | sc.stop(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/StreamingLogInput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map then filter in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | import org.apache.spark.api.java.JavaRDD; 12 | import org.apache.spark.api.java.JavaSparkContext; 13 | import org.apache.spark.api.java.function.Function; 14 | import org.apache.spark.streaming.api.java.JavaDStream; 15 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 16 | import org.apache.spark.streaming.Duration; 17 | 18 | public class StreamingLogInput { 19 | public static void main(String[] args) throws Exception { 20 | String master = args[0]; 21 | JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput"); 22 | // Create a StreamingContext with a 1 second batch size 23 | JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000)); 24 | // Create a DStream from all the input on port 7777 25 | JavaDStream lines = jssc.socketTextStream("localhost", 7777); 26 | // Filter our DStream for lines with "error" 27 | JavaDStream errorLines = lines.filter(new Function() { 28 | public Boolean call(String line) { 29 | return line.contains("error"); 30 | }}); 31 | // Print out the lines with errors, which causes this DStream to be evaluated 32 | errorLines.print(); 33 | // start our streaming context and wait for it to "finish" 34 | jssc.start(); 35 | // Wait for 10 seconds then exit. To run forever call without a timeout 36 | jssc.awaitTermination(10000); 37 | // Stop the streaming context 38 | jssc.stop(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/WordCount.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a wordcount in Java 3 | */ 4 | package com.oreilly.learningsparkexamples.java; 5 | 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.lang.Iterable; 9 | 10 | import scala.Tuple2; 11 | 12 | import org.apache.commons.lang.StringUtils; 13 | 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaSparkContext; 17 | import org.apache.spark.api.java.function.FlatMapFunction; 18 | import org.apache.spark.api.java.function.Function2; 19 | import org.apache.spark.api.java.function.PairFunction; 20 | 21 | 22 | public class WordCount { 23 | public static void main(String[] args) throws Exception { 24 | String master = args[0]; 25 | JavaSparkContext sc = new JavaSparkContext( 26 | master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS")); 27 | JavaRDD rdd = sc.textFile(args[1]); 28 | JavaPairRDD counts = rdd.flatMap( 29 | new FlatMapFunction() { 30 | public Iterable call(String x) { 31 | return Arrays.asList(x.split(" ")); 32 | }}).mapToPair(new PairFunction(){ 33 | public Tuple2 call(String x){ 34 | return new Tuple2(x, 1); 35 | }}).reduceByKey(new Function2(){ 36 | public Integer call(Integer x, Integer y){ return x+y;}}); 37 | counts.saveAsTextFile(args[2]); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/ApacheAccessLog.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import java.io.Serializable; 4 | import java.util.logging.Level; 5 | import java.util.logging.Logger; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | /** 10 | * This class represents an Apache access log line. 11 | * See http://httpd.apache.org/docs/2.2/logs.html for more details. 12 | */ 13 | public class ApacheAccessLog implements Serializable { 14 | private static final Logger logger = Logger.getLogger("Access"); 15 | 16 | private String ipAddress; 17 | private String clientIdentd; 18 | private String userID; 19 | private String dateTimeString; 20 | private String method; 21 | private String endpoint; 22 | private String protocol; 23 | private int responseCode; 24 | private long contentSize; 25 | 26 | private ApacheAccessLog(String ipAddress, String clientIdentd, String userID, 27 | String dateTime, String method, String endpoint, 28 | String protocol, String responseCode, 29 | String contentSize) { 30 | this.ipAddress = ipAddress; 31 | this.clientIdentd = clientIdentd; 32 | this.userID = userID; 33 | this.dateTimeString = dateTime; // TODO: Parse from dateTime String; 34 | this.method = method; 35 | this.endpoint = endpoint; 36 | this.protocol = protocol; 37 | this.responseCode = Integer.parseInt(responseCode); 38 | this.contentSize = Long.parseLong(contentSize); 39 | } 40 | 41 | public String getIpAddress() { 42 | return ipAddress; 43 | } 44 | 45 | public String getClientIdentd() { 46 | return clientIdentd; 47 | } 48 | 49 | public String getUserID() { 50 | return userID; 51 | } 52 | 53 | public String getDateTimeString() { 54 | return dateTimeString; 55 | } 56 | 57 | public String getMethod() { 58 | return method; 59 | } 60 | 61 | public String getEndpoint() { 62 | return endpoint; 63 | } 64 | 65 | public String getProtocol() { 66 | return protocol; 67 | } 68 | 69 | public int getResponseCode() { 70 | return responseCode; 71 | } 72 | 73 | public long getContentSize() { 74 | return contentSize; 75 | } 76 | 77 | public void setIpAddress(String ipAddress) { 78 | this.ipAddress = ipAddress; 79 | } 80 | 81 | public void setClientIdentd(String clientIdentd) { 82 | this.clientIdentd = clientIdentd; 83 | } 84 | 85 | public void setUserID(String userID) { 86 | this.userID = userID; 87 | } 88 | 89 | public void setDateTimeString(String dateTimeString) { 90 | this.dateTimeString = dateTimeString; 91 | } 92 | 93 | public void setMethod(String method) { 94 | this.method = method; 95 | } 96 | 97 | public void setEndpoint(String endpoint) { 98 | this.endpoint = endpoint; 99 | } 100 | 101 | public void setProtocol(String protocol) { 102 | this.protocol = protocol; 103 | } 104 | 105 | public void setResponseCode(int responseCode) { 106 | this.responseCode = responseCode; 107 | } 108 | 109 | public void setContentSize(long contentSize) { 110 | this.contentSize = contentSize; 111 | } 112 | 113 | // Example Apache log line: 114 | // 127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048 115 | private static final String LOG_ENTRY_PATTERN = 116 | // 1:IP 2:client 3:user 4:date time 5:method 6:req 7:proto 8:respcode 9:size 117 | "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)"; 118 | private static final Pattern PATTERN = Pattern.compile(LOG_ENTRY_PATTERN); 119 | 120 | public static ApacheAccessLog parseFromLogLine(String logline) { 121 | Matcher m = PATTERN.matcher(logline); 122 | if (!m.find()) { 123 | logger.log(Level.ALL, "Cannot parse logline" + logline); 124 | throw new RuntimeException("Error parsing logline"); 125 | } 126 | 127 | return new ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4), 128 | m.group(5), m.group(6), m.group(7), m.group(8), m.group(9)); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/Flags.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import org.apache.commons.cli.*; 4 | import org.apache.spark.streaming.Duration; 5 | 6 | public class Flags { 7 | private static Flags THE_INSTANCE = new Flags(); 8 | 9 | private Duration windowLength; 10 | private Duration slideInterval; 11 | private String logsDirectory; 12 | private String outputHtmlFile; 13 | private String checkpointDirectory; 14 | private String indexHtmlTemplate; 15 | private String outputDirectory; 16 | 17 | private boolean initialized = false; 18 | 19 | private Flags() {} 20 | 21 | public Duration getWindowLength() { 22 | return windowLength; 23 | } 24 | 25 | public Duration getSlideInterval() { 26 | return slideInterval; 27 | } 28 | 29 | public String getLogsDirectory() { 30 | return logsDirectory; 31 | } 32 | 33 | public String getOutputHtmlFile() { 34 | return outputHtmlFile; 35 | } 36 | 37 | public String getCheckpointDirectory() { 38 | return checkpointDirectory; 39 | } 40 | 41 | public String getOutputDirectory() { 42 | return outputDirectory; 43 | } 44 | 45 | public String getIndexHtmlTemplate() { 46 | return indexHtmlTemplate; 47 | } 48 | 49 | public static Flags getInstance() { 50 | if (!THE_INSTANCE.initialized) { 51 | throw new RuntimeException("Flags have not been initalized"); 52 | } 53 | return THE_INSTANCE; 54 | } 55 | 56 | public static void setFromCommandLineArgs(Options options, String[] args) { 57 | CommandLineParser parser = new PosixParser(); 58 | try { 59 | CommandLine cl = parser.parse(options, args); 60 | THE_INSTANCE.windowLength = new Duration(Integer.parseInt( 61 | cl.getOptionValue(LogAnalyzerAppMain.WINDOW_LENGTH, "30")) * 1000); 62 | THE_INSTANCE.slideInterval = new Duration(Integer.parseInt( 63 | cl.getOptionValue(LogAnalyzerAppMain.SLIDE_INTERVAL, "5")) * 1000); 64 | THE_INSTANCE.logsDirectory = cl.getOptionValue( 65 | LogAnalyzerAppMain.LOGS_DIRECTORY, "/tmp/logs"); 66 | THE_INSTANCE.outputHtmlFile = cl.getOptionValue( 67 | LogAnalyzerAppMain.OUTPUT_HTML_FILE, "/tmp/log_stats.html"); 68 | THE_INSTANCE.checkpointDirectory = cl.getOptionValue( 69 | LogAnalyzerAppMain.CHECKPOINT_DIRECTORY, "/tmp/log-analyzer-streaming"); 70 | THE_INSTANCE.indexHtmlTemplate = cl.getOptionValue( 71 | LogAnalyzerAppMain.INDEX_HTML_TEMPLATE, 72 | "./src/main/resources/index.html.template"); 73 | THE_INSTANCE.outputDirectory = cl.getOptionValue( 74 | LogAnalyzerAppMain.OUTPUT_DIRECTORY, "/tmp/pandaout"); 75 | THE_INSTANCE.initialized = true; 76 | } catch (ParseException e) { 77 | THE_INSTANCE.initialized = false; 78 | System.err.println("Parsing failed. Reason: " + e.getMessage()); 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/Functions.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import com.google.common.base.Optional; 4 | import com.google.common.collect.Ordering; 5 | import org.apache.spark.api.java.JavaDoubleRDD; 6 | import org.apache.spark.api.java.JavaPairRDD; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.function.Function; 9 | import org.apache.spark.api.java.function.DoubleFunction; 10 | import org.apache.spark.api.java.function.Function2; 11 | import org.apache.spark.api.java.function.PairFunction; 12 | import scala.Tuple2; 13 | import scala.Tuple4; 14 | 15 | import javax.annotation.Nullable; 16 | import java.io.Serializable; 17 | import java.util.Comparator; 18 | import java.util.List; 19 | 20 | public class Functions { 21 | public static final class LongSumReducer implements Function2 { 22 | @Override 23 | public Long call(Long a, Long b) { 24 | return a + b; 25 | } 26 | }; 27 | 28 | public static final class SumReducer implements Function2 { 29 | @Override 30 | public Double call(Double a, Double b) { 31 | return a + b; 32 | } 33 | }; 34 | 35 | 36 | public static final class ValueComparator 37 | implements Comparator>, Serializable { 38 | private Comparator comparator; 39 | 40 | public ValueComparator(Comparator comparator) { 41 | this.comparator = comparator; 42 | } 43 | 44 | @Override 45 | public int compare(Tuple2 o1, Tuple2 o2) { 46 | return comparator.compare(o1._2(), o2._2()); 47 | } 48 | } 49 | 50 | public static final class ComputeRunningSum implements Function2, Optional, Optional> { 51 | @Override 52 | public Optional call(List nums, Optional current) { 53 | long sum = current.or(0L); 54 | for (long i : nums) { 55 | sum += i; 56 | } 57 | return Optional.of(sum); 58 | } 59 | }; 60 | 61 | public static final class GetContentSize implements DoubleFunction { 62 | @Override 63 | public double call(ApacheAccessLog log) { 64 | return new Long(log.getContentSize()).doubleValue(); 65 | } 66 | } 67 | 68 | public static final @Nullable Tuple4 contentSizeStats( 69 | JavaRDD accessLogRDD) { 70 | JavaDoubleRDD contentSizes = 71 | accessLogRDD.mapToDouble(new GetContentSize()).cache(); 72 | long count = contentSizes.count(); 73 | if (count == 0) { 74 | return null; 75 | } 76 | Object ordering = Ordering.natural(); 77 | final Comparator cmp = (Comparator)ordering; 78 | 79 | return new Tuple4<>(count, 80 | contentSizes.reduce(new SumReducer()).longValue(), 81 | contentSizes.min(cmp).longValue(), 82 | contentSizes.max(cmp).longValue()); 83 | } 84 | 85 | public static final class ResponseCodeTuple implements PairFunction { 86 | @Override 87 | public Tuple2 call(ApacheAccessLog log) { 88 | return new Tuple2<>(log.getResponseCode(), 1L); 89 | } 90 | } 91 | 92 | public static final JavaPairRDD responseCodeCount( 93 | JavaRDD accessLogRDD) { 94 | return accessLogRDD 95 | .mapToPair(new ResponseCodeTuple()) 96 | .reduceByKey(new LongSumReducer()); 97 | } 98 | 99 | public static final class IpTuple implements PairFunction { 100 | @Override 101 | public Tuple2 call(ApacheAccessLog log) { 102 | return new Tuple2<>(log.getIpAddress(), 1L); 103 | } 104 | } 105 | 106 | public static final class IpContentTuple implements PairFunction { 107 | @Override 108 | public Tuple2 call(ApacheAccessLog log) { 109 | return new Tuple2<>(log.getIpAddress(), log.getContentSize()); 110 | } 111 | } 112 | 113 | 114 | public static final class EndPointTuple implements PairFunction { 115 | @Override 116 | public Tuple2 call(ApacheAccessLog log) { 117 | return new Tuple2<>(log.getEndpoint(), 1L); 118 | } 119 | } 120 | 121 | 122 | public static final class IpCountGreaterThan10 implements Function, Boolean> { 123 | @Override 124 | public Boolean call(Tuple2 e) { 125 | return e._2() > 10; 126 | } 127 | } 128 | 129 | public static final class ParseFromLogLine implements Function { 130 | @Override 131 | public ApacheAccessLog call(String line) { 132 | return ApacheAccessLog.parseFromLogLine(line); 133 | } 134 | } 135 | public static final JavaPairRDD ipAddressCount( 136 | JavaRDD accessLogRDD) { 137 | return accessLogRDD 138 | .mapToPair(new IpTuple()) 139 | .reduceByKey(new LongSumReducer()); 140 | } 141 | 142 | public static final JavaRDD filterIPAddress( 143 | JavaPairRDD ipAddressCount) { 144 | return ipAddressCount 145 | .filter(new IpCountGreaterThan10()) 146 | .keys(); 147 | } 148 | 149 | public static final JavaPairRDD endpointCount( 150 | JavaRDD accessLogRDD) { 151 | return accessLogRDD 152 | .mapToPair(new EndPointTuple()) 153 | .reduceByKey(new LongSumReducer()); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerAppMain.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import org.apache.commons.cli.Option; 4 | import org.apache.commons.cli.Options; 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.streaming.api.java.JavaDStream; 9 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 10 | import org.apache.spark.api.java.function.Function; 11 | 12 | import java.io.IOException; 13 | 14 | /** 15 | * The LogAnalyzerAppMain is an sample logs analysis application. For now, 16 | * it is a simple minimal viable product: 17 | * - Read in new log files from a directory and input those new files into streaming. 18 | * - Computes stats for all of time as well as the last time interval based on those logs. 19 | * - Write the calculated stats to an txt file on the local file system 20 | * that gets refreshed every time interval. 21 | * 22 | * Once you get this program up and running, feed apache access log files 23 | * into the local directory of your choosing. 24 | * 25 | * Then open your output text file, perhaps in a web browser, and refresh 26 | * that page to see more stats come in. 27 | * 28 | * Modify the command line flags to the values of your choosing. 29 | * Notice how they come after you specify the jar when using spark-submit. 30 | * 31 | * Example command to run: 32 | * % ${YOUR_SPARK_HOME}/bin/spark-submit 33 | * --class "com.oreilly.learningsparkexamples.java.logs.LogAnalyzerAppMain" 34 | * --master local[4] 35 | * target/uber-log-analyzer-1.0.jar 36 | * --logs_directory /tmp/logs 37 | * --output_html_file /tmp/log_stats.html 38 | * --index_html_template ./src/main/resources/index.html.template 39 | * --output_directory /tmp/pandaout 40 | */ 41 | public class LogAnalyzerAppMain { 42 | public static final String WINDOW_LENGTH = "window_length"; 43 | public static final String SLIDE_INTERVAL = "slide_interval"; 44 | public static final String LOGS_DIRECTORY = "logs_directory"; 45 | public static final String OUTPUT_HTML_FILE = "output_html_file"; 46 | public static final String CHECKPOINT_DIRECTORY = "checkpoint_directory"; 47 | public static final String INDEX_HTML_TEMPLATE = "index_html_template"; 48 | public static final String OUTPUT_DIRECTORY = "output_directory"; 49 | 50 | private static final Options THE_OPTIONS = createOptions(); 51 | private static Options createOptions() { 52 | Options options = new Options(); 53 | 54 | options.addOption( 55 | new Option(WINDOW_LENGTH, false, "The window length in seconds")); 56 | options.addOption( 57 | new Option(SLIDE_INTERVAL, false, "The slide interval in seconds")); 58 | options.addOption( 59 | new Option(LOGS_DIRECTORY, true, "The directory where logs are written")); 60 | options.addOption( 61 | new Option(OUTPUT_HTML_FILE, false, "Where to write output html file")); 62 | options.addOption( 63 | new Option(CHECKPOINT_DIRECTORY, false, "The checkpoint directory.")); 64 | options.addOption(new Option(INDEX_HTML_TEMPLATE, true, 65 | "path to the index.html.template file - accessible from all workers")); 66 | options.addOption(new Option(OUTPUT_DIRECTORY, false, "path to output DSTreams too")); 67 | 68 | return options; 69 | } 70 | 71 | public static void main(String[] args) throws IOException { 72 | Flags.setFromCommandLineArgs(THE_OPTIONS, args); 73 | 74 | // Startup the Spark Conf. 75 | SparkConf conf = new SparkConf() 76 | .setAppName("A Databricks Reference Application: Logs Analysis with Spark"); 77 | JavaStreamingContext jssc = new JavaStreamingContext(conf, 78 | Flags.getInstance().getSlideInterval()); 79 | 80 | // Checkpointing must be enabled to use the updateStateByKey function & windowed operations. 81 | jssc.checkpoint(Flags.getInstance().getCheckpointDirectory()); 82 | 83 | // This methods monitors a directory for new files to read in for streaming. 84 | JavaDStream logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory()); 85 | 86 | JavaDStream accessLogsDStream 87 | = logData.map(new Functions.ParseFromLogLine()).cache(); 88 | 89 | final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal(); 90 | final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed(); 91 | 92 | // Process the DStream which gathers stats for all of time. 93 | logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream); 94 | 95 | // Calculate statistics for the last time interval. 96 | logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream); 97 | 98 | // Render the output each time there is a new RDD in the accessLogsDStream. 99 | final Renderer renderer = new Renderer(); 100 | accessLogsDStream.foreachRDD(new Function, Void>() { 101 | public Void call(JavaRDD rdd) { 102 | // Call this to output the stats. 103 | try { 104 | renderer.render(logAnalyzerTotal.getLogStatistics(), 105 | logAnalyzerWindowed.getLogStatistics()); 106 | } catch (Exception e) { 107 | } 108 | return null; 109 | } 110 | }); 111 | 112 | // Start the streaming server. 113 | jssc.start(); // Start the computation 114 | jssc.awaitTermination(); // Wait for the computation to terminate 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerWindowed.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import com.google.common.collect.Ordering; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaPairDStream; 8 | import org.apache.spark.api.java.function.Function; 9 | import org.apache.spark.api.java.function.Function2; 10 | import org.apache.spark.api.java.function.PairFunction; 11 | import scala.Tuple2; 12 | import scala.Tuple4; 13 | 14 | import java.io.Serializable; 15 | import java.util.Comparator; 16 | import java.util.List; 17 | 18 | public class LogAnalyzerWindowed implements Serializable { 19 | private LogStatistics logStatistics; 20 | 21 | public void processAccessLogs(String outDir, JavaDStream accessLogsDStream) { 22 | JavaDStream windowDStream = accessLogsDStream.window( 23 | Flags.getInstance().getWindowLength(), 24 | Flags.getInstance().getSlideInterval()); 25 | JavaDStream ip = accessLogsDStream.map( 26 | new Function() { 27 | public String call(ApacheAccessLog entry) { 28 | return entry.getIpAddress(); 29 | }}); 30 | // reduceByWindow 31 | JavaDStream requestCountRBW = accessLogsDStream.map(new Function() { 32 | public Long call(ApacheAccessLog entry) { 33 | return 1L; 34 | }}).reduceByWindow(new Function2() { 35 | public Long call(Long v1, Long v2) { 36 | return v1+v2; 37 | }}, new Function2() { 38 | public Long call(Long v1, Long v2) { 39 | return v1-v2; 40 | }}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); 41 | requestCountRBW.print(); 42 | // reducebykeyandwindow 43 | JavaPairDStream ipAddressPairDStream = accessLogsDStream.mapToPair( 44 | new PairFunction() { 45 | public Tuple2 call(ApacheAccessLog entry) { 46 | return new Tuple2(entry.getIpAddress(), 1L); 47 | }}); 48 | JavaPairDStream ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow( 49 | // Adding elements in the new slice 50 | new Function2() { 51 | public Long call(Long v1, Long v2) { 52 | return v1+v2; 53 | }}, 54 | // Removing elements from the oldest slice 55 | new Function2() { 56 | public Long call(Long v1, Long v2) { 57 | return v1-v2; 58 | }}, 59 | Flags.getInstance().getWindowLength(), 60 | Flags.getInstance().getSlideInterval()); 61 | ipCountDStream.print(); 62 | // Use countByWindow 63 | JavaDStream requestCount = accessLogsDStream.countByWindow( 64 | Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); 65 | JavaPairDStream ipAddressRequestCount = ip.countByValueAndWindow( 66 | Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); 67 | requestCount.print(); 68 | ipAddressRequestCount.print(); 69 | 70 | // use a transform for the response code count 71 | JavaPairDStream responseCodeCountTransform = accessLogsDStream.transformToPair( 72 | new Function, JavaPairRDD>() { 73 | public JavaPairRDD call(JavaRDD logs) { 74 | return Functions.responseCodeCount(logs); 75 | } 76 | }); 77 | windowDStream.foreachRDD(new Function, Void>() { 78 | public Void call(JavaRDD accessLogs) { 79 | Tuple4 contentSizeStats = 80 | Functions.contentSizeStats(accessLogs); 81 | 82 | List> responseCodeToCount = 83 | Functions.responseCodeCount(accessLogs) 84 | .take(100); 85 | 86 | JavaPairRDD ipAddressCounts = 87 | Functions.ipAddressCount(accessLogs); 88 | List ip = Functions.filterIPAddress(ipAddressCounts) 89 | .take(100); 90 | 91 | Object ordering = Ordering.natural(); 92 | Comparator cmp = (Comparator)ordering; 93 | List> topEndpoints = 94 | Functions.endpointCount(accessLogs) 95 | .top(10, new Functions.ValueComparator(cmp)); 96 | 97 | logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, 98 | ip, topEndpoints); 99 | return null; 100 | }}); 101 | } 102 | 103 | public LogStatistics getLogStatistics() { 104 | return logStatistics; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/LogStatistics.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import scala.Tuple2; 4 | import scala.Tuple4; 5 | 6 | import java.io.Serializable; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | public class LogStatistics implements Serializable { 13 | public final static LogStatistics EMPTY_LOG_STATISTICS = 14 | new LogStatistics(new Tuple4<>(0L, 0L, 0L, 0L), new ArrayList>(), 15 | new ArrayList(), new ArrayList>()); 16 | 17 | private Tuple4 contentSizeStats; 18 | private List> responseCodeToCount; 19 | private List ipAddresses; 20 | private List> topEndpoints; 21 | 22 | public LogStatistics(Tuple4 contentSizeStats, 23 | List> responseCodeToCount, 24 | List ipAddresses, 25 | List> topEndpoints) { 26 | this.contentSizeStats = contentSizeStats; 27 | this.responseCodeToCount = responseCodeToCount; 28 | this.ipAddresses = ipAddresses; 29 | this.topEndpoints = topEndpoints; 30 | } 31 | 32 | public Tuple4 getContentSizeStats() { 33 | return contentSizeStats; 34 | } 35 | 36 | public Map getResponseCodeToCount() { 37 | Map responseCodeCount = new HashMap<>(); 38 | for (Tuple2 tuple: responseCodeToCount) { 39 | responseCodeCount.put(tuple._1(), tuple._2()); 40 | } 41 | return responseCodeCount; 42 | } 43 | 44 | public List getIpAddresses() { 45 | return ipAddresses; 46 | } 47 | 48 | public List> getTopEndpoints() { 49 | return topEndpoints; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/ReadTransferStats.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import org.apache.spark.api.java.JavaPairRDD; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.function.Function; 6 | import org.apache.spark.api.java.function.PairFunction; 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 8 | import org.apache.spark.streaming.api.java.JavaPairDStream; 9 | import scala.Tuple2; 10 | 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.Writable; 13 | import org.apache.hadoop.io.IntWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.io.LongWritable; 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 18 | 19 | 20 | import java.io.Serializable; 21 | 22 | public class ReadTransferStats implements Serializable { 23 | 24 | public JavaPairDStream readStats(JavaStreamingContext jssc, String inputDirectory) { 25 | // Note: This example doesn't work until Spark 1.2 26 | JavaPairDStream input = 27 | jssc.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class); 28 | // convert the input from Writables to native types 29 | JavaPairDStream usefulInput = input.mapToPair( 30 | new PairFunction, Long, Integer>() { 31 | public Tuple2 call(Tuple2 input) { 32 | return new Tuple2(input._1().get(), Integer.parseInt(input._2().toString())); 33 | } 34 | }); 35 | return usefulInput; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/oreilly/learningsparkexamples/java/logs/Renderer.java: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.java.logs; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Files; 5 | import scala.Tuple2; 6 | import scala.Tuple4; 7 | 8 | import java.io.*; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | public class Renderer implements Serializable { 13 | private String fileTemplate; 14 | 15 | public void render(LogStatistics allOfTime, LogStatistics lastWindow) 16 | throws Exception { 17 | if (fileTemplate == null) { 18 | fileTemplate = Files.toString( 19 | new File(Flags.getInstance().getIndexHtmlTemplate()), 20 | Charsets.UTF_8); 21 | } 22 | 23 | // TODO: Replace this hacky String replace with a proper HTML templating library. 24 | String output = fileTemplate; 25 | output = output.replace("${logLinesTable}", logLinesTable(allOfTime, lastWindow)); 26 | output = output.replace("${contentSizesTable}", contentSizesTable(allOfTime, lastWindow)); 27 | output = output.replace("${responseCodeTable}", responseCodeTable(allOfTime, lastWindow)); 28 | output = output.replace("${topEndpointsTable}", topEndpointsTable(allOfTime, lastWindow)); 29 | output = output.replace("${frequentIpAddressTable}", frequentIpAddressTable(allOfTime, lastWindow)); 30 | 31 | Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream( 32 | Flags.getInstance().getOutputHtmlFile()))); 33 | out.write(output); 34 | out.close(); 35 | } 36 | 37 | public String logLinesTable(LogStatistics allOfTime, LogStatistics lastWindow) { 38 | return "" + 39 | String.format("", 40 | allOfTime.getContentSizeStats()._1()) + 41 | String.format("", 42 | lastWindow.getContentSizeStats()._1()) + 43 | "
All Of Time:%s
Last Time Window:%s
"; 44 | } 45 | 46 | public String contentSizesTable(LogStatistics allOfTime, LogStatistics lastWindow) { 47 | StringBuilder builder = new StringBuilder(); 48 | builder.append(""); 49 | builder.append(""); 50 | Tuple4 totalStats = allOfTime.getContentSizeStats(); 51 | Tuple4 lastStats = lastWindow.getContentSizeStats(); 52 | builder.append(String.format("", 53 | totalStats._1() > 0 ? totalStats._2() / totalStats._1() : "-", 54 | lastStats._1() > 0 ? lastStats._2() / lastStats._1() : "-")); 55 | builder.append(String.format("", 56 | totalStats._1() > 0 ? totalStats._3() : "-", 57 | lastStats._1() > 0 ? lastStats._3() : "-")); 58 | builder.append(String.format("", 59 | totalStats._1() > 0 ? totalStats._4() : "-", 60 | lastStats._1() > 0 ? lastStats._4() : "-")); 61 | builder.append("
All of Time Last Time Window
Avg:%s%s
Min:%s%s
Max:%s%s
"); 62 | return builder.toString(); 63 | } 64 | 65 | public String responseCodeTable( 66 | LogStatistics allOfTime, LogStatistics lastWindow) { 67 | StringBuilder buffer = new StringBuilder(); 68 | buffer.append(""); 69 | buffer.append(""); 70 | Map lastWindowMap = lastWindow.getResponseCodeToCount(); 71 | for(Map.Entry entry: allOfTime.getResponseCodeToCount().entrySet()) { 72 | buffer.append(String.format("", 73 | entry.getKey(), entry.getValue(), lastWindowMap.get(entry.getKey()))); 74 | } 75 | buffer.append("
Response CodeAll of Time Last Time Window
%s%s%s
"); 76 | return buffer.toString(); 77 | } 78 | 79 | public String frequentIpAddressTable( 80 | LogStatistics allOfTime, LogStatistics lastWindow) { 81 | StringBuilder builder = new StringBuilder(); 82 | builder.append(""); 83 | builder.append(""); 84 | List totalIpAddresses = allOfTime.getIpAddresses(); 85 | List windowIpAddresses = lastWindow.getIpAddresses(); 86 | for (int i = 0; i < totalIpAddresses.size(); i++) { 87 | builder.append(String.format("", 88 | totalIpAddresses.get(i), 89 | i < windowIpAddresses.size() ? windowIpAddresses.get(i) : "-")); 90 | } 91 | builder.append("
All of Time Last Time Window
%s%s
"); 92 | return builder.toString(); 93 | } 94 | 95 | public String topEndpointsTable( 96 | LogStatistics allOfTime, LogStatistics lastWindow) { 97 | StringBuilder builder = new StringBuilder(); 98 | builder.append(""); 99 | builder.append(""); 100 | List> totalTopEndpoints = allOfTime.getTopEndpoints(); 101 | List> windowTopEndpoints = lastWindow.getTopEndpoints(); 102 | for (int i = 0; i < totalTopEndpoints.size(); i++) { 103 | builder.append(String.format("", 104 | totalTopEndpoints.get(i)._1(), 105 | i < windowTopEndpoints.size() ? windowTopEndpoints.get(i)._1() : "-")); 106 | } 107 | builder.append("
All of TimeLast Time Window
%s%s
"); 108 | return builder.toString(); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/protobuf/address_book.proto: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.proto; 2 | 3 | // The sample protocol buffer file that Google uses in their examples at 4 | // http://code.google.com/p/protobuf and twitter uses in elephant bird. 5 | // Used in this project for examples. 6 | 7 | option java_outer_classname = "AddressBookProtos"; 8 | 9 | message Person { 10 | required string name = 1; 11 | required int32 id = 2; 12 | optional string email = 3; 13 | 14 | enum PhoneType { 15 | MOBILE = 0; 16 | HOME = 1; 17 | WORK = 2; 18 | } 19 | 20 | message PhoneNumber { 21 | required string number = 1; 22 | optional PhoneType type = 2 [default = HOME]; 23 | } 24 | 25 | repeated PhoneNumber phone = 4; 26 | } 27 | 28 | message AddressBook { 29 | repeated Person person = 1; 30 | optional bytes byteData = 2; 31 | } 32 | 33 | // used testing handling of unknown fields 34 | message PersonWithoutEmail { 35 | required string name = 1; 36 | required int32 id = 2; 37 | repeated Person.PhoneNumber phone = 4; 38 | } 39 | -------------------------------------------------------------------------------- /src/main/protobuf/places.proto: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.proto; 2 | 3 | message Venue { 4 | required int32 id = 1; 5 | required string name = 2; 6 | required VenueType type = 3; 7 | optional string address = 4; 8 | 9 | enum VenueType { 10 | COFFEESHOP = 0; 11 | WORKPLACE = 1; 12 | CLUB = 2; 13 | OMNOMNOM = 3; 14 | OTHER = 4; 15 | } 16 | } 17 | 18 | message VenueResponse { 19 | repeated Venue results = 1; 20 | } -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvg.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple aggregate in scala to compute the average of an RDD 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.RDD 8 | 9 | object BasicAvg { 10 | def main(args: Array[String]) { 11 | val master = args.length match { 12 | case x: Int if x > 0 => args(0) 13 | case _ => "local" 14 | } 15 | val sc = new SparkContext(master, "BasicAvg", System.getenv("SPARK_HOME")) 16 | val input = sc.parallelize(List(1,2,3,4)) 17 | val result = computeAvg(input) 18 | val avg = result._1 / result._2.toFloat 19 | println(result) 20 | } 21 | def computeAvg(input: RDD[Int]) = { 22 | input.aggregate((0, 0))((x, y) => (x._1 + y, x._2 + 1), 23 | (x,y) => (x._1 + y._1, x._2 + y._2)) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgFromFile.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a simple text file 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicAvgFromFile { 9 | def main(args: Array[String]) { 10 | if (args.length < 2) { 11 | println("Usage: [sparkmaster] [inputfile]") 12 | exit(1) 13 | } 14 | val master = args(0) 15 | val inputFile = args(1) 16 | val sc = new SparkContext(master, "BasicAvg", System.getenv("SPARK_HOME")) 17 | val input = sc.textFile(inputFile) 18 | val result = input.map(_.toInt).aggregate((0, 0))( 19 | (acc, value) => (acc._1 + value, acc._2 + 1), 20 | (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)) 21 | val avg = result._1 / result._2.toFloat 22 | println(result) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgFromFiles.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a directory of files 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object BasicAvgFromFiles { 10 | def main(args: Array[String]) { 11 | if (args.length < 3) { 12 | println("Usage: [sparkmaster] [inputdirectory] [outputdirectory]") 13 | exit(1) 14 | } 15 | val master = args(0) 16 | val inputFile = args(1) 17 | val outputFile = args(2) 18 | val sc = new SparkContext(master, "BasicAvgFromFiles", System.getenv("SPARK_HOME")) 19 | val input = sc.wholeTextFiles(inputFile) 20 | val result = input.mapValues{y => 21 | val nums = y.split(" ").map(_.toDouble) 22 | nums.sum / nums.size.toDouble 23 | } 24 | result.saveAsTextFile(outputFile) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgMapPartitions.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates mapPartitions in scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicAvgMapPartitions { 9 | case class AvgCount(var total: Int = 0, var num: Int = 0) { 10 | def merge(other: AvgCount): AvgCount = { 11 | total += other.total 12 | num += other.num 13 | this 14 | } 15 | def merge(input: Iterator[Int]): AvgCount = { 16 | input.foreach{elem => 17 | total += elem 18 | num += 1 19 | } 20 | this 21 | } 22 | def avg(): Float = { 23 | total / num.toFloat; 24 | } 25 | } 26 | 27 | def main(args: Array[String]) { 28 | val master = args.length match { 29 | case x: Int if x > 0 => args(0) 30 | case _ => "local" 31 | } 32 | val sc = new SparkContext(master, "BasicAvgMapPartitions", System.getenv("SPARK_HOME")) 33 | val input = sc.parallelize(List(1, 2, 3, 4)) 34 | val result = input.mapPartitions(partition => 35 | // Here we only want to return a single element for each partition, but mapPartitions requires that we wrap our return in an Iterator 36 | Iterator(AvgCount(0, 0).merge(partition))) 37 | .reduce((x,y) => x.merge(y)) 38 | println(result) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgWithKryo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple fold in scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicAvgWithKryo { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val conf = new SparkConf().setMaster(master).setAppName("basicAvgWithKryo") 15 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 16 | val sc = new SparkContext(conf) 17 | val input = sc.parallelize(List(1,2,3,4)) 18 | val result = input.aggregate((0, 0))((x, y) => (x._1 + y, x._2 + 1), 19 | (x,y) => (x._1 + y._1, x._2 + y._2)) 20 | val avg = result._1 / result._2.toFloat 21 | println(result) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicFilterUnionCombo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates filtering and union to extract lines with "error" or "warning" 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object BasicFilterUnionCombo { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf 12 | conf.setMaster(args(0)) 13 | val sc = new SparkContext(conf) 14 | val inputRDD = sc.textFile(args(1)) 15 | val errorsRDD = inputRDD.filter(_.contains("error")) 16 | val warningsRDD = inputRDD.filter(_.contains("error")) 17 | val badLinesRDD = errorsRDD.union(warningsRDD) 18 | println(badLinesRDD.collect().mkString("\n")) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicIntersectByKey.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates intersection by key 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.PairRDDFunctions 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.SparkContext._ 10 | 11 | import scala.reflect.ClassTag 12 | 13 | object BasicIntersectByKey { 14 | 15 | def intersectByKey[K: ClassTag, V: ClassTag](rdd1: RDD[(K, V)], rdd2: RDD[(K, V)]): RDD[(K, V)] = { 16 | rdd1.cogroup(rdd2).flatMapValues{ 17 | case (Nil, _) => None 18 | case (_, Nil) => None 19 | case (x, y) => x++y 20 | } 21 | } 22 | 23 | def main(args: Array[String]) { 24 | val master = args.length match { 25 | case x: Int if x > 0 => args(0) 26 | case _ => "local" 27 | } 28 | val sc = new SparkContext(master, "BasicIntersectByKey", System.getenv("SPARK_HOME")) 29 | val rdd1 = sc.parallelize(List((1, "panda"), (2, "happy"))) 30 | val rdd2 = sc.parallelize(List((2, "pandas"))) 31 | val iRdd = intersectByKey(rdd1, rdd2) 32 | val panda: List[(Int, String)] = iRdd.collect().toList 33 | panda.map(println(_)) 34 | sc.stop() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadNums.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a text file of integers and counting the number of invalid elements 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object BasicLoadNums { 10 | def main(args: Array[String]) { 11 | val master = args(0) 12 | val inputFile = args(1) 13 | val sc = new SparkContext(master, "BasicLoadNums", System.getenv("SPARK_HOME")) 14 | val file = sc.textFile(inputFile) 15 | val errorLines = sc.accumulator(0) // Create an Accumulator[Int] initialized to 0 16 | val dataLines = sc.accumulator(0) // Create a second Accumulator[Int] initialized to 0 17 | val counts = file.flatMap(line => { 18 | try { 19 | val input = line.split(" ") 20 | val data = Some((input(0), input(1).toInt)) 21 | dataLines += 1 22 | data 23 | } catch { 24 | case e: java.lang.NumberFormatException => { 25 | errorLines += 1 26 | None 27 | } 28 | case e: java.lang.ArrayIndexOutOfBoundsException => { 29 | errorLines += 1 30 | None 31 | } 32 | } 33 | }).reduceByKey(_ + _) 34 | if (errorLines.value < 0.1 * dataLines.value) { 35 | counts.saveAsTextFile("output.txt") 36 | } else { 37 | println(s"Too many errors ${errorLines.value} for ${dataLines.value}") 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadSequenceFile.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Loads a simple sequence file of people and how many pandas they have seen. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.hadoop.io.{IntWritable, Text} 9 | 10 | 11 | object BasicLoadSequenceFile { 12 | def main(args: Array[String]) { 13 | val master = args(0) 14 | val inFile = args(1) 15 | val sc = new SparkContext(master, "BasicLoadSequenceFile", System.getenv("SPARK_HOME")) 16 | val data = sc.sequenceFile(inFile, classOf[Text], classOf[IntWritable]).map{case (x, y) => 17 | (x.toString, y.get())} 18 | println(data.collect().toList) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadTextFromFTP.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading a text file from FTP 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object BasicTextFromFTP { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf 12 | conf.setMaster(args(0)) 13 | val sc = new SparkContext(conf) 14 | val file = sc.textFile("ftp://anonymous:pandamagic@ftp.ubuntu.com/ubuntu/ls-LR.gz") 15 | println(file.collect().mkString("\n")) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMap.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicMap { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME")) 15 | val input = sc.parallelize(List(1,2,3,4)) 16 | val result = input.map(x => x*x) 17 | println(result.collect().mkString(",")) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapNoCache.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates lack of caching 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicMapNoCache { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "BasicMapNoCache", System.getenv("SPARK_HOME")) 15 | val input = sc.parallelize(List(1,2,3,4)) 16 | val result = input.map(x => x*x) 17 | // will compute result twice 18 | println(result.count()) 19 | println(result.collect().mkString(",")) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapPartitions.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | import org.eclipse.jetty.client.ContentExchange 9 | import org.eclipse.jetty.client.HttpClient 10 | 11 | object BasicMapPartitions { 12 | def main(args: Array[String]) { 13 | val master = args.length match { 14 | case x: Int if x > 0 => args(0) 15 | case _ => "local" 16 | } 17 | val sc = new SparkContext(master, "BasicMapPartitions", System.getenv("SPARK_HOME")) 18 | val input = sc.parallelize(List("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB")) 19 | val result = input.mapPartitions{ 20 | signs => 21 | val client = new HttpClient() 22 | client.start() 23 | signs.map {sign => 24 | val exchange = new ContentExchange(true); 25 | exchange.setURL(s"http://qrzcq.com/call/${sign}") 26 | client.send(exchange) 27 | exchange 28 | }.map{ exchange => 29 | exchange.waitForDone(); 30 | exchange.getResponseContent() 31 | } 32 | } 33 | println(result.collect().mkString(",")) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapThenFilter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map the filter in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicMapThenFilter { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME")) 15 | val input = sc.parallelize(List(1,2,3,4)) 16 | val squared = input.map(x => x*x) 17 | val result = squared.filter(x => x != 1) 18 | println(result.collect().mkString(",")) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseCsv.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse CSV data in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import java.io.StringReader 7 | import java.io.StringWriter 8 | 9 | import org.apache.spark._ 10 | import play.api.libs.json._ 11 | import play.api.libs.functional.syntax._ 12 | import scala.util.parsing.json.JSON 13 | import scala.collection.JavaConversions._ 14 | 15 | import au.com.bytecode.opencsv.CSVReader 16 | import au.com.bytecode.opencsv.CSVWriter 17 | 18 | object BasicParseCsv { 19 | case class Person(name: String, favouriteAnimal: String) 20 | 21 | def main(args: Array[String]) { 22 | if (args.length < 3) { 23 | println("Usage: [sparkmaster] [inputfile] [outputfile]") 24 | exit(1) 25 | } 26 | val master = args(0) 27 | val inputFile = args(1) 28 | val outputFile = args(2) 29 | val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME")) 30 | val input = sc.textFile(inputFile) 31 | val result = input.map{ line => 32 | val reader = new CSVReader(new StringReader(line)); 33 | reader.readNext(); 34 | } 35 | val people = result.map(x => Person(x(0), x(1))) 36 | val pandaLovers = people.filter(person => person.favouriteAnimal == "panda") 37 | pandaLovers.map(person => List(person.name, person.favouriteAnimal).toArray).mapPartitions{people => 38 | val stringWriter = new StringWriter(); 39 | val csvWriter = new CSVWriter(stringWriter); 40 | csvWriter.writeAll(people.toList) 41 | Iterator(stringWriter.toString) 42 | }.saveAsTextFile(outputFile) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseJson.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse JSON data in Scala 3 | * Loads the data into a case class with the name and a boolean flag 4 | * if the person loves pandas. 5 | */ 6 | package com.oreilly.learningsparkexamples.scala 7 | 8 | import org.apache.spark._ 9 | import play.api.libs.json._ 10 | import play.api.libs.functional.syntax._ 11 | 12 | object BasicParseJson { 13 | case class Person(name: String, lovesPandas: Boolean) 14 | implicit val personReads = Json.format[Person] 15 | 16 | def main(args: Array[String]) { 17 | if (args.length < 3) { 18 | println("Usage: [sparkmaster] [inputfile] [outputfile]") 19 | exit(1) 20 | } 21 | val master = args(0) 22 | val inputFile = args(1) 23 | val outputFile = args(2) 24 | val sc = new SparkContext(master, "BasicParseJson", System.getenv("SPARK_HOME")) 25 | val input = sc.textFile(inputFile) 26 | val parsed = input.map(Json.parse(_)) 27 | // We use asOpt combined with flatMap so that if it fails to parse we 28 | // get back a None and the flatMap essentially skips the result. 29 | val result = parsed.flatMap(record => personReads.reads(record).asOpt) 30 | result.filter(_.lovesPandas).map(Json.toJson(_)).saveAsTextFile(outputFile) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseJsonWithJackson.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse JSON data in Scala 3 | * Loads the data into a case class with the name and a boolean flag 4 | * if the person loves pandas. 5 | */ 6 | package com.oreilly.learningsparkexamples.scala 7 | 8 | import org.apache.spark._ 9 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 10 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper 11 | import com.fasterxml.jackson.databind.ObjectMapper 12 | import com.fasterxml.jackson.databind.DeserializationFeature 13 | 14 | 15 | 16 | case class Person(name: String, lovesPandas: Boolean) // Note: must be a top level class 17 | 18 | object BasicParseJsonWithJackson { 19 | 20 | def main(args: Array[String]) { 21 | if (args.length < 3) { 22 | println("Usage: [sparkmaster] [inputfile] [outputfile]") 23 | exit(1) 24 | } 25 | val master = args(0) 26 | val inputFile = args(1) 27 | val outputFile = args(2) 28 | val sc = new SparkContext(master, "BasicParseJsonWithJackson", System.getenv("SPARK_HOME")) 29 | val input = sc.textFile(inputFile) 30 | 31 | // Parse it into a specific case class. We use mapPartitions beacuse: 32 | // (a) ObjectMapper is not serializable so we either create a singleton object encapsulating ObjectMapper 33 | // on the driver and have to send data back to the driver to go through the singleton object. 34 | // Alternatively we can let each node create its own ObjectMapper but that's expensive in a map 35 | // (b) To solve for creating an ObjectMapper on each node without being too expensive we create one per 36 | // partition with mapPartitions. Solves serialization and object creation performance hit. 37 | val result = input.mapPartitions(records => { 38 | // mapper object created on each executor node 39 | val mapper = new ObjectMapper with ScalaObjectMapper 40 | mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 41 | mapper.registerModule(DefaultScalaModule) 42 | // We use flatMap to handle errors 43 | // by returning an empty list (None) if we encounter an issue and a 44 | // list with one element if everything is ok (Some(_)). 45 | records.flatMap(record => { 46 | try { 47 | Some(mapper.readValue(record, classOf[Person])) 48 | } catch { 49 | case e: Exception => None 50 | } 51 | }) 52 | }, true) 53 | result.filter(_.lovesPandas).mapPartitions(records => { 54 | val mapper = new ObjectMapper with ScalaObjectMapper 55 | mapper.registerModule(DefaultScalaModule) 56 | records.map(mapper.writeValueAsString(_)) 57 | }) 58 | .saveAsTextFile(outputFile) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseWholeFileCsv.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse CSV data in Scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import java.io.StringReader 7 | 8 | import org.apache.spark._ 9 | import play.api.libs.json._ 10 | import play.api.libs.functional.syntax._ 11 | import scala.util.parsing.json.JSON 12 | import scala.collection.JavaConversions._ 13 | import au.com.bytecode.opencsv.CSVReader 14 | 15 | object BasicParseWholeFileCsv { 16 | def main(args: Array[String]) { 17 | if (args.length < 2) { 18 | println("Usage: [sparkmaster] [inputfile]") 19 | exit(1) 20 | } 21 | val master = args(0) 22 | val inputFile = args(1) 23 | val sc = new SparkContext(master, "BasicParseWholeFileCsv", System.getenv("SPARK_HOME")) 24 | val input = sc.wholeTextFiles(inputFile) 25 | val result = input.flatMap{ case (_, txt) => 26 | val reader = new CSVReader(new StringReader(txt)); 27 | reader.readAll() 28 | } 29 | println(result.collect().map(_.toList).mkString(",")) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicQueryCassandra.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * A simple illustration of querying Cassandra 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | // Implicits that add functions to the SparkContext & RDDs. 9 | import com.datastax.spark.connector._ 10 | 11 | 12 | object BasicQueryCassandra { 13 | def main(args: Array[String]) { 14 | val sparkMaster = args(0) 15 | val cassandraHost = args(1) 16 | val conf = new SparkConf(true) 17 | .set("spark.cassandra.connection.host", cassandraHost) 18 | val sc = new SparkContext(sparkMaster, "BasicQueryCassandra", conf) 19 | // entire table as an RDD 20 | // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int); 21 | val data = sc.cassandraTable("test" , "kv") 22 | // print some basic stats 23 | println("stats "+data.map(row => row.getInt("value")).stats()) 24 | val rdd = sc.parallelize(List(("moremagic", 1))) 25 | rdd.saveToCassandra("test" , "kv", SomeColumns("key", "value")) 26 | // save from a case class 27 | val otherRdd = sc.parallelize(List(KeyValue("magic", 0))) 28 | otherRdd.saveToCassandra("test", "kv") 29 | } 30 | } 31 | 32 | case class KeyValue(key: String, value: Integer) 33 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSaveProtoBuf.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Saves a sequence file of people and how many pandas they have seen. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import com.oreilly.learningsparkexamples.proto.Places 7 | 8 | import org.apache.spark._ 9 | import org.apache.spark.SparkContext._ 10 | 11 | import org.apache.hadoop.io.Text 12 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable 13 | import com.twitter.elephantbird.mapreduce.output.LzoProtobufBlockOutputFormat 14 | import org.apache.hadoop.mapreduce.Job 15 | 16 | object BasicSaveProtoBuf { 17 | def main(args: Array[String]) { 18 | val master = args(0) 19 | val outputFile = args(1) 20 | val sc = new SparkContext(master, "BasicSaveProtoBuf", System.getenv("SPARK_HOME")) 21 | val job = new Job() 22 | val conf = job.getConfiguration 23 | LzoProtobufBlockOutputFormat.setClassConf(classOf[Places.Venue], conf); 24 | val dnaLounge = Places.Venue.newBuilder() 25 | dnaLounge.setId(1); 26 | dnaLounge.setName("DNA Lounge") 27 | dnaLounge.setType(Places.Venue.VenueType.CLUB) 28 | val data = sc.parallelize(List(dnaLounge.build())) 29 | val outputData = data.map{ pb => 30 | val protoWritable = ProtobufWritable.newInstance(classOf[Places.Venue]); 31 | protoWritable.set(pb) 32 | (null, protoWritable) 33 | } 34 | outputData.saveAsNewAPIHadoopFile(outputFile, classOf[Text], classOf[ProtobufWritable[Places.Venue]], 35 | classOf[LzoProtobufBlockOutputFormat[ProtobufWritable[Places.Venue]]], conf) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSaveSequenceFile.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Saves a sequence file of people and how many pandas they have seen. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object BasicSaveSequenceFile { 10 | def main(args: Array[String]) { 11 | val master = args(0) 12 | val outputFile = args(1) 13 | val sc = new SparkContext(master, "BasicSaveSequenceFile", System.getenv("SPARK_HOME")) 14 | val data = sc.parallelize(List(("Holden", 3), ("Kay", 6), ("Snail", 2))) 15 | data.saveAsSequenceFile(outputFile) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicStreamingExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * A sample streaming application saves the wordcounts of a specific window in time 3 | */ 4 | 5 | package com.oreilly.learningsparkexamples.scala 6 | 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import StreamingContext._ 9 | import org.apache.spark._ 10 | import org.apache.spark.SparkContext._ 11 | 12 | 13 | object BasicStreamingExample { 14 | def main(args: Array[String]) { 15 | if (args.length < 2) { 16 | System.err.println("Usage BasicStreamingExample ") 17 | } 18 | val Array(master, output) = args.take(2) 19 | 20 | val conf = new SparkConf().setMaster(master).setAppName("BasicStreamingExample") 21 | val ssc = new StreamingContext(conf, Seconds(30)) 22 | 23 | val lines = ssc.socketTextStream("localhost" , 7777) 24 | val words = lines.flatMap(_.split(" ")) 25 | val wc = words.map(x => (x, 1)).reduceByKey((x, y) => x + y) 26 | 27 | wc.saveAsTextFiles(output) 28 | wc.print 29 | 30 | println("pandas: sscstart") 31 | ssc.start() 32 | println("pandas: awaittermination") 33 | ssc.awaitTermination() 34 | println("pandas: done!") 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSum.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple fold in scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object BasicSum { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME")) 15 | val input = sc.parallelize(List(1,2,3,4)) 16 | val result = input.fold(0)((x, y) => (x + y)) 17 | println(result) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/FlumeInput.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a basic Flume stream 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.streaming._ 9 | import org.apache.spark.streaming.flume._ 10 | 11 | object FlumeInput { 12 | def main(args: Array[String]) { 13 | val receiverHostname = args(0) 14 | val receiverPort = args(1).toInt 15 | val conf = new SparkConf().setAppName("FlumeInput") 16 | // Create a StreamingContext with a 1 second batch size 17 | val ssc = new StreamingContext(conf, Seconds(1)) 18 | println(s"Creating flume stream on $receiverHostname $receiverPort") 19 | val events = FlumeUtils.createStream(ssc, receiverHostname, receiverPort) 20 | // Assuming that our flume events are UTF-8 log lines 21 | val lines = events.map{e => new String(e.event.getBody().array(), "UTF-8")} 22 | println("Starting StreamingContext") 23 | lines.print() 24 | // start our streaming context and wait for it to "finish" 25 | ssc.start() 26 | // Wait for 10 seconds then exit. To run forever call without a timeout 27 | ssc.awaitTermination(10000) 28 | ssc.stop() 29 | println("Done") 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/KafkaInput.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a basic Kafka stream 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.streaming._ 9 | import org.apache.spark.streaming.kafka._ 10 | 11 | object KafkaInput { 12 | def main(args: Array[String]) { 13 | val Array(zkQuorum, group, topic, numThreads) = args 14 | val conf = new SparkConf().setAppName("KafkaInput") 15 | // Create a StreamingContext with a 1 second batch size 16 | val ssc = new StreamingContext(conf, Seconds(1)) 17 | // Create a map of topics to number of receiver threads to use 18 | val topics = List((topic, 1)).toMap 19 | val topicLines = KafkaUtils.createStream(ssc, zkQuorum, group, topics) 20 | val lines = StreamingLogInput.processLines(topicLines.map(_._2)) 21 | lines.print() 22 | // start our streaming context and wait for it to "finish" 23 | ssc.start() 24 | // Wait for 10 seconds then exit. To run forever call without a timeout 25 | ssc.awaitTermination(10000) 26 | ssc.stop() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/LoadHive.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading Hive data using Spark SQL 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.sql.hive.HiveContext 8 | 9 | 10 | object LoadHive { 11 | def main(args: Array[String]) { 12 | if (args.length < 2) { 13 | println("Usage: [sparkmaster] [tablename]") 14 | exit(1) 15 | } 16 | val master = args(0) 17 | val tableName = args(1) 18 | val sc = new SparkContext(master, "LoadHive", System.getenv("SPARK_HOME")) 19 | val hiveCtx = new HiveContext(sc) 20 | val input = hiveCtx.sql("FROM src SELECT key, value") 21 | val data = input.map(_.getInt(0)) 22 | println(data.collect().toList) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/LoadJsonWithElephantBird.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse JSON data in Scala 3 | * Loads the data into a case class with the name and a boolean flag 4 | * if the person loves pandas. 5 | */ 6 | package com.oreilly.learningsparkexamples.scala 7 | 8 | import scala.collection.JavaConversions._ 9 | import org.apache.spark._ 10 | import com.twitter.elephantbird.mapreduce.input.LzoJsonInputFormat 11 | import org.apache.hadoop.io.{LongWritable, MapWritable, Text, BooleanWritable} 12 | import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} 13 | import java.util.HashMap 14 | 15 | object LoadJsonWithElephantBird { 16 | def main(args: Array[String]) { 17 | if (args.length < 2) { 18 | println("Usage: [sparkmaster] [inputfile]") 19 | exit(1) 20 | } 21 | val master = args(0) 22 | val inputFile = args(1) 23 | val sc = new SparkContext(master, "LoadJsonWithElephantBird", System.getenv("SPARK_HOME")) 24 | val conf = new NewHadoopJob().getConfiguration 25 | conf.set("io.compression.codecs","com.hadoop.compression.lzo.LzopCodec") 26 | conf.set("io.compression.codec.lzo.class", "com.hadoop.compression.lzo.LzoCodec") 27 | val input = sc.newAPIHadoopFile(inputFile, classOf[LzoJsonInputFormat], classOf[LongWritable], classOf[MapWritable], conf).map{case (x, y) => 28 | (x.get, y.entrySet().map{entry => 29 | (entry.getKey().asInstanceOf[Text].toString(), 30 | entry.getValue() match { 31 | case t: Text => t.toString() 32 | case b: BooleanWritable => b.get() 33 | case _ => throw new Exception("unexpected input") 34 | } 35 | )})} 36 | println(input.collect().toList) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/LoadJsonWithSparkSQL.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading JSON data using Spark SQL 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.sql.SQLContext 8 | 9 | 10 | object LoadJsonWithSparkSQL { 11 | def main(args: Array[String]) { 12 | if (args.length != 2) { 13 | println("Usage: [sparkmaster] [inputFile]") 14 | exit(1) 15 | } 16 | val master = args(0) 17 | val inputFile = args(1) 18 | val sc = new SparkContext(master, "LoadJsonWithSparkSQL", System.getenv("SPARK_HOME")) 19 | val sqlCtx = new SQLContext(sc) 20 | val input = sqlCtx.jsonFile(inputFile) 21 | input.printSchema() 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/LoadKeyValueTextInput.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple map partition to parse JSON data in Scala 3 | * Loads the data into a case class with the name and a boolean flag 4 | * if the person loves pandas. 5 | */ 6 | package com.oreilly.learningsparkexamples.scala 7 | 8 | import scala.collection.JavaConversions._ 9 | import org.apache.spark._ 10 | import org.apache.hadoop.mapred.KeyValueTextInputFormat 11 | import org.apache.hadoop.io.{MapWritable, Text} 12 | import java.util.HashMap 13 | 14 | object LoadKeyValueTextInput { 15 | def main(args: Array[String]) { 16 | if (args.length < 2) { 17 | println("Usage: [sparkmaster] [inputfile]") 18 | exit(1) 19 | } 20 | val master = args(0) 21 | val inputFile = args(1) 22 | val sc = new SparkContext(master, "LoadKeyValueTextInput", System.getenv("SPARK_HOME")) 23 | val input = sc.hadoopFile[Text, Text, KeyValueTextInputFormat](inputFile).map{ 24 | case (x, y) => (x.toString, y.toString) 25 | } 26 | println(input.collect().toList) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/LoadSimpleJdbc.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates loading data over JDBC 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.JdbcRDD 8 | import java.sql.{DriverManager, ResultSet} 9 | 10 | object LoadSimpleJdbc { 11 | def main(args: Array[String]) { 12 | if (args.length < 1) { 13 | println("Usage: [sparkmaster]") 14 | exit(1) 15 | } 16 | val master = args(0) 17 | val sc = new SparkContext(master, "LoadSimpleJdbc", System.getenv("SPARK_HOME")) 18 | val data = new JdbcRDD(sc, 19 | createConnection, "SELECT * FROM panda WHERE ? <= id AND ID <= ?", 20 | lowerBound = 1, upperBound = 3, numPartitions = 2, mapRow = extractValues) 21 | println(data.collect().toList) 22 | } 23 | 24 | def createConnection() = { 25 | Class.forName("com.mysql.jdbc.Driver").newInstance(); 26 | DriverManager.getConnection("jdbc:mysql://localhost/test?user=holden"); 27 | } 28 | 29 | def extractValues(r: ResultSet) = { 30 | (r.getInt(1), r.getString(2)) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/MLlib.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.oreilly.learningsparkexamples.scala 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD 22 | import org.apache.spark.mllib.feature.HashingTF 23 | import org.apache.spark.mllib.regression.LabeledPoint 24 | 25 | object MLlib { 26 | 27 | def main(args: Array[String]) { 28 | val conf = new SparkConf().setAppName(s"Book example: Scala") 29 | val sc = new SparkContext(conf) 30 | 31 | // Load 2 types of emails from text files: spam and ham (non-spam). 32 | // Each line has text from one email. 33 | val spam = sc.textFile("files/spam.txt") 34 | val ham = sc.textFile("files/ham.txt") 35 | 36 | // Create a HashingTF instance to map email text to vectors of 100 features. 37 | val tf = new HashingTF(numFeatures = 100) 38 | // Each email is split into words, and each word is mapped to one feature. 39 | val spamFeatures = spam.map(email => tf.transform(email.split(" "))) 40 | val hamFeatures = ham.map(email => tf.transform(email.split(" "))) 41 | 42 | // Create LabeledPoint datasets for positive (spam) and negative (ham) examples. 43 | val positiveExamples = spamFeatures.map(features => LabeledPoint(1, features)) 44 | val negativeExamples = hamFeatures.map(features => LabeledPoint(0, features)) 45 | val trainingData = positiveExamples ++ negativeExamples 46 | trainingData.cache() // Cache data since Logistic Regression is an iterative algorithm. 47 | 48 | // Create a Logistic Regression learner which uses the LBFGS optimizer. 49 | val lrLearner = new LogisticRegressionWithSGD() 50 | // Run the actual learning algorithm on the training data. 51 | val model = lrLearner.run(trainingData) 52 | 53 | // Test on a positive example (spam) and a negative one (ham). 54 | // First apply the same HashingTF feature transformation used on the training data. 55 | val posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) 56 | val negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) 57 | // Now use the learned model to predict spam/ham for new emails. 58 | println(s"Prediction for positive test example: ${model.predict(posTestExample)}") 59 | println(s"Prediction for negative test example: ${model.predict(negTestExample)}") 60 | 61 | sc.stop() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/MLlibPipeline.disabled_until_111: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.oreilly.learningsparkexamples.scala 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | import org.apache.spark.sql.{Row, SQLContext} 22 | import org.apache.spark.ml.Pipeline 23 | import org.apache.spark.ml.classification.LogisticRegression 24 | import org.apache.spark.ml.feature.{HashingTF, Tokenizer} 25 | 26 | object MLlibPipeline { 27 | 28 | case class Document(id: Long, text: String) 29 | 30 | case class LabeledDocument(id: Long, text: String, label: Double) 31 | 32 | def main(args: Array[String]) { 33 | val conf = new SparkConf().setAppName("BookExamplePipeline") 34 | val sc = new SparkContext(conf) 35 | val sqlContext = new SQLContext(sc) 36 | import sqlContext._ 37 | 38 | // Load 2 types of emails from text files: spam and ham (non-spam). 39 | // Each line has text from one email. 40 | val spam = sc.textFile("files/spam.txt") 41 | val ham = sc.textFile("files/ham.txt") 42 | 43 | // Create LabeledPoint datasets for positive (spam) and negative (ham) examples. 44 | val positiveExamples = spam.zipWithIndex().map { case (email, index) => 45 | LabeledDocument(index, email, 1.0) 46 | } 47 | val negativeExamples = ham.zipWithIndex().map { case (email, index) => 48 | LabeledDocument(index, email, 0.0) 49 | } 50 | val trainingData = positiveExamples ++ negativeExamples 51 | 52 | // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. 53 | // Each stage outputs a column in a SchemaRDD and feeds it to the next stage's input column. 54 | val tokenizer = new Tokenizer() // Splits each email into words 55 | .setInputCol("text") 56 | .setOutputCol("words") 57 | val hashingTF = new HashingTF() // Maps email words to vectors of 100 features. 58 | .setNumFeatures(100) 59 | .setInputCol(tokenizer.getOutputCol) 60 | .setOutputCol("features") 61 | val lr = new LogisticRegression() // LogisticRegression uses inputCol "features" by default. 62 | val pipeline = new Pipeline() 63 | .setStages(Array(tokenizer, hashingTF, lr)) 64 | 65 | // Fit the pipeline to training documents. 66 | // RDDs of case classes work well with Pipelines since Spark SQL can infer a schema from 67 | // case classes and convert the data into a SchemaRDD. 68 | val model = pipeline.fit(trainingData) 69 | 70 | // Make predictions on test documents. 71 | // The fitted model automatically transforms features using Tokenizer and HashingTF. 72 | val testData = sc.parallelize(Seq( 73 | Document(0, "O M G GET cheap stuff by sending money to ..."), // positive example (spam) 74 | Document(1, "Hi Dad, I started studying Spark the other ...") // negative example (ham) 75 | )) 76 | val predictions = model.transform(testData) 77 | .select('id, 'prediction).collect() 78 | .map { case Row(id, prediction) => (id, prediction) }.toMap 79 | println(s"Prediction for positive test example: ${predictions(0)}") 80 | println(s"Prediction for negative test example: ${predictions(1)}") 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/PerKeyAvg.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple fold in scala 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object PerKeyAvg { 10 | def main(args: Array[String]) { 11 | val master = args.length match { 12 | case x: Int if x > 0 => args(0) 13 | case _ => "local" 14 | } 15 | 16 | val sc = new SparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME")) 17 | val input = sc.parallelize(List(("coffee", 1) , ("coffee", 2) , ("panda", 4))) 18 | val result = input.combineByKey( 19 | (v) => (v, 1), 20 | (acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), 21 | (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2) 22 | // Note: we could us mapValues here, but we didn't because it was in the next section 23 | ).map{ case (key, value) => (key, value._1 / value._2.toFloat) } 24 | result.collectAsMap().map(println(_)) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/PipeExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple use of pipe to call a perl program from Spark 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | 8 | object PipeExample { 9 | def main(args: Array[String]) { 10 | val master = args.length match { 11 | case x: Int if x > 0 => args(0) 12 | case _ => "local" 13 | } 14 | val sc = new SparkContext(master, "PipeExample", System.getenv("SPARK_HOME")) 15 | val rdd = sc.parallelize(Array( 16 | "37.75889318222431,-122.42683635321838,37.7614213,-122.4240097", 17 | "37.7519528,-122.4208689,37.8709087,-122.2688365")) 18 | 19 | // adds our script to a list of files for each node to download with this job 20 | val distScript = "/home/holden/repos/learning-spark-examples/src/R/finddistance.R" 21 | sc.addFile(distScript) 22 | 23 | val piped = rdd.pipe(Seq(SparkFiles.get(distScript)), 24 | Map("SEPARATOR" -> ",")) 25 | val result = piped.collect 26 | 27 | println(result.mkString(" ")) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/RemoveOutliers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates remove outliers in Scala using summary Stats 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.rdd.RDD 9 | 10 | object RemoveOutliers { 11 | def main(args: Array[String]) { 12 | val master = args.length match { 13 | case x: Int if x > 0 => args(0) 14 | case _ => "local" 15 | } 16 | val sc = new SparkContext(master, "RemoveOutliers", System.getenv("SPARK_HOME")) 17 | val input = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000)).map(_.toDouble) 18 | val result = removeOutliers(input) 19 | println(result.collect().mkString(",")) 20 | } 21 | def removeOutliers(rdd: RDD[Double]): RDD[Double] = { 22 | val summaryStats = rdd.stats() 23 | val stddev = math.sqrt(summaryStats.variance) 24 | rdd.filter(x => math.abs(x-summaryStats.mean) < 3 * stddev) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/SparkSQLTwitter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Load some tweets stored as JSON data and explore them. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.sql.hive.HiveContext 9 | 10 | 11 | case class HappyPerson(handle: String, favouriteBeverage: String) 12 | 13 | object SparkSQLTwitter { 14 | def main(args: Array[String]) { 15 | if (args.length < 2) { 16 | println("Usage inputFile outputFile [spark.sql.inMemoryColumnarStorage.batchSize]") 17 | } 18 | val inputFile = args(0) 19 | val outputFile = args(1) 20 | val batchSize = if (args.length == 3) { 21 | args(2) 22 | } else { 23 | "200" 24 | } 25 | val conf = new SparkConf() 26 | conf.set("spark.sql.codegen", "false") 27 | conf.set("spark.sql.inMemoryColumnarStorage.batchSize", batchSize) 28 | val sc = new SparkContext(conf) 29 | val hiveCtx = new HiveContext(sc) 30 | import hiveCtx.implicits._ 31 | // Load some tweets 32 | val input = hiveCtx.jsonFile(inputFile) 33 | // Print the schema 34 | input.printSchema() 35 | // Register the input schema RDD 36 | input.registerTempTable("tweets") 37 | hiveCtx.cacheTable("tweets") 38 | // Select tweets based on the retweetCount 39 | val topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") 40 | topTweets.collect().map(println(_)) 41 | val topTweetText = topTweets.map(row => row.getString(0)) 42 | // Create a person and turn it into a Schema RDD 43 | val happyPeopleRDD = sc.parallelize(List(HappyPerson("holden", "coffee"))).toDF() 44 | happyPeopleRDD.registerTempTable("happy_people") 45 | // UDF 46 | hiveCtx.udf.register("strLenScala", (_: String).length) 47 | val tweetLength = hiveCtx.sql("SELECT strLenScala('tweet') FROM tweets LIMIT 10") 48 | tweetLength.collect().map(println(_)) 49 | // Two sums at once (crazy town!) 50 | val twoSums = hiveCtx.sql("SELECT SUM(user.favouritesCount), SUM(retweetCount), user.id FROM tweets GROUP BY user.id LIMIT 10") 51 | twoSums.collect().map(println(_)) 52 | sc.stop() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/StreamingLogInput.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates a simple streaming application 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.streaming._ 9 | import org.apache.spark.streaming.dstream._ 10 | 11 | object StreamingLogInput { 12 | def main(args: Array[String]) { 13 | val master = args(0) 14 | val conf = new SparkConf().setMaster(master).setAppName("StreamingLogInput") 15 | // Create a StreamingContext with a 1 second batch size 16 | val ssc = new StreamingContext(conf, Seconds(1)) 17 | // Create a DStream from all the input on port 7777 18 | val lines = ssc.socketTextStream("localhost", 7777) 19 | val errorLines = processLines(lines) 20 | // Print out the lines with errors, which causes this DStream to be evaluated 21 | errorLines.print() 22 | // start our streaming context and wait for it to "finish" 23 | ssc.start() 24 | // Wait for 10 seconds then exit. To run forever call without a timeout 25 | ssc.awaitTermination(10000) 26 | ssc.stop() 27 | } 28 | def processLines(lines: DStream[String]) = { 29 | // Filter our DStream for lines with "error" 30 | lines.filter(_.contains("error")) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/WordCount.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates flatMap + countByValue for wordcount. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | 9 | object WordCount { 10 | def main(args: Array[String]) { 11 | val master = args.length match { 12 | case x: Int if x > 0 => args(0) 13 | case _ => "local" 14 | } 15 | val sc = new SparkContext(master, "WordCount", System.getenv("SPARK_HOME")) 16 | val input = args.length match { 17 | case x: Int if x > 1 => sc.textFile(args(1)) 18 | case _ => sc.parallelize(List("pandas", "i like pandas")) 19 | } 20 | val words = input.flatMap(line => line.split(" ")) 21 | args.length match { 22 | case x: Int if x > 2 => { 23 | val counts = words.map(word => (word, 1)).reduceByKey{case (x,y) => x + y} 24 | counts.saveAsTextFile(args(2)) 25 | } 26 | case _ => { 27 | val wc = words.countByValue() 28 | println(wc.mkString(",")) 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/WriteSimpleDB.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates writing data over JDBC 3 | */ 4 | package com.oreilly.learningsparkexamples.scala 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.rdd.JdbcRDD 9 | import java.sql.{PreparedStatement, DriverManager, ResultSet} 10 | import org.apache.hadoop.mapred.lib.db._ 11 | import org.apache.hadoop.mapred.JobConf 12 | 13 | object WriteSimpleDB { 14 | def main(args: Array[String]) { 15 | if (args.length < 1) { 16 | println("Usage: [sparkmaster]") 17 | exit(1) 18 | } 19 | val master = args(0) 20 | val sc = new SparkContext(master, "WriteSimpleJdbc", System.getenv("SPARK_HOME")) 21 | val data = sc.parallelize(List(("cat1", 1))) 22 | // foreach partition method 23 | data.foreachPartition{records => 24 | records.foreach(record => println("fake db write")) 25 | } 26 | // DBOutputFormat approach 27 | val records = data.map(e => (catRecord(e._1, e._2), null)) 28 | val tableName = "table" 29 | val fields = Array("name", "age") 30 | val jobConf = new JobConf() 31 | DBConfiguration.configureDB(jobConf, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost/test?user=holden") 32 | DBOutputFormat.setOutput(jobConf, tableName, fields:_*) 33 | records.saveAsHadoopDataset(jobConf) 34 | } 35 | case class catRecord(name: String, age: Int) extends DBWritable { 36 | override def write(s: PreparedStatement) { 37 | s.setString(1, name) 38 | s.setInt(2, age) 39 | } 40 | override def readFields(r: ResultSet) = { 41 | // blank since only used for writing 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerAppMain.scala: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.scala.logs; 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.StreamingContext._ 6 | import org.apache.spark.streaming.dstream._ 7 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog 8 | 9 | /** 10 | * The LogAnalyzerAppMain is an sample logs analysis application. For now, 11 | * it is a simple minimal viable product: 12 | * - Read in new log files from a directory and input those new files into streaming. 13 | * - Computes stats for all of time as well as the last time interval based on those logs. 14 | * - Write the calculated stats to an txt file on the local file system 15 | * that gets refreshed every time interval. 16 | * 17 | * Once you get this program up and running, feed apache access log files 18 | * into the local directory of your choosing. 19 | * 20 | * Then open your output text file, perhaps in a web browser, and refresh 21 | * that page to see more stats come in. 22 | * 23 | * Modify the command line flags to the values of your choosing. 24 | * Notice how they come after you specify the jar when using spark-submit. 25 | * 26 | * Example command to run: 27 | * % ${YOUR_SPARK_HOME}/bin/spark-submit 28 | * --class "com.oreilly.learningsparkexamples.scala.logs.LogAnalyzerAppMain" 29 | * --master local[4] 30 | * target/uber-log-analyzer-1.0.jar 31 | * --logs_directory /tmp/logs 32 | * --output_html_file /tmp/log_stats.html 33 | * --index_html_template ./src/main/resources/index.html.template 34 | */ 35 | case class Config(WindowLength: Int = 3000, SlideInterval: Int = 1000, LogsDirectory: String = "/tmp/logs", 36 | CheckpointDirectory: String = "/tmp/checkpoint", 37 | OutputHTMLFile: String = "/tmp/log_stats.html", 38 | OutputDirectory: String = "/tmp/outpandas", 39 | IndexHTMLTemplate :String ="./src/main/resources/index.html.template") { 40 | def getWindowDuration() = { 41 | new Duration(WindowLength) 42 | } 43 | def getSlideDuration() = { 44 | new Duration(SlideInterval) 45 | } 46 | } 47 | 48 | object LogAnalyzerAppMain { 49 | 50 | def main(args: Array[String]) { 51 | val parser = new scopt.OptionParser[Config]("LogAnalyzerAppMain") { 52 | head("LogAnalyzer", "0.1") 53 | opt[Int]('w', "window_length") text("size of the window as an integer in miliseconds") 54 | opt[Int]('s', "slide_interval") text("size of the slide inteval as an integer in miliseconds") 55 | opt[String]('l', "logs_directory") text("location of the logs directory. if you don't have any logs use the fakelogs_dir script.") 56 | opt[String]('c', "checkpoint_directory") text("location of the checkpoint directory.") 57 | opt[String]('o', "output_directory") text("location of the output directory.") 58 | } 59 | val opts = parser.parse(args, new Config()).get 60 | // Startup the Spark Conf. 61 | val conf = new SparkConf() 62 | .setAppName("A Databricks Reference Application: Logs Analysis with Spark"); 63 | val ssc = new StreamingContext(conf, opts.getWindowDuration()) 64 | // Checkpointing must be enabled to use the updateStateByKey function & windowed operations. 65 | ssc.checkpoint(opts.CheckpointDirectory) 66 | // This methods monitors a directory for new files to read in for streaming. 67 | val logDirectory = opts.LogsDirectory 68 | val logData = ssc.textFileStream(logDirectory); 69 | val accessLogDStream = logData.map(line => ApacheAccessLog.parseFromLogLine(line)).cache() 70 | LogAnalyzerTotal.processAccessLogs(accessLogDStream) 71 | LogAnalyzerWindowed.processAccessLogs(accessLogDStream, opts) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerTotal.scala: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.scala.logs; 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming._ 6 | import org.apache.spark.streaming.StreamingContext._ 7 | import org.apache.spark.streaming.dstream._ 8 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog 9 | 10 | /** 11 | * Compute totals on the log input 12 | */ 13 | object LogAnalyzerTotal { 14 | def computeRunningSum(values: Seq[Long], state: Option[Long]) = { 15 | Some(values.reduce((x, y) => x + y) + state.getOrElse(0L)) 16 | } 17 | def processAccessLogs(accessLogsDStream: DStream[ApacheAccessLog]) { 18 | val ipDStream = accessLogsDStream.map(entry => (entry.getIpAddress(), 1)) 19 | val ipCountsDStream = ipDStream.reduceByKey((x, y) => x + y) 20 | ipCountsDStream.print() 21 | // with transform 22 | val ipRawDStream = accessLogsDStream.transform{ 23 | rdd => rdd.map(accessLog => (accessLog.getIpAddress(), 1)).reduceByKey( 24 | (x, y) => x +y) 25 | } 26 | ipRawDStream.print() 27 | // ip address bytes transfered 28 | val ipBytesDStream = accessLogsDStream.map(entry => (entry.getIpAddress(), entry.getContentSize())) 29 | val ipBytesSumDStream = ipBytesDStream.reduceByKey((x, y) => x + y) 30 | val ipBytesRequestCountDStream = ipRawDStream.join(ipBytesSumDStream) 31 | ipBytesRequestCountDStream.print() 32 | val responseCodeDStream = accessLogsDStream.map(log => (log.getResponseCode(), 1L)) 33 | val responseCodeCountDStream = responseCodeDStream.updateStateByKey(computeRunningSum _) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerWindowed.scala: -------------------------------------------------------------------------------- 1 | package com.oreilly.learningsparkexamples.scala.logs; 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.rdd._ 5 | import org.apache.spark.SparkContext._ 6 | import org.apache.spark.streaming._ 7 | import org.apache.spark.streaming.StreamingContext._ 8 | import org.apache.spark.streaming.dstream._ 9 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog 10 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 11 | import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} 12 | 13 | /** 14 | * Computes various pieces of information on a sliding window form the log input 15 | */ 16 | object LogAnalyzerWindowed { 17 | def responseCodeCount(accessLogRDD: RDD[ApacheAccessLog]) = { 18 | accessLogRDD.map(log => (log.getResponseCode(), 1)).reduceByKey((x, y) => x + y) 19 | } 20 | 21 | def processAccessLogs(accessLogsDStream: DStream[ApacheAccessLog], opts: Config) { 22 | val ipDStream = accessLogsDStream.map{entry => entry.getIpAddress()} 23 | val ipAddressRequestCount = ipDStream.countByValueAndWindow( 24 | opts.getWindowDuration(), opts.getSlideDuration()) 25 | ipAddressRequestCount.saveAsTextFiles(opts.OutputDirectory + "/ipAddressRequestCountsTXT") 26 | val writableIpAddressRequestCount = ipAddressRequestCount.map{case (ip, count) => 27 | (new Text(ip), new LongWritable(count))} 28 | writableIpAddressRequestCount.saveAsHadoopFiles[SequenceFileOutputFormat[Text, LongWritable]]( 29 | opts.OutputDirectory + "/ipAddressRequestCounts", "pandas") 30 | val requestCount = accessLogsDStream.countByWindow(opts.getWindowDuration(), opts.getSlideDuration()) 31 | requestCount.print() 32 | ipAddressRequestCount.print() 33 | val accessLogsWindow = accessLogsDStream.window( 34 | opts.getWindowDuration(), opts.getSlideDuration()) 35 | accessLogsWindow.transform(rdd => responseCodeCount(rdd)).print() 36 | // compute the visit counts for IP address in a window 37 | val ipPairDStream = accessLogsDStream.map(logEntry => (logEntry.getIpAddress(), 1)) 38 | val ipCountDStream = ipPairDStream.reduceByKeyAndWindow( 39 | {(x, y) => x + y}, // Adding elements in the new slice 40 | {(x, y) => x - y}, // Removing elements from the oldest slice 41 | opts.getWindowDuration(), // Window duration 42 | opts.getSlideDuration() // slide duration 43 | ) 44 | ipCountDStream.print() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/oreilly/learningsparkexamples/scala/logs/ReadTransferStats.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates reading in transfer statistics. 3 | */ 4 | package com.oreilly.learningsparkexamples.scala.logs 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.streaming._ 9 | import org.apache.spark.streaming.dstream._ 10 | 11 | import org.apache.hadoop.io.Writable 12 | import org.apache.hadoop.io.IntWritable 13 | import org.apache.hadoop.io.LongWritable 14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat 15 | 16 | 17 | object ReadTransferStats { 18 | def readStats(ssc: StreamingContext, inputDirectory: String): DStream[(Long, Int)] = { 19 | // convert the input from Writables to native types 20 | ssc.fileStream[LongWritable, IntWritable, 21 | SequenceFileInputFormat[LongWritable, IntWritable]](inputDirectory).map{ 22 | case (x, y) => (x.get(), y.get()) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/perl/splitwords.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | # This is a simple perl example of how to work with pipe interface 6 | # Here we read in each line and ouput the corresponding words 7 | # This is equivelent to rdd.flatMap(_.split($SEPARATOR)) 8 | while (my $line = <>) { 9 | chomp ($line); 10 | my @words = split($ENV{'SEPARATOR'}, $line); 11 | foreach my $word (@words) { 12 | print $word."\n"; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/python/AvgMapPartitions.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4]) 5 | >>> avg(b) 6 | 2.5 7 | """ 8 | 9 | import sys 10 | 11 | from pyspark import SparkContext 12 | 13 | 14 | def partitionCtr(nums): 15 | """Compute sumCounter for partition""" 16 | sumCount = [0, 0] 17 | for num in nums: 18 | sumCount[0] += num 19 | sumCount[1] += 1 20 | return [sumCount] 21 | 22 | 23 | def combineCtrs(c1, c2): 24 | return (c1[0] + c2[0], c1[1] + c2[1]) 25 | 26 | 27 | def basicAvg(nums): 28 | """Compute the avg""" 29 | sumCount = nums.mapPartitions(partitionCtr).reduce(combineCtrs) 30 | return sumCount[0] / float(sumCount[1]) 31 | 32 | if __name__ == "__main__": 33 | cluster = "local" 34 | if len(sys.argv) == 2: 35 | cluster = sys.argv[1] 36 | sc = SparkContext(cluster, "Sum") 37 | nums = sc.parallelize([1, 2, 3, 4]) 38 | avg = basicAvg(nums) 39 | print avg 40 | -------------------------------------------------------------------------------- /src/python/BasicAvg.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4]) 5 | >>> basicAvg(b) 6 | 2.5 7 | """ 8 | 9 | import sys 10 | 11 | from pyspark import SparkContext 12 | 13 | 14 | def basicAvg(nums): 15 | """Compute the avg""" 16 | sumCount = nums.map(lambda x: (x, 1)).fold( 17 | (0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1]))) 18 | return sumCount[0] / float(sumCount[1]) 19 | 20 | if __name__ == "__main__": 21 | master = "local" 22 | if len(sys.argv) == 2: 23 | master = sys.argv[1] 24 | sc = SparkContext(master, "Sum") 25 | nums = sc.parallelize([1, 2, 3, 4]) 26 | avg = basicAvg(nums) 27 | print avg 28 | -------------------------------------------------------------------------------- /src/python/BasicFilterMap.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4]) 5 | >>> sorted(basicSquareNoOnes(b).collect()) 6 | [4, 9, 16] 7 | """ 8 | 9 | import sys 10 | 11 | from pyspark import SparkContext 12 | 13 | 14 | def basicSquareNoOnes(nums): 15 | """Square the numbers""" 16 | return nums.map(lambda x: x * x).filter(lambda x: x != 1) 17 | 18 | if __name__ == "__main__": 19 | master = "local" 20 | if len(sys.argv) == 2: 21 | master = sys.argv[1] 22 | sc = SparkContext(master, "BasicFilterMap") 23 | nums = sc.parallelize([1, 2, 3, 4]) 24 | output = sorted(basicSquareNoOnes(nums).collect()) 25 | for num in output: 26 | print "%i " % (num) 27 | -------------------------------------------------------------------------------- /src/python/BasicKeyValueMapFilter.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> input = ["coffee", "i really like coffee", "coffee > magic"] 5 | >>> b = sc.parallelize(input) 6 | >>> sorted(basicKeyValueMapFilter(b).collect()) 7 | [4, 9] 8 | """ 9 | 10 | import sys 11 | 12 | from pyspark import SparkContext 13 | 14 | 15 | def basicKeyValueMapFilter(input): 16 | """Construct a key/value RDD and then filter on the value""" 17 | return input.map(lambda x: (x.split(" ")[0], x)).filter( 18 | lambda x: len(x[1]) < 20) 19 | 20 | if __name__ == "__main__": 21 | master = "local" 22 | if len(sys.argv) == 2: 23 | master = sys.argv[1] 24 | sc = SparkContext(master, "BasicFilterMap") 25 | input = sc.parallelize( 26 | ["coffee", "i really like coffee", "coffee > magic", "panda < coffee"]) 27 | output = sorted(basicKeyValueMapFilter(input).collect()) 28 | for elem in output: 29 | print elem 30 | -------------------------------------------------------------------------------- /src/python/BasicMap.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4]) 5 | >>> sorted(basicSquare(b).collect()) 6 | [1, 4, 9, 16] 7 | """ 8 | 9 | import sys 10 | 11 | from pyspark import SparkContext 12 | 13 | 14 | def basicSquare(nums): 15 | """Square the numbers""" 16 | return nums.map(lambda x: x * x) 17 | 18 | if __name__ == "__main__": 19 | master = "local" 20 | if len(sys.argv) == 2: 21 | master = sys.argv[1] 22 | sc = SparkContext(master, "BasicMap") 23 | nums = sc.parallelize([1, 2, 3, 4]) 24 | output = sorted(basicSquare(nums).collect()) 25 | for num in output: 26 | print "%i " % (num) 27 | -------------------------------------------------------------------------------- /src/python/BasicMapPartitions.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize(["KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"]) 5 | >>> fetchCallSigns(b).size() 6 | 4 7 | """ 8 | 9 | import sys 10 | import urllib3 11 | 12 | from pyspark import SparkContext 13 | 14 | 15 | def processCallSigns(signs): 16 | """Process call signs""" 17 | http = urllib3.PoolManager() 18 | requests = map( 19 | lambda x: http.request('GET', "http://qrzcq.com/call/" + x), signs) 20 | return map(lambda x: x.data, requests) 21 | 22 | 23 | def fetchCallSigns(input): 24 | """Fetch call signs""" 25 | return input.mapPartitions(lambda callSigns: processCallSigns(callSigns)) 26 | 27 | if __name__ == "__main__": 28 | master = "local" 29 | if len(sys.argv) == 2: 30 | master = sys.argv[1] 31 | sc = SparkContext(master, "BasicMapPartitions") 32 | input = sc.parallelize(["KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"]) 33 | output = sorted(fetchCallSigns(input).collect()) 34 | for str in output: 35 | print "%s " % (str) 36 | -------------------------------------------------------------------------------- /src/python/BasicSum.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4]) 5 | >>> basicSum(b) 6 | 10 7 | """ 8 | 9 | import sys 10 | 11 | from pyspark import SparkContext 12 | 13 | 14 | def basicSum(nums): 15 | """Sum the numbers""" 16 | return nums.fold(0, (lambda x, y: x + y)) 17 | 18 | if __name__ == "__main__": 19 | master = "local" 20 | if len(sys.argv) == 2: 21 | master = sys.argv[1] 22 | sc = SparkContext(master, "Sum") 23 | nums = sc.parallelize([1, 2, 3, 4]) 24 | output = basicSum(nums) 25 | print output 26 | -------------------------------------------------------------------------------- /src/python/ChapterSixExample.py: -------------------------------------------------------------------------------- 1 | """Contains the Chapter 6 Example illustrating accumulators, broadcast 2 | variables, numeric operations, and pipe.""" 3 | import bisect 4 | import re 5 | import sys 6 | import urllib3 7 | import json 8 | import math 9 | import os 10 | 11 | from pyspark import SparkContext 12 | from pyspark import SparkFiles 13 | 14 | sparkMaster = sys.argv[1] 15 | inputFile = sys.argv[2] 16 | outputDir = sys.argv[3] 17 | 18 | sc = SparkContext(sparkMaster, appName="ChapterSixExample") 19 | file = sc.textFile(inputFile) 20 | 21 | # Count lines with KK6JKQ using accumulators 22 | count = sc.accumulator(0) 23 | 24 | 25 | def incrementCounter(line): 26 | global count # Access the counter 27 | if "KK6JKQ" in line: 28 | count += 1 29 | 30 | file.foreach(incrementCounter) 31 | print "Lines with KK6JKQ %d" % count.value 32 | 33 | 34 | # Create Accumulator[Int] initialized to 0 35 | blankLines = sc.accumulator(0) 36 | dataLines = sc.accumulator(0) 37 | 38 | 39 | def extractCallSigns(line): 40 | global blankLines, dataLines # Access the counters 41 | if (line == ""): 42 | blankLines += 1 43 | return line.split(" ") 44 | 45 | callSigns = file.flatMap(extractCallSigns) 46 | callSigns.saveAsTextFile(outputDir + "/callsigns") 47 | print "Blank lines %d" % blankLines.value 48 | 49 | # Create Accumulators for validating call signs 50 | validSignCount = sc.accumulator(0) 51 | invalidSignCount = sc.accumulator(0) 52 | 53 | 54 | def validateSign(sign): 55 | global validSignCount, invalidSignCount 56 | if re.match(r"\A\d?[a-zA-Z]{1,2}\d{1,4}[a-zA-Z]{1,3}\Z", sign): 57 | validSignCount += 1 58 | return True 59 | else: 60 | invalidSignCount += 1 61 | return False 62 | 63 | validSigns = callSigns.filter(validateSign) 64 | contactCounts = validSigns.map( 65 | lambda sign: (sign, 1)).reduceByKey((lambda x, y: x + y)) 66 | # Force evaluation so the counters are populated 67 | contactCounts.count() 68 | if invalidSignCount.value < 0.1 * validSignCount.value: 69 | contactCounts.saveAsTextFile(outputDir + "/contactCount") 70 | else: 71 | print ("Too many errors %d in %d" % 72 | (invalidSignCount.value, validSignCount.value)) 73 | 74 | # Helper functions for looking up the call signs 75 | 76 | 77 | def lookupCountry(sign, prefixes): 78 | pos = bisect.bisect_left(prefixes, sign) 79 | return prefixes[pos].split(",")[1] 80 | 81 | 82 | def loadCallSignTable(): 83 | f = open("./files/callsign_tbl_sorted", "r") 84 | return f.readlines() 85 | 86 | # Lookup the locations of the call signs on the 87 | # RDD contactCounts. We load a list of call sign 88 | # prefixes to country code to support this lookup. 89 | signPrefixes = sc.broadcast(loadCallSignTable()) 90 | 91 | 92 | def processSignCount(sign_count, signPrefixes): 93 | country = lookupCountry(sign_count[0], signPrefixes.value) 94 | count = sign_count[1] 95 | return (country, count) 96 | 97 | countryContactCounts = (contactCounts 98 | .map(lambda signCount: processSignCount(signCount, signPrefixes)) 99 | .reduceByKey((lambda x, y: x + y))) 100 | 101 | countryContactCounts.saveAsTextFile(outputDir + "/countries.txt") 102 | 103 | # Query 73s for the call signs CallLogs and parse the personse 104 | 105 | 106 | def processCallSigns(signs): 107 | """Lookup call signs using a connection pool""" 108 | # Create a connection pool 109 | http = urllib3.PoolManager() 110 | # the URL associated with each call sign record 111 | urls = map(lambda x: "http://73s.com/qsos/%s.json" % x, signs) 112 | # create the requests (non-blocking) 113 | requests = map(lambda x: (x, http.request('GET', x)), urls) 114 | # fetch the results 115 | result = map(lambda x: (x[0], json.loads(x[1].data)), requests) 116 | # remove any empty results and return 117 | return filter(lambda x: x[1] is not None, result) 118 | 119 | 120 | def fetchCallSigns(input): 121 | """Fetch call signs""" 122 | return input.mapPartitions(lambda callSigns: processCallSigns(callSigns)) 123 | 124 | contactsContactList = fetchCallSigns(validSigns) 125 | 126 | # Compute the distance of each call using an external R program 127 | distScript = os.getcwd()+"/src/R/finddistance.R" 128 | distScriptName = "finddistance.R" 129 | sc.addFile(distScript) 130 | 131 | 132 | def hasDistInfo(call): 133 | """Verify that a call has the fields required to compute the distance""" 134 | requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] 135 | return all(map(lambda f: call[f], requiredFields)) 136 | 137 | 138 | def formatCall(call): 139 | """Format a call so that it can be parsed by our R program""" 140 | return "{0},{1},{2},{3}".format( 141 | call["mylat"], call["mylong"], 142 | call["contactlat"], call["contactlong"]) 143 | 144 | pipeInputs = contactsContactList.values().flatMap( 145 | lambda calls: map(formatCall, filter(hasDistInfo, calls))) 146 | distances = pipeInputs.pipe(SparkFiles.get(distScriptName)) 147 | print distances.collect() 148 | # Convert our RDD of strings to numeric data so we can compute stats and 149 | # remove the outliers. 150 | distanceNumerics = distances.map(lambda string: float(string)) 151 | stats = distanceNumerics.stats() 152 | stddev = stats.stdev() 153 | mean = stats.mean() 154 | reasonableDistances = distanceNumerics.filter( 155 | lambda x: math.fabs(x - mean) < 3 * stddev) 156 | print reasonableDistances.collect() 157 | -------------------------------------------------------------------------------- /src/python/IntersectByKey.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> input = [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)] 5 | >>> rdd1 = sc.parallelize(input) 6 | >>> rdd2 = sc.parallelize([("pandas", 20)]) 7 | >>> intserectByKey(rdd1, rdd2).collect() 8 | [('pandas', 2), ('pandas', 20)] 9 | """ 10 | 11 | import sys 12 | import itertools 13 | 14 | from pyspark import SparkContext 15 | 16 | 17 | def combineIfBothPresent(itrs): 18 | """Return an iterable of the elements from 19 | both itr1 and itr2 if there are elements in both itr1 and itr2 otherwise 20 | return an empty itrable""" 21 | iter1 = itrs[0].__iter__() 22 | iter2 = itrs[1].__iter__() 23 | try: 24 | e1 = iter1.next() 25 | e2 = iter2.next() 26 | return itertools.chain([e1], [e2], iter1, iter2) 27 | except StopIteration: 28 | return [] 29 | 30 | 31 | def intersectByKey(rdd1, rdd2): 32 | """Intersect two RDDs by key""" 33 | return rdd1.cogroup(rdd2).flatMapValues(combineIfBothPresent) 34 | 35 | if __name__ == "__main__": 36 | master = "local" 37 | if len(sys.argv) == 2: 38 | master = sys.argv[1] 39 | sc = SparkContext(master, "IntersectByKey") 40 | rdd1 = sc.parallelize( 41 | [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)]) 42 | rdd2 = sc.parallelize([("pandas", 20), ("pandas", 21)]) 43 | print intersectByKey(rdd1, rdd2).collect() 44 | -------------------------------------------------------------------------------- /src/python/LoadCsv.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | import csv 3 | import sys 4 | import StringIO 5 | 6 | 7 | def loadRecord(line): 8 | """Parse a CSV line""" 9 | input = StringIO.StringIO(line) 10 | reader = csv.DictReader(input, fieldnames=["name", "favouriteAnimal"]) 11 | return reader.next() 12 | 13 | 14 | def loadRecords(fileNameContents): 15 | """Load all the records in a given file""" 16 | input = StringIO.StringIO(fileNameContents[1]) 17 | reader = csv.DictReader(input, fieldnames=["name", "favouriteAnimal"]) 18 | return reader 19 | 20 | 21 | def writeRecords(records): 22 | """Write out CSV lines""" 23 | output = StringIO.StringIO() 24 | writer = csv.DictWriter(output, fieldnames=["name", "favouriteAnimal"]) 25 | for record in records: 26 | writer.writerow(record) 27 | return [output.getvalue()] 28 | 29 | if __name__ == "__main__": 30 | if len(sys.argv) != 4: 31 | print "Error usage: LoadCsv [sparkmaster] [inputfile] [outputfile]" 32 | sys.exit(-1) 33 | master = sys.argv[1] 34 | inputFile = sys.argv[2] 35 | outputFile = sys.argv[3] 36 | sc = SparkContext(master, "LoadCsv") 37 | # Try the record-per-line-input 38 | input = sc.textFile(inputFile) 39 | data = input.map(loadRecord) 40 | pandaLovers = data.filter(lambda x: x['favouriteAnimal'] == "panda") 41 | pandaLovers.mapPartitions(writeRecords).saveAsTextFile(outputFile) 42 | # Try the more whole file input 43 | fullFileData = sc.wholeTextFiles(inputFile).flatMap(loadRecords) 44 | fullFilePandaLovers = fullFileData.filter( 45 | lambda x: x['favouriteAnimal'] == "panda") 46 | fullFilePandaLovers.mapPartitions( 47 | writeRecords).saveAsTextFile(outputFile + "fullfile") 48 | sc.stop() 49 | print "Done!" 50 | -------------------------------------------------------------------------------- /src/python/LoadHive.py: -------------------------------------------------------------------------------- 1 | # A simple hive demo. If you do not have a table to load from look run 2 | # MakeHiveTable.py 3 | from pyspark import SparkContext 4 | from pyspark.sql import HiveContext 5 | import json 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | if len(sys.argv) != 3: 10 | print "Error usage: LoadHive [sparkmaster] [inputtable]" 11 | sys.exit(-1) 12 | master = sys.argv[1] 13 | inputTable = sys.argv[2] 14 | sc = SparkContext(master, "LoadHive") 15 | hiveCtx = HiveContext(sc) 16 | # Query hive 17 | input = hiveCtx.sql("FROM " + inputTable + " SELECT key, value") 18 | print "result of query" 19 | print input.collect() 20 | data = input.map(lambda x: x[0] * x[0]) 21 | result = data.collect() 22 | for element in result: 23 | print "Got data " + str(element) 24 | sc.stop() 25 | print "Done!" 26 | -------------------------------------------------------------------------------- /src/python/LoadJson.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | import json 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | if len(sys.argv) != 4: 7 | print "Error usage: LoadJson [sparkmaster] [inputfile] [outputfile]" 8 | sys.exit(-1) 9 | master = sys.argv[1] 10 | inputFile = sys.argv[2] 11 | outputFile = sys.argv[3] 12 | sc = SparkContext(master, "LoadJson") 13 | input = sc.textFile(inputFile) 14 | data = input.map(lambda x: json.loads(x)) 15 | data.filter(lambda x: 'lovesPandas' in x and x['lovesPandas']).map( 16 | lambda x: json.dumps(x)).saveAsTextFile(outputFile) 17 | sc.stop() 18 | print "Done!" 19 | -------------------------------------------------------------------------------- /src/python/MLlib.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from pyspark import SparkContext 19 | from pyspark.mllib.regression import LabeledPoint 20 | from pyspark.mllib.classification import LogisticRegressionWithSGD 21 | from pyspark.mllib.feature import HashingTF 22 | 23 | 24 | if __name__ == "__main__": 25 | sc = SparkContext(appName="PythonBookExample") 26 | 27 | # Load 2 types of emails from text files: spam and ham (non-spam). 28 | # Each line has text from one email. 29 | spam = sc.textFile("files/spam.txt") 30 | ham = sc.textFile("files/ham.txt") 31 | 32 | # Create a HashingTF instance to map email text to vectors of 100 features. 33 | tf = HashingTF(numFeatures = 100) 34 | # Each email is split into words, and each word is mapped to one feature. 35 | spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) 36 | hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) 37 | 38 | # Create LabeledPoint datasets for positive (spam) and negative (ham) examples. 39 | positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) 40 | negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) 41 | training_data = positiveExamples.union(negativeExamples) 42 | training_data.cache() # Cache data since Logistic Regression is an iterative algorithm. 43 | 44 | # Run Logistic Regression using the SGD optimizer. 45 | # regParam is model regularization, which can make models more robust. 46 | model = LogisticRegressionWithSGD.train(training_data) 47 | 48 | # Test on a positive example (spam) and a negative one (ham). 49 | # First apply the same HashingTF feature transformation used on the training data. 50 | posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) 51 | negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) 52 | 53 | # Now use the learned model to predict spam/ham for new emails. 54 | print "Prediction for positive test example: %g" % model.predict(posTestExample) 55 | print "Prediction for negative test example: %g" % model.predict(negTestExample) 56 | 57 | sc.stop() 58 | -------------------------------------------------------------------------------- /src/python/MakeHiveTable.py: -------------------------------------------------------------------------------- 1 | # Createas a hive table and loads an input file into it 2 | # For input you can use examples/src/main/resources/kv1.txt from the spark 3 | # distribution 4 | from pyspark import SparkContext 5 | from pyspark.sql import HiveContext 6 | import json 7 | import sys 8 | 9 | if __name__ == "__main__": 10 | if len(sys.argv) != 4: 11 | print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]" 12 | sys.exit(-1) 13 | master = sys.argv[1] 14 | inputFile = sys.argv[2] 15 | inputTable = sys.argv[3] 16 | sc = SparkContext(master, "LoadHive") 17 | hiveCtx = HiveContext(sc) 18 | # Load some data into hive 19 | hiveCtx.sql( 20 | "CREATE TABLE IF NOT EXISTS " + 21 | inputTable + 22 | " (key INT, value STRING)") 23 | hiveCtx.sql( 24 | "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable) 25 | -------------------------------------------------------------------------------- /src/python/MakeParquetFile.py: -------------------------------------------------------------------------------- 1 | # Createas a parquet file and loads an input file into it 2 | # For input you can use files/favourite_animal.csv as the iput 3 | from pyspark import SparkContext 4 | from pyspark.sql import SQLContext 5 | import json 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | if len(sys.argv) != 4: 10 | print "Error usage: LoadHive [sparkmaster] [inputFile] [parquetfile]" 11 | sys.exit(-1) 12 | master = sys.argv[1] 13 | inputFile = sys.argv[2] 14 | parquetFile = sys.argv[3] 15 | sc = SparkContext(master, "MakeParquetFile") 16 | sqlCtx = SQLContext(sc) 17 | # Load some data into an RDD 18 | rdd = sc.textFile(inputFile).map(lambda l: l.split(",")) 19 | namedRdd = rdd.map(lambda r: {"name": r[0], "favouriteAnimal": r[1]}) 20 | schemaNamedRdd = sqlCtx.inferSchema(namedRdd) 21 | # Save it 22 | schemaNamedRdd.saveAsParquetFile(parquetFile) 23 | -------------------------------------------------------------------------------- /src/python/PerKeyAvg.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> input = [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)] 5 | >>> b = sc.parallelize(input) 6 | >>> perKeyAvg(b) 7 | 8 | """ 9 | 10 | import sys 11 | 12 | from pyspark import SparkContext 13 | 14 | 15 | def perKeyAvg(nums): 16 | """Compute the avg""" 17 | sumCount = nums.combineByKey((lambda x: (x, 1)), 18 | (lambda x, y: (x[0] + y, x[1] + 1)), 19 | (lambda x, y: (x[0] + y[0], x[1] + y[1]))) 20 | return sumCount.collectAsMap() 21 | 22 | if __name__ == "__main__": 23 | master = "local" 24 | if len(sys.argv) == 2: 25 | master = sys.argv[1] 26 | sc = SparkContext(master, "Sum") 27 | nums = sc.parallelize( 28 | [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)]) 29 | avg = perKeyAvg(nums) 30 | print avg 31 | -------------------------------------------------------------------------------- /src/python/QueryParquetFile.py: -------------------------------------------------------------------------------- 1 | # Finds the names of people who like pandas from a parquet file 2 | # consisting of name & favouriteAnimal. 3 | # For input you can use the result of MakeParquetFile 4 | from pyspark import SparkContext 5 | from pyspark.sql import SQLContext 6 | import json 7 | import sys 8 | 9 | if __name__ == "__main__": 10 | if len(sys.argv) != 3: 11 | print "Error usage: QueryParquetFile [sparkmaster] [parquetfile]" 12 | sys.exit(-1) 13 | master = sys.argv[1] 14 | parquetFile = sys.argv[2] 15 | sc = SparkContext(master, "QueryParquetFile") 16 | sqlCtx = SQLContext(sc) 17 | # Load some data in from a Parquet file of name & favouriteAnimal 18 | rows = sqlCtx.parquetFile(parquetFile) 19 | names = rows.map(lambda row: row.name) 20 | print "Everyone" 21 | print names.collect() 22 | # Find the panda lovers 23 | tbl = rows.registerAsTable("people") 24 | pandaFriends = sqlCtx.sql("SELECT name FROM people WHERE "+ 25 | "favouriteAnimal = \"panda\"") 26 | print "Panda Friends" 27 | print pandaFriends.map(lambda row: row.name).collect() 28 | -------------------------------------------------------------------------------- /src/python/QueryParuetFile.py: -------------------------------------------------------------------------------- 1 | # Finds the names of people who like pandas from a parquet file 2 | # consisting of name & favouriteAnimal. 3 | # For input you can use the result of MakeParquetFile 4 | from pyspark import SparkContext 5 | from pyspark.sql import SQLContext 6 | import json 7 | import sys 8 | 9 | if __name__ == "__main__": 10 | if len(sys.argv) != 4: 11 | print "Error usage: QueryParquetFile [sparkmaster] [parquetfile]" 12 | sys.exit(-1) 13 | master = sys.argv[1] 14 | inputFile = sys.argv[2] 15 | parquetFile = sys.argv[3] 16 | sc = SparkContext(master, "MakeParquetFile") 17 | sqlCtx = SQLContext(sc) 18 | # Load some data into an RDD 19 | rdd = sc.textFile(inputFile).map(lambda l: l.split(",")) 20 | namedRdd = rdd.map(lambda r: {"name": r[0], "favouriteAnimal": r[1]}) 21 | schemaNamedRdd = sqlCtx.inferSchema(namedRdd) 22 | # Save it 23 | schemaNamedRdd.saveAsParquetFile(parquetFile) 24 | -------------------------------------------------------------------------------- /src/python/RemoveOutliers.py: -------------------------------------------------------------------------------- 1 | """ 2 | >>> from pyspark.context import SparkContext 3 | >>> sc = SparkContext('local', 'test') 4 | >>> b = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000]) 5 | >>> sorted(removeOutliers(b).collect() 6 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 7 | """ 8 | 9 | import sys 10 | import math 11 | 12 | from pyspark import SparkContext 13 | 14 | 15 | def removeOutliers(nums): 16 | """Remove the outliers""" 17 | stats = nums.stats() 18 | stddev = math.sqrt(stats.variance()) 19 | return nums.filter(lambda x: math.fabs(x - stats.mean()) < 3 * stddev) 20 | 21 | if __name__ == "__main__": 22 | master = "local" 23 | if len(sys.argv) == 2: 24 | master = sys.argv[1] 25 | sc = SparkContext(master, "Sum") 26 | nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000]) 27 | output = sorted(removeOutliers(nums).collect()) 28 | for num in output: 29 | print "%i " % (num) 30 | -------------------------------------------------------------------------------- /src/python/SparkSQLTwitter.py: -------------------------------------------------------------------------------- 1 | # A simple demo for working with SparkSQL and Tweets 2 | from pyspark import SparkContext, SparkConf 3 | from pyspark.sql import HiveContext, Row 4 | from pyspark.sql.types import IntegerType 5 | import json 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | inputFile = sys.argv[1] 10 | conf = SparkConf().setAppName("SparkSQLTwitter") 11 | sc = SparkContext() 12 | hiveCtx = HiveContext(sc) 13 | print "Loading tweets from " + inputFile 14 | input = hiveCtx.jsonFile(inputFile) 15 | input.registerTempTable("tweets") 16 | topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") 17 | print topTweets.collect() 18 | topTweetText = topTweets.map(lambda row : row.text) 19 | print topTweetText.collect() 20 | # Make a happy person row 21 | happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")]) 22 | happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) 23 | happyPeopleSchemaRDD.registerTempTable("happy_people") 24 | # Make a UDF to tell us how long some text is 25 | hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) 26 | lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") 27 | print lengthSchemaRDD.collect() 28 | sc.stop() 29 | -------------------------------------------------------------------------------- /src/python/WordCount.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark import SparkContext 4 | 5 | if __name__ == "__main__": 6 | master = "local" 7 | if len(sys.argv) == 2: 8 | master = sys.argv[1] 9 | sc = SparkContext(master, "WordCount") 10 | lines = sc.parallelize(["pandas", "i like pandas"]) 11 | result = lines.flatMap(lambda x: x.split(" ")).countByValue() 12 | for key, value in result.iteritems(): 13 | print "%s %i" % (key, value) 14 | --------------------------------------------------------------------------------