├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE.md
├── README.md
├── bin
    ├── datagen.cmd
    ├── fakelogs.cmd
    └── fakelogs.sh
├── build-project
├── build.sbt
├── files
    ├── call_signs2.txt
    ├── callsign_tbl
    ├── callsign_tbl_sorted
    ├── callsigns
    ├── cqlsh_setup
    ├── fake_logs
    │   ├── log1.log
    │   └── log2.log
    ├── favourite_animals.csv
    ├── flumeconf.cfg
    ├── ham.txt
    ├── happypandas
    ├── int_string.csv
    ├── pandainfo.json
    ├── spam.txt
    └── testweet.json
├── mini-complete-example
    ├── README.md
    ├── build.sbt
    ├── pom.xml
    ├── project
    │   └── plugins.sbt
    ├── sbt
    │   └── sbt
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── oreilly
    │           │       └── learningsparkexamples
    │           │           └── mini
    │           │               └── java
    │           │                   ├── BasicMap.java
    │           │                   └── WordCount.java
    │       └── scala
    │           └── com
    │               └── oreilly
    │                   └── learningsparkexamples
    │                       └── mini
    │                           └── scala
    │                               ├── BasicMap.scala
    │                               └── WordCount.scala
├── pom.xml
├── project
    └── plugins.sbt
├── run-all-examples
├── sbt
    └── sbt
├── setup-project
└── src
    ├── R
        └── finddistance.R
    ├── main
        ├── java
        │   └── com
        │   │   └── oreilly
        │   │       └── learningsparkexamples
        │   │           └── java
        │   │               ├── BasicAvg.java
        │   │               ├── BasicAvgMapPartitions.java
        │   │               ├── BasicAvgWithKryo.java
        │   │               ├── BasicFlatMap.java
        │   │               ├── BasicJoinCsv.java
        │   │               ├── BasicLoadJson.java
        │   │               ├── BasicLoadSequenceFile.java
        │   │               ├── BasicLoadWholeCsv.java
        │   │               ├── BasicMap.java
        │   │               ├── BasicMapPartitions.java
        │   │               ├── BasicMapThenFilter.java
        │   │               ├── BasicMapToDouble.java
        │   │               ├── BasicQueryCassandra.java
        │   │               ├── BasicSaveSequenceFile.java
        │   │               ├── BasicSum.java
        │   │               ├── CallLog.java
        │   │               ├── ChapterSixExample.java
        │   │               ├── HappyPerson.java
        │   │               ├── IntersectByKey.java
        │   │               ├── KafkaInput.java
        │   │               ├── KeyValueMapFilter.java
        │   │               ├── LoadHive.java
        │   │               ├── LoadJsonWithSparkSQL.java
        │   │               ├── MLlib.java
        │   │               ├── PerKeyAvg.java
        │   │               ├── RemoveOutliers.java
        │   │               ├── SparkSQLTwitter.java
        │   │               ├── StreamingLogInput.java
        │   │               ├── WordCount.java
        │   │               └── logs
        │   │                   ├── ApacheAccessLog.java
        │   │                   ├── Flags.java
        │   │                   ├── Functions.java
        │   │                   ├── LogAnalyzerAppMain.java
        │   │                   ├── LogAnalyzerTotal.java
        │   │                   ├── LogAnalyzerWindowed.java
        │   │                   ├── LogStatistics.java
        │   │                   ├── ReadTransferStats.java
        │   │                   └── Renderer.java
        ├── protobuf
        │   ├── address_book.proto
        │   └── places.proto
        └── scala
        │   └── com
        │       └── oreilly
        │           └── learningsparkexamples
        │               └── scala
        │                   ├── BasicAvg.scala
        │                   ├── BasicAvgFromFile.scala
        │                   ├── BasicAvgFromFiles.scala
        │                   ├── BasicAvgMapPartitions.scala
        │                   ├── BasicAvgWithKryo.scala
        │                   ├── BasicFilterUnionCombo.scala
        │                   ├── BasicIntersectByKey.scala
        │                   ├── BasicLoadNums.scala
        │                   ├── BasicLoadSequenceFile.scala
        │                   ├── BasicLoadTextFromFTP.scala
        │                   ├── BasicMap.scala
        │                   ├── BasicMapNoCache.scala
        │                   ├── BasicMapPartitions.scala
        │                   ├── BasicMapThenFilter.scala
        │                   ├── BasicParseCsv.scala
        │                   ├── BasicParseJson.scala
        │                   ├── BasicParseJsonWithJackson.scala
        │                   ├── BasicParseWholeFileCsv.scala
        │                   ├── BasicQueryCassandra.scala
        │                   ├── BasicSaveProtoBuf.scala
        │                   ├── BasicSaveSequenceFile.scala
        │                   ├── BasicStreamingExample.scala
        │                   ├── BasicSum.scala
        │                   ├── ChapterSixExample.scala
        │                   ├── FlumeInput.scala
        │                   ├── KafkaInput.scala
        │                   ├── LoadHive.scala
        │                   ├── LoadJsonWithElephantBird.scala
        │                   ├── LoadJsonWithSparkSQL.scala
        │                   ├── LoadKeyValueTextInput.scala
        │                   ├── LoadSimpleJdbc.scala
        │                   ├── MLlib.scala
        │                   ├── MLlibPipeline.disabled_until_111
        │                   ├── PerKeyAvg.scala
        │                   ├── PipeExample.scala
        │                   ├── RemoveOutliers.scala
        │                   ├── SparkSQLTwitter.scala
        │                   ├── StreamingLogInput.scala
        │                   ├── WordCount.scala
        │                   ├── WriteSimpleDB.scala
        │                   └── logs
        │                       ├── LogAnalyzerAppMain.scala
        │                       ├── LogAnalyzerTotal.scala
        │                       ├── LogAnalyzerWindowed.scala
        │                       └── ReadTransferStats.scala
    ├── perl
        └── splitwords.pl
    └── python
        ├── AvgMapPartitions.py
        ├── BasicAvg.py
        ├── BasicFilterMap.py
        ├── BasicKeyValueMapFilter.py
        ├── BasicMap.py
        ├── BasicMapPartitions.py
        ├── BasicSum.py
        ├── ChapterSixExample.py
        ├── IntersectByKey.py
        ├── LoadCsv.py
        ├── LoadHive.py
        ├── LoadJson.py
        ├── MLlib.py
        ├── MakeHiveTable.py
        ├── MakeParquetFile.py
        ├── PerKeyAvg.py
        ├── QueryParquetFile.py
        ├── QueryParuetFile.py
        ├── RemoveOutliers.py
        ├── SparkSQLTwitter.py
        └── WordCount.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | dist/*
 6 | target/
 7 | lib_managed/
 8 | src_managed/
 9 | project/boot/
10 | project/plugins/project/
11 | sbt/*.jar
12 | mini-complete-example/sbt/*.jar
13 | 
14 | # Scala-IDE specific
15 | .scala_dependencies
16 | 
17 | #Emacs
18 | *~
19 | 
20 | #ignore the metastore
21 | metastore_db/*


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | scala:
 3 |    - 2.10.4
 4 | # Install R
 5 | before_install:
 6 |   - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
 7 |   - chmod 755 ./travis-tool.sh
 8 |   - ./travis-tool.sh bootstrap
 9 | install:
10 |   - ./travis-tool.sh install_deps
11 | before_script:
12 |    - ./setup-project
13 | script:
14 |    - ./build-project


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: learning-spark-examples
2 | Version: 0.1
3 | Depends: Imap
4 | License: MIT License
5 | Description: Examples for the learning spark book.
6 | Title: Examples for the learning spark book.
7 | Author@R: c(person("Holden Karau", role = c("aut", "cre"), email="holden@pigscanfly.ca"))


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (C) 2014 Holden Karau and respective authors. The learning spark examples are licensed under the [MIT license](http://opensource.org/licenses/MIT). 
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![buildstatus](https://travis-ci.org/holdenk/learning-spark-examples.svg?branch=master)](https://travis-ci.org/holdenk/learning-spark-examples)
 2 | Examples for Learning Spark
 3 | ===============
 4 | Examples for the Learning Spark book. These examples require a number of libraries and as such have long build files. We have also added a stand alone example with minimal dependencies and a small build file
 5 | in the mini-complete-example directory.
 6 | 
 7 | 
 8 | These examples have been updated to run against Spark 1.3 so they may
 9 | be slightly different than the versions in your copy of "Learning Spark".
10 | 
11 | Requirements
12 | ==
13 | * JDK 1.7 or higher
14 | * Scala 2.10.3
15 | - scala-lang.org
16 | * Spark 1.3
17 | * Protobuf compiler
18 | - On debian you can install with sudo apt-get install protobuf-compiler
19 | * R & the CRAN package Imap are required for the ChapterSixExample
20 | * The Python examples require urllib3
21 | 
22 | Python examples
23 | ===
24 | 
25 | From spark just run ./bin/pyspark ./src/python/[example]
26 | 
27 | Spark Submit
28 | ===
29 | 
30 | You can also create an assembly jar with all of the dependencies for running either the java or scala
31 | versions of the code and run the job with the spark-submit script
32 | 
33 | `./sbt/sbt assembly` OR `mvn package`
34 | 
35 | `cd $SPARK_HOME; ./bin/spark-submit   --class com.oreilly.learningsparkexamples.[lang].[example] ../learning-spark-examples/target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar`
36 | 
37 | [![Learning Spark](http://akamaicovers.oreilly.com/images/0636920028512/cat.gif)](http://www.jdoqocy.com/click-7645222-11260198?url=http%3A%2F%2Fshop.oreilly.com%2Fproduct%2F0636920028512.do%3Fcmp%3Daf-strata-books-videos-product_cj_9781449358600_%2525zp&cjsku=0636920028512)
38 | 


--------------------------------------------------------------------------------
/bin/datagen.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | echo 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
3 | ping -n 5 localhost > null
4 | echo 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
5 | ping -n 5 localhost > null
6 | echo 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"


--------------------------------------------------------------------------------
/bin/fakelogs.cmd:
--------------------------------------------------------------------------------
1 | ncat -l 7777 -k -c datagen.cmd


--------------------------------------------------------------------------------
/bin/fakelogs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | rm /tmp/logdata
 3 | touch /tmp/logdata
 4 | tail -f /tmp/logdata | nc -lk 7777 &
 5 | TAIL_NC_PID=$!
 6 | cat ./files/fake_logs/log1.log >> /tmp/logdata
 7 | sleep 5
 8 | cat ./files/fake_logs/log2.log >> /tmp/logdata
 9 | sleep 1
10 | cat ./files/fake_logs/log1.log >> /tmp/logdata
11 | sleep 2
12 | cat ./files/fake_logs/log1.log >> /tmp/logdata
13 | sleep 3
14 | sleep 20
15 | kill $TAIL_NC_PID
16 | 


--------------------------------------------------------------------------------
/build-project:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | set -e
 3 | set -x
 4 | # Do our mini example first
 5 | cd mini-complete-example
 6 | ./sbt/sbt clean compile package
 7 | ./sbt/sbt clean
 8 | echo $PWD && mvn clean && mvn compile
 9 | cd ..
10 | # Run the tests
11 | export SPARK_HOME=./spark-1.3.1-bin-hadoop1/
12 | ./sbt/sbt compile package assembly
13 | echo $?
14 | time ./run-all-examples
15 | echo $?
16 | echo "done"
17 | # Try and build with maven, skip for now
18 | #mvn clean && mvn compile && mvn package
19 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | name := "learning-spark-examples"
 6 | 
 7 | version := "0.0.1"
 8 | 
 9 | scalaVersion := "2.10.4"
10 | 
11 | javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
12 | 
13 | // protocol buffer support
14 | seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)
15 | 
16 | // additional libraries
17 | libraryDependencies ++= Seq(
18 |   "org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
19 |   "org.apache.spark" %% "spark-sql" % "1.3.1",
20 |   "org.apache.spark" %% "spark-hive" % "1.3.1",
21 |   "org.apache.spark" %% "spark-streaming" % "1.3.1",
22 |   "org.apache.spark" %% "spark-streaming-kafka" % "1.3.1",
23 |   "org.apache.spark" %% "spark-streaming-flume" % "1.3.1",
24 |   "org.apache.spark" %% "spark-mllib" % "1.3.1",
25 |   "org.apache.commons" % "commons-lang3" % "3.0",
26 |   "org.eclipse.jetty"  % "jetty-client" % "8.1.14.v20131031",
27 |   "com.typesafe.play" % "play-json_2.10" % "2.2.1",
28 |   "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3",
29 |   "com.fasterxml.jackson.module" % "jackson-module-scala_2.10" % "2.3.3",
30 |   "org.elasticsearch" % "elasticsearch-hadoop-mr" % "2.0.0.RC1",
31 |   "net.sf.opencsv" % "opencsv" % "2.0",
32 |   "com.twitter.elephantbird" % "elephant-bird" % "4.5",
33 |   "com.twitter.elephantbird" % "elephant-bird-core" % "4.5",
34 |   "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.17",
35 |   "mysql" % "mysql-connector-java" % "5.1.31",
36 |   "com.datastax.spark" %% "spark-cassandra-connector" % "1.0.0-rc5",
37 |   "com.datastax.spark" %% "spark-cassandra-connector-java" % "1.0.0-rc5",
38 |   "com.github.scopt" %% "scopt" % "3.2.0",
39 |   "org.scalatest" %% "scalatest" % "2.2.1" % "test",
40 |   "com.holdenkarau" %% "spark-testing-base" % "0.0.1" % "test"
41 | )
42 | 
43 | resolvers ++= Seq(
44 |   "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
45 |   "Spray Repository" at "http://repo.spray.cc/",
46 |   "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
47 |   "Akka Repository" at "http://repo.akka.io/releases/",
48 |   "Twitter4J Repository" at "http://twitter4j.org/maven2/",
49 |   "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
50 |   "Twitter Maven Repo" at "http://maven.twttr.com/",
51 |   "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
52 |   "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
53 |   "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
54 |   "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
55 |   Resolver.sonatypeRepo("public")
56 | )
57 | 
58 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
59 |   {
60 |     case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
61 |     case m if m.startsWith("META-INF") => MergeStrategy.discard
62 |     case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
63 |     case PathList("org", "apache", xs @ _*) => MergeStrategy.first
64 |     case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
65 |     case "about.html"  => MergeStrategy.rename
66 |     case "reference.conf" => MergeStrategy.concat
67 |     case _ => MergeStrategy.first
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/files/call_signs2.txt:
--------------------------------------------------------------------------------
1 | KK6JKQ
2 | 


--------------------------------------------------------------------------------
/files/callsigns:
--------------------------------------------------------------------------------
 1 | W8PAL
 2 | KK6JKQ
 3 | W6BB
 4 | VE3UOW
 5 | VE2CUA
 6 | VE2UN
 7 | OH2TI
 8 | GB1MIR
 9 | K2AMH
10 | UA1LO
11 | N7ICE
12 | 


--------------------------------------------------------------------------------
/files/cqlsh_setup:
--------------------------------------------------------------------------------
1 | DROP KEYSPACE IF EXISTS test;
2 | CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
3 | CREATE TABLE test.kv(key text PRIMARY KEY, value int);
4 | INSERT INTO test.kv(key, value) VALUES ('panda', 1);
5 | INSERT INTO test.kv(key, value) VALUES ('notpanda', 0);


--------------------------------------------------------------------------------
/files/fake_logs/log1.log:
--------------------------------------------------------------------------------
1 | 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
2 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
3 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /favicon.ico HTTP/1.1" 200 1713 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
4 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
5 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET /jobmineimg.php?q=m HTTP/1.1" 200 222 "http://www.holdenkarau.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
6 | 


--------------------------------------------------------------------------------
/files/fake_logs/log2.log:
--------------------------------------------------------------------------------
1 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error78978 HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
2 | 


--------------------------------------------------------------------------------
/files/favourite_animals.csv:
--------------------------------------------------------------------------------
1 | holden,panda
2 | notholden,notpanda
3 | spark,bear


--------------------------------------------------------------------------------
/files/flumeconf.cfg:
--------------------------------------------------------------------------------
 1 | # Name the components on this agent
 2 | panda.sources = r1
 3 | panda.sinks = avroSink
 4 | panda.channels = c1
 5 | 
 6 | # avro sink
 7 | panda.sinks = avroSink
 8 | panda.sinks.avroSink.type = avro
 9 | panda.sinks.avroSink.channel = memoryChannel
10 | panda.sinks.avroSink.hostname = localhost
11 | panda.sinks.avroSink.port = 7788
12 | 
13 | # input
14 | panda.sources.r1.type = netcat
15 | panda.sources.r1.bind = localhost
16 | panda.sources.r1.port = 44444
17 | 
18 | # Use a channel which buffers events in memory
19 | panda.channels.c1.type = memory
20 | panda.channels.c1.capacity = 1000
21 | panda.channels.c1.transactionCapacity = 100
22 | 
23 | # Bind the source and sink to the channel
24 | panda.sources.r1.channels = c1
25 | panda.sinks.avroSink.channel = c1


--------------------------------------------------------------------------------
/files/ham.txt:
--------------------------------------------------------------------------------
1 | Dear Spark Learner, Thanks so much for attending the Spark Summit 2014!  Check out videos of talks from the summit at ...
2 | Hi Mom, Apologies for being late about emailing and forgetting to send you the package.  I hope you and bro have been ...
3 | Wow, hey Fred, just heard about the Spark petabyte sort.  I think we need to take time to try it out immediately ...
4 | Hi Spark user list, This is my first question to this list, so thanks in advance for your help!  I tried running ...
5 | Thanks Tom for your email.  I need to refer you to Alice for this one.  I haven't yet figured out that part either ...
6 | Good job yesterday!  I was attending your talk, and really enjoyed it.  I want to try out GraphX ...
7 | Summit demo got whoops from audience!  Had to let you know. --Joe
8 | 


--------------------------------------------------------------------------------
/files/happypandas:
--------------------------------------------------------------------------------
1 | coffee 1
2 | coffee 2
3 | pandas 3
4 | happy 4


--------------------------------------------------------------------------------
/files/int_string.csv:
--------------------------------------------------------------------------------
1 | 1panda
2 | 2pandas
3 | 3pandas


--------------------------------------------------------------------------------
/files/pandainfo.json:
--------------------------------------------------------------------------------
1 | {"name":"Sparky The Bear", "lovesPandas":true}
2 | {"name": "Holden"}
3 | {"name":"Sparky The Bear", "lovesPandas":true, "knows":{"friends": ["holden"]}}
4 | 


--------------------------------------------------------------------------------
/files/spam.txt:
--------------------------------------------------------------------------------
1 | Dear sir, I am a Prince in a far kingdom you have not heard of.  I want to send you money via wire transfer so please ...
2 | Get Viagra real cheap!  Send money right away to ...
3 | Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ...
4 | YOUR COMPUTER HAS BEEN INFECTED!  YOU MUST RESET YOUR PASSWORD.  Reply to this email with your password and SSN ...
5 | THIS IS NOT A SCAM!  Send money and get access to awesome stuff really cheap and never have to ...
6 | 


--------------------------------------------------------------------------------
/files/testweet.json:
--------------------------------------------------------------------------------
1 | {"createdAt":"Nov 4, 2014 4:56:59 PM","id":529799371026485248,"text":"Adventures With Coffee, Code, and Writing.","source":"\u003ca href\u003d\"http://twitter.com\" rel\u003d\"nofollow\"\u003eTwitter Web Client\u003c/a\u003e","isTruncated":false,"inReplyToStatusId":-1,"inReplyToUserId":-1,"isFavorited":false,"retweetCount":0,"isPossiblySensitive":false,"contributorsIDs":[],"userMentionEntities":[],"urlEntities":[],"hashtagEntities":[],"mediaEntities":[],"currentUserRetweetId":-1,"user":{"id":15594928,"name":"Holden Karau","screenName":"holdenkarau","location":"","description":"","descriptionURLEntities":[],"isContributorsEnabled":false,"profileImageUrl":"http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","profileImageUrlHttps":"https://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","isProtected":false,"followersCount":1231,"profileBackgroundColor":"C0DEED","profileTextColor":"333333","profileLinkColor":"0084B4","profileSidebarFillColor":"DDEEF6","profileSidebarBorderColor":"FFFFFF","profileUseBackgroundImage":true,"showAllInlineMedia":false,"friendsCount":600,"createdAt":"Aug 5, 2011 9:42:44 AM","favouritesCount":1095,"utcOffset":-3,"profileBackgroundImageUrl":"","profileBackgroundImageUrlHttps":"","profileBannerImageUrl":"","profileBackgroundTiled":true,"lang":"en","statusesCount":6234,"isGeoEnabled":true,"isVerified":false,"translator":false,"listedCount":0,"isFollowRequestSent":false}}
2 | 


--------------------------------------------------------------------------------
/mini-complete-example/README.md:
--------------------------------------------------------------------------------
1 | Mini Examples for Spark
2 | ===============
3 | This directory contains a complete stand alone example with both Maven and SBT build tools.
4 | 


--------------------------------------------------------------------------------
/mini-complete-example/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "learning-spark-mini-example"
 2 | 
 3 | version := "0.0.1"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | // additional libraries
 8 | libraryDependencies ++= Seq(
 9 |   "org.apache.spark" %% "spark-core" % "1.1.0" % "provided"
10 | )
11 | 


--------------------------------------------------------------------------------
/mini-complete-example/pom.xml:
--------------------------------------------------------------------------------
 1 | <project>
 2 |   <groupId>com.oreilly.learningsparkexamples.mini</groupId>
 3 |   <artifactId>learning-spark-mini-example</artifactId>
 4 |   <modelVersion>4.0.0</modelVersion>
 5 |   <name>example</name>
 6 |   <packaging>jar</packaging>
 7 |   <version>0.0.1</version>
 8 |   <dependencies>
 9 |     <dependency> <!-- Spark dependency -->
10 |       <groupId>org.apache.spark</groupId>
11 |       <artifactId>spark-core_2.10</artifactId>
12 |       <version>1.1.0</version>
13 |       <scope>provided</scope>
14 |     </dependency>
15 |   </dependencies>
16 |   <properties>
17 |     <java.version>1.6</java.version>
18 |   </properties>
19 |   <build>
20 |     <pluginManagement>
21 |       <plugins>
22 |         <plugin>
23 | 	  <groupId>org.apache.maven.plugins</groupId>
24 |           <artifactId>maven-compiler-plugin</artifactId>
25 |           <version>3.1</version>
26 |           <configuration>
27 |             <source>${java.version}</source>
28 |             <target>${java.version}</target>
29 |           </configuration>
30 | 	</plugin>
31 |       </plugins>
32 |     </pluginManagement>
33 |   </build>
34 | </project>
35 | 


--------------------------------------------------------------------------------
/mini-complete-example/project/plugins.sbt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdenk/learning-spark-examples/6862949df6c29c149ffcbedfd5948fe2ab5e2619/mini-complete-example/project/plugins.sbt


--------------------------------------------------------------------------------
/mini-complete-example/sbt/sbt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This script launches sbt for this project. If present it uses the system 
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=0.13.7
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 | 
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 |   # Download
31 |   printf "Attempting to fetch sbt\n"
32 |   JAR_DL=${JAR}.part
33 |   if hash wget 2>/dev/null; then
34 |       (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
35 |   elif hash curl 2>/dev/null; then
36 |     (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
37 |   else
38 |     printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
39 |     exit -1
40 |   fi
41 | fi
42 | if [ ! -f ${JAR} ]; then
43 |   # We failed to download
44 |   printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
45 |   exit -1
46 | fi
47 | printf "Launching sbt from ${JAR}\n"
48 | java \
49 |   -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \
50 |   -jar ${JAR} \
51 |   "$@"
52 | 


--------------------------------------------------------------------------------
/mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/BasicMap.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.mini.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 | 
15 | public class BasicMap {
16 |   public static void main(String[] args) throws Exception {
17 |     String master;
18 |     if (args.length > 0) {
19 |       master = args[0];
20 |     } else {
21 |       master = "local";
22 |     }
23 |     JavaSparkContext sc = new JavaSparkContext(
24 |       master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 |     JavaRDD<Integer> result = rdd.map(
27 |       new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
28 |     System.out.println(StringUtils.join(result.collect(), ","));
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/WordCount.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a wordcount in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.mini.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | import java.lang.Iterable;
 9 | 
10 | import scala.Tuple2;
11 | 
12 | import org.apache.commons.lang.StringUtils;
13 | 
14 | import org.apache.spark.SparkConf;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function2;
20 | import org.apache.spark.api.java.function.PairFunction;
21 | 
22 | 
23 | public class WordCount {
24 |   public static void main(String[] args) throws Exception {
25 |     String inputFile = args[0];
26 |     String outputFile = args[1];
27 |     // Create a Java Spark Context.
28 |     SparkConf conf = new SparkConf().setAppName("wordCount");
29 | 		JavaSparkContext sc = new JavaSparkContext(conf);
30 |     // Load our input data.
31 |     JavaRDD<String> input = sc.textFile(inputFile);
32 |     // Split up into words.
33 |     JavaRDD<String> words = input.flatMap(
34 |       new FlatMapFunction<String, String>() {
35 |         public Iterable<String> call(String x) {
36 |           return Arrays.asList(x.split(" "));
37 |         }});
38 |     // Transform into word and count.
39 |     JavaPairRDD<String, Integer> counts = words.mapToPair(
40 |       new PairFunction<String, String, Integer>(){
41 |         public Tuple2<String, Integer> call(String x){
42 |           return new Tuple2(x, 1);
43 |         }}).reduceByKey(new Function2<Integer, Integer, Integer>(){
44 |             public Integer call(Integer x, Integer y){ return x + y;}});
45 |     // Save the word count back out to a text file, causing evaluation.
46 |     counts.saveAsTextFile(outputFile);
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/BasicMap.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicMap {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME"))
15 |       val input = sc.parallelize(List(1,2,3,4))
16 |       val result = input.map(x => x*x)
17 |       println(result.collect().mkString(","))
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/WordCount.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates flatMap + countByValue for wordcount.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.mini.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object WordCount {
10 |     def main(args: Array[String]) {
11 |       val inputFile = args(0)
12 |       val outputFile = args(1)
13 |       val conf = new SparkConf().setAppName("wordCount")
14 |       // Create a Scala Spark Context.
15 |       val sc = new SparkContext(conf)
16 |       // Load our input data.
17 |       val input =  sc.textFile(inputFile)
18 |       // Split up into words.
19 |       val words = input.flatMap(line => line.split(" "))
20 |       // Transform into word and count.
21 |       val counts = words.map(word => (word, 1)).reduceByKey{case (x, y) => x + y}
22 |       // Save the word count back out to a text file, causing evaluation.
23 |       counts.saveAsTextFile(outputFile)
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project>
  2 |   <groupId>com.oreilly.learningsparkexamples</groupId>
  3 |   <artifactId>java</artifactId>
  4 |   <modelVersion>4.0.0</modelVersion>
  5 |   <name>examples</name>
  6 |   <packaging>jar</packaging>
  7 |   <version>0.0.2</version>
  8 |   <repositories>
  9 |     <repository>
 10 |       <id>Akka repository</id>
 11 |       <url>http://repo.akka.io/releases</url>
 12 |     </repository>
 13 |     <repository>
 14 |       <id>scala-tools</id>
 15 |       <url>https://oss.sonatype.org/content/groups/scala-tools</url>
 16 |     </repository>
 17 |     <repository>
 18 |       <id>apache</id>
 19 |       <url>https://repository.apache.org/content/repositories/releases</url>
 20 |     </repository>
 21 |     <repository>
 22 |       <id>twitter</id>
 23 |       <url>http://maven.twttr.com/</url>
 24 |     </repository>
 25 |     <repository>
 26 |       <id>central2</id>
 27 |       <url>http://central.maven.org/maven2/</url>
 28 |     </repository>
 29 |   </repositories>
 30 |   <dependencies>
 31 |     <dependency> <!-- Spark dependency -->
 32 |       <groupId>org.apache.spark</groupId>
 33 |       <artifactId>spark-core_2.10</artifactId>
 34 |       <version>1.3.1</version>
 35 |       <scope>provided</scope>
 36 |     </dependency>
 37 |     <dependency> <!-- Spark dependency -->
 38 |       <groupId>org.apache.spark</groupId>
 39 |       <artifactId>spark-sql_2.10</artifactId>
 40 |       <version>1.3.1</version>
 41 |       <scope>provided</scope>
 42 |     </dependency>
 43 |     <dependency> <!-- Spark dependency -->
 44 |       <groupId>org.apache.spark</groupId>
 45 |       <artifactId>spark-hive_2.10</artifactId>
 46 |       <version>1.3.1</version>
 47 |       <scope>provided</scope>
 48 |     </dependency>
 49 |     <dependency> <!-- Spark dependency -->
 50 |       <groupId>org.apache.spark</groupId>
 51 |       <artifactId>spark-streaming_2.10</artifactId>
 52 |       <version>1.3.1</version>
 53 |     </dependency>
 54 |     <dependency> <!-- Spark dependency -->
 55 |       <groupId>org.apache.spark</groupId>
 56 |       <artifactId>spark-streaming-kafka_2.10</artifactId>
 57 |       <version>1.3.1</version>
 58 |     </dependency>
 59 |     <dependency> <!-- Spark dependency -->
 60 |       <groupId>org.apache.spark</groupId>
 61 |       <artifactId>spark-mllib</artifactId>
 62 |       <version>1.3.1</version>
 63 |     </dependency>
 64 |     <dependency> <!-- Cassandra -->
 65 |       <groupId>com.datastax.spark</groupId>
 66 |       <artifactId>spark-cassandra-connector</artifactId>
 67 |       <version>1.0.0-rc5</version>
 68 |     </dependency>
 69 |     <dependency> <!-- Cassandra -->
 70 |       <groupId>com.datastax.spark</groupId>
 71 |       <artifactId>spark-cassandra-connector-java</artifactId>
 72 |       <version>1.0.0-rc5</version>
 73 |     </dependency>
 74 |     <dependency> <!-- Elastic search connector -->
 75 |       <groupId>org.elasticsearch</groupId>
 76 |       <artifactId>elasticsearch-hadoop-mr</artifactId>
 77 |       <version>2.0.0.RC1</version>
 78 |     </dependency>
 79 |     <dependency> <!-- Jetty demmo -->
 80 |       <groupId>org.eclipse.jetty</groupId>
 81 |       <artifactId>jetty-client</artifactId>
 82 |       <version>8.1.14.v20131031</version>
 83 |     </dependency>
 84 |     <dependency>
 85 |       <groupId>com.fasterxml.jackson.core</groupId>
 86 |       <artifactId>jackson-databind</artifactId>
 87 |       <version>2.3.3</version>
 88 |     </dependency>
 89 |     <dependency>
 90 |       <groupId>org.apache.commons</groupId>
 91 |       <artifactId>commons-lang3</artifactId>
 92 |       <version>3.0</version>
 93 |     </dependency>
 94 |     <dependency>
 95 |       <groupId>net.sf.opencsv</groupId>
 96 |       <artifactId>opencsv</artifactId>
 97 |       <version>2.0</version>
 98 |     </dependency>
 99 |     <dependency>
100 |       <groupId>org.scalatest</groupId>
101 |       <artifactId>scalatest_${scala.binary.version}</artifactId>
102 |       <version>2.2.1</version>
103 |     </dependency>
104 |   </dependencies>
105 |   <properties>
106 |     <java.version>1.7</java.version>
107 |   </properties>
108 |   <build>
109 |     <pluginManagement>
110 |       <plugins>
111 |         <plugin>
112 | 	  <groupId>org.apache.maven.plugins</groupId>
113 |           <artifactId>maven-compiler-plugin</artifactId>
114 |           <version>3.1</version>
115 |           <configuration>
116 |             <source>${java.version}</source>
117 |             <target>${java.version}</target>
118 |           </configuration>
119 | 	</plugin>
120 | 	<plugin>
121 |           <groupId>org.apache.maven.plugins</groupId>
122 |           <artifactId>maven-assembly-plugin</artifactId>
123 |           <version>2.2.2</version>
124 |           <!-- The configuration of the plugin -->
125 |           <configuration>
126 |             <!-- Specifies the configuration file of the assembly plugin -->
127 |             <descriptors>
128 |               <descriptor>src/main/assembly/assembly.xml</descriptor>
129 |             </descriptors>
130 |           </configuration>
131 |         </plugin>
132 |       </plugins>
133 |     </pluginManagement>
134 |   </build>
135 | </project>
136 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 2 | 
 3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
 4 | 
 5 | resolvers += "Spray Repository" at "http://repo.spray.cc/"
 6 | 
 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
 8 | 
 9 | addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
10 | 


--------------------------------------------------------------------------------
/run-all-examples:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script is used to run all of the examples. Mostly to be used by travis for testing
 3 | # Output the commands we run
 4 | set -x
 5 | # If any command fails, fail
 6 | set -e
 7 | # Build everything
 8 | ./sbt/sbt compile package assembly > sbtlog || (echo "sbt failed" && cat ./sbtlog && exit 1)
 9 | KAFKA_ROOT=./kafka_2.9.2-0.8.1.1
10 | SPARK_SUBMIT_SCRIPT=$SPARK_HOME/bin/spark-submit
11 | ASSEMBLY_JAR=./target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar
12 | # Mini cleanup
13 | rm -rf /tmp/py; mkdir -p /tmp/py
14 | rm -rf /tmp/java; mkdir -p /tmp/java
15 | rm -rf /tmp/scala; mkdir -p /tmp/scala
16 | # setup cassandra
17 | # cqlsh --file ./files/cqlsh_setup &
18 | # Scala
19 | echo "Running Scala programs"
20 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json 
21 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.ChapterSixExample  $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/scala/ch6out
22 | TWITTER_DATA=./files/testweet.json
23 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.SparkSQLTwitter  $ASSEMBLY_JAR  "$TWITTER_DATA"  /tmp/scala/tweetout
24 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.BasicQueryCassandra $ASSEMBLY_JAR local localhost
25 | echo "Running Scala streaming program"
26 | ./bin/fakelogs.sh &
27 | sleep 1
28 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.StreamingLogInput $ASSEMBLY_JAR local[4]
29 | echo "Running Scala Kafka streaming example"
30 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-readers pandas 1 &
31 | KAFKA_PID=$!
32 | sleep 1
33 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas
34 | wait $KAFKA_PID
35 | echo "Running Scala Flume example"
36 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.FlumeInput $ASSEMBLY_JAR localhost 7788 &
37 | FLUME_PID=$!
38 | sleep 1
39 | echo "panda\nerror panda\n" | nc localhost 44444
40 | sleep 3
41 | echo "panda2\nerror panda2\n" | nc localhost 44444
42 | wait $FLUME_PID
43 | # Python
44 | echo "Running Python programs"
45 | $SPARK_SUBMIT_SCRIPT ./src/python/AvgMapPartitions.py local
46 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicAvg.py local
47 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicFilterMap.py local
48 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicKeyValueMapFilter.py local
49 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMapPartitions.py local
50 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMap.py local
51 | $SPARK_SUBMIT_SCRIPT ./src/python/ChapterSixExample.py local ./files/callsigns /tmp/py/pandaout
52 | $SPARK_SUBMIT_SCRIPT ./src/python/SparkSQLTwitter.py ./files/testweet.json /tmp/py/tweetout
53 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadCsv.py local ./files/favourite_animals.csv /tmp/py/panda_lovers.csv
54 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeHiveTable.py local ./files/int_string.csv pandaplural
55 | # Temporarily disabled due to API changes
56 | #$SPARK_SUBMIT_SCRIPT ./src/python/LoadHive.py local pandaplural
57 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadJson.py local ./files/pandainfo.json /tmp/py/loadjsonout
58 | $SPARK_SUBMIT_SCRIPT ./src/python/PerKeyAvg.py local
59 | $SPARK_SUBMIT_SCRIPT ./src/python/RemoveOutliers.py local
60 | $SPARK_SUBMIT_SCRIPT ./src/python/WordCount.py local
61 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeParquetFile.py local ./files/favourite_animals.csv /tmp/py/favouriteanimal_parquet
62 | $SPARK_SUBMIT_SCRIPT ./src/python/QueryParquetFile.py local /tmp/py/favouriteanimal_parquet
63 | 
64 | # Java
65 | echo "Running Java programs"
66 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json
67 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.ChapterSixExample $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/java/ch6out
68 | ./sbt/sbt assembly && $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.SparkSQLTwitter  $ASSEMBLY_JAR  ./files/testweet.json  /tmp/java/tweetout
69 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.BasicQueryCassandra $ASSEMBLY_JAR local localhost
70 | echo "Running Java streaming program"
71 | ./bin/fakelogs.sh &
72 | sleep 1
73 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.StreamingLogInput $ASSEMBLY_JAR local[4]
74 | sleep 5
75 | echo "Running Java Kafka streaming example"
76 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-java-readers
77 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas
78 | 
79 | echo "Done running all programs :)"
80 | 


--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This script launches sbt for this project. If present it uses the system 
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=0.13.7
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 | 
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 |   # Download
31 |   printf "Attempting to fetch sbt\n"
32 |   JAR_DL=${JAR}.part
33 |   if hash wget 2>/dev/null; then
34 |       (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
35 |   elif hash curl 2>/dev/null; then
36 |     (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
37 |   else
38 |     printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
39 |     exit -1
40 |   fi
41 | fi
42 | if [ ! -f ${JAR} ]; then
43 |   # We failed to download
44 |   printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
45 |   exit -1
46 | fi
47 | printf "Launching sbt from ${JAR}\n"
48 | java \
49 |   -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \
50 |   -jar ${JAR} \
51 |   "$@"
52 | 


--------------------------------------------------------------------------------
/setup-project:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | set -e
 4 | set -o pipefail
 5 | sudo apt-get install -y axel time
 6 | echo "Downloading misc tools"
 7 | sudo rm -f /etc/apt/sources.list.d/cassandra.sources.list
 8 | echo "deb http://debian.datastax.com/community stable main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list
 9 | curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add -
10 | sudo apt-get update > aptlog &
11 | APT_GET_UPDATE_PID=$!
12 | axel http://d3kbcqa49mib13.cloudfront.net/spark-1.3.1-bin-hadoop1.tgz  > sparkdl &
13 | SPARK_DL_PID=$!
14 | axel http://mirrors.ibiblio.org/apache/kafka/0.8.1.1/kafka_2.9.2-0.8.1.1.tgz > kafkadl &
15 | KAFKA_DL_PID=$!
16 | axel http://mirror.cogentco.com/pub/apache/flume/1.5.0.1/apache-flume-1.5.0.1-bin.tar.gz > flumedl &
17 | FLUME_DL_PID=$!
18 | wait $SPARK_DL_PID
19 | sudo mkdir -p /etc/apt/sources.list.d/
20 | echo "install urllib3"
21 | sudo pip install urllib3
22 | wait $SPARK_DL_PID || echo "Spark DL finished early"
23 | tar -xf spark-1.3.1-bin-hadoop1.tgz
24 | wait $APT_GET_UPDATE_PID
25 | echo "Installing protobuf"
26 | sudo apt-get install protobuf-compiler
27 | echo $?
28 | # Set up cassandra
29 | echo "Waiting for apt-get update to finish"
30 | wait $APT_GET_UPDATE_PID || echo "apt-get update finished early"
31 | echo "Setting up dsc (cassandra)"
32 | sleep 1;
33 | #sudo apt-get -y --force-yes remove cassandra cassandra-tools
34 | #sudo rm -rf /etc/security/limits.d/cassandra.conf || echo "No cassandra security conf"
35 | #yes | sudo apt-get -y --force-yes install  dsc21 > dscinstall.log
36 | #yes | sudo apt-get -y --force-yes install  cassandra-tools > ctoolsinstall.log
37 | echo "Starting cassandra"
38 | sudo /etc/init.d/cassandra start
39 | echo $?
40 | echo "set up hive directories"
41 | export IAM=`whoami`
42 | sudo mkdir -p /user/hive && sudo chown -R $IAM /user/hive
43 | echo "done with setup"
44 | # Set up kafka
45 | echo "Setting up kafka"
46 | wait $KAFKA_DL_PID || echo "Kafka DL finished early"
47 | tar -xzf kafka_2.9.2-0.8.1.1.tgz
48 | cd kafka_2.9.2-0.8.1.1
49 | echo "Starting zookeeper"
50 | ./bin/zookeeper-server-start.sh config/zookeeper.properties &
51 | echo "Starting kafka"
52 | sleep 5
53 | ./bin/kafka-server-start.sh config/server.properties &
54 | sleep 5
55 | # publish a pandas topic to kafka
56 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic pandas --partition 1 --replication-factor 1 --create
57 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic logs --partition 1 --replication-factor 1 --create
58 | cd ..
59 | 
60 | # set up flume
61 | wait $FLUME_DL_PID || echo "Flume DL finished early"
62 | echo "Setting up flume"
63 | tar -xf apache-flume-1.5.0.1-bin.tar.gz
64 | cd apache-flume-1.5.0.1-bin
65 | ./bin/flume-ng agent -n panda --conf-file ../files/flumeconf.cfg &
66 | disown $!
67 | cd ..
68 | echo $?
69 | 


--------------------------------------------------------------------------------
/src/R/finddistance.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library("Imap")
 3 | f <- file("stdin")
 4 | open(f)
 5 | while(length(line <- readLines(f,n=1)) > 0) {
 6 |   # process line
 7 |   contents <- Map(as.numeric, strsplit(line, ","))
 8 |   mydist <- gdist(contents[[1]][1], contents[[1]][2], contents[[1]][3], contents[[1]][4],
 9 |                   units="m", a=6378137.0, b=6356752.3142, verbose = FALSE)
10 |   write(mydist, stdout())
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvg.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates how to compute an average using aggregate in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.Serializable;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | 
10 | import org.apache.commons.lang.StringUtils;
11 | 
12 | import org.apache.spark.api.java.JavaRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.Function2;
15 | 
16 | public final class BasicAvg {
17 |   public static class AvgCount implements Serializable {
18 |     public AvgCount(int total, int num) {
19 | 	    total_ = total;
20 | 	    num_ = num;
21 |     }
22 |     public int total_;
23 |     public int num_;
24 |     public float avg() {
25 | 	    return total_ / (float) num_;
26 |     }
27 |   }
28 | 
29 |   public static void main(String[] args) throws Exception {
30 |     String master;
31 |     if (args.length > 0) {
32 | 	    master = args[0];
33 |     } else {
34 | 	    master = "local";
35 |     }
36 | 
37 |     JavaSparkContext sc = new JavaSparkContext(
38 |       master, "basicavg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
39 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
40 |     Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {
41 | 	    @Override
42 | 	    public AvgCount call(AvgCount a, Integer x) {
43 |         a.total_ += x;
44 |         a.num_ += 1;
45 |         return a;
46 | 	    }
47 |     };
48 |     Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
49 | 	    @Override
50 | 	    public AvgCount call(AvgCount a, AvgCount b) {
51 |         a.total_ += b.total_;
52 |         a.num_ += b.num_;
53 |         return a;
54 | 	    }
55 |     };
56 |     AvgCount initial = new AvgCount(0,0);
57 |     AvgCount result = rdd.aggregate(initial, addAndCount, combine);
58 |     System.out.println(result.avg());
59 |     sc.stop();
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgMapPartitions.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partitions in Java to compute the average
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | import java.util.Iterator;
10 | 
11 | import org.apache.commons.lang.StringUtils;
12 | 
13 | import org.eclipse.jetty.client.ContentExchange;
14 | import org.eclipse.jetty.client.HttpClient;
15 | 
16 | 
17 | import org.apache.spark.api.java.JavaRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 | import org.apache.spark.api.java.function.Function2;
21 | 
22 | public final class BasicAvgMapPartitions {
23 |   class AvgCount {
24 |     public AvgCount() {
25 |       total_ = 0;
26 |       num_ = 0;
27 |     }
28 |     public AvgCount(Integer total, Integer num) {
29 | 	    total_ = total;
30 | 	    num_ = num;
31 |     }
32 |     public AvgCount merge(Iterable<Integer> input) {
33 |       for (Integer elem : input) {
34 |         num_ += 1;
35 |         total_ += elem;
36 |       }
37 |       return this;
38 |     }
39 |     public Integer total_;
40 |     public Integer num_;
41 |     public float avg() {
42 | 	    return total_ / (float) num_;
43 |     }
44 |   }
45 | 
46 |   public static void main(String[] args) throws Exception {
47 |     String master;
48 |     if (args.length > 0) {
49 |       master = args[0];
50 |     } else {
51 |       master = "local";
52 |     }
53 |     BasicAvgMapPartitions bamp = new BasicAvgMapPartitions();
54 |     bamp.run(master);
55 |   }
56 | 
57 |   public void run(String master) {
58 |     JavaSparkContext sc = new JavaSparkContext(
59 |       master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
60 |     JavaRDD<Integer> rdd = sc.parallelize(
61 |       Arrays.asList(1, 2, 3, 4, 5));
62 |     FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() {
63 |       @Override
64 |       public Iterable<AvgCount> call(Iterator<Integer> input) {
65 |         AvgCount a = new AvgCount(0, 0);
66 |         while (input.hasNext()) {
67 |           a.total_ += input.next();
68 |           a.num_ += 1;
69 |         }
70 |         ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
71 |         ret.add(a);
72 |         return ret;
73 |       }
74 |     };
75 |     Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
76 | 	    @Override
77 | 	    public AvgCount call(AvgCount a, AvgCount b) {
78 |         a.total_ += b.total_;
79 |         a.num_ += b.num_;
80 |         return a;
81 | 	    }
82 |     };
83 | 
84 |     AvgCount result = rdd.mapPartitions(setup).reduce(combine);
85 |     System.out.println(result.avg());
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgWithKryo.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates Kryo serialization in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.SparkConf;
12 | import org.apache.spark.serializer.KryoRegistrator;
13 | import org.apache.spark.api.java.JavaRDD;
14 | import org.apache.spark.api.java.JavaSparkContext;
15 | import org.apache.spark.api.java.function.Function2;
16 | 
17 | import com.esotericsoftware.kryo.Kryo;
18 | import com.esotericsoftware.kryo.serializers.FieldSerializer;
19 | 
20 | public final class BasicAvgWithKryo {
21 |   // This is our custom class we will configure Kyro to serialize
22 |   static class AvgCount implements java.io.Serializable {
23 |     public AvgCount() {
24 |       total_ = 0;
25 |       num_ = 0;
26 |     }
27 |     public AvgCount(int total, int num) {
28 |       total_ = total;
29 |       num_ = num;
30 |     }
31 |     public float avg() {
32 |       return total_ / (float) num_;
33 |     }
34 |     public int total_;
35 |     public int num_;
36 |   }
37 | 
38 |   public static class AvgRegistrator implements KryoRegistrator {
39 |     public void registerClasses(Kryo kryo) {
40 |       kryo.register(AvgCount.class, new FieldSerializer(kryo, AvgCount.class));
41 |     }
42 |   }
43 | 
44 |   public static void main(String[] args) throws Exception {
45 | 		String master;
46 | 		if (args.length > 0) {
47 |       master = args[0];
48 | 		} else {
49 | 			master = "local";
50 | 		}
51 | 
52 |     SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro");
53 |     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
54 |     conf.set("spark.kryo.registrator", AvgRegistrator.class.getName());
55 | 		JavaSparkContext sc = new JavaSparkContext(conf);
56 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
57 |     Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {
58 |       @Override
59 |       public AvgCount call(AvgCount a, Integer x) {
60 |         a.total_ += x;
61 |         a.num_ += 1;
62 |         return a;
63 |       }
64 |     };
65 |     Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
66 |       @Override
67 |       public AvgCount call(AvgCount a, AvgCount b) {
68 |         a.total_ += b.total_;
69 |         a.num_ += b.num_;
70 |         return a;
71 |       }
72 |     };
73 |     AvgCount initial = new AvgCount(0,0);
74 |     AvgCount result = rdd.aggregate(initial, addAndCount, combine);
75 |     System.out.println(result.avg());
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicFlatMap.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple flatMap in Java to extract the words
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | import java.util.Map.Entry;
10 | 
11 | 
12 | import org.apache.commons.lang.StringUtils;
13 | 
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | 
19 | public class BasicFlatMap {
20 |   public static void main(String[] args) throws Exception {
21 | 
22 | 		if (args.length != 2) {
23 |       throw new Exception("Usage BasicFlatMap sparkMaster inputFile");
24 | 		}
25 | 
26 |     JavaSparkContext sc = new JavaSparkContext(
27 |       args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
28 |     JavaRDD<String> rdd = sc.textFile(args[1]);
29 |     JavaRDD<String> words = rdd.flatMap(
30 |       new FlatMapFunction<String, String>() { public Iterable<String> call(String x) {
31 |           return Arrays.asList(x.split(" "));
32 |         }});
33 |     Map<String, Long> result = words.countByValue();
34 |     for (Entry<String, Long> entry: result.entrySet()) {
35 |       System.out.println(entry.getKey() + ":" + entry.getValue());
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicJoinCsv.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates joining two csv files
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.StringReader;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | import scala.Tuple2;
10 | 
11 | import au.com.bytecode.opencsv.CSVReader;
12 | 
13 | import org.apache.commons.lang.StringUtils;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.PairFunction;
18 | 
19 | public class BasicJoinCsv {
20 | 
21 |   public static class ParseLine implements PairFunction<String, Integer, String[]> {
22 |     public Tuple2<Integer, String[]> call(String line) throws Exception {
23 |       CSVReader reader = new CSVReader(new StringReader(line));
24 |       String[] elements = reader.readNext();
25 |       Integer key = Integer.parseInt(elements[0]);
26 |       return new Tuple2(key, elements);
27 |     }
28 |   }
29 | 
30 |   public static void main(String[] args) throws Exception {
31 | 		if (args.length != 3) {
32 |       throw new Exception("Usage BasicJoinCsv sparkMaster csv1 csv2");
33 | 		}
34 |     String master = args[0];
35 |     String csv1 = args[1];
36 |     String csv2 = args[2];
37 |     BasicJoinCsv jsv = new BasicJoinCsv();
38 |     jsv.run(master, csv1, csv2);
39 |   }
40 | 
41 |   public void run(String master, String csv1, String csv2) throws Exception {
42 | 		JavaSparkContext sc = new JavaSparkContext(
43 |       master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
44 |     JavaRDD<String> csvFile1 = sc.textFile(csv1);
45 |     JavaRDD<String> csvFile2 = sc.textFile(csv2);
46 |     JavaPairRDD<Integer, String[]> keyedRDD1 = csvFile1.mapToPair(new ParseLine());
47 |     JavaPairRDD<Integer, String[]> keyedRDD2 = csvFile1.mapToPair(new ParseLine());
48 |     JavaPairRDD<Integer, Tuple2<String[], String[]>> result = keyedRDD1.join(keyedRDD2);
49 |     List<Tuple2<Integer, Tuple2<String[], String[]>>> resultCollection = result.collect();
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadJson.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a json file and finding out if people like pandas
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.StringReader;
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | import java.util.Iterator;
10 | import java.lang.Iterable;
11 | import scala.Tuple2;
12 | 
13 | import org.apache.commons.lang.StringUtils;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | import org.apache.spark.api.java.function.Function;
19 | 
20 | import com.fasterxml.jackson.databind.ObjectMapper;
21 | import com.fasterxml.jackson.databind.ObjectWriter;
22 | 
23 | public class BasicLoadJson {
24 | 
25 |   public static class Person implements java.io.Serializable {
26 |     public String name;
27 |     public Boolean lovesPandas;
28 |   }
29 | 
30 |   public static class ParseJson implements FlatMapFunction<Iterator<String>, Person> {
31 |     public Iterable<Person> call(Iterator<String> lines) throws Exception {
32 |       ArrayList<Person> people = new ArrayList<Person>();
33 |       ObjectMapper mapper = new ObjectMapper();
34 |       while (lines.hasNext()) {
35 |         String line = lines.next();
36 |         try {
37 |           people.add(mapper.readValue(line, Person.class));
38 |         } catch (Exception e) {
39 |           // Skip invalid input
40 |         }
41 |       }
42 |       return people;
43 |     }
44 |   }
45 | 
46 |   public static class LikesPandas implements Function<Person, Boolean> {
47 |     public Boolean call(Person person) {
48 |       return person.lovesPandas;
49 |     }
50 |   }
51 | 
52 | 
53 |   public static class WriteJson implements FlatMapFunction<Iterator<Person>, String> {
54 |     public Iterable<String> call(Iterator<Person> people) throws Exception {
55 |       ArrayList<String> text = new ArrayList<String>();
56 |       ObjectMapper mapper = new ObjectMapper();
57 |       while (people.hasNext()) {
58 |         Person person = people.next();
59 |         text.add(mapper.writeValueAsString(person));
60 |       }
61 |       return text;
62 |     }
63 |   }
64 | 
65 |   public static void main(String[] args) throws Exception {
66 | 		if (args.length != 3) {
67 |       throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]");
68 | 		}
69 |     String master = args[0];
70 |     String fileName = args[1];
71 |     String outfile = args[2];
72 | 
73 | 		JavaSparkContext sc = new JavaSparkContext(
74 |       master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS"));
75 |     JavaRDD<String> input = sc.textFile(fileName);
76 |     JavaRDD<Person> result = input.mapPartitions(new ParseJson()).filter(new LikesPandas());
77 |     JavaRDD<String> formatted = result.mapPartitions(new WriteJson());
78 |     formatted.saveAsTextFile(outfile);
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadSequenceFile.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a sequence file of people and how many pandas they have seen
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.List;
 7 | import scala.Tuple2;
 8 | 
 9 | import org.apache.spark.api.java.JavaPairRDD;
10 | import org.apache.spark.api.java.JavaSparkContext;
11 | import org.apache.spark.api.java.function.PairFunction;
12 | import org.apache.hadoop.io.IntWritable;
13 | import org.apache.hadoop.io.Text;
14 | 
15 | public class BasicLoadSequenceFile {
16 | 
17 |   public static class ConvertToNativeTypes implements PairFunction<Tuple2<Text, IntWritable>, String, Integer> {
18 |     public Tuple2<String, Integer> call(Tuple2<Text, IntWritable> record) {
19 |       return new Tuple2(record._1.toString(), record._2.get());
20 |     }
21 |   }
22 | 
23 |   public static void main(String[] args) throws Exception {
24 | 		if (args.length != 2) {
25 |       throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
26 | 		}
27 |     String master = args[0];
28 |     String fileName = args[1];
29 | 
30 | 		JavaSparkContext sc = new JavaSparkContext(
31 |       master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
32 |     JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
33 |     JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
34 |     List<Tuple2<String, Integer>> resultList = result.collect();
35 |     for (Tuple2<String, Integer> record : resultList) {
36 |       System.out.println(record);
37 |     }
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadWholeCsv.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates joining two csv files
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.StringReader;
 7 | import java.util.Arrays;
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import scala.Tuple2;
11 | 
12 | import au.com.bytecode.opencsv.CSVReader;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 | 
21 | public class BasicLoadWholeCsv {
22 | 
23 |   public static class ParseLine implements FlatMapFunction<Tuple2<String, String>, String[]> {
24 |     public Iterable<String[]> call(Tuple2<String, String> file) throws Exception {
25 |       CSVReader reader = new CSVReader(new StringReader(file._2()));
26 |       return reader.readAll();
27 |     }
28 |   }
29 | 
30 |   public static void main(String[] args) throws Exception {
31 | 		if (args.length != 3) {
32 |       throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key");
33 | 		}
34 |     String master = args[0];
35 |     String csvInput = args[1];
36 |     String outputFile = args[2];
37 |     final String key = args[3];
38 | 
39 | 		JavaSparkContext sc = new JavaSparkContext(
40 |       master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
41 |     JavaPairRDD<String, String> csvData = sc.wholeTextFiles(csvInput);
42 |     JavaRDD<String[]> keyedRDD = csvData.flatMap(new ParseLine());
43 |     JavaRDD<String[]> result =
44 |       keyedRDD.filter(new Function<String[], Boolean>() {
45 |           public Boolean call(String[] input) { return input[0].equals(key); }});
46 | 
47 |     result.saveAsTextFile(outputFile);
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMap.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 | 
15 | public class BasicMap {
16 |   public static void main(String[] args) throws Exception {
17 |     String master;
18 |     if (args.length > 0) {
19 |       master = args[0];
20 |     } else {
21 |       master = "local";
22 |     }
23 |     JavaSparkContext sc = new JavaSparkContext(
24 |       master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 |     JavaRDD<Integer> result = rdd.map(
27 |       new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
28 |     System.out.println(StringUtils.join(result.collect(), ","));
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapPartitions.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | import java.util.Iterator;
10 | 
11 | import org.apache.commons.lang.StringUtils;
12 | 
13 | import org.eclipse.jetty.client.ContentExchange;
14 | import org.eclipse.jetty.client.HttpClient;
15 | 
16 | 
17 | import org.apache.spark.api.java.JavaRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 | 
21 | public class BasicMapPartitions {
22 |   public static void main(String[] args) throws Exception {
23 |     String master;
24 |     if (args.length > 0) {
25 |       master = args[0];
26 |     } else {
27 |       master = "local";
28 |     }
29 |     JavaSparkContext sc = new JavaSparkContext(
30 |       master, "basicmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
31 |     JavaRDD<String> rdd = sc.parallelize(
32 |       Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"));
33 |     JavaRDD<String> result = rdd.mapPartitions(
34 |       new FlatMapFunction<Iterator<String>, String>() {
35 |         public Iterable<String> call(Iterator<String> input) {
36 |           ArrayList<String> content = new ArrayList<String>();
37 |           ArrayList<ContentExchange> cea = new ArrayList<ContentExchange>();
38 |           HttpClient client = new HttpClient();
39 |           try {
40 |             client.start();
41 |             while (input.hasNext()) {
42 |               ContentExchange exchange = new ContentExchange(true);
43 |               exchange.setURL("http://qrzcq.com/call/" + input.next());
44 |               client.send(exchange);
45 |               cea.add(exchange);
46 |             }
47 |             for (ContentExchange exchange : cea) {
48 |               exchange.waitForDone();
49 |               content.add(exchange.getResponseContent());
50 |             }
51 |           } catch (Exception e) {
52 |           }
53 |           return content;
54 |         }});
55 |     System.out.println(StringUtils.join(result.collect(), ","));
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapThenFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map then filter in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 | 
15 | public class BasicMapThenFilter {
16 |   public static void main(String[] args) throws Exception {
17 | 		String master;
18 | 		if (args.length > 0) {
19 |       master = args[0];
20 | 		} else {
21 | 			master = "local";
22 | 		}
23 | 		JavaSparkContext sc = new JavaSparkContext(
24 |       master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 |     JavaRDD<Integer> squared = rdd.map(
27 |       new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
28 |     JavaRDD<Integer> result = squared.filter(
29 |       new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }});
30 |     System.out.println(StringUtils.join(result.collect(), ","));
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapToDouble.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map to double in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaDoubleRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.DoubleFunction;
15 | 
16 | public class BasicMapToDouble {
17 |   public static void main(String[] args) throws Exception {
18 | 		String master;
19 | 		if (args.length > 0) {
20 |       master = args[0];
21 | 		} else {
22 | 			master = "local";
23 | 		}
24 | 		JavaSparkContext sc = new JavaSparkContext(
25 |       master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS"));
26 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
27 |     JavaDoubleRDD result = rdd.mapToDouble(
28 |       new DoubleFunction<Integer>() {
29 |         public double call(Integer x) {
30 |           double y = (double) x;
31 |           return y * y;
32 |         }
33 |       });
34 |     System.out.println(StringUtils.join(result.collect(), ","));
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicQueryCassandra.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a json file and finding out if people like pandas
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | import java.io.Serializable;
 6 | 
 7 | import java.io.StringReader;
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import java.util.Iterator;
11 | import java.lang.Iterable;
12 | import scala.Tuple2;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.SparkConf;
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.DoubleFunction;
20 | import org.apache.spark.api.java.function.FlatMapFunction;
21 | import org.apache.spark.api.java.function.Function;
22 | 
23 | import com.datastax.spark.connector.CassandraRow;
24 | import static com.datastax.spark.connector.CassandraJavaUtil.javaFunctions;
25 | 
26 | public class BasicQueryCassandra {
27 |   public static void main(String[] args) throws Exception {
28 | 		if (args.length != 2) {
29 |       throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]");
30 | 		}
31 |     String sparkMaster = args[0];
32 |     String cassandraHost = args[1];
33 |     SparkConf conf = new SparkConf(true)
34 |       .set("spark.cassandra.connection.host", cassandraHost);
35 | 
36 | 		JavaSparkContext sc = new JavaSparkContext(
37 |       sparkMaster, "basicquerycassandra", conf);
38 |     // entire table as an RDD
39 |     // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int);
40 |     JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test" , "kv");
41 |     // print some basic stats
42 |     System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() {
43 |         public double call(CassandraRow row) {
44 |           return row.getInt("value");
45 |         }}).stats());
46 |     // write some basic data to Cassandra
47 |     ArrayList<KeyValue> input = new ArrayList<KeyValue>();
48 |     input.add(KeyValue.newInstance("mostmagic", 3));
49 |     JavaRDD<KeyValue> kvRDD = sc.parallelize(input);
50 |     javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv");
51 | 	}
52 |   public static class KeyValue implements Serializable {
53 |     private String key;
54 |     private Integer value;
55 |     public KeyValue() {
56 |     }
57 |     public static KeyValue newInstance(String k, Integer v) {
58 |       KeyValue kv = new KeyValue();
59 |       kv.setKey(k);
60 |       kv.setValue(v);
61 |       return kv;
62 |     }
63 |     public String getKey() {
64 |       return key;
65 |     }
66 |     public Integer getValue() {
67 |       return value;
68 |     }
69 |     void setKey(String k) {
70 |       this.key = k;
71 |     }
72 |     void setValue(Integer v) {
73 |       this.value = v;
74 |     }
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicSaveSequenceFile.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates saving a sequence file in Java using the old style hadoop APIs.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | import scala.Tuple2;
 9 | 
10 | import org.apache.spark.api.java.JavaPairRDD;
11 | import org.apache.spark.api.java.JavaSparkContext;
12 | import org.apache.spark.api.java.function.PairFunction;
13 | import org.apache.hadoop.io.IntWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
16 | 
17 | public class BasicSaveSequenceFile {
18 | 
19 |   public static class ConvertToWritableTypes implements PairFunction<Tuple2<String, Integer>, Text, IntWritable> {
20 |     public Tuple2<Text, IntWritable> call(Tuple2<String, Integer> record) {
21 |       return new Tuple2(new Text(record._1), new IntWritable(record._2));
22 |     }
23 |   }
24 | 
25 |   public static void main(String[] args) throws Exception {
26 | 		if (args.length != 2) {
27 |       throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
28 | 		}
29 |     String master = args[0];
30 |     String fileName = args[1];
31 | 
32 | 		JavaSparkContext sc = new JavaSparkContext(
33 |       master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
34 |     List<Tuple2<String, Integer>> input = new ArrayList();
35 |     input.add(new Tuple2("coffee", 1));
36 |     input.add(new Tuple2("coffee", 2));
37 |     input.add(new Tuple2("pandas", 3));
38 |     JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
39 |     JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
40 |     result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicSum.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple fold in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function2;
14 | 
15 | public class BasicSum {
16 |   public static void main(String[] args) throws Exception {
17 | 		String master;
18 | 		if (args.length > 0) {
19 |       master = args[0];
20 | 		} else {
21 | 			master = "local";
22 | 		}
23 | 		JavaSparkContext sc = new JavaSparkContext(
24 |       master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 |     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 |     Integer result = rdd.fold(0, new Function2<Integer, Integer, Integer>() {
27 |         public Integer call(Integer x, Integer y) { return x + y;}});
28 |     System.out.println(result);
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/CallLog.java:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.java;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class CallLog implements Serializable {
 6 |   public String callsign;
 7 |   public Double contactlat;
 8 |   public Double contactlong;
 9 |   public Double mylat;
10 |   public Double mylong;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/HappyPerson.java:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.java;
 2 | import java.io.Serializable;
 3 | 
 4 | 
 5 | class HappyPerson implements Serializable {
 6 |   private String name;
 7 |   private String favouriteBeverage;
 8 |   public HappyPerson() {}
 9 |   public HappyPerson(String n, String b) {
10 |     name = n; favouriteBeverage = b;
11 |   }
12 |   public String getName() { return name; }
13 |   public void setName(String n) { name = n; }
14 |   public String getFavouriteBeverage() { return favouriteBeverage; }
15 |   public void setFavouriteBeverage(String b) { favouriteBeverage = b; }
16 | };
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/IntersectByKey.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | 
12 | import com.google.common.collect.Iterables;
13 | 
14 | import scala.Tuple2;
15 | 
16 | import org.apache.commons.lang.StringUtils;
17 | 
18 | import org.apache.spark.api.java.JavaRDD;
19 | import org.apache.spark.api.java.JavaPairRDD;
20 | import org.apache.spark.api.java.JavaSparkContext;
21 | import org.apache.spark.api.java.function.Function;
22 | import org.apache.spark.api.java.function.Function2;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 | 
25 | public final class IntersectByKey {
26 |   public static <K, V> JavaPairRDD<K, V> intersectByKey(JavaPairRDD<K, V> rdd1, JavaPairRDD<K, V> rdd2) {
27 |     JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<V>>> grouped = rdd1.cogroup(rdd2);
28 |     return grouped.flatMapValues(new Function<Tuple2<Iterable<V>, Iterable<V>>, Iterable<V>>() {
29 |         @Override
30 |         public Iterable<V> call(Tuple2<Iterable<V>, Iterable<V>> input) {
31 |           ArrayList<V> al = new ArrayList<V>();
32 |           if (!Iterables.isEmpty(input._1()) && !Iterables.isEmpty(input._2())) {
33 |             Iterables.addAll(al, input._1());
34 |             Iterables.addAll(al, input._2());
35 |           }
36 |           return al;
37 |         }
38 |         });
39 |   }
40 |   public static void main(String[] args) throws Exception {
41 | 		String master;
42 | 		if (args.length > 0) {
43 |       master = args[0];
44 | 		} else {
45 | 			master = "local";
46 | 		}
47 | 
48 | 		JavaSparkContext sc = new JavaSparkContext(
49 |       master, "IntersectByKey", System.getenv("SPARK_HOME"), System.getenv("JARS"));
50 |     List<Tuple2<String, Integer>> input1 = new ArrayList();
51 |     input1.add(new Tuple2("coffee", 1));
52 |     input1.add(new Tuple2("coffee", 2));
53 |     input1.add(new Tuple2("pandas", 3));
54 |     List<Tuple2<String, Integer>> input2 = new ArrayList();
55 |     input2.add(new Tuple2("pandas", 20));
56 |     JavaPairRDD<String, Integer> rdd1 = sc.parallelizePairs(input1);
57 |     JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(input2);
58 |     JavaPairRDD<String, Integer> result = intersectByKey(rdd1, rdd2);
59 |     for (Tuple2<String, Integer> entry : result.collect()) {
60 |       System.out.println(entry._1() + ":" + entry._2());
61 |     }
62 |     System.out.println("Done");
63 |     sc.stop();
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/KafkaInput.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map then filter in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.HashMap;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | 
11 | import org.apache.commons.lang.StringUtils;
12 | 
13 | import org.apache.spark.SparkConf;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaSparkContext;
16 | import org.apache.spark.api.java.function.Function;
17 | import org.apache.spark.streaming.api.java.JavaPairDStream;
18 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
19 | import org.apache.spark.streaming.Duration;
20 | import org.apache.spark.streaming.kafka.*;
21 | 
22 | public final class KafkaInput {
23 |   public static void main(String[] args) throws Exception {
24 |     String zkQuorum = args[0];
25 |     String group = args[1];
26 |     SparkConf conf = new SparkConf().setAppName("KafkaInput");
27 |     // Create a StreamingContext with a 1 second batch size
28 |     JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000));
29 |     Map<String, Integer> topics = new HashMap<String, Integer>();
30 |     topics.put("pandas", 1);
31 |     JavaPairDStream<String, String> input = KafkaUtils.createStream(jssc, zkQuorum, group, topics);
32 |     input.print();
33 |     // start our streaming context and wait for it to "finish"
34 |     jssc.start();
35 |     // Wait for 10 seconds then exit. To run forever call without a timeout
36 |     jssc.awaitTermination(10000);
37 |     // Stop the streaming context
38 |     jssc.stop();
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/KeyValueMapFilter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates how to make a PairRDD then do a basic filter
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | 
12 | import scala.Tuple2;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | 
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.api.java.function.PairFunction;
21 | 
22 | public final class KeyValueMapFilter {
23 | 
24 |   public static void main(String[] args) throws Exception {
25 | 		if (args.length != 2) {
26 |       throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
27 | 		}
28 |     String master = args[0];
29 |     String inputFile = args[1];
30 | 
31 | 		JavaSparkContext sc = new JavaSparkContext(
32 |       master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
33 |     JavaRDD<String> input = sc.textFile(inputFile);
34 |     PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {
35 |       @Override
36 |       public Tuple2<String, String> call(String x) {
37 |         return new Tuple2(x.split(" ")[0], x);
38 |       }
39 |     };
40 |     Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {
41 |       @Override
42 |       public Boolean call(Tuple2<String, String> input) {
43 |         return (input._2().length() < 20);
44 |       }
45 |     };
46 |     JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
47 |     JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
48 |     Map<String, String> resultMap = result.collectAsMap();
49 |     for (Entry<String, String> entry : resultMap.entrySet()) {
50 |       System.out.println(entry.getKey() + ":" + entry.getValue());
51 |     }
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/LoadHive.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading data from Hive with Spark SQL
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.StringReader;
 7 | import java.util.Arrays;
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import scala.Tuple2;
11 | 
12 | import au.com.bytecode.opencsv.CSVReader;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.sql.SQLContext;
21 | import org.apache.spark.sql.Row;
22 | import org.apache.spark.sql.DataFrame;
23 | 
24 | public class LoadHive {
25 | 
26 |   public static class SquareKey implements Function<Row, Integer> {
27 |     public Integer call(Row row) throws Exception {
28 |       return row.getInt(0) * row.getInt(0);
29 |     }
30 |   }
31 | 
32 |   public static void main(String[] args) throws Exception {
33 | 		if (args.length != 3) {
34 |       throw new Exception("Usage LoadHive sparkMaster tbl");
35 | 		}
36 |     String master = args[0];
37 |     String tbl = args[1];
38 | 
39 | 		JavaSparkContext sc = new JavaSparkContext(
40 |       master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS"));
41 |     SQLContext sqlCtx = new SQLContext(sc);
42 |     DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src");
43 |     JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey());
44 |     List<Integer> result = squaredKeys.collect();
45 |     for (Integer elem : result) {
46 |       System.out.println(elem);
47 |     }
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/LoadJsonWithSparkSQL.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading data from Hive with Spark SQL
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.io.StringReader;
 7 | import java.util.Arrays;
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | import scala.Tuple2;
11 | 
12 | import au.com.bytecode.opencsv.CSVReader;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.sql.SQLContext;
21 | import org.apache.spark.sql.Row;
22 | import org.apache.spark.sql.DataFrame;
23 | 
24 | public class LoadJsonWithSparkSQL {
25 | 
26 | 
27 |   public static void main(String[] args) throws Exception {
28 | 		if (args.length != 2) {
29 |       throw new Exception("Usage LoadJsonWithSparkSQL sparkMaster jsonFile");
30 | 		}
31 |     String master = args[0];
32 |     String jsonFile = args[1];
33 | 
34 | 		JavaSparkContext sc = new JavaSparkContext(
35 |       master, "loadJsonwithsparksql");
36 |     SQLContext sqlCtx = new SQLContext(sc);
37 |     DataFrame input = sqlCtx.jsonFile(jsonFile);
38 |     input.printSchema();
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/MLlib.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.oreilly.learningsparkexamples.java;
19 | 
20 | import java.util.Arrays;
21 | 
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.JavaRDD;
24 | import org.apache.spark.api.java.JavaSparkContext;
25 | import org.apache.spark.api.java.function.Function;
26 | 
27 | import org.apache.spark.mllib.classification.LogisticRegressionModel;
28 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
29 | import org.apache.spark.mllib.feature.HashingTF;
30 | import org.apache.spark.mllib.linalg.Vector;
31 | import org.apache.spark.mllib.regression.LabeledPoint;
32 | 
33 | public final class MLlib {
34 | 
35 |   public static void main(String[] args) {
36 |     SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample");
37 |     JavaSparkContext sc = new JavaSparkContext(sparkConf);
38 | 
39 |     // Load 2 types of emails from text files: spam and ham (non-spam).
40 |     // Each line has text from one email.
41 |     JavaRDD<String> spam = sc.textFile("files/spam.txt");
42 |     JavaRDD<String> ham = sc.textFile("files/ham.txt");
43 | 
44 |     // Create a HashingTF instance to map email text to vectors of 100 features.
45 |     final HashingTF tf = new HashingTF(100);
46 | 
47 |     // Each email is split into words, and each word is mapped to one feature.
48 |     // Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
49 |     JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() {
50 |       @Override public LabeledPoint call(String email) {
51 |         return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" "))));
52 |       }
53 |     });
54 |     JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() {
55 |       @Override public LabeledPoint call(String email) {
56 |         return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" "))));
57 |       }
58 |     });
59 |     JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);
60 |     trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm.
61 | 
62 |     // Create a Logistic Regression learner which uses the LBFGS optimizer.
63 |     LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
64 |     // Run the actual learning algorithm on the training data.
65 |     LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
66 | 
67 |     // Test on a positive example (spam) and a negative one (ham).
68 |     // First apply the same HashingTF feature transformation used on the training data.
69 |     Vector posTestExample =
70 |         tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
71 |     Vector negTestExample =
72 |         tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
73 |     // Now use the learned model to predict spam/ham for new emails.
74 |     System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
75 |     System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
76 | 
77 |     sc.stop();
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/PerKeyAvg.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | 
12 | import scala.Tuple2;
13 | 
14 | import org.apache.commons.lang.StringUtils;
15 | 
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.api.java.function.Function2;
21 | 
22 | public final class PerKeyAvg {
23 |   public static class AvgCount implements java.io.Serializable {
24 |     public AvgCount(int total, int num) {
25 |       total_ = total;
26 |       num_ = num;
27 |     }
28 |     public int total_;
29 |     public int num_;
30 |     public float avg() {
31 |       return total_ / (float) num_;
32 |     }
33 |   }
34 |   public static void main(String[] args) throws Exception {
35 | 		String master;
36 | 		if (args.length > 0) {
37 |       master = args[0];
38 | 		} else {
39 | 			master = "local";
40 | 		}
41 | 
42 | 		JavaSparkContext sc = new JavaSparkContext(
43 |       master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
44 |     List<Tuple2<String, Integer>> input = new ArrayList();
45 |     input.add(new Tuple2("coffee", 1));
46 |     input.add(new Tuple2("coffee", 2));
47 |     input.add(new Tuple2("pandas", 3));
48 |     JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
49 |     Function<Integer, AvgCount> createAcc = new Function<Integer, AvgCount>() {
50 |       @Override
51 |       public AvgCount call(Integer x) {
52 |         return new AvgCount(x, 1);
53 |       }
54 |     };
55 |     Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {
56 |       @Override
57 |       public AvgCount call(AvgCount a, Integer x) {
58 |         a.total_ += x;
59 |         a.num_ += 1;
60 |         return a;
61 |       }
62 |     };
63 |     Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
64 |       @Override
65 |       public AvgCount call(AvgCount a, AvgCount b) {
66 |         a.total_ += b.total_;
67 |         a.num_ += b.num_;
68 |         return a;
69 |       }
70 |     };
71 |     AvgCount initial = new AvgCount(0,0);
72 |     JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
73 |     Map<String, AvgCount> countMap = avgCounts.collectAsMap();
74 |     for (Entry<String, AvgCount> entry : countMap.entrySet()) {
75 |       System.out.println(entry.getKey() + ":" + entry.getValue().avg());
76 |     }
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/RemoveOutliers.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates remove outliers in Java using summary Stats
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaDoubleRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.Function;
15 | import org.apache.spark.util.StatCounter;
16 | 
17 | public class RemoveOutliers {
18 |   public static void main(String[] args) {
19 | 		String master;
20 | 		if (args.length > 0) {
21 |       master = args[0];
22 | 		} else {
23 | 			master = "local";
24 | 		}
25 | 		JavaSparkContext sc = new JavaSparkContext(
26 |       master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
27 |     JavaDoubleRDD input = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 1000.0));
28 |     JavaDoubleRDD result = removeOutliers(input);
29 |     System.out.println(StringUtils.join(result.collect(), ","));
30 |   }
31 |   static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) {
32 |     final StatCounter summaryStats = rdd.stats();
33 |     final Double stddev = Math.sqrt(summaryStats.variance());
34 |     return rdd.filter(new Function<Double, Boolean>() { public Boolean call(Double x) {
35 |           return (Math.abs(x - summaryStats.mean()) < 3 * stddev);
36 |         }});
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/SparkSQLTwitter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Load some tweets stored as JSON data and explore them.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | 
10 | import org.apache.commons.lang.StringUtils;
11 | 
12 | import org.apache.spark.SparkConf;
13 | import org.apache.spark.api.java.JavaRDD;
14 | import org.apache.spark.api.java.JavaSparkContext;
15 | import org.apache.spark.api.java.function.Function;
16 | import org.apache.spark.sql.SQLContext;
17 | import org.apache.spark.sql.DataFrame;
18 | import org.apache.spark.sql.Row;
19 | import org.apache.spark.sql.api.java.UDF1;
20 | import org.apache.spark.sql.types.DataTypes;
21 | 
22 | public class SparkSQLTwitter {
23 |   public static void main(String[] args) {
24 |     String inputFile = args[0];
25 |     SparkConf conf = new SparkConf();
26 |     JavaSparkContext sc = new JavaSparkContext(conf);
27 |     SQLContext sqlCtx = new SQLContext(sc);
28 |     DataFrame input = sqlCtx.jsonFile(inputFile);
29 |     // Print the schema
30 |     input.printSchema();
31 |     // Register the input schema RDD
32 |     input.registerTempTable("tweets");
33 |     // Select tweets based on the retweetCount
34 |     DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
35 |     Row[] result = topTweets.collect();
36 |     for (Row row : result) {
37 |       System.out.println(row.get(0));
38 |     }
39 |     JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() {
40 |         public String call(Row row) {
41 |           return row.getString(0);
42 |         }});
43 |     System.out.println(topTweetText.collect());
44 |     // Create a person and turn it into a Schema RDD
45 |     ArrayList<HappyPerson> peopleList = new ArrayList<HappyPerson>();
46 |     peopleList.add(new HappyPerson("holden", "coffee"));
47 |     JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList);
48 |     DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
49 |     happyPeopleSchemaRDD.registerTempTable("happy_people");
50 |     sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {
51 |         @Override
52 |           public Integer call(String str) throws Exception {
53 |           return str.length();
54 |         }
55 |       }, DataTypes.IntegerType);
56 |     DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
57 |     Row[] lengths = tweetLength.collect();
58 |     for (Row row : result) {
59 |       System.out.println(row.get(0));
60 |     }
61 |     sc.stop();
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/StreamingLogInput.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map then filter in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang.StringUtils;
10 | 
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 | import org.apache.spark.streaming.api.java.JavaDStream;
15 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
16 | import org.apache.spark.streaming.Duration;
17 | 
18 | public class StreamingLogInput {
19 |   public static void main(String[] args) throws Exception {
20 | 		String master = args[0];
21 | 		JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput");
22 |     // Create a StreamingContext with a 1 second batch size
23 |     JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000));
24 |     // Create a DStream from all the input on port 7777
25 |     JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777);
26 |     // Filter our DStream for lines with "error"
27 |     JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() {
28 |         public Boolean call(String line) {
29 |           return line.contains("error");
30 |         }});
31 |     // Print out the lines with errors, which causes this DStream to be evaluated
32 |     errorLines.print();
33 |     // start our streaming context and wait for it to "finish"
34 |     jssc.start();
35 |     // Wait for 10 seconds then exit. To run forever call without a timeout
36 |     jssc.awaitTermination(10000);
37 |     // Stop the streaming context
38 |     jssc.stop();
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/WordCount.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a wordcount in Java
 3 |  */
 4 | package com.oreilly.learningsparkexamples.java;
 5 | 
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | import java.lang.Iterable;
 9 | 
10 | import scala.Tuple2;
11 | 
12 | import org.apache.commons.lang.StringUtils;
13 | 
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | import org.apache.spark.api.java.function.Function2;
19 | import org.apache.spark.api.java.function.PairFunction;
20 | 
21 | 
22 | public class WordCount {
23 |   public static void main(String[] args) throws Exception {
24 | 		String master = args[0];
25 | 		JavaSparkContext sc = new JavaSparkContext(
26 |       master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS"));
27 |     JavaRDD<String> rdd = sc.textFile(args[1]);
28 |     JavaPairRDD<String, Integer> counts = rdd.flatMap(
29 |       new FlatMapFunction<String, String>() {
30 |         public Iterable<String> call(String x) {
31 |           return Arrays.asList(x.split(" "));
32 |         }}).mapToPair(new PairFunction<String, String, Integer>(){
33 |             public Tuple2<String, Integer> call(String x){
34 |               return new Tuple2(x, 1);
35 |             }}).reduceByKey(new Function2<Integer, Integer, Integer>(){
36 |                 public Integer call(Integer x, Integer y){ return x+y;}});
37 |     counts.saveAsTextFile(args[2]);
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/ApacheAccessLog.java:
--------------------------------------------------------------------------------
  1 | package com.oreilly.learningsparkexamples.java.logs;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.logging.Level;
  5 | import java.util.logging.Logger;
  6 | import java.util.regex.Matcher;
  7 | import java.util.regex.Pattern;
  8 | 
  9 | /**
 10 |  * This class represents an Apache access log line.
 11 |  * See http://httpd.apache.org/docs/2.2/logs.html for more details.
 12 |  */
 13 | public class ApacheAccessLog implements Serializable {
 14 |   private static final Logger logger = Logger.getLogger("Access");
 15 | 
 16 |   private String ipAddress;
 17 |   private String clientIdentd;
 18 |   private String userID;
 19 |   private String dateTimeString;
 20 |   private String method;
 21 |   private String endpoint;
 22 |   private String protocol;
 23 |   private int responseCode;
 24 |   private long contentSize;
 25 | 
 26 |   private ApacheAccessLog(String ipAddress, String clientIdentd, String userID,
 27 |                           String dateTime, String method, String endpoint,
 28 |                           String protocol, String responseCode,
 29 |                           String contentSize) {
 30 |     this.ipAddress = ipAddress;
 31 |     this.clientIdentd = clientIdentd;
 32 |     this.userID = userID;
 33 |     this.dateTimeString = dateTime;  // TODO: Parse from dateTime String;
 34 |     this.method = method;
 35 |     this.endpoint = endpoint;
 36 |     this.protocol = protocol;
 37 |     this.responseCode = Integer.parseInt(responseCode);
 38 |     this.contentSize = Long.parseLong(contentSize);
 39 |   }
 40 | 
 41 |   public String getIpAddress() {
 42 |     return ipAddress;
 43 |   }
 44 | 
 45 |   public String getClientIdentd() {
 46 |     return clientIdentd;
 47 |   }
 48 | 
 49 |   public String getUserID() {
 50 |     return userID;
 51 |   }
 52 | 
 53 |   public String getDateTimeString() {
 54 |     return dateTimeString;
 55 |   }
 56 | 
 57 |   public String getMethod() {
 58 |     return method;
 59 |   }
 60 | 
 61 |   public String getEndpoint() {
 62 |     return endpoint;
 63 |   }
 64 | 
 65 |   public String getProtocol() {
 66 |     return protocol;
 67 |   }
 68 | 
 69 |   public int getResponseCode() {
 70 |     return responseCode;
 71 |   }
 72 | 
 73 |   public long getContentSize() {
 74 |     return contentSize;
 75 |   }
 76 | 
 77 |   public void setIpAddress(String ipAddress) {
 78 |     this.ipAddress = ipAddress;
 79 |   }
 80 | 
 81 |   public void setClientIdentd(String clientIdentd) {
 82 |     this.clientIdentd = clientIdentd;
 83 |   }
 84 | 
 85 |   public void setUserID(String userID) {
 86 |     this.userID = userID;
 87 |   }
 88 | 
 89 |   public void setDateTimeString(String dateTimeString) {
 90 |     this.dateTimeString = dateTimeString;
 91 |   }
 92 | 
 93 |   public void setMethod(String method) {
 94 |     this.method = method;
 95 |   }
 96 | 
 97 |   public void setEndpoint(String endpoint) {
 98 |     this.endpoint = endpoint;
 99 |   }
100 | 
101 |   public void setProtocol(String protocol) {
102 |     this.protocol = protocol;
103 |   }
104 | 
105 |   public void setResponseCode(int responseCode) {
106 |     this.responseCode = responseCode;
107 |   }
108 | 
109 |   public void setContentSize(long contentSize) {
110 |     this.contentSize = contentSize;
111 |   }
112 | 
113 |   // Example Apache log line:
114 |   //   127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048
115 |   private static final String LOG_ENTRY_PATTERN =
116 |       // 1:IP  2:client 3:user 4:date time                   5:method 6:req 7:proto   8:respcode 9:size
117 |       "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)";
118 |   private static final Pattern PATTERN = Pattern.compile(LOG_ENTRY_PATTERN);
119 | 
120 |   public static ApacheAccessLog parseFromLogLine(String logline) {
121 |     Matcher m = PATTERN.matcher(logline);
122 |     if (!m.find()) {
123 |       logger.log(Level.ALL, "Cannot parse logline" + logline);
124 |       throw new RuntimeException("Error parsing logline");
125 |     }
126 | 
127 |     return new ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4),
128 |         m.group(5), m.group(6), m.group(7), m.group(8), m.group(9));
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/Flags.java:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.java.logs;
 2 | 
 3 | import org.apache.commons.cli.*;
 4 | import org.apache.spark.streaming.Duration;
 5 | 
 6 | public class Flags {
 7 |   private static Flags THE_INSTANCE = new Flags();
 8 | 
 9 |   private Duration windowLength;
10 |   private Duration slideInterval;
11 |   private String logsDirectory;
12 |   private String outputHtmlFile;
13 |   private String checkpointDirectory;
14 |   private String indexHtmlTemplate;
15 |   private String outputDirectory;
16 | 
17 |   private boolean initialized = false;
18 | 
19 |   private Flags() {}
20 | 
21 |   public Duration getWindowLength() {
22 |     return windowLength;
23 |   }
24 | 
25 |   public Duration getSlideInterval() {
26 |     return slideInterval;
27 |   }
28 | 
29 |   public String getLogsDirectory() {
30 |     return logsDirectory;
31 |   }
32 | 
33 |   public String getOutputHtmlFile() {
34 |     return outputHtmlFile;
35 |   }
36 | 
37 |   public String getCheckpointDirectory() {
38 |     return checkpointDirectory;
39 |   }
40 | 
41 |   public String getOutputDirectory() {
42 |     return outputDirectory;
43 |   }
44 | 
45 |   public String getIndexHtmlTemplate() {
46 |     return indexHtmlTemplate;
47 |   }
48 | 
49 |   public static Flags getInstance() {
50 |     if (!THE_INSTANCE.initialized) {
51 |       throw new RuntimeException("Flags have not been initalized");
52 |     }
53 |     return THE_INSTANCE;
54 |   }
55 | 
56 |   public static void setFromCommandLineArgs(Options options, String[] args) {
57 |     CommandLineParser parser = new PosixParser();
58 |     try {
59 |       CommandLine cl = parser.parse(options, args);
60 |       THE_INSTANCE.windowLength = new Duration(Integer.parseInt(
61 |           cl.getOptionValue(LogAnalyzerAppMain.WINDOW_LENGTH, "30")) * 1000);
62 |       THE_INSTANCE.slideInterval = new Duration(Integer.parseInt(
63 |           cl.getOptionValue(LogAnalyzerAppMain.SLIDE_INTERVAL, "5")) * 1000);
64 |       THE_INSTANCE.logsDirectory = cl.getOptionValue(
65 |           LogAnalyzerAppMain.LOGS_DIRECTORY, "/tmp/logs");
66 |       THE_INSTANCE.outputHtmlFile = cl.getOptionValue(
67 |           LogAnalyzerAppMain.OUTPUT_HTML_FILE, "/tmp/log_stats.html");
68 |       THE_INSTANCE.checkpointDirectory = cl.getOptionValue(
69 |           LogAnalyzerAppMain.CHECKPOINT_DIRECTORY, "/tmp/log-analyzer-streaming");
70 |       THE_INSTANCE.indexHtmlTemplate = cl.getOptionValue(
71 |           LogAnalyzerAppMain.INDEX_HTML_TEMPLATE,
72 |           "./src/main/resources/index.html.template");
73 |       THE_INSTANCE.outputDirectory = cl.getOptionValue(
74 |         LogAnalyzerAppMain.OUTPUT_DIRECTORY, "/tmp/pandaout");
75 |       THE_INSTANCE.initialized = true;
76 |     } catch (ParseException e) {
77 |       THE_INSTANCE.initialized = false;
78 |       System.err.println("Parsing failed.  Reason: " + e.getMessage());
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/Functions.java:
--------------------------------------------------------------------------------
  1 | package com.oreilly.learningsparkexamples.java.logs;
  2 | 
  3 | import com.google.common.base.Optional;
  4 | import com.google.common.collect.Ordering;
  5 | import org.apache.spark.api.java.JavaDoubleRDD;
  6 | import org.apache.spark.api.java.JavaPairRDD;
  7 | import org.apache.spark.api.java.JavaRDD;
  8 | import org.apache.spark.api.java.function.Function;
  9 | import org.apache.spark.api.java.function.DoubleFunction;
 10 | import org.apache.spark.api.java.function.Function2;
 11 | import org.apache.spark.api.java.function.PairFunction;
 12 | import scala.Tuple2;
 13 | import scala.Tuple4;
 14 | 
 15 | import javax.annotation.Nullable;
 16 | import java.io.Serializable;
 17 | import java.util.Comparator;
 18 | import java.util.List;
 19 | 
 20 | public class Functions {
 21 |   public static final class LongSumReducer implements Function2<Long, Long, Long> {
 22 |     @Override
 23 |     public Long call(Long a, Long b) {
 24 |       return  a + b;
 25 |     }
 26 |   };
 27 | 
 28 |   public static final class SumReducer implements Function2<Double, Double, Double> {
 29 |     @Override
 30 |     public Double call(Double a, Double b) {
 31 |       return  a + b;
 32 |     }
 33 |   };
 34 | 
 35 | 
 36 |   public static final class ValueComparator<K, V>
 37 |       implements Comparator<Tuple2<K, V>>, Serializable {
 38 |     private Comparator<V> comparator;
 39 | 
 40 |     public ValueComparator(Comparator<V> comparator) {
 41 |       this.comparator = comparator;
 42 |     }
 43 | 
 44 |     @Override
 45 |     public int compare(Tuple2<K, V> o1, Tuple2<K, V> o2) {
 46 |       return comparator.compare(o1._2(), o2._2());
 47 |     }
 48 |   }
 49 | 
 50 |   public static final class ComputeRunningSum implements Function2<List<Long>, Optional<Long>, Optional<Long>> {
 51 |     @Override
 52 |     public Optional<Long> call(List<Long> nums, Optional<Long> current) {
 53 |       long sum = current.or(0L);
 54 |       for (long i : nums) {
 55 |         sum += i;
 56 |       }
 57 |       return Optional.of(sum);
 58 |     }
 59 |   };
 60 | 
 61 |   public static final class GetContentSize implements DoubleFunction<ApacheAccessLog> {
 62 |     @Override
 63 |     public double call(ApacheAccessLog log) {
 64 |       return new Long(log.getContentSize()).doubleValue();
 65 |     }
 66 |   }
 67 | 
 68 |   public static final @Nullable Tuple4<Long, Long, Long, Long> contentSizeStats(
 69 |       JavaRDD<ApacheAccessLog> accessLogRDD) {
 70 |     JavaDoubleRDD contentSizes =
 71 |       accessLogRDD.mapToDouble(new GetContentSize()).cache();
 72 |     long count = contentSizes.count();
 73 |     if (count == 0) {
 74 |       return null;
 75 |     }
 76 |     Object ordering = Ordering.natural();
 77 |     final Comparator<Double> cmp = (Comparator<Double>)ordering;
 78 |     
 79 |     return new Tuple4<>(count,
 80 |                         contentSizes.reduce(new SumReducer()).longValue(),
 81 |                         contentSizes.min(cmp).longValue(),
 82 |                         contentSizes.max(cmp).longValue());
 83 |   }
 84 | 
 85 |   public static final class ResponseCodeTuple implements PairFunction<ApacheAccessLog, Integer, Long> {
 86 |     @Override
 87 |     public Tuple2<Integer, Long> call(ApacheAccessLog log) {
 88 |       return new Tuple2<>(log.getResponseCode(), 1L);
 89 |     }
 90 |   }
 91 | 
 92 |   public static final JavaPairRDD<Integer, Long> responseCodeCount(
 93 |       JavaRDD<ApacheAccessLog> accessLogRDD) {
 94 |     return accessLogRDD
 95 |       .mapToPair(new ResponseCodeTuple())
 96 |       .reduceByKey(new LongSumReducer());
 97 |   }
 98 | 
 99 |   public static final class IpTuple implements PairFunction<ApacheAccessLog, String, Long> {
100 |     @Override
101 |     public Tuple2<String, Long> call(ApacheAccessLog log) {
102 |       return new Tuple2<>(log.getIpAddress(), 1L);
103 |     }
104 |   }
105 | 
106 |   public static final class IpContentTuple implements PairFunction<ApacheAccessLog, String, Long> {
107 |     @Override
108 |     public Tuple2<String, Long> call(ApacheAccessLog log) {
109 |       return new Tuple2<>(log.getIpAddress(), log.getContentSize());
110 |     }
111 |   }
112 | 
113 | 
114 |   public static final class EndPointTuple implements PairFunction<ApacheAccessLog, String, Long> {
115 |     @Override
116 |     public Tuple2<String, Long> call(ApacheAccessLog log) {
117 |       return new Tuple2<>(log.getEndpoint(), 1L);
118 |     }
119 |   }
120 | 
121 | 
122 |   public static final class IpCountGreaterThan10 implements Function<Tuple2<String, Long>, Boolean> {
123 |     @Override
124 |     public Boolean call(Tuple2<String, Long> e) {
125 |       return e._2() > 10;
126 |     }
127 |   }
128 | 
129 |   public static final class ParseFromLogLine implements Function<String, ApacheAccessLog> {
130 |     @Override
131 |     public ApacheAccessLog call(String line) {
132 |       return ApacheAccessLog.parseFromLogLine(line);
133 |     }
134 |   }
135 |   public static final JavaPairRDD<String, Long> ipAddressCount(
136 |       JavaRDD<ApacheAccessLog> accessLogRDD) {
137 |     return accessLogRDD
138 |       .mapToPair(new IpTuple())
139 |       .reduceByKey(new LongSumReducer());
140 |   }
141 | 
142 |   public static final JavaRDD<String> filterIPAddress(
143 |       JavaPairRDD<String, Long> ipAddressCount) {
144 |     return ipAddressCount
145 |       .filter(new IpCountGreaterThan10())
146 |       .keys();
147 |   }
148 | 
149 |   public static final JavaPairRDD<String, Long> endpointCount(
150 |       JavaRDD<ApacheAccessLog> accessLogRDD) {
151 |     return accessLogRDD
152 |       .mapToPair(new EndPointTuple())
153 |       .reduceByKey(new LongSumReducer());
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerAppMain.java:
--------------------------------------------------------------------------------
  1 | package com.oreilly.learningsparkexamples.java.logs;
  2 | 
  3 | import org.apache.commons.cli.Option;
  4 | import org.apache.commons.cli.Options;
  5 | import org.apache.spark.SparkConf;
  6 | import org.apache.spark.api.java.JavaSparkContext;
  7 | import org.apache.spark.api.java.JavaRDD;
  8 | import org.apache.spark.streaming.api.java.JavaDStream;
  9 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
 10 | import org.apache.spark.api.java.function.Function;
 11 | 
 12 | import java.io.IOException;
 13 | 
 14 | /**
 15 |  * The LogAnalyzerAppMain is an sample logs analysis application.  For now,
 16 |  * it is a simple minimal viable product:
 17 |  *   - Read in new log files from a directory and input those new files into streaming.
 18 |  *   - Computes stats for all of time as well as the last time interval based on those logs.
 19 |  *   - Write the calculated stats to an txt file on the local file system
 20 |  *     that gets refreshed every time interval.
 21 |  *
 22 |  * Once you get this program up and running, feed apache access log files
 23 |  * into the local directory of your choosing.
 24 |  *
 25 |  * Then open your output text file, perhaps in a web browser, and refresh
 26 |  * that page to see more stats come in.
 27 |  *
 28 |  * Modify the command line flags to the values of your choosing.
 29 |  * Notice how they come after you specify the jar when using spark-submit.
 30 |  *
 31 |  * Example command to run:
 32 |  * %  ${YOUR_SPARK_HOME}/bin/spark-submit
 33 |  *     --class "com.oreilly.learningsparkexamples.java.logs.LogAnalyzerAppMain"
 34 |  *     --master local[4]
 35 |  *     target/uber-log-analyzer-1.0.jar
 36 |  *     --logs_directory /tmp/logs
 37 |  *     --output_html_file /tmp/log_stats.html
 38 |  *     --index_html_template ./src/main/resources/index.html.template
 39 |  *     --output_directory /tmp/pandaout
 40 |  */
 41 | public class LogAnalyzerAppMain {
 42 |   public static final String WINDOW_LENGTH = "window_length";
 43 |   public static final String SLIDE_INTERVAL = "slide_interval";
 44 |   public static final String LOGS_DIRECTORY = "logs_directory";
 45 |   public static final String OUTPUT_HTML_FILE = "output_html_file";
 46 |   public static final String CHECKPOINT_DIRECTORY = "checkpoint_directory";
 47 |   public static final String INDEX_HTML_TEMPLATE = "index_html_template";
 48 |   public static final String OUTPUT_DIRECTORY = "output_directory";
 49 | 
 50 |   private static final Options THE_OPTIONS = createOptions();
 51 |   private static Options createOptions() {
 52 |     Options options = new Options();
 53 | 
 54 |     options.addOption(
 55 |         new Option(WINDOW_LENGTH, false, "The window length in seconds"));
 56 |     options.addOption(
 57 |         new Option(SLIDE_INTERVAL, false, "The slide interval in seconds"));
 58 |     options.addOption(
 59 |         new Option(LOGS_DIRECTORY, true, "The directory where logs are written"));
 60 |     options.addOption(
 61 |         new Option(OUTPUT_HTML_FILE, false, "Where to write output html file"));
 62 |     options.addOption(
 63 |         new Option(CHECKPOINT_DIRECTORY, false, "The checkpoint directory."));
 64 |     options.addOption(new Option(INDEX_HTML_TEMPLATE, true,
 65 |             "path to the index.html.template file - accessible from all workers"));
 66 |     options.addOption(new Option(OUTPUT_DIRECTORY, false, "path to output DSTreams too"));
 67 | 
 68 |     return options;
 69 |   }
 70 | 
 71 |   public static void main(String[] args) throws IOException {
 72 |     Flags.setFromCommandLineArgs(THE_OPTIONS, args);
 73 | 
 74 |     // Startup the Spark Conf.
 75 |     SparkConf conf = new SparkConf()
 76 |         .setAppName("A Databricks Reference Application: Logs Analysis with Spark");
 77 |     JavaStreamingContext jssc = new JavaStreamingContext(conf,
 78 |         Flags.getInstance().getSlideInterval());
 79 | 
 80 |     // Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
 81 |     jssc.checkpoint(Flags.getInstance().getCheckpointDirectory());
 82 | 
 83 |     // This methods monitors a directory for new files to read in for streaming.
 84 |     JavaDStream<String> logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory());
 85 | 
 86 |     JavaDStream<ApacheAccessLog> accessLogsDStream
 87 |       = logData.map(new Functions.ParseFromLogLine()).cache();
 88 | 
 89 |     final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal();
 90 |     final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed();
 91 | 
 92 |     // Process the DStream which gathers stats for all of time.
 93 |     logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
 94 | 
 95 |     // Calculate statistics for the last time interval.
 96 |     logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
 97 | 
 98 |     // Render the output each time there is a new RDD in the accessLogsDStream.
 99 |     final Renderer renderer = new Renderer();
100 |     accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
101 |         public Void call(JavaRDD<ApacheAccessLog> rdd) {
102 |           // Call this to output the stats.
103 |           try {
104 |             renderer.render(logAnalyzerTotal.getLogStatistics(),
105 |                             logAnalyzerWindowed.getLogStatistics());
106 |           } catch (Exception e) {
107 |           }
108 |           return null;
109 |         }
110 |       });
111 | 
112 |     // Start the streaming server.
113 |     jssc.start();              // Start the computation
114 |     jssc.awaitTermination();   // Wait for the computation to terminate
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerWindowed.java:
--------------------------------------------------------------------------------
  1 | package com.oreilly.learningsparkexamples.java.logs;
  2 | 
  3 | import com.google.common.collect.Ordering;
  4 | import org.apache.spark.api.java.JavaPairRDD;
  5 | import org.apache.spark.api.java.JavaRDD;
  6 | import org.apache.spark.streaming.api.java.JavaDStream;
  7 | import org.apache.spark.streaming.api.java.JavaPairDStream;
  8 | import org.apache.spark.api.java.function.Function;
  9 | import org.apache.spark.api.java.function.Function2;
 10 | import org.apache.spark.api.java.function.PairFunction;
 11 | import scala.Tuple2;
 12 | import scala.Tuple4;
 13 | 
 14 | import java.io.Serializable;
 15 | import java.util.Comparator;
 16 | import java.util.List;
 17 | 
 18 | public class LogAnalyzerWindowed implements Serializable {
 19 |   private LogStatistics logStatistics;
 20 | 
 21 |   public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
 22 |     JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(
 23 |         Flags.getInstance().getWindowLength(),
 24 |         Flags.getInstance().getSlideInterval());
 25 |     JavaDStream<String> ip = accessLogsDStream.map(
 26 |       new Function<ApacheAccessLog, String>() {
 27 |         public String call(ApacheAccessLog entry) {
 28 |           return entry.getIpAddress();
 29 |         }});
 30 |     // reduceByWindow
 31 |     JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {
 32 |         public Long call(ApacheAccessLog entry) {
 33 |           return 1L;
 34 |         }}).reduceByWindow(new Function2<Long, Long, Long>() {
 35 |             public Long call(Long v1, Long v2) {
 36 |               return v1+v2;
 37 |             }}, new Function2<Long, Long, Long>() {
 38 |             public Long call(Long v1, Long v2) {
 39 |               return v1-v2;
 40 |             }}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
 41 |     requestCountRBW.print();
 42 |     // reducebykeyandwindow
 43 |     JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(
 44 |       new PairFunction<ApacheAccessLog, String, Long>() {
 45 |         public Tuple2<String, Long> call(ApacheAccessLog entry) {
 46 |           return new Tuple2(entry.getIpAddress(), 1L);
 47 |         }});
 48 |     JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(
 49 |       // Adding elements in the new slice
 50 |       new Function2<Long, Long, Long>() {
 51 |         public Long call(Long v1, Long v2) {
 52 |           return v1+v2;
 53 |         }},
 54 |       // Removing elements from the oldest slice
 55 |       new Function2<Long, Long, Long>() {
 56 |         public Long call(Long v1, Long v2) {
 57 |           return v1-v2;
 58 |         }},
 59 |       Flags.getInstance().getWindowLength(),
 60 |       Flags.getInstance().getSlideInterval());
 61 |     ipCountDStream.print();
 62 |     // Use countByWindow
 63 |     JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(
 64 |       Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
 65 |     JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(
 66 |       Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
 67 |     requestCount.print();
 68 |     ipAddressRequestCount.print();
 69 | 
 70 |     // use a transform for the response code count
 71 |     JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(
 72 |       new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {
 73 |         public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
 74 |           return Functions.responseCodeCount(logs);
 75 |         }
 76 |       });
 77 |     windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
 78 |         public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
 79 |       Tuple4<Long, Long, Long, Long> contentSizeStats =
 80 |           Functions.contentSizeStats(accessLogs);
 81 | 
 82 |       List<Tuple2<Integer, Long>> responseCodeToCount =
 83 |           Functions.responseCodeCount(accessLogs)
 84 |           .take(100);
 85 | 
 86 |       JavaPairRDD<String, Long> ipAddressCounts =
 87 |           Functions.ipAddressCount(accessLogs);
 88 |       List<String> ip = Functions.filterIPAddress(ipAddressCounts)
 89 |           .take(100);
 90 | 
 91 |       Object ordering = Ordering.natural();
 92 |       Comparator<Long> cmp = (Comparator<Long>)ordering;
 93 |       List<Tuple2<String, Long>> topEndpoints =
 94 |           Functions.endpointCount(accessLogs)
 95 |         .top(10, new Functions.ValueComparator<String, Long>(cmp));
 96 | 
 97 |       logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount,
 98 |           ip, topEndpoints);
 99 |       return null;
100 |         }});
101 |   }
102 | 
103 |   public LogStatistics getLogStatistics() {
104 |     return logStatistics;
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogStatistics.java:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.java.logs;
 2 | 
 3 | import scala.Tuple2;
 4 | import scala.Tuple4;
 5 | 
 6 | import java.io.Serializable;
 7 | import java.util.ArrayList;
 8 | import java.util.HashMap;
 9 | import java.util.List;
10 | import java.util.Map;
11 | 
12 | public class LogStatistics implements Serializable {
13 |   public final static LogStatistics EMPTY_LOG_STATISTICS =
14 |     new LogStatistics(new Tuple4<>(0L, 0L, 0L, 0L), new ArrayList<Tuple2<Integer, Long>>(),
15 |                       new ArrayList<String>(), new ArrayList<Tuple2<String, Long>>());
16 | 
17 |   private Tuple4<Long, Long, Long, Long> contentSizeStats;
18 |   private List<Tuple2<Integer, Long>> responseCodeToCount;
19 |   private List<String> ipAddresses;
20 |   private List<Tuple2<String, Long>> topEndpoints;
21 | 
22 |   public LogStatistics(Tuple4<Long, Long, Long, Long> contentSizeStats,
23 |                        List<Tuple2<Integer, Long>> responseCodeToCount,
24 |                        List<String> ipAddresses,
25 |                        List<Tuple2<String, Long>> topEndpoints) {
26 |     this.contentSizeStats = contentSizeStats;
27 |     this.responseCodeToCount = responseCodeToCount;
28 |     this.ipAddresses = ipAddresses;
29 |     this.topEndpoints = topEndpoints;
30 |   }
31 | 
32 |   public Tuple4<Long, Long, Long, Long> getContentSizeStats() {
33 |     return contentSizeStats;
34 |   }
35 | 
36 |   public Map<Integer, Long> getResponseCodeToCount() {
37 |     Map<Integer, Long> responseCodeCount = new HashMap<>();
38 |     for (Tuple2<Integer, Long> tuple: responseCodeToCount) {
39 |       responseCodeCount.put(tuple._1(), tuple._2());
40 |     }
41 |     return responseCodeCount;
42 |   }
43 | 
44 |   public List<String> getIpAddresses() {
45 |     return ipAddresses;
46 |   }
47 | 
48 |   public List<Tuple2<String, Long>> getTopEndpoints() {
49 |     return topEndpoints;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/ReadTransferStats.java:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.java.logs;
 2 | 
 3 | import org.apache.spark.api.java.JavaPairRDD;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.function.Function;
 6 | import org.apache.spark.api.java.function.PairFunction;
 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
 8 | import org.apache.spark.streaming.api.java.JavaPairDStream;
 9 | import scala.Tuple2;
10 | 
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.io.Writable;
13 | import org.apache.hadoop.io.IntWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.io.LongWritable;
16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
17 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
18 | 
19 | 
20 | import java.io.Serializable;
21 | 
22 | public class ReadTransferStats implements Serializable {
23 | 
24 |   public JavaPairDStream<Long, Integer> readStats(JavaStreamingContext jssc, String inputDirectory) {
25 |     // Note: This example doesn't work until Spark 1.2
26 |     JavaPairDStream<LongWritable, Text> input = 
27 |       jssc.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
28 |     // convert the input from Writables to native types
29 |     JavaPairDStream<Long, Integer> usefulInput = input.mapToPair(
30 |       new PairFunction<Tuple2<LongWritable, Text>, Long, Integer>() {
31 |         public Tuple2<Long, Integer> call(Tuple2<LongWritable, Text> input) {
32 |           return new Tuple2(input._1().get(), Integer.parseInt(input._2().toString()));
33 |         }
34 |       });
35 |     return usefulInput;
36 |   }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/Renderer.java:
--------------------------------------------------------------------------------
  1 | package com.oreilly.learningsparkexamples.java.logs;
  2 | 
  3 | import com.google.common.base.Charsets;
  4 | import com.google.common.io.Files;
  5 | import scala.Tuple2;
  6 | import scala.Tuple4;
  7 | 
  8 | import java.io.*;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | 
 12 | public class Renderer implements Serializable {
 13 |   private String fileTemplate;
 14 | 
 15 |   public void render(LogStatistics allOfTime, LogStatistics lastWindow)
 16 |       throws Exception {
 17 |     if (fileTemplate == null) {
 18 |       fileTemplate = Files.toString(
 19 |           new File(Flags.getInstance().getIndexHtmlTemplate()),
 20 |           Charsets.UTF_8);
 21 |     }
 22 | 
 23 |     // TODO: Replace this hacky String replace with a proper HTML templating library.
 24 |     String output = fileTemplate;
 25 |     output = output.replace("${logLinesTable}", logLinesTable(allOfTime, lastWindow));
 26 |     output = output.replace("${contentSizesTable}", contentSizesTable(allOfTime, lastWindow));
 27 |     output = output.replace("${responseCodeTable}", responseCodeTable(allOfTime, lastWindow));
 28 |     output = output.replace("${topEndpointsTable}", topEndpointsTable(allOfTime, lastWindow));
 29 |     output = output.replace("${frequentIpAddressTable}", frequentIpAddressTable(allOfTime, lastWindow));
 30 | 
 31 |     Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
 32 |         Flags.getInstance().getOutputHtmlFile())));
 33 |     out.write(output);
 34 |     out.close();
 35 |   }
 36 | 
 37 |   public String logLinesTable(LogStatistics allOfTime, LogStatistics lastWindow) {
 38 |     return "<table class=\"table table-striped\">" +
 39 |         String.format("<tr><th>All Of Time:</th><td>%s</td></tr>",
 40 |             allOfTime.getContentSizeStats()._1()) +
 41 |         String.format("<tr><th>Last Time Window:</th><td>%s</td></tr>",
 42 |             lastWindow.getContentSizeStats()._1()) +
 43 |         "</table>";
 44 |   }
 45 | 
 46 |   public String contentSizesTable(LogStatistics allOfTime, LogStatistics lastWindow) {
 47 |     StringBuilder builder = new StringBuilder();
 48 |     builder.append("<table class=\"table table-striped\">");
 49 |     builder.append("<tr><th></th><th>All of Time</th><th> Last Time Window</th></tr>");
 50 |     Tuple4<Long, Long, Long, Long> totalStats = allOfTime.getContentSizeStats();
 51 |     Tuple4<Long, Long, Long, Long> lastStats = lastWindow.getContentSizeStats();
 52 |     builder.append(String.format("<tr><th>Avg:</th><td>%s</td><td>%s</td>",
 53 |         totalStats._1() > 0 ? totalStats._2() / totalStats._1() : "-",
 54 |         lastStats._1() > 0 ? lastStats._2() / lastStats._1() : "-"));
 55 |     builder.append(String.format("<tr><th>Min:</th><td>%s</td><td>%s</td>",
 56 |         totalStats._1() > 0 ? totalStats._3() : "-",
 57 |         lastStats._1() > 0 ? lastStats._3() : "-"));
 58 |     builder.append(String.format("<tr><th>Max:</th><td>%s</td><td>%s</td>",
 59 |         totalStats._1() > 0 ? totalStats._4() : "-",
 60 |         lastStats._1() > 0 ? lastStats._4() : "-"));
 61 |     builder.append("</table>");
 62 |     return builder.toString();
 63 |   }
 64 | 
 65 |   public String responseCodeTable(
 66 |       LogStatistics allOfTime, LogStatistics lastWindow) {
 67 |     StringBuilder buffer = new StringBuilder();
 68 |     buffer.append("<table class=\"table table-striped\">");
 69 |     buffer.append("<tr><th>Response Code</th><th>All of Time</th><th> Last Time Window</th></tr>");
 70 |     Map<Integer, Long> lastWindowMap = lastWindow.getResponseCodeToCount();
 71 |     for(Map.Entry<Integer, Long> entry: allOfTime.getResponseCodeToCount().entrySet()) {
 72 |       buffer.append(String.format("<tr><td>%s</td><td>%s</td><td>%s</td>",
 73 |         entry.getKey(), entry.getValue(), lastWindowMap.get(entry.getKey())));
 74 |     }
 75 |     buffer.append("</table>");
 76 |     return buffer.toString();
 77 |   }
 78 | 
 79 |   public String frequentIpAddressTable(
 80 |       LogStatistics allOfTime, LogStatistics lastWindow) {
 81 |     StringBuilder builder = new StringBuilder();
 82 |     builder.append("<table class=\"table table-striped\">");
 83 |     builder.append("<tr><th>All of Time</th><th> Last Time Window</th></tr>");
 84 |     List<String> totalIpAddresses = allOfTime.getIpAddresses();
 85 |     List<String> windowIpAddresses = lastWindow.getIpAddresses();
 86 |     for (int i = 0; i < totalIpAddresses.size(); i++) {
 87 |       builder.append(String.format("<tr><td>%s</td><td>%s</td></tr>",
 88 |           totalIpAddresses.get(i),
 89 |           i < windowIpAddresses.size() ? windowIpAddresses.get(i) : "-"));
 90 |     }
 91 |     builder.append("</table>");
 92 |     return builder.toString();
 93 |   }
 94 | 
 95 |   public String topEndpointsTable(
 96 |       LogStatistics allOfTime, LogStatistics lastWindow) {
 97 |     StringBuilder builder = new StringBuilder();
 98 |     builder.append("<table class=\"table table-striped\">");
 99 |     builder.append("<tr><th>All of Time</th><th>Last Time Window</th></tr>");
100 |     List<Tuple2<String, Long>> totalTopEndpoints = allOfTime.getTopEndpoints();
101 |     List<Tuple2<String, Long>> windowTopEndpoints = lastWindow.getTopEndpoints();
102 |     for (int i = 0; i < totalTopEndpoints.size(); i++) {
103 |       builder.append(String.format("<tr><td>%s</td><td>%s</td></tr>",
104 |           totalTopEndpoints.get(i)._1(),
105 |           i < windowTopEndpoints.size() ? windowTopEndpoints.get(i)._1() : "-"));
106 |     }
107 |     builder.append("</table>");
108 |     return builder.toString();
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/protobuf/address_book.proto:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.proto;
 2 | 
 3 | // The sample protocol buffer file that Google uses in their examples at
 4 | // http://code.google.com/p/protobuf and twitter uses in elephant bird.
 5 | // Used in this project for examples.
 6 | 
 7 | option java_outer_classname = "AddressBookProtos";
 8 | 
 9 | message Person {
10 |   required string name = 1;
11 |   required int32 id = 2;
12 |   optional string email = 3;
13 | 
14 |   enum PhoneType {
15 |     MOBILE = 0;
16 |     HOME = 1;
17 |     WORK = 2;
18 |   }
19 | 
20 |   message PhoneNumber {
21 |     required string number = 1;
22 |     optional PhoneType type = 2 [default = HOME];
23 |   }
24 | 
25 |   repeated PhoneNumber phone = 4;
26 | }
27 | 
28 | message AddressBook {
29 |   repeated Person person = 1;
30 |   optional bytes byteData = 2;
31 | }
32 | 
33 | // used testing handling of unknown fields
34 | message PersonWithoutEmail {
35 |   required string name = 1;
36 |   required int32 id = 2;
37 |   repeated Person.PhoneNumber phone = 4;
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/protobuf/places.proto:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.proto;
 2 | 
 3 | message Venue {
 4 |   required int32 id = 1;
 5 |   required string name = 2;
 6 |   required VenueType type = 3;
 7 |   optional string address = 4;
 8 | 
 9 |   enum VenueType {
10 |     COFFEESHOP = 0;
11 |     WORKPLACE = 1;
12 |     CLUB = 2;
13 |     OMNOMNOM = 3;
14 |     OTHER = 4;
15 |   }
16 | }
17 | 
18 | message VenueResponse {
19 |   repeated Venue results = 1;
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvg.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple aggregate in scala to compute the average of an RDD
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | object BasicAvg {
10 |   def main(args: Array[String]) {
11 |     val master = args.length match {
12 |       case x: Int if x > 0 => args(0)
13 |       case _ => "local"
14 |     }
15 |     val sc = new SparkContext(master, "BasicAvg", System.getenv("SPARK_HOME"))
16 |     val input = sc.parallelize(List(1,2,3,4))
17 |     val result = computeAvg(input)
18 |     val avg = result._1 / result._2.toFloat
19 |     println(result)
20 |   }
21 |   def computeAvg(input: RDD[Int]) = {
22 |     input.aggregate((0, 0))((x, y) => (x._1 + y, x._2 + 1),
23 |       (x,y) => (x._1 + y._1, x._2 + y._2))
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgFromFile.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a simple text file
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicAvgFromFile {
 9 |     def main(args: Array[String]) {
10 |       if (args.length < 2) {
11 |         println("Usage: [sparkmaster] [inputfile]")
12 |         exit(1)
13 |       }
14 |       val master = args(0)
15 |       val inputFile = args(1)
16 |       val sc = new SparkContext(master, "BasicAvg", System.getenv("SPARK_HOME"))
17 |       val input = sc.textFile(inputFile)
18 |       val result = input.map(_.toInt).aggregate((0, 0))(
19 |         (acc, value) => (acc._1 + value, acc._2 + 1),
20 |         (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2))
21 |       val avg = result._1 / result._2.toFloat
22 |       println(result)
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgFromFiles.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a directory of files
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object BasicAvgFromFiles {
10 |     def main(args: Array[String]) {
11 |       if (args.length < 3) {
12 |         println("Usage: [sparkmaster] [inputdirectory] [outputdirectory]")
13 |         exit(1)
14 |       }
15 |       val master = args(0)
16 |       val inputFile = args(1)
17 |       val outputFile = args(2)
18 |       val sc = new SparkContext(master, "BasicAvgFromFiles", System.getenv("SPARK_HOME"))
19 |       val input = sc.wholeTextFiles(inputFile)
20 |       val result = input.mapValues{y =>
21 |         val nums = y.split(" ").map(_.toDouble)
22 |         nums.sum / nums.size.toDouble
23 |       }
24 |       result.saveAsTextFile(outputFile)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgMapPartitions.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates mapPartitions in scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicAvgMapPartitions {
 9 |   case class AvgCount(var total: Int = 0, var num: Int = 0) {
10 |     def merge(other: AvgCount): AvgCount = {
11 |       total += other.total
12 |       num += other.num
13 |       this
14 |     }
15 |     def merge(input: Iterator[Int]): AvgCount = {
16 |       input.foreach{elem =>
17 |         total += elem
18 |         num += 1
19 |       }
20 |       this
21 |     }
22 |     def avg(): Float = {
23 |       total / num.toFloat;
24 |     }
25 |   }
26 | 
27 |   def main(args: Array[String]) {
28 |     val master = args.length match {
29 |       case x: Int if x > 0 => args(0)
30 |       case _ => "local"
31 |     }
32 |     val sc = new SparkContext(master, "BasicAvgMapPartitions", System.getenv("SPARK_HOME"))
33 |     val input = sc.parallelize(List(1, 2, 3, 4))
34 |     val result = input.mapPartitions(partition =>
35 |       // Here we only want to return a single element for each partition, but mapPartitions requires that we wrap our return in an Iterator
36 |       Iterator(AvgCount(0, 0).merge(partition)))
37 |       .reduce((x,y) => x.merge(y))
38 |     println(result)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicAvgWithKryo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple fold in scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicAvgWithKryo {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val conf = new SparkConf().setMaster(master).setAppName("basicAvgWithKryo")
15 |       conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
16 |       val sc = new SparkContext(conf)
17 |       val input = sc.parallelize(List(1,2,3,4))
18 |       val result = input.aggregate((0, 0))((x, y) => (x._1 + y, x._2 + 1),
19 |         (x,y) => (x._1 + y._1, x._2 + y._2))
20 |       val avg = result._1 / result._2.toFloat
21 |       println(result)
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicFilterUnionCombo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates filtering and union to extract lines with "error" or "warning"
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object BasicFilterUnionCombo {
10 |     def main(args: Array[String]) {
11 |       val conf = new SparkConf
12 |       conf.setMaster(args(0))
13 |       val sc = new SparkContext(conf)
14 |       val inputRDD = sc.textFile(args(1))
15 |       val errorsRDD = inputRDD.filter(_.contains("error"))
16 |       val warningsRDD = inputRDD.filter(_.contains("error"))
17 |       val badLinesRDD = errorsRDD.union(warningsRDD)
18 |       println(badLinesRDD.collect().mkString("\n"))
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicIntersectByKey.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates intersection by key
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.rdd.PairRDDFunctions
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.SparkContext._
10 | 
11 | import scala.reflect.ClassTag
12 | 
13 | object BasicIntersectByKey {
14 | 
15 |   def intersectByKey[K: ClassTag, V: ClassTag](rdd1: RDD[(K, V)], rdd2: RDD[(K, V)]): RDD[(K, V)] = {
16 |     rdd1.cogroup(rdd2).flatMapValues{
17 |       case (Nil, _) => None
18 |       case (_, Nil) => None
19 |       case (x, y) => x++y
20 |     }
21 |   }
22 | 
23 |   def main(args: Array[String]) {
24 |     val master = args.length match {
25 |       case x: Int if x > 0 => args(0)
26 |       case _ => "local"
27 |     }
28 |     val sc = new SparkContext(master, "BasicIntersectByKey", System.getenv("SPARK_HOME"))
29 |     val rdd1 = sc.parallelize(List((1, "panda"), (2, "happy")))
30 |     val rdd2 = sc.parallelize(List((2, "pandas")))
31 |     val iRdd = intersectByKey(rdd1, rdd2)
32 |     val panda: List[(Int, String)] = iRdd.collect().toList
33 |     panda.map(println(_))
34 |     sc.stop()
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadNums.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a text file of integers and counting the number of invalid elements
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object BasicLoadNums {
10 |     def main(args: Array[String]) {
11 |       val master = args(0)
12 |       val inputFile = args(1)
13 |       val sc = new SparkContext(master, "BasicLoadNums", System.getenv("SPARK_HOME"))
14 |       val file = sc.textFile(inputFile)
15 |       val errorLines = sc.accumulator(0)  // Create an Accumulator[Int] initialized to 0
16 |       val dataLines = sc.accumulator(0)  // Create a second Accumulator[Int] initialized to 0
17 |       val counts = file.flatMap(line => {
18 |         try {
19 |           val input = line.split(" ")
20 |           val data = Some((input(0), input(1).toInt))
21 |           dataLines += 1
22 |           data
23 |         } catch {
24 |           case e: java.lang.NumberFormatException => {
25 |             errorLines += 1
26 |             None
27 |           }
28 |           case e: java.lang.ArrayIndexOutOfBoundsException => {
29 |             errorLines += 1
30 |             None
31 |           }
32 |         }
33 |       }).reduceByKey(_ + _)
34 |       if (errorLines.value < 0.1 * dataLines.value) {
35 |         counts.saveAsTextFile("output.txt")
36 |       } else {
37 |         println(s"Too many errors ${errorLines.value} for ${dataLines.value}")
38 |       }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadSequenceFile.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Loads a simple sequence file of people and how many pandas they have seen.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.hadoop.io.{IntWritable, Text}
 9 | 
10 | 
11 | object BasicLoadSequenceFile {
12 |     def main(args: Array[String]) {
13 |       val master = args(0)
14 |       val inFile = args(1)
15 |       val sc = new SparkContext(master, "BasicLoadSequenceFile", System.getenv("SPARK_HOME"))
16 |       val data = sc.sequenceFile(inFile, classOf[Text], classOf[IntWritable]).map{case (x, y) =>
17 |         (x.toString, y.get())}
18 |       println(data.collect().toList)
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicLoadTextFromFTP.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading a text file from FTP
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object BasicTextFromFTP {
10 |     def main(args: Array[String]) {
11 |       val conf = new SparkConf
12 |       conf.setMaster(args(0))
13 |       val sc = new SparkContext(conf)
14 |       val file = sc.textFile("ftp://anonymous:pandamagic@ftp.ubuntu.com/ubuntu/ls-LR.gz")
15 |       println(file.collect().mkString("\n"))
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMap.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicMap {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME"))
15 |       val input = sc.parallelize(List(1,2,3,4))
16 |       val result = input.map(x => x*x)
17 |       println(result.collect().mkString(","))
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapNoCache.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates lack of caching
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicMapNoCache {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "BasicMapNoCache", System.getenv("SPARK_HOME"))
15 |       val input = sc.parallelize(List(1,2,3,4))
16 |       val result = input.map(x => x*x)
17 |       // will compute result twice
18 |       println(result.count())
19 |       println(result.collect().mkString(","))
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapPartitions.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | import org.eclipse.jetty.client.ContentExchange
 9 | import org.eclipse.jetty.client.HttpClient
10 | 
11 | object BasicMapPartitions {
12 |     def main(args: Array[String]) {
13 |       val master = args.length match {
14 |         case x: Int if x > 0 => args(0)
15 |         case _ => "local"
16 |       }
17 |       val sc = new SparkContext(master, "BasicMapPartitions", System.getenv("SPARK_HOME"))
18 |       val input = sc.parallelize(List("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"))
19 |       val result = input.mapPartitions{
20 |         signs =>
21 |         val client = new HttpClient()
22 |         client.start()
23 |         signs.map {sign =>
24 |           val exchange = new ContentExchange(true);
25 |           exchange.setURL(s"http://qrzcq.com/call/${sign}")
26 |           client.send(exchange)
27 |           exchange
28 |         }.map{ exchange =>
29 |           exchange.waitForDone();
30 |           exchange.getResponseContent()
31 |         }
32 |       }
33 |       println(result.collect().mkString(","))
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicMapThenFilter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map the filter in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicMapThenFilter {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME"))
15 |       val input = sc.parallelize(List(1,2,3,4))
16 |       val squared = input.map(x => x*x)
17 |       val result = squared.filter(x => x != 1)
18 |       println(result.collect().mkString(","))
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseCsv.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse CSV data in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import java.io.StringReader
 7 | import java.io.StringWriter
 8 | 
 9 | import org.apache.spark._
10 | import play.api.libs.json._
11 | import play.api.libs.functional.syntax._
12 | import scala.util.parsing.json.JSON
13 | import scala.collection.JavaConversions._
14 | 
15 | import au.com.bytecode.opencsv.CSVReader
16 | import au.com.bytecode.opencsv.CSVWriter
17 | 
18 | object BasicParseCsv {
19 |   case class Person(name: String, favouriteAnimal: String)
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length < 3) {
23 |       println("Usage: [sparkmaster] [inputfile] [outputfile]")
24 |       exit(1)
25 |     }
26 |     val master = args(0)
27 |     val inputFile = args(1)
28 |     val outputFile = args(2)
29 |     val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME"))
30 |     val input = sc.textFile(inputFile)
31 |     val result = input.map{ line =>
32 |       val reader = new CSVReader(new StringReader(line));
33 |       reader.readNext();
34 |     }
35 |     val people = result.map(x => Person(x(0), x(1)))
36 |     val pandaLovers = people.filter(person => person.favouriteAnimal == "panda")
37 |     pandaLovers.map(person => List(person.name, person.favouriteAnimal).toArray).mapPartitions{people =>
38 |       val stringWriter = new StringWriter();
39 |       val csvWriter = new CSVWriter(stringWriter);
40 |       csvWriter.writeAll(people.toList)
41 |       Iterator(stringWriter.toString)
42 |     }.saveAsTextFile(outputFile)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseJson.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse JSON data in Scala
 3 |  * Loads the data into a case class with the name and a boolean flag
 4 |  * if the person loves pandas.
 5 |  */
 6 | package com.oreilly.learningsparkexamples.scala
 7 | 
 8 | import org.apache.spark._
 9 | import play.api.libs.json._
10 | import play.api.libs.functional.syntax._
11 | 
12 | object BasicParseJson {
13 |   case class Person(name: String, lovesPandas: Boolean)
14 |   implicit val personReads = Json.format[Person]
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length < 3) {
18 |       println("Usage: [sparkmaster] [inputfile] [outputfile]")
19 |       exit(1)
20 |       }
21 |     val master = args(0)
22 |     val inputFile = args(1)
23 |     val outputFile = args(2)
24 |     val sc = new SparkContext(master, "BasicParseJson", System.getenv("SPARK_HOME"))
25 |     val input = sc.textFile(inputFile)
26 |     val parsed = input.map(Json.parse(_))
27 |     // We use asOpt combined with flatMap so that if it fails to parse we
28 |     // get back a None and the flatMap essentially skips the result.
29 |     val result = parsed.flatMap(record => personReads.reads(record).asOpt)
30 |     result.filter(_.lovesPandas).map(Json.toJson(_)).saveAsTextFile(outputFile)
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseJsonWithJackson.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse JSON data in Scala
 3 |  * Loads the data into a case class with the name and a boolean flag
 4 |  * if the person loves pandas.
 5 |  */
 6 | package com.oreilly.learningsparkexamples.scala
 7 | 
 8 | import org.apache.spark._
 9 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
10 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
11 | import com.fasterxml.jackson.databind.ObjectMapper
12 | import com.fasterxml.jackson.databind.DeserializationFeature
13 | 
14 | 
15 | 
16 | case class Person(name: String, lovesPandas: Boolean) // Note: must be a top level class
17 | 
18 | object BasicParseJsonWithJackson {
19 | 
20 |   def main(args: Array[String]) {
21 |     if (args.length < 3) {
22 |       println("Usage: [sparkmaster] [inputfile] [outputfile]")
23 |       exit(1)
24 |       }
25 |     val master = args(0)
26 |     val inputFile = args(1)
27 |     val outputFile = args(2)
28 |     val sc = new SparkContext(master, "BasicParseJsonWithJackson", System.getenv("SPARK_HOME"))
29 |     val input = sc.textFile(inputFile)
30 | 
31 |     // Parse it into a specific case class. We use mapPartitions beacuse:
32 |     // (a) ObjectMapper is not serializable so we either create a singleton object encapsulating ObjectMapper
33 |     //     on the driver and have to send data back to the driver to go through the singleton object.
34 |     //     Alternatively we can let each node create its own ObjectMapper but that's expensive in a map
35 |     // (b) To solve for creating an ObjectMapper on each node without being too expensive we create one per
36 |     //     partition with mapPartitions. Solves serialization and object creation performance hit.
37 |     val result = input.mapPartitions(records => {
38 |         // mapper object created on each executor node
39 |         val mapper = new ObjectMapper with ScalaObjectMapper
40 |         mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
41 |         mapper.registerModule(DefaultScalaModule)
42 |         // We use flatMap to handle errors
43 |         // by returning an empty list (None) if we encounter an issue and a
44 |         // list with one element if everything is ok (Some(_)).
45 |         records.flatMap(record => {
46 |           try {
47 |             Some(mapper.readValue(record, classOf[Person]))
48 |           } catch {
49 |             case e: Exception => None
50 |           }
51 |         })
52 |     }, true)
53 |     result.filter(_.lovesPandas).mapPartitions(records => {
54 |       val mapper = new ObjectMapper with ScalaObjectMapper
55 |       mapper.registerModule(DefaultScalaModule)
56 |       records.map(mapper.writeValueAsString(_))
57 |     })
58 |       .saveAsTextFile(outputFile)
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicParseWholeFileCsv.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse CSV data in Scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import java.io.StringReader
 7 | 
 8 | import org.apache.spark._
 9 | import play.api.libs.json._
10 | import play.api.libs.functional.syntax._
11 | import scala.util.parsing.json.JSON
12 | import scala.collection.JavaConversions._
13 | import au.com.bytecode.opencsv.CSVReader
14 | 
15 | object BasicParseWholeFileCsv {
16 |   def main(args: Array[String]) {
17 |     if (args.length < 2) {
18 |       println("Usage: [sparkmaster] [inputfile]")
19 |       exit(1)
20 |     }
21 |     val master = args(0)
22 |     val inputFile = args(1)
23 |     val sc = new SparkContext(master, "BasicParseWholeFileCsv", System.getenv("SPARK_HOME"))
24 |     val input = sc.wholeTextFiles(inputFile)
25 |     val result = input.flatMap{ case (_, txt) =>
26 |       val reader = new CSVReader(new StringReader(txt));
27 |       reader.readAll()
28 |     }
29 |     println(result.collect().map(_.toList).mkString(","))
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicQueryCassandra.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A simple illustration of querying Cassandra
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | // Implicits that add functions to the SparkContext & RDDs.
 9 | import com.datastax.spark.connector._
10 | 
11 | 
12 | object BasicQueryCassandra {
13 |     def main(args: Array[String]) {
14 |       val sparkMaster = args(0)
15 |       val cassandraHost = args(1)
16 |       val conf = new SparkConf(true)
17 |         .set("spark.cassandra.connection.host", cassandraHost)
18 |       val sc = new SparkContext(sparkMaster, "BasicQueryCassandra", conf)
19 |       // entire table as an RDD
20 |       // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int);
21 |       val data = sc.cassandraTable("test" , "kv")
22 |       // print some basic stats
23 |       println("stats "+data.map(row => row.getInt("value")).stats())
24 |       val rdd = sc.parallelize(List(("moremagic", 1)))
25 |       rdd.saveToCassandra("test" , "kv", SomeColumns("key", "value"))
26 |       // save from a case class
27 |       val otherRdd = sc.parallelize(List(KeyValue("magic", 0)))
28 |       otherRdd.saveToCassandra("test", "kv")
29 |     }
30 | }
31 | 
32 | case class KeyValue(key: String, value: Integer)
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSaveProtoBuf.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Saves a sequence file of people and how many pandas they have seen.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import com.oreilly.learningsparkexamples.proto.Places
 7 | 
 8 | import org.apache.spark._
 9 | import org.apache.spark.SparkContext._
10 | 
11 | import org.apache.hadoop.io.Text
12 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable
13 | import com.twitter.elephantbird.mapreduce.output.LzoProtobufBlockOutputFormat
14 | import org.apache.hadoop.mapreduce.Job
15 | 
16 | object BasicSaveProtoBuf {
17 |     def main(args: Array[String]) {
18 |       val master = args(0)
19 |       val outputFile = args(1)
20 |       val sc = new SparkContext(master, "BasicSaveProtoBuf", System.getenv("SPARK_HOME"))
21 |       val job = new Job()
22 |       val conf = job.getConfiguration
23 |       LzoProtobufBlockOutputFormat.setClassConf(classOf[Places.Venue], conf);
24 |       val dnaLounge = Places.Venue.newBuilder()
25 |       dnaLounge.setId(1);
26 |       dnaLounge.setName("DNA Lounge")
27 |       dnaLounge.setType(Places.Venue.VenueType.CLUB)
28 |       val data = sc.parallelize(List(dnaLounge.build()))
29 |       val outputData = data.map{ pb =>
30 |         val protoWritable = ProtobufWritable.newInstance(classOf[Places.Venue]);
31 |         protoWritable.set(pb)
32 |         (null, protoWritable)
33 |       }
34 |       outputData.saveAsNewAPIHadoopFile(outputFile, classOf[Text], classOf[ProtobufWritable[Places.Venue]],
35 |         classOf[LzoProtobufBlockOutputFormat[ProtobufWritable[Places.Venue]]], conf)
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSaveSequenceFile.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Saves a sequence file of people and how many pandas they have seen.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object BasicSaveSequenceFile {
10 |     def main(args: Array[String]) {
11 |       val master = args(0)
12 |       val outputFile = args(1)
13 |       val sc = new SparkContext(master, "BasicSaveSequenceFile", System.getenv("SPARK_HOME"))
14 |       val data = sc.parallelize(List(("Holden", 3), ("Kay", 6), ("Snail", 2)))
15 |       data.saveAsSequenceFile(outputFile)
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicStreamingExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A sample streaming application saves the wordcounts of a specific window in time
 3 |  */
 4 | 
 5 | package com.oreilly.learningsparkexamples.scala
 6 | 
 7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 8 | import StreamingContext._
 9 | import org.apache.spark._
10 | import org.apache.spark.SparkContext._
11 | 
12 | 
13 | object BasicStreamingExample {
14 |   def main(args: Array[String]) {
15 |     if (args.length < 2) {
16 |       System.err.println("Usage BasicStreamingExample <master> <output>")
17 |     }
18 |     val Array(master, output) = args.take(2)
19 | 
20 |     val conf = new SparkConf().setMaster(master).setAppName("BasicStreamingExample")
21 |     val ssc = new StreamingContext(conf, Seconds(30))
22 | 
23 |     val lines = ssc.socketTextStream("localhost" , 7777)
24 |     val words = lines.flatMap(_.split(" "))
25 |     val wc = words.map(x => (x, 1)).reduceByKey((x, y) => x + y)
26 | 
27 |     wc.saveAsTextFiles(output)
28 |     wc.print
29 | 
30 |     println("pandas: sscstart")
31 |     ssc.start()
32 |     println("pandas: awaittermination")
33 |     ssc.awaitTermination()
34 |     println("pandas: done!")
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/BasicSum.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple fold in scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object BasicSum {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME"))
15 |       val input = sc.parallelize(List(1,2,3,4))
16 |       val result = input.fold(0)((x, y) => (x + y))
17 |       println(result)
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/FlumeInput.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a basic Flume stream
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.streaming._
 9 | import org.apache.spark.streaming.flume._
10 | 
11 | object FlumeInput {
12 |   def main(args: Array[String]) {
13 |     val receiverHostname = args(0)
14 |     val receiverPort = args(1).toInt
15 |     val conf = new SparkConf().setAppName("FlumeInput")
16 |     // Create a StreamingContext with a 1 second batch size
17 |     val ssc = new StreamingContext(conf, Seconds(1))
18 |     println(s"Creating flume stream on $receiverHostname $receiverPort")
19 |     val events = FlumeUtils.createStream(ssc, receiverHostname, receiverPort)
20 |     // Assuming that our flume events are UTF-8 log lines
21 |     val lines = events.map{e => new String(e.event.getBody().array(), "UTF-8")}
22 |     println("Starting StreamingContext")
23 |     lines.print()
24 |     // start our streaming context and wait for it to "finish"
25 |     ssc.start()
26 |     // Wait for 10 seconds then exit. To run forever call without a timeout
27 |     ssc.awaitTermination(10000)
28 |     ssc.stop()
29 |     println("Done")
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/KafkaInput.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a basic Kafka stream
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.streaming._
 9 | import org.apache.spark.streaming.kafka._
10 | 
11 | object KafkaInput {
12 |   def main(args: Array[String]) {
13 |     val Array(zkQuorum, group, topic, numThreads) = args
14 |     val conf = new SparkConf().setAppName("KafkaInput")
15 |     // Create a StreamingContext with a 1 second batch size
16 |     val ssc = new StreamingContext(conf, Seconds(1))
17 |     // Create a map of topics to number of receiver threads to use
18 |     val topics = List((topic, 1)).toMap
19 |     val topicLines = KafkaUtils.createStream(ssc, zkQuorum, group, topics)
20 |     val lines = StreamingLogInput.processLines(topicLines.map(_._2))
21 |     lines.print()
22 |     // start our streaming context and wait for it to "finish"
23 |     ssc.start()
24 |     // Wait for 10 seconds then exit. To run forever call without a timeout
25 |     ssc.awaitTermination(10000)
26 |     ssc.stop()
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/LoadHive.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading Hive data using Spark SQL
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.sql.hive.HiveContext
 8 | 
 9 | 
10 | object LoadHive {
11 |   def main(args: Array[String]) {
12 |     if (args.length < 2) {
13 |       println("Usage: [sparkmaster] [tablename]")
14 |       exit(1)
15 |     }
16 |     val master = args(0)
17 |     val tableName = args(1)
18 |     val sc = new SparkContext(master, "LoadHive", System.getenv("SPARK_HOME"))
19 |     val hiveCtx = new HiveContext(sc)
20 |     val input = hiveCtx.sql("FROM src SELECT key, value")
21 |     val data = input.map(_.getInt(0))
22 |     println(data.collect().toList)
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/LoadJsonWithElephantBird.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse JSON data in Scala
 3 |  * Loads the data into a case class with the name and a boolean flag
 4 |  * if the person loves pandas.
 5 |  */
 6 | package com.oreilly.learningsparkexamples.scala
 7 | 
 8 | import scala.collection.JavaConversions._
 9 | import org.apache.spark._
10 | import com.twitter.elephantbird.mapreduce.input.LzoJsonInputFormat
11 | import org.apache.hadoop.io.{LongWritable, MapWritable, Text, BooleanWritable}
12 | import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
13 | import java.util.HashMap
14 | 
15 | object LoadJsonWithElephantBird {
16 |   def main(args: Array[String]) {
17 |     if (args.length < 2) {
18 |       println("Usage: [sparkmaster] [inputfile]")
19 |       exit(1)
20 |     }
21 |     val master = args(0)
22 |     val inputFile = args(1)
23 |     val sc = new SparkContext(master, "LoadJsonWithElephantBird", System.getenv("SPARK_HOME"))
24 |     val conf = new NewHadoopJob().getConfiguration
25 |     conf.set("io.compression.codecs","com.hadoop.compression.lzo.LzopCodec")
26 |     conf.set("io.compression.codec.lzo.class", "com.hadoop.compression.lzo.LzoCodec")
27 |     val input = sc.newAPIHadoopFile(inputFile, classOf[LzoJsonInputFormat], classOf[LongWritable], classOf[MapWritable], conf).map{case (x, y) =>
28 |       (x.get, y.entrySet().map{entry =>
29 |         (entry.getKey().asInstanceOf[Text].toString(),
30 |          entry.getValue() match {
31 |            case t: Text => t.toString()
32 |            case b: BooleanWritable => b.get()
33 |            case _ => throw new Exception("unexpected input")
34 |          }
35 |         )})}
36 |     println(input.collect().toList)
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/LoadJsonWithSparkSQL.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading JSON data using Spark SQL
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.sql.SQLContext
 8 | 
 9 | 
10 | object LoadJsonWithSparkSQL {
11 |   def main(args: Array[String]) {
12 |     if (args.length != 2) {
13 |       println("Usage: [sparkmaster] [inputFile]")
14 |       exit(1)
15 |     }
16 |     val master = args(0)
17 |     val inputFile = args(1)
18 |     val sc = new SparkContext(master, "LoadJsonWithSparkSQL", System.getenv("SPARK_HOME"))
19 |     val sqlCtx = new SQLContext(sc)
20 |     val input = sqlCtx.jsonFile(inputFile)
21 |     input.printSchema()
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/LoadKeyValueTextInput.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple map partition to parse JSON data in Scala
 3 |  * Loads the data into a case class with the name and a boolean flag
 4 |  * if the person loves pandas.
 5 |  */
 6 | package com.oreilly.learningsparkexamples.scala
 7 | 
 8 | import scala.collection.JavaConversions._
 9 | import org.apache.spark._
10 | import org.apache.hadoop.mapred.KeyValueTextInputFormat
11 | import org.apache.hadoop.io.{MapWritable, Text}
12 | import java.util.HashMap
13 | 
14 | object LoadKeyValueTextInput {
15 |   def main(args: Array[String]) {
16 |     if (args.length < 2) {
17 |       println("Usage: [sparkmaster] [inputfile]")
18 |       exit(1)
19 |     }
20 |     val master = args(0)
21 |     val inputFile = args(1)
22 |     val sc = new SparkContext(master, "LoadKeyValueTextInput", System.getenv("SPARK_HOME"))
23 |     val input = sc.hadoopFile[Text, Text, KeyValueTextInputFormat](inputFile).map{
24 |       case (x, y) => (x.toString, y.toString)
25 |     }
26 |     println(input.collect().toList)
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/LoadSimpleJdbc.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates loading data over JDBC
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.rdd.JdbcRDD
 8 | import java.sql.{DriverManager, ResultSet}
 9 | 
10 | object LoadSimpleJdbc {
11 |   def main(args: Array[String]) {
12 |     if (args.length < 1) {
13 |       println("Usage: [sparkmaster]")
14 |       exit(1)
15 |     }
16 |     val master = args(0)
17 |     val sc = new SparkContext(master, "LoadSimpleJdbc", System.getenv("SPARK_HOME"))
18 |     val data = new JdbcRDD(sc,
19 |       createConnection, "SELECT * FROM panda WHERE ? <= id AND ID <= ?",
20 |       lowerBound = 1, upperBound = 3, numPartitions = 2, mapRow = extractValues)
21 |     println(data.collect().toList)
22 |   }
23 | 
24 |   def createConnection() = {
25 |     Class.forName("com.mysql.jdbc.Driver").newInstance();
26 |     DriverManager.getConnection("jdbc:mysql://localhost/test?user=holden");
27 |   }
28 | 
29 |   def extractValues(r: ResultSet) = {
30 |     (r.getInt(1), r.getString(2))
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/MLlib.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.oreilly.learningsparkexamples.scala
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
22 | import org.apache.spark.mllib.feature.HashingTF
23 | import org.apache.spark.mllib.regression.LabeledPoint
24 | 
25 | object MLlib {
26 | 
27 |   def main(args: Array[String]) {
28 |     val conf = new SparkConf().setAppName(s"Book example: Scala")
29 |     val sc = new SparkContext(conf)
30 | 
31 |     // Load 2 types of emails from text files: spam and ham (non-spam).
32 |     // Each line has text from one email.
33 |     val spam = sc.textFile("files/spam.txt")
34 |     val ham = sc.textFile("files/ham.txt")
35 | 
36 |     // Create a HashingTF instance to map email text to vectors of 100 features.
37 |     val tf = new HashingTF(numFeatures = 100)
38 |     // Each email is split into words, and each word is mapped to one feature.
39 |     val spamFeatures = spam.map(email => tf.transform(email.split(" ")))
40 |     val hamFeatures = ham.map(email => tf.transform(email.split(" ")))
41 | 
42 |     // Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
43 |     val positiveExamples = spamFeatures.map(features => LabeledPoint(1, features))
44 |     val negativeExamples = hamFeatures.map(features => LabeledPoint(0, features))
45 |     val trainingData = positiveExamples ++ negativeExamples
46 |     trainingData.cache() // Cache data since Logistic Regression is an iterative algorithm.
47 | 
48 |     // Create a Logistic Regression learner which uses the LBFGS optimizer.
49 |     val lrLearner = new LogisticRegressionWithSGD()
50 |     // Run the actual learning algorithm on the training data.
51 |     val model = lrLearner.run(trainingData)
52 | 
53 |     // Test on a positive example (spam) and a negative one (ham).
54 |     // First apply the same HashingTF feature transformation used on the training data.
55 |     val posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
56 |     val negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))
57 |     // Now use the learned model to predict spam/ham for new emails.
58 |     println(s"Prediction for positive test example: ${model.predict(posTestExample)}")
59 |     println(s"Prediction for negative test example: ${model.predict(negTestExample)}")
60 | 
61 |     sc.stop()
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/MLlibPipeline.disabled_until_111:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.oreilly.learningsparkexamples.scala
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.sql.{Row, SQLContext}
22 | import org.apache.spark.ml.Pipeline
23 | import org.apache.spark.ml.classification.LogisticRegression
24 | import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
25 | 
26 | object MLlibPipeline {
27 | 
28 |   case class Document(id: Long, text: String)
29 | 
30 |   case class LabeledDocument(id: Long, text: String, label: Double)
31 | 
32 |   def main(args: Array[String]) {
33 |     val conf = new SparkConf().setAppName("BookExamplePipeline")
34 |     val sc = new SparkContext(conf)
35 |     val sqlContext = new SQLContext(sc)
36 |     import sqlContext._
37 | 
38 |     // Load 2 types of emails from text files: spam and ham (non-spam).
39 |     // Each line has text from one email.
40 |     val spam = sc.textFile("files/spam.txt")
41 |     val ham = sc.textFile("files/ham.txt")
42 | 
43 |     // Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
44 |     val positiveExamples = spam.zipWithIndex().map { case (email, index) =>
45 |       LabeledDocument(index, email, 1.0)
46 |     }
47 |     val negativeExamples = ham.zipWithIndex().map { case (email, index) =>
48 |       LabeledDocument(index, email, 0.0)
49 |     }
50 |     val trainingData = positiveExamples ++ negativeExamples
51 | 
52 |     // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
53 |     // Each stage outputs a column in a SchemaRDD and feeds it to the next stage's input column.
54 |     val tokenizer = new Tokenizer() // Splits each email into words
55 |       .setInputCol("text")
56 |       .setOutputCol("words")
57 |     val hashingTF = new HashingTF() // Maps email words to vectors of 100 features.
58 |       .setNumFeatures(100)
59 |       .setInputCol(tokenizer.getOutputCol)
60 |       .setOutputCol("features")
61 |     val lr = new LogisticRegression() // LogisticRegression uses inputCol "features" by default.
62 |     val pipeline = new Pipeline()
63 |       .setStages(Array(tokenizer, hashingTF, lr))
64 | 
65 |     // Fit the pipeline to training documents.
66 |     // RDDs of case classes work well with Pipelines since Spark SQL can infer a schema from
67 |     // case classes and convert the data into a SchemaRDD.
68 |     val model = pipeline.fit(trainingData)
69 | 
70 |     // Make predictions on test documents.
71 |     // The fitted model automatically transforms features using Tokenizer and HashingTF.
72 |     val testData = sc.parallelize(Seq(
73 |       Document(0, "O M G GET cheap stuff by sending money to ..."), // positive example (spam)
74 |       Document(1, "Hi Dad, I started studying Spark the other ...")   // negative example (ham)
75 |     ))
76 |     val predictions = model.transform(testData)
77 |       .select('id, 'prediction).collect()
78 |       .map { case Row(id, prediction) => (id, prediction) }.toMap
79 |     println(s"Prediction for positive test example: ${predictions(0)}")
80 |     println(s"Prediction for negative test example: ${predictions(1)}")
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/PerKeyAvg.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple fold in scala
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object PerKeyAvg {
10 |     def main(args: Array[String]) {
11 |       val master = args.length match {
12 |         case x: Int if x > 0 => args(0)
13 |         case _ => "local"
14 |       }
15 | 
16 |       val sc = new SparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME"))
17 |       val input = sc.parallelize(List(("coffee", 1) , ("coffee", 2) , ("panda", 4)))
18 |       val result = input.combineByKey(
19 |         (v) => (v, 1),
20 |         (acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1),
21 |         (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
22 |       // Note: we could us mapValues here, but we didn't because it was in the next section
23 |       ).map{ case (key, value) => (key, value._1 / value._2.toFloat) }
24 |       result.collectAsMap().map(println(_))
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/PipeExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple use of pipe to call a perl program from Spark
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | 
 8 | object PipeExample {
 9 |     def main(args: Array[String]) {
10 |       val master = args.length match {
11 |         case x: Int if x > 0 => args(0)
12 |         case _ => "local"
13 |       }
14 |       val sc = new SparkContext(master, "PipeExample", System.getenv("SPARK_HOME"))
15 |       val rdd = sc.parallelize(Array(
16 |         "37.75889318222431,-122.42683635321838,37.7614213,-122.4240097",
17 |         "37.7519528,-122.4208689,37.8709087,-122.2688365"))
18 | 
19 |       // adds our script to a list of files for each node to download with this job
20 |       val distScript = "/home/holden/repos/learning-spark-examples/src/R/finddistance.R"
21 |       sc.addFile(distScript)
22 | 
23 |       val piped = rdd.pipe(Seq(SparkFiles.get(distScript)),
24 |         Map("SEPARATOR" -> ","))
25 |       val result = piped.collect
26 | 
27 |       println(result.mkString(" "))
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/RemoveOutliers.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates remove outliers in Scala using summary Stats
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.rdd.RDD
 9 | 
10 | object RemoveOutliers {
11 |   def main(args: Array[String]) {
12 |     val master = args.length match {
13 |       case x: Int if x > 0 => args(0)
14 |       case _ => "local"
15 |     }
16 |     val sc = new SparkContext(master, "RemoveOutliers", System.getenv("SPARK_HOME"))
17 |     val input = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000)).map(_.toDouble)
18 |     val result = removeOutliers(input)
19 |     println(result.collect().mkString(","))
20 |   }
21 |   def removeOutliers(rdd: RDD[Double]): RDD[Double] = {
22 |     val summaryStats = rdd.stats()
23 |     val stddev = math.sqrt(summaryStats.variance)
24 |     rdd.filter(x => math.abs(x-summaryStats.mean) < 3 * stddev)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/SparkSQLTwitter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Load some tweets stored as JSON data and explore them.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.sql.hive.HiveContext
 9 | 
10 | 
11 | case class HappyPerson(handle: String, favouriteBeverage: String)
12 | 
13 | object SparkSQLTwitter {
14 |     def main(args: Array[String]) {
15 |       if (args.length < 2) {
16 |         println("Usage inputFile outputFile [spark.sql.inMemoryColumnarStorage.batchSize]")
17 |       }
18 |       val inputFile = args(0)
19 |       val outputFile = args(1)
20 |       val batchSize = if (args.length == 3) {
21 |         args(2)
22 |       } else {
23 |         "200"
24 |       }
25 |       val conf = new SparkConf()
26 |       conf.set("spark.sql.codegen", "false")
27 |       conf.set("spark.sql.inMemoryColumnarStorage.batchSize", batchSize)
28 |       val sc = new SparkContext(conf)
29 |       val hiveCtx = new HiveContext(sc)
30 |       import hiveCtx.implicits._
31 |       // Load some tweets
32 |       val input = hiveCtx.jsonFile(inputFile)
33 |       // Print the schema
34 |       input.printSchema()
35 |       // Register the input schema RDD
36 |       input.registerTempTable("tweets")
37 |       hiveCtx.cacheTable("tweets")
38 |       // Select tweets based on the retweetCount
39 |       val topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
40 |       topTweets.collect().map(println(_))
41 |       val topTweetText = topTweets.map(row => row.getString(0))
42 |       // Create a person and turn it into a Schema RDD
43 |       val happyPeopleRDD = sc.parallelize(List(HappyPerson("holden", "coffee"))).toDF()
44 |       happyPeopleRDD.registerTempTable("happy_people")
45 |       // UDF
46 |       hiveCtx.udf.register("strLenScala", (_: String).length)
47 |       val tweetLength = hiveCtx.sql("SELECT strLenScala('tweet') FROM tweets LIMIT 10")
48 |       tweetLength.collect().map(println(_))
49 |       // Two sums at once (crazy town!)
50 |       val twoSums = hiveCtx.sql("SELECT SUM(user.favouritesCount), SUM(retweetCount), user.id FROM tweets GROUP BY user.id LIMIT 10")
51 |       twoSums.collect().map(println(_))
52 |       sc.stop()
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/StreamingLogInput.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates a simple streaming application
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.streaming._
 9 | import org.apache.spark.streaming.dstream._
10 | 
11 | object StreamingLogInput {
12 |   def main(args: Array[String]) {
13 |     val master = args(0)
14 |     val conf = new SparkConf().setMaster(master).setAppName("StreamingLogInput")
15 |     // Create a StreamingContext with a 1 second batch size
16 |     val ssc = new StreamingContext(conf, Seconds(1))
17 |     // Create a DStream from all the input on port 7777
18 |     val lines = ssc.socketTextStream("localhost", 7777)
19 |     val errorLines = processLines(lines)
20 |     // Print out the lines with errors, which causes this DStream to be evaluated
21 |     errorLines.print()
22 |     // start our streaming context and wait for it to "finish"
23 |     ssc.start()
24 |     // Wait for 10 seconds then exit. To run forever call without a timeout
25 |     ssc.awaitTermination(10000)
26 |     ssc.stop()
27 |   }
28 |   def processLines(lines: DStream[String]) = {
29 |     // Filter our DStream for lines with "error"
30 |     lines.filter(_.contains("error"))
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/WordCount.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates flatMap + countByValue for wordcount.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | 
 9 | object WordCount {
10 |     def main(args: Array[String]) {
11 |       val master = args.length match {
12 |         case x: Int if x > 0 => args(0)
13 |         case _ => "local"
14 |       }
15 |       val sc = new SparkContext(master, "WordCount", System.getenv("SPARK_HOME"))
16 |       val input = args.length match {
17 |         case x: Int if x > 1 => sc.textFile(args(1))
18 |         case _ => sc.parallelize(List("pandas", "i like pandas"))
19 |       }
20 |       val words = input.flatMap(line => line.split(" "))
21 |       args.length match {
22 |         case x: Int if x > 2 => {
23 |           val counts = words.map(word => (word, 1)).reduceByKey{case (x,y) => x + y}
24 |           counts.saveAsTextFile(args(2))
25 |         }
26 |         case _ => {
27 |           val wc = words.countByValue()
28 |           println(wc.mkString(","))
29 |         }
30 |       }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/WriteSimpleDB.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates writing data over JDBC
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.rdd.JdbcRDD
 9 | import java.sql.{PreparedStatement, DriverManager, ResultSet}
10 | import org.apache.hadoop.mapred.lib.db._
11 | import org.apache.hadoop.mapred.JobConf
12 | 
13 | object WriteSimpleDB {
14 |   def main(args: Array[String]) {
15 |     if (args.length < 1) {
16 |       println("Usage: [sparkmaster]")
17 |       exit(1)
18 |     }
19 |     val master = args(0)
20 |     val sc = new SparkContext(master, "WriteSimpleJdbc", System.getenv("SPARK_HOME"))
21 |     val data = sc.parallelize(List(("cat1", 1)))
22 |     // foreach partition method
23 |     data.foreachPartition{records =>
24 |       records.foreach(record => println("fake db write"))
25 |     }
26 |     // DBOutputFormat approach
27 |     val records = data.map(e => (catRecord(e._1, e._2), null))
28 |     val tableName = "table"
29 |     val fields = Array("name", "age")
30 |     val jobConf = new JobConf()
31 |     DBConfiguration.configureDB(jobConf, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost/test?user=holden")
32 |     DBOutputFormat.setOutput(jobConf, tableName, fields:_*)
33 |     records.saveAsHadoopDataset(jobConf)
34 |   }
35 |   case class catRecord(name: String, age: Int) extends DBWritable {
36 |     override def write(s: PreparedStatement) {
37 |       s.setString(1, name)
38 |       s.setInt(2, age)
39 |     }
40 |     override def readFields(r: ResultSet) = {
41 |       // blank since only used for writing
42 |     }
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerAppMain.scala:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.scala.logs;
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.streaming._
 5 | import org.apache.spark.streaming.StreamingContext._
 6 | import org.apache.spark.streaming.dstream._
 7 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog
 8 | 
 9 | /**
10 |  * The LogAnalyzerAppMain is an sample logs analysis application.  For now,
11 |  * it is a simple minimal viable product:
12 |  *   - Read in new log files from a directory and input those new files into streaming.
13 |  *   - Computes stats for all of time as well as the last time interval based on those logs.
14 |  *   - Write the calculated stats to an txt file on the local file system
15 |  *     that gets refreshed every time interval.
16 |  *
17 |  * Once you get this program up and running, feed apache access log files
18 |  * into the local directory of your choosing.
19 |  *
20 |  * Then open your output text file, perhaps in a web browser, and refresh
21 |  * that page to see more stats come in.
22 |  *
23 |  * Modify the command line flags to the values of your choosing.
24 |  * Notice how they come after you specify the jar when using spark-submit.
25 |  *
26 |  * Example command to run:
27 |  * %  ${YOUR_SPARK_HOME}/bin/spark-submit
28 |  *     --class "com.oreilly.learningsparkexamples.scala.logs.LogAnalyzerAppMain"
29 |  *     --master local[4]
30 |  *     target/uber-log-analyzer-1.0.jar
31 |  *     --logs_directory /tmp/logs
32 |  *     --output_html_file /tmp/log_stats.html
33 |  *     --index_html_template ./src/main/resources/index.html.template
34 |  */
35 | case class Config(WindowLength: Int = 3000, SlideInterval: Int = 1000, LogsDirectory: String = "/tmp/logs",
36 |   CheckpointDirectory: String = "/tmp/checkpoint",
37 |   OutputHTMLFile: String = "/tmp/log_stats.html",
38 |   OutputDirectory: String = "/tmp/outpandas",
39 |   IndexHTMLTemplate :String ="./src/main/resources/index.html.template") {
40 |   def getWindowDuration() = {
41 |     new Duration(WindowLength)
42 |   }
43 |   def getSlideDuration() = {
44 |     new Duration(SlideInterval)
45 |   }
46 | }
47 | 
48 | object LogAnalyzerAppMain {
49 | 
50 |   def main(args: Array[String]) {
51 |     val parser = new scopt.OptionParser[Config]("LogAnalyzerAppMain") {
52 |       head("LogAnalyzer", "0.1")
53 |       opt[Int]('w', "window_length") text("size of the window as an integer in miliseconds")
54 |       opt[Int]('s', "slide_interval") text("size of the slide inteval as an integer in miliseconds")
55 |       opt[String]('l', "logs_directory") text("location of the logs directory. if you don't have any logs use the fakelogs_dir script.")
56 |       opt[String]('c', "checkpoint_directory") text("location of the checkpoint directory.")
57 |       opt[String]('o', "output_directory") text("location of the output directory.")
58 |     }
59 |     val opts = parser.parse(args, new Config()).get
60 |     // Startup the Spark Conf.
61 |     val conf = new SparkConf()
62 |       .setAppName("A Databricks Reference Application: Logs Analysis with Spark");
63 |     val ssc = new StreamingContext(conf, opts.getWindowDuration())
64 |     // Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
65 |     ssc.checkpoint(opts.CheckpointDirectory)
66 |     // This methods monitors a directory for new files to read in for streaming.
67 |     val logDirectory = opts.LogsDirectory
68 |     val logData = ssc.textFileStream(logDirectory);
69 |     val accessLogDStream = logData.map(line => ApacheAccessLog.parseFromLogLine(line)).cache()
70 |     LogAnalyzerTotal.processAccessLogs(accessLogDStream)
71 |     LogAnalyzerWindowed.processAccessLogs(accessLogDStream, opts)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerTotal.scala:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.scala.logs;
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.streaming._
 6 | import org.apache.spark.streaming.StreamingContext._
 7 | import org.apache.spark.streaming.dstream._
 8 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog
 9 | 
10 | /**
11 |  * Compute totals on the log input
12 |  */
13 | object LogAnalyzerTotal {
14 |   def computeRunningSum(values: Seq[Long], state: Option[Long]) = {
15 |     Some(values.reduce((x, y) => x + y) + state.getOrElse(0L))
16 |   }
17 |   def processAccessLogs(accessLogsDStream: DStream[ApacheAccessLog]) {
18 |     val ipDStream = accessLogsDStream.map(entry => (entry.getIpAddress(), 1))
19 |     val ipCountsDStream = ipDStream.reduceByKey((x, y) => x + y)
20 |     ipCountsDStream.print()
21 |     // with transform
22 |     val ipRawDStream = accessLogsDStream.transform{
23 |       rdd => rdd.map(accessLog => (accessLog.getIpAddress(), 1)).reduceByKey(
24 |         (x, y) => x +y)
25 |     }
26 |     ipRawDStream.print()
27 |     // ip address bytes transfered
28 |     val ipBytesDStream = accessLogsDStream.map(entry => (entry.getIpAddress(), entry.getContentSize()))
29 |     val ipBytesSumDStream = ipBytesDStream.reduceByKey((x, y) => x + y)
30 |     val ipBytesRequestCountDStream = ipRawDStream.join(ipBytesSumDStream)
31 |     ipBytesRequestCountDStream.print()
32 |     val responseCodeDStream = accessLogsDStream.map(log => (log.getResponseCode(), 1L))
33 |     val responseCodeCountDStream = responseCodeDStream.updateStateByKey(computeRunningSum _)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/logs/LogAnalyzerWindowed.scala:
--------------------------------------------------------------------------------
 1 | package com.oreilly.learningsparkexamples.scala.logs;
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.rdd._
 5 | import org.apache.spark.SparkContext._
 6 | import org.apache.spark.streaming._
 7 | import org.apache.spark.streaming.StreamingContext._
 8 | import org.apache.spark.streaming.dstream._
 9 | import com.oreilly.learningsparkexamples.java.logs.ApacheAccessLog
10 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
11 | import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
12 | 
13 | /**
14 |  * Computes various pieces of information on a sliding window form the log input
15 |  */
16 | object LogAnalyzerWindowed {
17 |   def responseCodeCount(accessLogRDD: RDD[ApacheAccessLog]) = {
18 |     accessLogRDD.map(log => (log.getResponseCode(), 1)).reduceByKey((x, y) => x + y)
19 |   }
20 | 
21 |   def processAccessLogs(accessLogsDStream: DStream[ApacheAccessLog], opts: Config) {
22 |     val ipDStream = accessLogsDStream.map{entry => entry.getIpAddress()}
23 |     val ipAddressRequestCount = ipDStream.countByValueAndWindow(
24 |       opts.getWindowDuration(), opts.getSlideDuration())
25 |     ipAddressRequestCount.saveAsTextFiles(opts.OutputDirectory + "/ipAddressRequestCountsTXT")
26 |     val writableIpAddressRequestCount = ipAddressRequestCount.map{case (ip, count) =>
27 |       (new Text(ip), new LongWritable(count))}
28 |     writableIpAddressRequestCount.saveAsHadoopFiles[SequenceFileOutputFormat[Text, LongWritable]](
29 |       opts.OutputDirectory + "/ipAddressRequestCounts", "pandas")
30 |     val requestCount = accessLogsDStream.countByWindow(opts.getWindowDuration(), opts.getSlideDuration())
31 |     requestCount.print()
32 |     ipAddressRequestCount.print()
33 |     val accessLogsWindow = accessLogsDStream.window(
34 |       opts.getWindowDuration(), opts.getSlideDuration())
35 |     accessLogsWindow.transform(rdd => responseCodeCount(rdd)).print()
36 |     // compute the visit counts for IP address in a window
37 |     val ipPairDStream = accessLogsDStream.map(logEntry => (logEntry.getIpAddress(), 1))
38 |     val ipCountDStream = ipPairDStream.reduceByKeyAndWindow(
39 |       {(x, y) => x + y}, // Adding elements in the new slice
40 |       {(x, y) => x - y}, // Removing elements from the oldest slice
41 |       opts.getWindowDuration(), // Window duration
42 |       opts.getSlideDuration() // slide duration
43 |     )
44 |     ipCountDStream.print()
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/com/oreilly/learningsparkexamples/scala/logs/ReadTransferStats.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Illustrates reading in transfer statistics.
 3 |  */
 4 | package com.oreilly.learningsparkexamples.scala.logs
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.SparkContext._
 8 | import org.apache.spark.streaming._
 9 | import org.apache.spark.streaming.dstream._
10 | 
11 | import org.apache.hadoop.io.Writable
12 | import org.apache.hadoop.io.IntWritable
13 | import org.apache.hadoop.io.LongWritable
14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
15 | 
16 | 
17 | object ReadTransferStats {
18 |   def readStats(ssc: StreamingContext, inputDirectory: String): DStream[(Long, Int)] = {
19 |     // convert the input from Writables to native types
20 |     ssc.fileStream[LongWritable, IntWritable,
21 |       SequenceFileInputFormat[LongWritable, IntWritable]](inputDirectory).map{
22 |       case (x, y) => (x.get(), y.get())
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/perl/splitwords.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | 
 5 | # This is a simple perl example of how to work with pipe interface
 6 | # Here we read in each line and ouput the corresponding words
 7 | # This is equivelent to rdd.flatMap(_.split($SEPARATOR))
 8 | while (my $line = <>) {
 9 |     chomp ($line);
10 |     my @words = split($ENV{'SEPARATOR'}, $line);
11 |     foreach my $word (@words) {
12 | 	print $word."\n";
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/python/AvgMapPartitions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4])
 5 | >>> avg(b)
 6 | 2.5
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from pyspark import SparkContext
12 | 
13 | 
14 | def partitionCtr(nums):
15 |     """Compute sumCounter for partition"""
16 |     sumCount = [0, 0]
17 |     for num in nums:
18 |         sumCount[0] += num
19 |         sumCount[1] += 1
20 |     return [sumCount]
21 | 
22 | 
23 | def combineCtrs(c1, c2):
24 |     return (c1[0] + c2[0], c1[1] + c2[1])
25 | 
26 | 
27 | def basicAvg(nums):
28 |     """Compute the avg"""
29 |     sumCount = nums.mapPartitions(partitionCtr).reduce(combineCtrs)
30 |     return sumCount[0] / float(sumCount[1])
31 | 
32 | if __name__ == "__main__":
33 |     cluster = "local"
34 |     if len(sys.argv) == 2:
35 |         cluster = sys.argv[1]
36 |     sc = SparkContext(cluster, "Sum")
37 |     nums = sc.parallelize([1, 2, 3, 4])
38 |     avg = basicAvg(nums)
39 |     print avg
40 | 


--------------------------------------------------------------------------------
/src/python/BasicAvg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4])
 5 | >>> basicAvg(b)
 6 | 2.5
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from pyspark import SparkContext
12 | 
13 | 
14 | def basicAvg(nums):
15 |     """Compute the avg"""
16 |     sumCount = nums.map(lambda x: (x, 1)).fold(
17 |         (0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1])))
18 |     return sumCount[0] / float(sumCount[1])
19 | 
20 | if __name__ == "__main__":
21 |     master = "local"
22 |     if len(sys.argv) == 2:
23 |         master = sys.argv[1]
24 |     sc = SparkContext(master, "Sum")
25 |     nums = sc.parallelize([1, 2, 3, 4])
26 |     avg = basicAvg(nums)
27 |     print avg
28 | 


--------------------------------------------------------------------------------
/src/python/BasicFilterMap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4])
 5 | >>> sorted(basicSquareNoOnes(b).collect())
 6 | [4, 9, 16]
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from pyspark import SparkContext
12 | 
13 | 
14 | def basicSquareNoOnes(nums):
15 |     """Square the numbers"""
16 |     return nums.map(lambda x: x * x).filter(lambda x: x != 1)
17 | 
18 | if __name__ == "__main__":
19 |     master = "local"
20 |     if len(sys.argv) == 2:
21 |         master = sys.argv[1]
22 |     sc = SparkContext(master, "BasicFilterMap")
23 |     nums = sc.parallelize([1, 2, 3, 4])
24 |     output = sorted(basicSquareNoOnes(nums).collect())
25 |     for num in output:
26 |         print "%i " % (num)
27 | 


--------------------------------------------------------------------------------
/src/python/BasicKeyValueMapFilter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> input = ["coffee", "i really like coffee", "coffee > magic"]
 5 | >>> b = sc.parallelize(input)
 6 | >>> sorted(basicKeyValueMapFilter(b).collect())
 7 | [4, 9]
 8 | """
 9 | 
10 | import sys
11 | 
12 | from pyspark import SparkContext
13 | 
14 | 
15 | def basicKeyValueMapFilter(input):
16 |     """Construct a key/value RDD and then filter on the value"""
17 |     return input.map(lambda x: (x.split(" ")[0], x)).filter(
18 |         lambda x: len(x[1]) < 20)
19 | 
20 | if __name__ == "__main__":
21 |     master = "local"
22 |     if len(sys.argv) == 2:
23 |         master = sys.argv[1]
24 |     sc = SparkContext(master, "BasicFilterMap")
25 |     input = sc.parallelize(
26 |         ["coffee", "i really like coffee", "coffee > magic", "panda < coffee"])
27 |     output = sorted(basicKeyValueMapFilter(input).collect())
28 |     for elem in output:
29 |         print elem
30 | 


--------------------------------------------------------------------------------
/src/python/BasicMap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4])
 5 | >>> sorted(basicSquare(b).collect())
 6 | [1, 4, 9, 16]
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from pyspark import SparkContext
12 | 
13 | 
14 | def basicSquare(nums):
15 |     """Square the numbers"""
16 |     return nums.map(lambda x: x * x)
17 | 
18 | if __name__ == "__main__":
19 |     master = "local"
20 |     if len(sys.argv) == 2:
21 |         master = sys.argv[1]
22 |     sc = SparkContext(master, "BasicMap")
23 |     nums = sc.parallelize([1, 2, 3, 4])
24 |     output = sorted(basicSquare(nums).collect())
25 |     for num in output:
26 |         print "%i " % (num)
27 | 


--------------------------------------------------------------------------------
/src/python/BasicMapPartitions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize(["KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"])
 5 | >>> fetchCallSigns(b).size()
 6 | 4
 7 | """
 8 | 
 9 | import sys
10 | import urllib3
11 | 
12 | from pyspark import SparkContext
13 | 
14 | 
15 | def processCallSigns(signs):
16 |     """Process call signs"""
17 |     http = urllib3.PoolManager()
18 |     requests = map(
19 |         lambda x: http.request('GET', "http://qrzcq.com/call/" + x), signs)
20 |     return map(lambda x: x.data, requests)
21 | 
22 | 
23 | def fetchCallSigns(input):
24 |     """Fetch call signs"""
25 |     return input.mapPartitions(lambda callSigns: processCallSigns(callSigns))
26 | 
27 | if __name__ == "__main__":
28 |     master = "local"
29 |     if len(sys.argv) == 2:
30 |         master = sys.argv[1]
31 |     sc = SparkContext(master, "BasicMapPartitions")
32 |     input = sc.parallelize(["KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"])
33 |     output = sorted(fetchCallSigns(input).collect())
34 |     for str in output:
35 |         print "%s " % (str)
36 | 


--------------------------------------------------------------------------------
/src/python/BasicSum.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4])
 5 | >>> basicSum(b)
 6 | 10
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from pyspark import SparkContext
12 | 
13 | 
14 | def basicSum(nums):
15 |     """Sum the numbers"""
16 |     return nums.fold(0, (lambda x, y: x + y))
17 | 
18 | if __name__ == "__main__":
19 |     master = "local"
20 |     if len(sys.argv) == 2:
21 |         master = sys.argv[1]
22 |     sc = SparkContext(master, "Sum")
23 |     nums = sc.parallelize([1, 2, 3, 4])
24 |     output = basicSum(nums)
25 |     print output
26 | 


--------------------------------------------------------------------------------
/src/python/ChapterSixExample.py:
--------------------------------------------------------------------------------
  1 | """Contains the Chapter 6 Example illustrating accumulators, broadcast
  2 | variables, numeric operations, and pipe."""
  3 | import bisect
  4 | import re
  5 | import sys
  6 | import urllib3
  7 | import json
  8 | import math
  9 | import os
 10 | 
 11 | from pyspark import SparkContext
 12 | from pyspark import SparkFiles
 13 | 
 14 | sparkMaster = sys.argv[1]
 15 | inputFile = sys.argv[2]
 16 | outputDir = sys.argv[3]
 17 | 
 18 | sc = SparkContext(sparkMaster, appName="ChapterSixExample")
 19 | file = sc.textFile(inputFile)
 20 | 
 21 | # Count lines with KK6JKQ using accumulators
 22 | count = sc.accumulator(0)
 23 | 
 24 | 
 25 | def incrementCounter(line):
 26 |     global count  # Access the counter
 27 |     if "KK6JKQ" in line:
 28 |         count += 1
 29 | 
 30 | file.foreach(incrementCounter)
 31 | print "Lines with KK6JKQ %d" % count.value
 32 | 
 33 | 
 34 | # Create Accumulator[Int] initialized to 0
 35 | blankLines = sc.accumulator(0)
 36 | dataLines = sc.accumulator(0)
 37 | 
 38 | 
 39 | def extractCallSigns(line):
 40 |     global blankLines, dataLines  # Access the counters
 41 |     if (line == ""):
 42 |         blankLines += 1
 43 |     return line.split(" ")
 44 | 
 45 | callSigns = file.flatMap(extractCallSigns)
 46 | callSigns.saveAsTextFile(outputDir + "/callsigns")
 47 | print "Blank lines %d" % blankLines.value
 48 | 
 49 | # Create Accumulators for validating call signs
 50 | validSignCount = sc.accumulator(0)
 51 | invalidSignCount = sc.accumulator(0)
 52 | 
 53 | 
 54 | def validateSign(sign):
 55 |     global validSignCount, invalidSignCount
 56 |     if re.match(r"\A\d?[a-zA-Z]{1,2}\d{1,4}[a-zA-Z]{1,3}\Z", sign):
 57 |         validSignCount += 1
 58 |         return True
 59 |     else:
 60 |         invalidSignCount += 1
 61 |         return False
 62 | 
 63 | validSigns = callSigns.filter(validateSign)
 64 | contactCounts = validSigns.map(
 65 |     lambda sign: (sign, 1)).reduceByKey((lambda x, y: x + y))
 66 | # Force evaluation so the counters are populated
 67 | contactCounts.count()
 68 | if invalidSignCount.value < 0.1 * validSignCount.value:
 69 |     contactCounts.saveAsTextFile(outputDir + "/contactCount")
 70 | else:
 71 |     print ("Too many errors %d in %d" %
 72 |            (invalidSignCount.value, validSignCount.value))
 73 | 
 74 | # Helper functions for looking up the call signs
 75 | 
 76 | 
 77 | def lookupCountry(sign, prefixes):
 78 |     pos = bisect.bisect_left(prefixes, sign)
 79 |     return prefixes[pos].split(",")[1]
 80 | 
 81 | 
 82 | def loadCallSignTable():
 83 |     f = open("./files/callsign_tbl_sorted", "r")
 84 |     return f.readlines()
 85 | 
 86 | # Lookup the locations of the call signs on the
 87 | # RDD contactCounts. We load a list of call sign
 88 | # prefixes to country code to support this lookup.
 89 | signPrefixes = sc.broadcast(loadCallSignTable())
 90 | 
 91 | 
 92 | def processSignCount(sign_count, signPrefixes):
 93 |     country = lookupCountry(sign_count[0], signPrefixes.value)
 94 |     count = sign_count[1]
 95 |     return (country, count)
 96 | 
 97 | countryContactCounts = (contactCounts
 98 |                         .map(lambda signCount: processSignCount(signCount, signPrefixes))
 99 |                         .reduceByKey((lambda x, y: x + y)))
100 | 
101 | countryContactCounts.saveAsTextFile(outputDir + "/countries.txt")
102 | 
103 | # Query 73s for the call signs CallLogs and parse the personse
104 | 
105 | 
106 | def processCallSigns(signs):
107 |     """Lookup call signs using a connection pool"""
108 |     # Create a connection pool
109 |     http = urllib3.PoolManager()
110 |     # the URL associated with each call sign record
111 |     urls = map(lambda x: "http://73s.com/qsos/%s.json" % x, signs)
112 |     # create the requests (non-blocking)
113 |     requests = map(lambda x: (x, http.request('GET', x)), urls)
114 |     # fetch the results
115 |     result = map(lambda x: (x[0], json.loads(x[1].data)), requests)
116 |     # remove any empty results and return
117 |     return filter(lambda x: x[1] is not None, result)
118 | 
119 | 
120 | def fetchCallSigns(input):
121 |     """Fetch call signs"""
122 |     return input.mapPartitions(lambda callSigns: processCallSigns(callSigns))
123 | 
124 | contactsContactList = fetchCallSigns(validSigns)
125 | 
126 | # Compute the distance of each call using an external R program
127 | distScript = os.getcwd()+"/src/R/finddistance.R"
128 | distScriptName = "finddistance.R"
129 | sc.addFile(distScript)
130 | 
131 | 
132 | def hasDistInfo(call):
133 |     """Verify that a call has the fields required to compute the distance"""
134 |     requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
135 |     return all(map(lambda f: call[f], requiredFields))
136 | 
137 | 
138 | def formatCall(call):
139 |     """Format a call so that it can be parsed by our R program"""
140 |     return "{0},{1},{2},{3}".format(
141 |         call["mylat"], call["mylong"],
142 |         call["contactlat"], call["contactlong"])
143 | 
144 | pipeInputs = contactsContactList.values().flatMap(
145 |     lambda calls: map(formatCall, filter(hasDistInfo, calls)))
146 | distances = pipeInputs.pipe(SparkFiles.get(distScriptName))
147 | print distances.collect()
148 | # Convert our RDD of strings to numeric data so we can compute stats and
149 | # remove the outliers.
150 | distanceNumerics = distances.map(lambda string: float(string))
151 | stats = distanceNumerics.stats()
152 | stddev = stats.stdev()
153 | mean = stats.mean()
154 | reasonableDistances = distanceNumerics.filter(
155 |     lambda x: math.fabs(x - mean) < 3 * stddev)
156 | print reasonableDistances.collect()
157 | 


--------------------------------------------------------------------------------
/src/python/IntersectByKey.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> input = [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)]
 5 | >>> rdd1 = sc.parallelize(input)
 6 | >>> rdd2 = sc.parallelize([("pandas", 20)])
 7 | >>> intserectByKey(rdd1, rdd2).collect()
 8 | [('pandas', 2), ('pandas', 20)]
 9 | """
10 | 
11 | import sys
12 | import itertools
13 | 
14 | from pyspark import SparkContext
15 | 
16 | 
17 | def combineIfBothPresent(itrs):
18 |     """Return an iterable of the elements from
19 |     both itr1 and itr2 if there are elements in both itr1 and itr2 otherwise
20 |     return an empty itrable"""
21 |     iter1 = itrs[0].__iter__()
22 |     iter2 = itrs[1].__iter__()
23 |     try:
24 |         e1 = iter1.next()
25 |         e2 = iter2.next()
26 |         return itertools.chain([e1], [e2], iter1, iter2)
27 |     except StopIteration:
28 |         return []
29 | 
30 | 
31 | def intersectByKey(rdd1, rdd2):
32 |     """Intersect two RDDs by key"""
33 |     return rdd1.cogroup(rdd2).flatMapValues(combineIfBothPresent)
34 | 
35 | if __name__ == "__main__":
36 |     master = "local"
37 |     if len(sys.argv) == 2:
38 |         master = sys.argv[1]
39 |     sc = SparkContext(master, "IntersectByKey")
40 |     rdd1 = sc.parallelize(
41 |         [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)])
42 |     rdd2 = sc.parallelize([("pandas", 20), ("pandas", 21)])
43 |     print intersectByKey(rdd1, rdd2).collect()
44 | 


--------------------------------------------------------------------------------
/src/python/LoadCsv.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | import csv
 3 | import sys
 4 | import StringIO
 5 | 
 6 | 
 7 | def loadRecord(line):
 8 |     """Parse a CSV line"""
 9 |     input = StringIO.StringIO(line)
10 |     reader = csv.DictReader(input, fieldnames=["name", "favouriteAnimal"])
11 |     return reader.next()
12 | 
13 | 
14 | def loadRecords(fileNameContents):
15 |     """Load all the records in a given file"""
16 |     input = StringIO.StringIO(fileNameContents[1])
17 |     reader = csv.DictReader(input, fieldnames=["name", "favouriteAnimal"])
18 |     return reader
19 | 
20 | 
21 | def writeRecords(records):
22 |     """Write out CSV lines"""
23 |     output = StringIO.StringIO()
24 |     writer = csv.DictWriter(output, fieldnames=["name", "favouriteAnimal"])
25 |     for record in records:
26 |         writer.writerow(record)
27 |     return [output.getvalue()]
28 | 
29 | if __name__ == "__main__":
30 |     if len(sys.argv) != 4:
31 |         print "Error usage: LoadCsv [sparkmaster] [inputfile] [outputfile]"
32 |         sys.exit(-1)
33 |     master = sys.argv[1]
34 |     inputFile = sys.argv[2]
35 |     outputFile = sys.argv[3]
36 |     sc = SparkContext(master, "LoadCsv")
37 |     # Try the record-per-line-input
38 |     input = sc.textFile(inputFile)
39 |     data = input.map(loadRecord)
40 |     pandaLovers = data.filter(lambda x: x['favouriteAnimal'] == "panda")
41 |     pandaLovers.mapPartitions(writeRecords).saveAsTextFile(outputFile)
42 |     # Try the more whole file input
43 |     fullFileData = sc.wholeTextFiles(inputFile).flatMap(loadRecords)
44 |     fullFilePandaLovers = fullFileData.filter(
45 |         lambda x: x['favouriteAnimal'] == "panda")
46 |     fullFilePandaLovers.mapPartitions(
47 |         writeRecords).saveAsTextFile(outputFile + "fullfile")
48 |     sc.stop()
49 |     print "Done!"
50 | 


--------------------------------------------------------------------------------
/src/python/LoadHive.py:
--------------------------------------------------------------------------------
 1 | # A simple hive demo. If you do not have a table to load from look run
 2 | # MakeHiveTable.py
 3 | from pyspark import SparkContext
 4 | from pyspark.sql import HiveContext
 5 | import json
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     if len(sys.argv) != 3:
10 |         print "Error usage: LoadHive [sparkmaster] [inputtable]"
11 |         sys.exit(-1)
12 |     master = sys.argv[1]
13 |     inputTable = sys.argv[2]
14 |     sc = SparkContext(master, "LoadHive")
15 |     hiveCtx = HiveContext(sc)
16 |     # Query hive
17 |     input = hiveCtx.sql("FROM " + inputTable + " SELECT key, value")
18 |     print "result of query"
19 |     print input.collect()
20 |     data = input.map(lambda x: x[0] * x[0])
21 |     result = data.collect()
22 |     for element in result:
23 |         print "Got data " + str(element)
24 |     sc.stop()
25 |     print "Done!"
26 | 


--------------------------------------------------------------------------------
/src/python/LoadJson.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | import json
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     if len(sys.argv) != 4:
 7 |         print "Error usage: LoadJson [sparkmaster] [inputfile] [outputfile]"
 8 |         sys.exit(-1)
 9 |     master = sys.argv[1]
10 |     inputFile = sys.argv[2]
11 |     outputFile = sys.argv[3]
12 |     sc = SparkContext(master, "LoadJson")
13 |     input = sc.textFile(inputFile)
14 |     data = input.map(lambda x: json.loads(x))
15 |     data.filter(lambda x: 'lovesPandas' in x and x['lovesPandas']).map(
16 |         lambda x: json.dumps(x)).saveAsTextFile(outputFile)
17 |     sc.stop()
18 |     print "Done!"
19 | 


--------------------------------------------------------------------------------
/src/python/MLlib.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | from pyspark import SparkContext
19 | from pyspark.mllib.regression import LabeledPoint
20 | from pyspark.mllib.classification import LogisticRegressionWithSGD
21 | from pyspark.mllib.feature import HashingTF
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     sc = SparkContext(appName="PythonBookExample")
26 | 
27 |     # Load 2 types of emails from text files: spam and ham (non-spam).
28 |     # Each line has text from one email.
29 |     spam = sc.textFile("files/spam.txt")
30 |     ham = sc.textFile("files/ham.txt")
31 | 
32 |     # Create a HashingTF instance to map email text to vectors of 100 features.
33 |     tf = HashingTF(numFeatures = 100)
34 |     # Each email is split into words, and each word is mapped to one feature.
35 |     spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
36 |     hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))
37 | 
38 |     # Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
39 |     positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
40 |     negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
41 |     training_data = positiveExamples.union(negativeExamples)
42 |     training_data.cache() # Cache data since Logistic Regression is an iterative algorithm.
43 | 
44 |     # Run Logistic Regression using the SGD optimizer.
45 |     # regParam is model regularization, which can make models more robust.
46 |     model = LogisticRegressionWithSGD.train(training_data)
47 | 
48 |     # Test on a positive example (spam) and a negative one (ham).
49 |     # First apply the same HashingTF feature transformation used on the training data.
50 |     posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
51 |     negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))
52 | 
53 |     # Now use the learned model to predict spam/ham for new emails.
54 |     print "Prediction for positive test example: %g" % model.predict(posTestExample)
55 |     print "Prediction for negative test example: %g" % model.predict(negTestExample)
56 | 
57 |     sc.stop()
58 | 


--------------------------------------------------------------------------------
/src/python/MakeHiveTable.py:
--------------------------------------------------------------------------------
 1 | # Createas a hive table and loads an input file into it
 2 | # For input you can use examples/src/main/resources/kv1.txt from the spark
 3 | # distribution
 4 | from pyspark import SparkContext
 5 | from pyspark.sql import HiveContext
 6 | import json
 7 | import sys
 8 | 
 9 | if __name__ == "__main__":
10 |     if len(sys.argv) != 4:
11 |         print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
12 |         sys.exit(-1)
13 |     master = sys.argv[1]
14 |     inputFile = sys.argv[2]
15 |     inputTable = sys.argv[3]
16 |     sc = SparkContext(master, "LoadHive")
17 |     hiveCtx = HiveContext(sc)
18 |     # Load some data into hive
19 |     hiveCtx.sql(
20 |         "CREATE TABLE IF NOT EXISTS " +
21 |         inputTable +
22 |         " (key INT, value STRING)")
23 |     hiveCtx.sql(
24 |         "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
25 | 


--------------------------------------------------------------------------------
/src/python/MakeParquetFile.py:
--------------------------------------------------------------------------------
 1 | # Createas a parquet file and loads an input file into it
 2 | # For input you can use files/favourite_animal.csv as the iput
 3 | from pyspark import SparkContext
 4 | from pyspark.sql import SQLContext
 5 | import json
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     if len(sys.argv) != 4:
10 |         print "Error usage: LoadHive [sparkmaster] [inputFile] [parquetfile]"
11 |         sys.exit(-1)
12 |     master = sys.argv[1]
13 |     inputFile = sys.argv[2]
14 |     parquetFile = sys.argv[3]
15 |     sc = SparkContext(master, "MakeParquetFile")
16 |     sqlCtx = SQLContext(sc)
17 |     # Load some data into an RDD
18 |     rdd = sc.textFile(inputFile).map(lambda l: l.split(","))
19 |     namedRdd = rdd.map(lambda r: {"name": r[0], "favouriteAnimal": r[1]})
20 |     schemaNamedRdd = sqlCtx.inferSchema(namedRdd)
21 |     # Save it
22 |     schemaNamedRdd.saveAsParquetFile(parquetFile)
23 | 


--------------------------------------------------------------------------------
/src/python/PerKeyAvg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> input = [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)]
 5 | >>> b = sc.parallelize(input)
 6 | >>> perKeyAvg(b)
 7 | 
 8 | """
 9 | 
10 | import sys
11 | 
12 | from pyspark import SparkContext
13 | 
14 | 
15 | def perKeyAvg(nums):
16 |     """Compute the avg"""
17 |     sumCount = nums.combineByKey((lambda x: (x, 1)),
18 |                                  (lambda x, y: (x[0] + y, x[1] + 1)),
19 |                                  (lambda x, y: (x[0] + y[0], x[1] + y[1])))
20 |     return sumCount.collectAsMap()
21 | 
22 | if __name__ == "__main__":
23 |     master = "local"
24 |     if len(sys.argv) == 2:
25 |         master = sys.argv[1]
26 |     sc = SparkContext(master, "Sum")
27 |     nums = sc.parallelize(
28 |         [("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)])
29 |     avg = perKeyAvg(nums)
30 |     print avg
31 | 


--------------------------------------------------------------------------------
/src/python/QueryParquetFile.py:
--------------------------------------------------------------------------------
 1 | # Finds the names of people who like pandas from a parquet file
 2 | # consisting of name & favouriteAnimal.
 3 | # For input you can use the result of MakeParquetFile
 4 | from pyspark import SparkContext
 5 | from pyspark.sql import SQLContext
 6 | import json
 7 | import sys
 8 | 
 9 | if __name__ == "__main__":
10 |     if len(sys.argv) != 3:
11 |         print "Error usage: QueryParquetFile [sparkmaster] [parquetfile]"
12 |         sys.exit(-1)
13 |     master = sys.argv[1]
14 |     parquetFile = sys.argv[2]
15 |     sc = SparkContext(master, "QueryParquetFile")
16 |     sqlCtx = SQLContext(sc)
17 |     # Load some data in from a Parquet file of name & favouriteAnimal
18 |     rows = sqlCtx.parquetFile(parquetFile)
19 |     names = rows.map(lambda row: row.name)
20 |     print "Everyone"
21 |     print names.collect()
22 |     # Find the panda lovers
23 |     tbl = rows.registerAsTable("people")
24 |     pandaFriends = sqlCtx.sql("SELECT name FROM people WHERE "+
25 |                               "favouriteAnimal = \"panda\"")
26 |     print "Panda Friends"
27 |     print pandaFriends.map(lambda row: row.name).collect()
28 | 


--------------------------------------------------------------------------------
/src/python/QueryParuetFile.py:
--------------------------------------------------------------------------------
 1 | # Finds the names of people who like pandas from a parquet file
 2 | # consisting of name & favouriteAnimal.
 3 | # For input you can use the result of MakeParquetFile
 4 | from pyspark import SparkContext
 5 | from pyspark.sql import SQLContext
 6 | import json
 7 | import sys
 8 | 
 9 | if __name__ == "__main__":
10 |     if len(sys.argv) != 4:
11 |         print "Error usage: QueryParquetFile [sparkmaster] [parquetfile]"
12 |         sys.exit(-1)
13 |     master = sys.argv[1]
14 |     inputFile = sys.argv[2]
15 |     parquetFile = sys.argv[3]
16 |     sc = SparkContext(master, "MakeParquetFile")
17 |     sqlCtx = SQLContext(sc)
18 |     # Load some data into an RDD
19 |     rdd = sc.textFile(inputFile).map(lambda l: l.split(","))
20 |     namedRdd = rdd.map(lambda r: {"name": r[0], "favouriteAnimal": r[1]})
21 |     schemaNamedRdd = sqlCtx.inferSchema(namedRdd)
22 |     # Save it
23 |     schemaNamedRdd.saveAsParquetFile(parquetFile)
24 | 


--------------------------------------------------------------------------------
/src/python/RemoveOutliers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | >>> from pyspark.context import SparkContext
 3 | >>> sc = SparkContext('local', 'test')
 4 | >>> b = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000])
 5 | >>> sorted(removeOutliers(b).collect()
 6 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 7 | """
 8 | 
 9 | import sys
10 | import math
11 | 
12 | from pyspark import SparkContext
13 | 
14 | 
15 | def removeOutliers(nums):
16 |     """Remove the outliers"""
17 |     stats = nums.stats()
18 |     stddev = math.sqrt(stats.variance())
19 |     return nums.filter(lambda x: math.fabs(x - stats.mean()) < 3 * stddev)
20 | 
21 | if __name__ == "__main__":
22 |     master = "local"
23 |     if len(sys.argv) == 2:
24 |         master = sys.argv[1]
25 |     sc = SparkContext(master, "Sum")
26 |     nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000])
27 |     output = sorted(removeOutliers(nums).collect())
28 |     for num in output:
29 |         print "%i " % (num)
30 | 


--------------------------------------------------------------------------------
/src/python/SparkSQLTwitter.py:
--------------------------------------------------------------------------------
 1 | # A simple demo for working with SparkSQL and Tweets
 2 | from pyspark import SparkContext, SparkConf
 3 | from pyspark.sql import HiveContext, Row
 4 | from pyspark.sql.types import IntegerType
 5 | import json
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     inputFile = sys.argv[1]
10 |     conf = SparkConf().setAppName("SparkSQLTwitter")
11 |     sc = SparkContext()
12 |     hiveCtx = HiveContext(sc)
13 |     print "Loading tweets from " + inputFile
14 |     input = hiveCtx.jsonFile(inputFile)
15 |     input.registerTempTable("tweets")
16 |     topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
17 |     print topTweets.collect()
18 |     topTweetText = topTweets.map(lambda row : row.text)
19 |     print topTweetText.collect()
20 |     # Make a happy person row
21 |     happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")])
22 |     happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
23 |     happyPeopleSchemaRDD.registerTempTable("happy_people")
24 |     # Make a UDF to tell us how long some text is
25 |     hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
26 |     lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
27 |     print lengthSchemaRDD.collect()
28 |     sc.stop()
29 | 


--------------------------------------------------------------------------------
/src/python/WordCount.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark import SparkContext
 4 | 
 5 | if __name__ == "__main__":
 6 |     master = "local"
 7 |     if len(sys.argv) == 2:
 8 |         master = sys.argv[1]
 9 |     sc = SparkContext(master, "WordCount")
10 |     lines = sc.parallelize(["pandas", "i like pandas"])
11 |     result = lines.flatMap(lambda x: x.split(" ")).countByValue()
12 |     for key, value in result.iteritems():
13 |         print "%s %i" % (key, value)
14 | 


--------------------------------------------------------------------------------