├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE.md
├── README.md
├── bin
├── datagen.cmd
├── fakelogs.cmd
└── fakelogs.sh
├── build-project
├── build.sbt
├── files
├── call_signs2.txt
├── callsign_tbl
├── callsign_tbl_sorted
├── callsigns
├── cqlsh_setup
├── fake_logs
│ ├── log1.log
│ └── log2.log
├── favourite_animals.csv
├── flumeconf.cfg
├── ham.txt
├── happypandas
├── int_string.csv
├── pandainfo.json
├── spam.txt
└── testweet.json
├── mini-complete-example
├── README.md
├── build.sbt
├── pom.xml
├── project
│ └── plugins.sbt
├── sbt
│ └── sbt
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── oreilly
│ │ └── learningsparkexamples
│ │ └── mini
│ │ └── java
│ │ ├── BasicMap.java
│ │ └── WordCount.java
│ └── scala
│ └── com
│ └── oreilly
│ └── learningsparkexamples
│ └── mini
│ └── scala
│ ├── BasicMap.scala
│ └── WordCount.scala
├── pom.xml
├── project
└── plugins.sbt
├── run-all-examples
├── sbt
└── sbt
├── setup-project
└── src
├── R
└── finddistance.R
├── main
├── java
│ └── com
│ │ └── oreilly
│ │ └── learningsparkexamples
│ │ └── java
│ │ ├── BasicAvg.java
│ │ ├── BasicAvgMapPartitions.java
│ │ ├── BasicAvgWithKryo.java
│ │ ├── BasicFlatMap.java
│ │ ├── BasicJoinCsv.java
│ │ ├── BasicLoadJson.java
│ │ ├── BasicLoadSequenceFile.java
│ │ ├── BasicLoadWholeCsv.java
│ │ ├── BasicMap.java
│ │ ├── BasicMapPartitions.java
│ │ ├── BasicMapThenFilter.java
│ │ ├── BasicMapToDouble.java
│ │ ├── BasicQueryCassandra.java
│ │ ├── BasicSaveSequenceFile.java
│ │ ├── BasicSum.java
│ │ ├── CallLog.java
│ │ ├── ChapterSixExample.java
│ │ ├── HappyPerson.java
│ │ ├── IntersectByKey.java
│ │ ├── KafkaInput.java
│ │ ├── KeyValueMapFilter.java
│ │ ├── LoadHive.java
│ │ ├── LoadJsonWithSparkSQL.java
│ │ ├── MLlib.java
│ │ ├── PerKeyAvg.java
│ │ ├── RemoveOutliers.java
│ │ ├── SparkSQLTwitter.java
│ │ ├── StreamingLogInput.java
│ │ ├── WordCount.java
│ │ └── logs
│ │ ├── ApacheAccessLog.java
│ │ ├── Flags.java
│ │ ├── Functions.java
│ │ ├── LogAnalyzerAppMain.java
│ │ ├── LogAnalyzerTotal.java
│ │ ├── LogAnalyzerWindowed.java
│ │ ├── LogStatistics.java
│ │ ├── ReadTransferStats.java
│ │ └── Renderer.java
├── protobuf
│ ├── address_book.proto
│ └── places.proto
└── scala
│ └── com
│ └── oreilly
│ └── learningsparkexamples
│ └── scala
│ ├── BasicAvg.scala
│ ├── BasicAvgFromFile.scala
│ ├── BasicAvgFromFiles.scala
│ ├── BasicAvgMapPartitions.scala
│ ├── BasicAvgWithKryo.scala
│ ├── BasicFilterUnionCombo.scala
│ ├── BasicIntersectByKey.scala
│ ├── BasicLoadNums.scala
│ ├── BasicLoadSequenceFile.scala
│ ├── BasicLoadTextFromFTP.scala
│ ├── BasicMap.scala
│ ├── BasicMapNoCache.scala
│ ├── BasicMapPartitions.scala
│ ├── BasicMapThenFilter.scala
│ ├── BasicParseCsv.scala
│ ├── BasicParseJson.scala
│ ├── BasicParseJsonWithJackson.scala
│ ├── BasicParseWholeFileCsv.scala
│ ├── BasicQueryCassandra.scala
│ ├── BasicSaveProtoBuf.scala
│ ├── BasicSaveSequenceFile.scala
│ ├── BasicStreamingExample.scala
│ ├── BasicSum.scala
│ ├── ChapterSixExample.scala
│ ├── FlumeInput.scala
│ ├── KafkaInput.scala
│ ├── LoadHive.scala
│ ├── LoadJsonWithElephantBird.scala
│ ├── LoadJsonWithSparkSQL.scala
│ ├── LoadKeyValueTextInput.scala
│ ├── LoadSimpleJdbc.scala
│ ├── MLlib.scala
│ ├── MLlibPipeline.disabled_until_111
│ ├── PerKeyAvg.scala
│ ├── PipeExample.scala
│ ├── RemoveOutliers.scala
│ ├── SparkSQLTwitter.scala
│ ├── StreamingLogInput.scala
│ ├── WordCount.scala
│ ├── WriteSimpleDB.scala
│ └── logs
│ ├── LogAnalyzerAppMain.scala
│ ├── LogAnalyzerTotal.scala
│ ├── LogAnalyzerWindowed.scala
│ └── ReadTransferStats.scala
├── perl
└── splitwords.pl
└── python
├── AvgMapPartitions.py
├── BasicAvg.py
├── BasicFilterMap.py
├── BasicKeyValueMapFilter.py
├── BasicMap.py
├── BasicMapPartitions.py
├── BasicSum.py
├── ChapterSixExample.py
├── IntersectByKey.py
├── LoadCsv.py
├── LoadHive.py
├── LoadJson.py
├── MLlib.py
├── MakeHiveTable.py
├── MakeParquetFile.py
├── PerKeyAvg.py
├── QueryParquetFile.py
├── QueryParuetFile.py
├── RemoveOutliers.py
├── SparkSQLTwitter.py
└── WordCount.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt specific
5 | dist/*
6 | target/
7 | lib_managed/
8 | src_managed/
9 | project/boot/
10 | project/plugins/project/
11 | sbt/*.jar
12 | mini-complete-example/sbt/*.jar
13 |
14 | # Scala-IDE specific
15 | .scala_dependencies
16 |
17 | #Emacs
18 | *~
19 |
20 | #ignore the metastore
21 | metastore_db/*
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.10.4
4 | # Install R
5 | before_install:
6 | - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
7 | - chmod 755 ./travis-tool.sh
8 | - ./travis-tool.sh bootstrap
9 | install:
10 | - ./travis-tool.sh install_deps
11 | before_script:
12 | - ./setup-project
13 | script:
14 | - ./build-project
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: learning-spark-examples
2 | Version: 0.1
3 | Depends: Imap
4 | License: MIT License
5 | Description: Examples for the learning spark book.
6 | Title: Examples for the learning spark book.
7 | Author@R: c(person("Holden Karau", role = c("aut", "cre"), email="holden@pigscanfly.ca"))
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (C) 2014 Holden Karau and respective authors. The learning spark examples are licensed under the [MIT license](http://opensource.org/licenses/MIT).
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/holdenk/learning-spark-examples)
2 | Examples for Learning Spark
3 | ===============
4 | Examples for the Learning Spark book. These examples require a number of libraries and as such have long build files. We have also added a stand alone example with minimal dependencies and a small build file
5 | in the mini-complete-example directory.
6 |
7 |
8 | These examples have been updated to run against Spark 1.3 so they may
9 | be slightly different than the versions in your copy of "Learning Spark".
10 |
11 | Requirements
12 | ==
13 | * JDK 1.7 or higher
14 | * Scala 2.10.3
15 | - scala-lang.org
16 | * Spark 1.3
17 | * Protobuf compiler
18 | - On debian you can install with sudo apt-get install protobuf-compiler
19 | * R & the CRAN package Imap are required for the ChapterSixExample
20 | * The Python examples require urllib3
21 |
22 | Python examples
23 | ===
24 |
25 | From spark just run ./bin/pyspark ./src/python/[example]
26 |
27 | Spark Submit
28 | ===
29 |
30 | You can also create an assembly jar with all of the dependencies for running either the java or scala
31 | versions of the code and run the job with the spark-submit script
32 |
33 | `./sbt/sbt assembly` OR `mvn package`
34 |
35 | `cd $SPARK_HOME; ./bin/spark-submit --class com.oreilly.learningsparkexamples.[lang].[example] ../learning-spark-examples/target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar`
36 |
37 | [](http://www.jdoqocy.com/click-7645222-11260198?url=http%3A%2F%2Fshop.oreilly.com%2Fproduct%2F0636920028512.do%3Fcmp%3Daf-strata-books-videos-product_cj_9781449358600_%2525zp&cjsku=0636920028512)
38 |
--------------------------------------------------------------------------------
/bin/datagen.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | echo 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
3 | ping -n 5 localhost > null
4 | echo 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
5 | ping -n 5 localhost > null
6 | echo 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
--------------------------------------------------------------------------------
/bin/fakelogs.cmd:
--------------------------------------------------------------------------------
1 | ncat -l 7777 -k -c datagen.cmd
--------------------------------------------------------------------------------
/bin/fakelogs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | rm /tmp/logdata
3 | touch /tmp/logdata
4 | tail -f /tmp/logdata | nc -lk 7777 &
5 | TAIL_NC_PID=$!
6 | cat ./files/fake_logs/log1.log >> /tmp/logdata
7 | sleep 5
8 | cat ./files/fake_logs/log2.log >> /tmp/logdata
9 | sleep 1
10 | cat ./files/fake_logs/log1.log >> /tmp/logdata
11 | sleep 2
12 | cat ./files/fake_logs/log1.log >> /tmp/logdata
13 | sleep 3
14 | sleep 20
15 | kill $TAIL_NC_PID
16 |
--------------------------------------------------------------------------------
/build-project:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | set -e
3 | set -x
4 | # Do our mini example first
5 | cd mini-complete-example
6 | ./sbt/sbt clean compile package
7 | ./sbt/sbt clean
8 | echo $PWD && mvn clean && mvn compile
9 | cd ..
10 | # Run the tests
11 | export SPARK_HOME=./spark-1.3.1-bin-hadoop1/
12 | ./sbt/sbt compile package assembly
13 | echo $?
14 | time ./run-all-examples
15 | echo $?
16 | echo "done"
17 | # Try and build with maven, skip for now
18 | #mvn clean && mvn compile && mvn package
19 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import AssemblyKeys._
2 |
3 | assemblySettings
4 |
5 | name := "learning-spark-examples"
6 |
7 | version := "0.0.1"
8 |
9 | scalaVersion := "2.10.4"
10 |
11 | javacOptions ++= Seq("-source", "1.7", "-target", "1.7")
12 |
13 | // protocol buffer support
14 | seq(sbtprotobuf.ProtobufPlugin.protobufSettings: _*)
15 |
16 | // additional libraries
17 | libraryDependencies ++= Seq(
18 | "org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
19 | "org.apache.spark" %% "spark-sql" % "1.3.1",
20 | "org.apache.spark" %% "spark-hive" % "1.3.1",
21 | "org.apache.spark" %% "spark-streaming" % "1.3.1",
22 | "org.apache.spark" %% "spark-streaming-kafka" % "1.3.1",
23 | "org.apache.spark" %% "spark-streaming-flume" % "1.3.1",
24 | "org.apache.spark" %% "spark-mllib" % "1.3.1",
25 | "org.apache.commons" % "commons-lang3" % "3.0",
26 | "org.eclipse.jetty" % "jetty-client" % "8.1.14.v20131031",
27 | "com.typesafe.play" % "play-json_2.10" % "2.2.1",
28 | "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3",
29 | "com.fasterxml.jackson.module" % "jackson-module-scala_2.10" % "2.3.3",
30 | "org.elasticsearch" % "elasticsearch-hadoop-mr" % "2.0.0.RC1",
31 | "net.sf.opencsv" % "opencsv" % "2.0",
32 | "com.twitter.elephantbird" % "elephant-bird" % "4.5",
33 | "com.twitter.elephantbird" % "elephant-bird-core" % "4.5",
34 | "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.17",
35 | "mysql" % "mysql-connector-java" % "5.1.31",
36 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.0.0-rc5",
37 | "com.datastax.spark" %% "spark-cassandra-connector-java" % "1.0.0-rc5",
38 | "com.github.scopt" %% "scopt" % "3.2.0",
39 | "org.scalatest" %% "scalatest" % "2.2.1" % "test",
40 | "com.holdenkarau" %% "spark-testing-base" % "0.0.1" % "test"
41 | )
42 |
43 | resolvers ++= Seq(
44 | "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
45 | "Spray Repository" at "http://repo.spray.cc/",
46 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
47 | "Akka Repository" at "http://repo.akka.io/releases/",
48 | "Twitter4J Repository" at "http://twitter4j.org/maven2/",
49 | "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
50 | "Twitter Maven Repo" at "http://maven.twttr.com/",
51 | "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
52 | "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
53 | "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
54 | "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
55 | Resolver.sonatypeRepo("public")
56 | )
57 |
58 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
59 | {
60 | case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
61 | case m if m.startsWith("META-INF") => MergeStrategy.discard
62 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
63 | case PathList("org", "apache", xs @ _*) => MergeStrategy.first
64 | case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
65 | case "about.html" => MergeStrategy.rename
66 | case "reference.conf" => MergeStrategy.concat
67 | case _ => MergeStrategy.first
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/files/call_signs2.txt:
--------------------------------------------------------------------------------
1 | KK6JKQ
2 |
--------------------------------------------------------------------------------
/files/callsigns:
--------------------------------------------------------------------------------
1 | W8PAL
2 | KK6JKQ
3 | W6BB
4 | VE3UOW
5 | VE2CUA
6 | VE2UN
7 | OH2TI
8 | GB1MIR
9 | K2AMH
10 | UA1LO
11 | N7ICE
12 |
--------------------------------------------------------------------------------
/files/cqlsh_setup:
--------------------------------------------------------------------------------
1 | DROP KEYSPACE IF EXISTS test;
2 | CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
3 | CREATE TABLE test.kv(key text PRIMARY KEY, value int);
4 | INSERT INTO test.kv(key, value) VALUES ('panda', 1);
5 | INSERT INTO test.kv(key, value) VALUES ('notpanda', 0);
--------------------------------------------------------------------------------
/files/fake_logs/log1.log:
--------------------------------------------------------------------------------
1 | 66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET /071300/242153 HTTP/1.1" 404 514 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
2 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
3 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /favicon.ico HTTP/1.1" 200 1713 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
4 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET / HTTP/1.1" 200 18785 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
5 | 71.19.157.174 - - [24/Sep/2014:22:26:37 +0000] "GET /jobmineimg.php?q=m HTTP/1.1" 200 222 "http://www.holdenkarau.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
6 |
--------------------------------------------------------------------------------
/files/fake_logs/log2.log:
--------------------------------------------------------------------------------
1 | 71.19.157.174 - - [24/Sep/2014:22:26:12 +0000] "GET /error78978 HTTP/1.1" 404 505 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
2 |
--------------------------------------------------------------------------------
/files/favourite_animals.csv:
--------------------------------------------------------------------------------
1 | holden,panda
2 | notholden,notpanda
3 | spark,bear
--------------------------------------------------------------------------------
/files/flumeconf.cfg:
--------------------------------------------------------------------------------
1 | # Name the components on this agent
2 | panda.sources = r1
3 | panda.sinks = avroSink
4 | panda.channels = c1
5 |
6 | # avro sink
7 | panda.sinks = avroSink
8 | panda.sinks.avroSink.type = avro
9 | panda.sinks.avroSink.channel = memoryChannel
10 | panda.sinks.avroSink.hostname = localhost
11 | panda.sinks.avroSink.port = 7788
12 |
13 | # input
14 | panda.sources.r1.type = netcat
15 | panda.sources.r1.bind = localhost
16 | panda.sources.r1.port = 44444
17 |
18 | # Use a channel which buffers events in memory
19 | panda.channels.c1.type = memory
20 | panda.channels.c1.capacity = 1000
21 | panda.channels.c1.transactionCapacity = 100
22 |
23 | # Bind the source and sink to the channel
24 | panda.sources.r1.channels = c1
25 | panda.sinks.avroSink.channel = c1
--------------------------------------------------------------------------------
/files/ham.txt:
--------------------------------------------------------------------------------
1 | Dear Spark Learner, Thanks so much for attending the Spark Summit 2014! Check out videos of talks from the summit at ...
2 | Hi Mom, Apologies for being late about emailing and forgetting to send you the package. I hope you and bro have been ...
3 | Wow, hey Fred, just heard about the Spark petabyte sort. I think we need to take time to try it out immediately ...
4 | Hi Spark user list, This is my first question to this list, so thanks in advance for your help! I tried running ...
5 | Thanks Tom for your email. I need to refer you to Alice for this one. I haven't yet figured out that part either ...
6 | Good job yesterday! I was attending your talk, and really enjoyed it. I want to try out GraphX ...
7 | Summit demo got whoops from audience! Had to let you know. --Joe
8 |
--------------------------------------------------------------------------------
/files/happypandas:
--------------------------------------------------------------------------------
1 | coffee 1
2 | coffee 2
3 | pandas 3
4 | happy 4
--------------------------------------------------------------------------------
/files/int_string.csv:
--------------------------------------------------------------------------------
1 | 1panda
2 | 2pandas
3 | 3pandas
--------------------------------------------------------------------------------
/files/pandainfo.json:
--------------------------------------------------------------------------------
1 | {"name":"Sparky The Bear", "lovesPandas":true}
2 | {"name": "Holden"}
3 | {"name":"Sparky The Bear", "lovesPandas":true, "knows":{"friends": ["holden"]}}
4 |
--------------------------------------------------------------------------------
/files/spam.txt:
--------------------------------------------------------------------------------
1 | Dear sir, I am a Prince in a far kingdom you have not heard of. I want to send you money via wire transfer so please ...
2 | Get Viagra real cheap! Send money right away to ...
3 | Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ...
4 | YOUR COMPUTER HAS BEEN INFECTED! YOU MUST RESET YOUR PASSWORD. Reply to this email with your password and SSN ...
5 | THIS IS NOT A SCAM! Send money and get access to awesome stuff really cheap and never have to ...
6 |
--------------------------------------------------------------------------------
/files/testweet.json:
--------------------------------------------------------------------------------
1 | {"createdAt":"Nov 4, 2014 4:56:59 PM","id":529799371026485248,"text":"Adventures With Coffee, Code, and Writing.","source":"\u003ca href\u003d\"http://twitter.com\" rel\u003d\"nofollow\"\u003eTwitter Web Client\u003c/a\u003e","isTruncated":false,"inReplyToStatusId":-1,"inReplyToUserId":-1,"isFavorited":false,"retweetCount":0,"isPossiblySensitive":false,"contributorsIDs":[],"userMentionEntities":[],"urlEntities":[],"hashtagEntities":[],"mediaEntities":[],"currentUserRetweetId":-1,"user":{"id":15594928,"name":"Holden Karau","screenName":"holdenkarau","location":"","description":"","descriptionURLEntities":[],"isContributorsEnabled":false,"profileImageUrl":"http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","profileImageUrlHttps":"https://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","isProtected":false,"followersCount":1231,"profileBackgroundColor":"C0DEED","profileTextColor":"333333","profileLinkColor":"0084B4","profileSidebarFillColor":"DDEEF6","profileSidebarBorderColor":"FFFFFF","profileUseBackgroundImage":true,"showAllInlineMedia":false,"friendsCount":600,"createdAt":"Aug 5, 2011 9:42:44 AM","favouritesCount":1095,"utcOffset":-3,"profileBackgroundImageUrl":"","profileBackgroundImageUrlHttps":"","profileBannerImageUrl":"","profileBackgroundTiled":true,"lang":"en","statusesCount":6234,"isGeoEnabled":true,"isVerified":false,"translator":false,"listedCount":0,"isFollowRequestSent":false}}
2 |
--------------------------------------------------------------------------------
/mini-complete-example/README.md:
--------------------------------------------------------------------------------
1 | Mini Examples for Spark
2 | ===============
3 | This directory contains a complete stand alone example with both Maven and SBT build tools.
4 |
--------------------------------------------------------------------------------
/mini-complete-example/build.sbt:
--------------------------------------------------------------------------------
1 | name := "learning-spark-mini-example"
2 |
3 | version := "0.0.1"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | // additional libraries
8 | libraryDependencies ++= Seq(
9 | "org.apache.spark" %% "spark-core" % "1.1.0" % "provided"
10 | )
11 |
--------------------------------------------------------------------------------
/mini-complete-example/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | com.oreilly.learningsparkexamples.mini
3 | learning-spark-mini-example
4 | 4.0.0
5 | example
6 | jar
7 | 0.0.1
8 |
9 |
10 | org.apache.spark
11 | spark-core_2.10
12 | 1.1.0
13 | provided
14 |
15 |
16 |
17 | 1.6
18 |
19 |
20 |
21 |
22 |
23 | org.apache.maven.plugins
24 | maven-compiler-plugin
25 | 3.1
26 |
27 | ${java.version}
28 | ${java.version}
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/mini-complete-example/project/plugins.sbt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holdenk/learning-spark-examples/6862949df6c29c149ffcbedfd5948fe2ab5e2619/mini-complete-example/project/plugins.sbt
--------------------------------------------------------------------------------
/mini-complete-example/sbt/sbt:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # This script launches sbt for this project. If present it uses the system
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=0.13.7
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 |
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 | # Download
31 | printf "Attempting to fetch sbt\n"
32 | JAR_DL=${JAR}.part
33 | if hash wget 2>/dev/null; then
34 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
35 | elif hash curl 2>/dev/null; then
36 | (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
37 | else
38 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
39 | exit -1
40 | fi
41 | fi
42 | if [ ! -f ${JAR} ]; then
43 | # We failed to download
44 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
45 | exit -1
46 | fi
47 | printf "Launching sbt from ${JAR}\n"
48 | java \
49 | -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \
50 | -jar ${JAR} \
51 | "$@"
52 |
--------------------------------------------------------------------------------
/mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/BasicMap.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Java
3 | */
4 | package com.oreilly.learningsparkexamples.mini.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 |
15 | public class BasicMap {
16 | public static void main(String[] args) throws Exception {
17 | String master;
18 | if (args.length > 0) {
19 | master = args[0];
20 | } else {
21 | master = "local";
22 | }
23 | JavaSparkContext sc = new JavaSparkContext(
24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 | JavaRDD result = rdd.map(
27 | new Function() { public Integer call(Integer x) { return x*x;}});
28 | System.out.println(StringUtils.join(result.collect(), ","));
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/mini-complete-example/src/main/java/com/oreilly/learningsparkexamples/mini/java/WordCount.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a wordcount in Java
3 | */
4 | package com.oreilly.learningsparkexamples.mini.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 | import java.lang.Iterable;
9 |
10 | import scala.Tuple2;
11 |
12 | import org.apache.commons.lang.StringUtils;
13 |
14 | import org.apache.spark.SparkConf;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function2;
20 | import org.apache.spark.api.java.function.PairFunction;
21 |
22 |
23 | public class WordCount {
24 | public static void main(String[] args) throws Exception {
25 | String inputFile = args[0];
26 | String outputFile = args[1];
27 | // Create a Java Spark Context.
28 | SparkConf conf = new SparkConf().setAppName("wordCount");
29 | JavaSparkContext sc = new JavaSparkContext(conf);
30 | // Load our input data.
31 | JavaRDD input = sc.textFile(inputFile);
32 | // Split up into words.
33 | JavaRDD words = input.flatMap(
34 | new FlatMapFunction() {
35 | public Iterable call(String x) {
36 | return Arrays.asList(x.split(" "));
37 | }});
38 | // Transform into word and count.
39 | JavaPairRDD counts = words.mapToPair(
40 | new PairFunction(){
41 | public Tuple2 call(String x){
42 | return new Tuple2(x, 1);
43 | }}).reduceByKey(new Function2(){
44 | public Integer call(Integer x, Integer y){ return x + y;}});
45 | // Save the word count back out to a text file, causing evaluation.
46 | counts.saveAsTextFile(outputFile);
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/BasicMap.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Scala
3 | */
4 | package com.oreilly.learningsparkexamples.scala
5 |
6 | import org.apache.spark._
7 |
8 | object BasicMap {
9 | def main(args: Array[String]) {
10 | val master = args.length match {
11 | case x: Int if x > 0 => args(0)
12 | case _ => "local"
13 | }
14 | val sc = new SparkContext(master, "BasicMap", System.getenv("SPARK_HOME"))
15 | val input = sc.parallelize(List(1,2,3,4))
16 | val result = input.map(x => x*x)
17 | println(result.collect().mkString(","))
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/mini-complete-example/src/main/scala/com/oreilly/learningsparkexamples/mini/scala/WordCount.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates flatMap + countByValue for wordcount.
3 | */
4 | package com.oreilly.learningsparkexamples.mini.scala
5 |
6 | import org.apache.spark._
7 | import org.apache.spark.SparkContext._
8 |
9 | object WordCount {
10 | def main(args: Array[String]) {
11 | val inputFile = args(0)
12 | val outputFile = args(1)
13 | val conf = new SparkConf().setAppName("wordCount")
14 | // Create a Scala Spark Context.
15 | val sc = new SparkContext(conf)
16 | // Load our input data.
17 | val input = sc.textFile(inputFile)
18 | // Split up into words.
19 | val words = input.flatMap(line => line.split(" "))
20 | // Transform into word and count.
21 | val counts = words.map(word => (word, 1)).reduceByKey{case (x, y) => x + y}
22 | // Save the word count back out to a text file, causing evaluation.
23 | counts.saveAsTextFile(outputFile)
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | com.oreilly.learningsparkexamples
3 | java
4 | 4.0.0
5 | examples
6 | jar
7 | 0.0.2
8 |
9 |
10 | Akka repository
11 | http://repo.akka.io/releases
12 |
13 |
14 | scala-tools
15 | https://oss.sonatype.org/content/groups/scala-tools
16 |
17 |
18 | apache
19 | https://repository.apache.org/content/repositories/releases
20 |
21 |
22 | twitter
23 | http://maven.twttr.com/
24 |
25 |
26 | central2
27 | http://central.maven.org/maven2/
28 |
29 |
30 |
31 |
32 | org.apache.spark
33 | spark-core_2.10
34 | 1.3.1
35 | provided
36 |
37 |
38 | org.apache.spark
39 | spark-sql_2.10
40 | 1.3.1
41 | provided
42 |
43 |
44 | org.apache.spark
45 | spark-hive_2.10
46 | 1.3.1
47 | provided
48 |
49 |
50 | org.apache.spark
51 | spark-streaming_2.10
52 | 1.3.1
53 |
54 |
55 | org.apache.spark
56 | spark-streaming-kafka_2.10
57 | 1.3.1
58 |
59 |
60 | org.apache.spark
61 | spark-mllib
62 | 1.3.1
63 |
64 |
65 | com.datastax.spark
66 | spark-cassandra-connector
67 | 1.0.0-rc5
68 |
69 |
70 | com.datastax.spark
71 | spark-cassandra-connector-java
72 | 1.0.0-rc5
73 |
74 |
75 | org.elasticsearch
76 | elasticsearch-hadoop-mr
77 | 2.0.0.RC1
78 |
79 |
80 | org.eclipse.jetty
81 | jetty-client
82 | 8.1.14.v20131031
83 |
84 |
85 | com.fasterxml.jackson.core
86 | jackson-databind
87 | 2.3.3
88 |
89 |
90 | org.apache.commons
91 | commons-lang3
92 | 3.0
93 |
94 |
95 | net.sf.opencsv
96 | opencsv
97 | 2.0
98 |
99 |
100 | org.scalatest
101 | scalatest_${scala.binary.version}
102 | 2.2.1
103 |
104 |
105 |
106 | 1.7
107 |
108 |
109 |
110 |
111 |
112 | org.apache.maven.plugins
113 | maven-compiler-plugin
114 | 3.1
115 |
116 | ${java.version}
117 | ${java.version}
118 |
119 |
120 |
121 | org.apache.maven.plugins
122 | maven-assembly-plugin
123 | 2.2.2
124 |
125 |
126 |
127 |
128 | src/main/assembly/assembly.xml
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
2 |
3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
4 |
5 | resolvers += "Spray Repository" at "http://repo.spray.cc/"
6 |
7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
8 |
9 | addSbtPlugin("com.github.gseitz" % "sbt-protobuf" % "0.3.3")
10 |
--------------------------------------------------------------------------------
/run-all-examples:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # This script is used to run all of the examples. Mostly to be used by travis for testing
3 | # Output the commands we run
4 | set -x
5 | # If any command fails, fail
6 | set -e
7 | # Build everything
8 | ./sbt/sbt compile package assembly > sbtlog || (echo "sbt failed" && cat ./sbtlog && exit 1)
9 | KAFKA_ROOT=./kafka_2.9.2-0.8.1.1
10 | SPARK_SUBMIT_SCRIPT=$SPARK_HOME/bin/spark-submit
11 | ASSEMBLY_JAR=./target/scala-2.10/learning-spark-examples-assembly-0.0.1.jar
12 | # Mini cleanup
13 | rm -rf /tmp/py; mkdir -p /tmp/py
14 | rm -rf /tmp/java; mkdir -p /tmp/java
15 | rm -rf /tmp/scala; mkdir -p /tmp/scala
16 | # setup cassandra
17 | # cqlsh --file ./files/cqlsh_setup &
18 | # Scala
19 | echo "Running Scala programs"
20 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json
21 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.ChapterSixExample $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/scala/ch6out
22 | TWITTER_DATA=./files/testweet.json
23 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.SparkSQLTwitter $ASSEMBLY_JAR "$TWITTER_DATA" /tmp/scala/tweetout
24 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.BasicQueryCassandra $ASSEMBLY_JAR local localhost
25 | echo "Running Scala streaming program"
26 | ./bin/fakelogs.sh &
27 | sleep 1
28 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.scala.StreamingLogInput $ASSEMBLY_JAR local[4]
29 | echo "Running Scala Kafka streaming example"
30 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-readers pandas 1 &
31 | KAFKA_PID=$!
32 | sleep 1
33 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas
34 | wait $KAFKA_PID
35 | echo "Running Scala Flume example"
36 | $SPARK_SUBMIT_SCRIPT --master local[4] --class com.oreilly.learningsparkexamples.scala.FlumeInput $ASSEMBLY_JAR localhost 7788 &
37 | FLUME_PID=$!
38 | sleep 1
39 | echo "panda\nerror panda\n" | nc localhost 44444
40 | sleep 3
41 | echo "panda2\nerror panda2\n" | nc localhost 44444
42 | wait $FLUME_PID
43 | # Python
44 | echo "Running Python programs"
45 | $SPARK_SUBMIT_SCRIPT ./src/python/AvgMapPartitions.py local
46 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicAvg.py local
47 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicFilterMap.py local
48 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicKeyValueMapFilter.py local
49 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMapPartitions.py local
50 | $SPARK_SUBMIT_SCRIPT ./src/python/BasicMap.py local
51 | $SPARK_SUBMIT_SCRIPT ./src/python/ChapterSixExample.py local ./files/callsigns /tmp/py/pandaout
52 | $SPARK_SUBMIT_SCRIPT ./src/python/SparkSQLTwitter.py ./files/testweet.json /tmp/py/tweetout
53 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadCsv.py local ./files/favourite_animals.csv /tmp/py/panda_lovers.csv
54 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeHiveTable.py local ./files/int_string.csv pandaplural
55 | # Temporarily disabled due to API changes
56 | #$SPARK_SUBMIT_SCRIPT ./src/python/LoadHive.py local pandaplural
57 | $SPARK_SUBMIT_SCRIPT ./src/python/LoadJson.py local ./files/pandainfo.json /tmp/py/loadjsonout
58 | $SPARK_SUBMIT_SCRIPT ./src/python/PerKeyAvg.py local
59 | $SPARK_SUBMIT_SCRIPT ./src/python/RemoveOutliers.py local
60 | $SPARK_SUBMIT_SCRIPT ./src/python/WordCount.py local
61 | $SPARK_SUBMIT_SCRIPT ./src/python/MakeParquetFile.py local ./files/favourite_animals.csv /tmp/py/favouriteanimal_parquet
62 | $SPARK_SUBMIT_SCRIPT ./src/python/QueryParquetFile.py local /tmp/py/favouriteanimal_parquet
63 |
64 | # Java
65 | echo "Running Java programs"
66 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.LoadJsonWithSparkSQL $ASSEMBLY_JAR local ./files/pandainfo.json
67 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.ChapterSixExample $ASSEMBLY_JAR local ./files/callsigns ./files/callsigns /tmp/java/ch6out
68 | ./sbt/sbt assembly && $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.SparkSQLTwitter $ASSEMBLY_JAR ./files/testweet.json /tmp/java/tweetout
69 | #$SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.BasicQueryCassandra $ASSEMBLY_JAR local localhost
70 | echo "Running Java streaming program"
71 | ./bin/fakelogs.sh &
72 | sleep 1
73 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.StreamingLogInput $ASSEMBLY_JAR local[4]
74 | sleep 5
75 | echo "Running Java Kafka streaming example"
76 | $SPARK_SUBMIT_SCRIPT --class com.oreilly.learningsparkexamples.java.KafkaInput $ASSEMBLY_JAR localhost:2181 spark-java-readers
77 | echo "panda\nerror panda" | $KAFKA_ROOT/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pandas
78 |
79 | echo "Done running all programs :)"
80 |
--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # This script launches sbt for this project. If present it uses the system
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=0.13.7
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 |
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 | # Download
31 | printf "Attempting to fetch sbt\n"
32 | JAR_DL=${JAR}.part
33 | if hash wget 2>/dev/null; then
34 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
35 | elif hash curl 2>/dev/null; then
36 | (curl -L --progress=bar ${URL1} -O ${JAR_DL} || curl -L --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
37 | else
38 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
39 | exit -1
40 | fi
41 | fi
42 | if [ ! -f ${JAR} ]; then
43 | # We failed to download
44 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
45 | exit -1
46 | fi
47 | printf "Launching sbt from ${JAR}\n"
48 | java \
49 | -Xmx1400m -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=256m \
50 | -jar ${JAR} \
51 | "$@"
52 |
--------------------------------------------------------------------------------
/setup-project:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -x
3 | set -e
4 | set -o pipefail
5 | sudo apt-get install -y axel time
6 | echo "Downloading misc tools"
7 | sudo rm -f /etc/apt/sources.list.d/cassandra.sources.list
8 | echo "deb http://debian.datastax.com/community stable main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list
9 | curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add -
10 | sudo apt-get update > aptlog &
11 | APT_GET_UPDATE_PID=$!
12 | axel http://d3kbcqa49mib13.cloudfront.net/spark-1.3.1-bin-hadoop1.tgz > sparkdl &
13 | SPARK_DL_PID=$!
14 | axel http://mirrors.ibiblio.org/apache/kafka/0.8.1.1/kafka_2.9.2-0.8.1.1.tgz > kafkadl &
15 | KAFKA_DL_PID=$!
16 | axel http://mirror.cogentco.com/pub/apache/flume/1.5.0.1/apache-flume-1.5.0.1-bin.tar.gz > flumedl &
17 | FLUME_DL_PID=$!
18 | wait $SPARK_DL_PID
19 | sudo mkdir -p /etc/apt/sources.list.d/
20 | echo "install urllib3"
21 | sudo pip install urllib3
22 | wait $SPARK_DL_PID || echo "Spark DL finished early"
23 | tar -xf spark-1.3.1-bin-hadoop1.tgz
24 | wait $APT_GET_UPDATE_PID
25 | echo "Installing protobuf"
26 | sudo apt-get install protobuf-compiler
27 | echo $?
28 | # Set up cassandra
29 | echo "Waiting for apt-get update to finish"
30 | wait $APT_GET_UPDATE_PID || echo "apt-get update finished early"
31 | echo "Setting up dsc (cassandra)"
32 | sleep 1;
33 | #sudo apt-get -y --force-yes remove cassandra cassandra-tools
34 | #sudo rm -rf /etc/security/limits.d/cassandra.conf || echo "No cassandra security conf"
35 | #yes | sudo apt-get -y --force-yes install dsc21 > dscinstall.log
36 | #yes | sudo apt-get -y --force-yes install cassandra-tools > ctoolsinstall.log
37 | echo "Starting cassandra"
38 | sudo /etc/init.d/cassandra start
39 | echo $?
40 | echo "set up hive directories"
41 | export IAM=`whoami`
42 | sudo mkdir -p /user/hive && sudo chown -R $IAM /user/hive
43 | echo "done with setup"
44 | # Set up kafka
45 | echo "Setting up kafka"
46 | wait $KAFKA_DL_PID || echo "Kafka DL finished early"
47 | tar -xzf kafka_2.9.2-0.8.1.1.tgz
48 | cd kafka_2.9.2-0.8.1.1
49 | echo "Starting zookeeper"
50 | ./bin/zookeeper-server-start.sh config/zookeeper.properties &
51 | echo "Starting kafka"
52 | sleep 5
53 | ./bin/kafka-server-start.sh config/server.properties &
54 | sleep 5
55 | # publish a pandas topic to kafka
56 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic pandas --partition 1 --replication-factor 1 --create
57 | ./bin/kafka-topics.sh --zookeeper localhost:2181 --topic logs --partition 1 --replication-factor 1 --create
58 | cd ..
59 |
60 | # set up flume
61 | wait $FLUME_DL_PID || echo "Flume DL finished early"
62 | echo "Setting up flume"
63 | tar -xf apache-flume-1.5.0.1-bin.tar.gz
64 | cd apache-flume-1.5.0.1-bin
65 | ./bin/flume-ng agent -n panda --conf-file ../files/flumeconf.cfg &
66 | disown $!
67 | cd ..
68 | echo $?
69 |
--------------------------------------------------------------------------------
/src/R/finddistance.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | library("Imap")
3 | f <- file("stdin")
4 | open(f)
5 | while(length(line <- readLines(f,n=1)) > 0) {
6 | # process line
7 | contents <- Map(as.numeric, strsplit(line, ","))
8 | mydist <- gdist(contents[[1]][1], contents[[1]][2], contents[[1]][3], contents[[1]][4],
9 | units="m", a=6378137.0, b=6356752.3142, verbose = FALSE)
10 | write(mydist, stdout())
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvg.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates how to compute an average using aggregate in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.Serializable;
7 | import java.util.Arrays;
8 | import java.util.List;
9 |
10 | import org.apache.commons.lang.StringUtils;
11 |
12 | import org.apache.spark.api.java.JavaRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.Function2;
15 |
16 | public final class BasicAvg {
17 | public static class AvgCount implements Serializable {
18 | public AvgCount(int total, int num) {
19 | total_ = total;
20 | num_ = num;
21 | }
22 | public int total_;
23 | public int num_;
24 | public float avg() {
25 | return total_ / (float) num_;
26 | }
27 | }
28 |
29 | public static void main(String[] args) throws Exception {
30 | String master;
31 | if (args.length > 0) {
32 | master = args[0];
33 | } else {
34 | master = "local";
35 | }
36 |
37 | JavaSparkContext sc = new JavaSparkContext(
38 | master, "basicavg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
39 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
40 | Function2 addAndCount = new Function2() {
41 | @Override
42 | public AvgCount call(AvgCount a, Integer x) {
43 | a.total_ += x;
44 | a.num_ += 1;
45 | return a;
46 | }
47 | };
48 | Function2 combine = new Function2() {
49 | @Override
50 | public AvgCount call(AvgCount a, AvgCount b) {
51 | a.total_ += b.total_;
52 | a.num_ += b.num_;
53 | return a;
54 | }
55 | };
56 | AvgCount initial = new AvgCount(0,0);
57 | AvgCount result = rdd.aggregate(initial, addAndCount, combine);
58 | System.out.println(result.avg());
59 | sc.stop();
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgMapPartitions.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map partitions in Java to compute the average
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.Iterator;
10 |
11 | import org.apache.commons.lang.StringUtils;
12 |
13 | import org.eclipse.jetty.client.ContentExchange;
14 | import org.eclipse.jetty.client.HttpClient;
15 |
16 |
17 | import org.apache.spark.api.java.JavaRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 | import org.apache.spark.api.java.function.Function2;
21 |
22 | public final class BasicAvgMapPartitions {
23 | class AvgCount {
24 | public AvgCount() {
25 | total_ = 0;
26 | num_ = 0;
27 | }
28 | public AvgCount(Integer total, Integer num) {
29 | total_ = total;
30 | num_ = num;
31 | }
32 | public AvgCount merge(Iterable input) {
33 | for (Integer elem : input) {
34 | num_ += 1;
35 | total_ += elem;
36 | }
37 | return this;
38 | }
39 | public Integer total_;
40 | public Integer num_;
41 | public float avg() {
42 | return total_ / (float) num_;
43 | }
44 | }
45 |
46 | public static void main(String[] args) throws Exception {
47 | String master;
48 | if (args.length > 0) {
49 | master = args[0];
50 | } else {
51 | master = "local";
52 | }
53 | BasicAvgMapPartitions bamp = new BasicAvgMapPartitions();
54 | bamp.run(master);
55 | }
56 |
57 | public void run(String master) {
58 | JavaSparkContext sc = new JavaSparkContext(
59 | master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
60 | JavaRDD rdd = sc.parallelize(
61 | Arrays.asList(1, 2, 3, 4, 5));
62 | FlatMapFunction, AvgCount> setup = new FlatMapFunction, AvgCount>() {
63 | @Override
64 | public Iterable call(Iterator input) {
65 | AvgCount a = new AvgCount(0, 0);
66 | while (input.hasNext()) {
67 | a.total_ += input.next();
68 | a.num_ += 1;
69 | }
70 | ArrayList ret = new ArrayList();
71 | ret.add(a);
72 | return ret;
73 | }
74 | };
75 | Function2 combine = new Function2() {
76 | @Override
77 | public AvgCount call(AvgCount a, AvgCount b) {
78 | a.total_ += b.total_;
79 | a.num_ += b.num_;
80 | return a;
81 | }
82 | };
83 |
84 | AvgCount result = rdd.mapPartitions(setup).reduce(combine);
85 | System.out.println(result.avg());
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicAvgWithKryo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates Kryo serialization in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.SparkConf;
12 | import org.apache.spark.serializer.KryoRegistrator;
13 | import org.apache.spark.api.java.JavaRDD;
14 | import org.apache.spark.api.java.JavaSparkContext;
15 | import org.apache.spark.api.java.function.Function2;
16 |
17 | import com.esotericsoftware.kryo.Kryo;
18 | import com.esotericsoftware.kryo.serializers.FieldSerializer;
19 |
20 | public final class BasicAvgWithKryo {
21 | // This is our custom class we will configure Kyro to serialize
22 | static class AvgCount implements java.io.Serializable {
23 | public AvgCount() {
24 | total_ = 0;
25 | num_ = 0;
26 | }
27 | public AvgCount(int total, int num) {
28 | total_ = total;
29 | num_ = num;
30 | }
31 | public float avg() {
32 | return total_ / (float) num_;
33 | }
34 | public int total_;
35 | public int num_;
36 | }
37 |
38 | public static class AvgRegistrator implements KryoRegistrator {
39 | public void registerClasses(Kryo kryo) {
40 | kryo.register(AvgCount.class, new FieldSerializer(kryo, AvgCount.class));
41 | }
42 | }
43 |
44 | public static void main(String[] args) throws Exception {
45 | String master;
46 | if (args.length > 0) {
47 | master = args[0];
48 | } else {
49 | master = "local";
50 | }
51 |
52 | SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro");
53 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
54 | conf.set("spark.kryo.registrator", AvgRegistrator.class.getName());
55 | JavaSparkContext sc = new JavaSparkContext(conf);
56 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
57 | Function2 addAndCount = new Function2() {
58 | @Override
59 | public AvgCount call(AvgCount a, Integer x) {
60 | a.total_ += x;
61 | a.num_ += 1;
62 | return a;
63 | }
64 | };
65 | Function2 combine = new Function2() {
66 | @Override
67 | public AvgCount call(AvgCount a, AvgCount b) {
68 | a.total_ += b.total_;
69 | a.num_ += b.num_;
70 | return a;
71 | }
72 | };
73 | AvgCount initial = new AvgCount(0,0);
74 | AvgCount result = rdd.aggregate(initial, addAndCount, combine);
75 | System.out.println(result.avg());
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicFlatMap.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple flatMap in Java to extract the words
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.Map.Entry;
10 |
11 |
12 | import org.apache.commons.lang.StringUtils;
13 |
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 |
19 | public class BasicFlatMap {
20 | public static void main(String[] args) throws Exception {
21 |
22 | if (args.length != 2) {
23 | throw new Exception("Usage BasicFlatMap sparkMaster inputFile");
24 | }
25 |
26 | JavaSparkContext sc = new JavaSparkContext(
27 | args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
28 | JavaRDD rdd = sc.textFile(args[1]);
29 | JavaRDD words = rdd.flatMap(
30 | new FlatMapFunction() { public Iterable call(String x) {
31 | return Arrays.asList(x.split(" "));
32 | }});
33 | Map result = words.countByValue();
34 | for (Entry entry: result.entrySet()) {
35 | System.out.println(entry.getKey() + ":" + entry.getValue());
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicJoinCsv.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates joining two csv files
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.StringReader;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import scala.Tuple2;
10 |
11 | import au.com.bytecode.opencsv.CSVReader;
12 |
13 | import org.apache.commons.lang.StringUtils;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.PairFunction;
18 |
19 | public class BasicJoinCsv {
20 |
21 | public static class ParseLine implements PairFunction {
22 | public Tuple2 call(String line) throws Exception {
23 | CSVReader reader = new CSVReader(new StringReader(line));
24 | String[] elements = reader.readNext();
25 | Integer key = Integer.parseInt(elements[0]);
26 | return new Tuple2(key, elements);
27 | }
28 | }
29 |
30 | public static void main(String[] args) throws Exception {
31 | if (args.length != 3) {
32 | throw new Exception("Usage BasicJoinCsv sparkMaster csv1 csv2");
33 | }
34 | String master = args[0];
35 | String csv1 = args[1];
36 | String csv2 = args[2];
37 | BasicJoinCsv jsv = new BasicJoinCsv();
38 | jsv.run(master, csv1, csv2);
39 | }
40 |
41 | public void run(String master, String csv1, String csv2) throws Exception {
42 | JavaSparkContext sc = new JavaSparkContext(
43 | master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
44 | JavaRDD csvFile1 = sc.textFile(csv1);
45 | JavaRDD csvFile2 = sc.textFile(csv2);
46 | JavaPairRDD keyedRDD1 = csvFile1.mapToPair(new ParseLine());
47 | JavaPairRDD keyedRDD2 = csvFile1.mapToPair(new ParseLine());
48 | JavaPairRDD> result = keyedRDD1.join(keyedRDD2);
49 | List>> resultCollection = result.collect();
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadJson.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates loading a json file and finding out if people like pandas
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.StringReader;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.Iterator;
10 | import java.lang.Iterable;
11 | import scala.Tuple2;
12 |
13 | import org.apache.commons.lang.StringUtils;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | import org.apache.spark.api.java.function.Function;
19 |
20 | import com.fasterxml.jackson.databind.ObjectMapper;
21 | import com.fasterxml.jackson.databind.ObjectWriter;
22 |
23 | public class BasicLoadJson {
24 |
25 | public static class Person implements java.io.Serializable {
26 | public String name;
27 | public Boolean lovesPandas;
28 | }
29 |
30 | public static class ParseJson implements FlatMapFunction, Person> {
31 | public Iterable call(Iterator lines) throws Exception {
32 | ArrayList people = new ArrayList();
33 | ObjectMapper mapper = new ObjectMapper();
34 | while (lines.hasNext()) {
35 | String line = lines.next();
36 | try {
37 | people.add(mapper.readValue(line, Person.class));
38 | } catch (Exception e) {
39 | // Skip invalid input
40 | }
41 | }
42 | return people;
43 | }
44 | }
45 |
46 | public static class LikesPandas implements Function {
47 | public Boolean call(Person person) {
48 | return person.lovesPandas;
49 | }
50 | }
51 |
52 |
53 | public static class WriteJson implements FlatMapFunction, String> {
54 | public Iterable call(Iterator people) throws Exception {
55 | ArrayList text = new ArrayList();
56 | ObjectMapper mapper = new ObjectMapper();
57 | while (people.hasNext()) {
58 | Person person = people.next();
59 | text.add(mapper.writeValueAsString(person));
60 | }
61 | return text;
62 | }
63 | }
64 |
65 | public static void main(String[] args) throws Exception {
66 | if (args.length != 3) {
67 | throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]");
68 | }
69 | String master = args[0];
70 | String fileName = args[1];
71 | String outfile = args[2];
72 |
73 | JavaSparkContext sc = new JavaSparkContext(
74 | master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS"));
75 | JavaRDD input = sc.textFile(fileName);
76 | JavaRDD result = input.mapPartitions(new ParseJson()).filter(new LikesPandas());
77 | JavaRDD formatted = result.mapPartitions(new WriteJson());
78 | formatted.saveAsTextFile(outfile);
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadSequenceFile.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates loading a sequence file of people and how many pandas they have seen
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.List;
7 | import scala.Tuple2;
8 |
9 | import org.apache.spark.api.java.JavaPairRDD;
10 | import org.apache.spark.api.java.JavaSparkContext;
11 | import org.apache.spark.api.java.function.PairFunction;
12 | import org.apache.hadoop.io.IntWritable;
13 | import org.apache.hadoop.io.Text;
14 |
15 | public class BasicLoadSequenceFile {
16 |
17 | public static class ConvertToNativeTypes implements PairFunction, String, Integer> {
18 | public Tuple2 call(Tuple2 record) {
19 | return new Tuple2(record._1.toString(), record._2.get());
20 | }
21 | }
22 |
23 | public static void main(String[] args) throws Exception {
24 | if (args.length != 2) {
25 | throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
26 | }
27 | String master = args[0];
28 | String fileName = args[1];
29 |
30 | JavaSparkContext sc = new JavaSparkContext(
31 | master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
32 | JavaPairRDD input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
33 | JavaPairRDD result = input.mapToPair(new ConvertToNativeTypes());
34 | List> resultList = result.collect();
35 | for (Tuple2 record : resultList) {
36 | System.out.println(record);
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicLoadWholeCsv.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates joining two csv files
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.StringReader;
7 | import java.util.Arrays;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import scala.Tuple2;
11 |
12 | import au.com.bytecode.opencsv.CSVReader;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 |
21 | public class BasicLoadWholeCsv {
22 |
23 | public static class ParseLine implements FlatMapFunction, String[]> {
24 | public Iterable call(Tuple2 file) throws Exception {
25 | CSVReader reader = new CSVReader(new StringReader(file._2()));
26 | return reader.readAll();
27 | }
28 | }
29 |
30 | public static void main(String[] args) throws Exception {
31 | if (args.length != 3) {
32 | throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key");
33 | }
34 | String master = args[0];
35 | String csvInput = args[1];
36 | String outputFile = args[2];
37 | final String key = args[3];
38 |
39 | JavaSparkContext sc = new JavaSparkContext(
40 | master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
41 | JavaPairRDD csvData = sc.wholeTextFiles(csvInput);
42 | JavaRDD keyedRDD = csvData.flatMap(new ParseLine());
43 | JavaRDD result =
44 | keyedRDD.filter(new Function() {
45 | public Boolean call(String[] input) { return input[0].equals(key); }});
46 |
47 | result.saveAsTextFile(outputFile);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMap.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 |
15 | public class BasicMap {
16 | public static void main(String[] args) throws Exception {
17 | String master;
18 | if (args.length > 0) {
19 | master = args[0];
20 | } else {
21 | master = "local";
22 | }
23 | JavaSparkContext sc = new JavaSparkContext(
24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 | JavaRDD result = rdd.map(
27 | new Function() { public Integer call(Integer x) { return x*x;}});
28 | System.out.println(StringUtils.join(result.collect(), ","));
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapPartitions.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.Iterator;
10 |
11 | import org.apache.commons.lang.StringUtils;
12 |
13 | import org.eclipse.jetty.client.ContentExchange;
14 | import org.eclipse.jetty.client.HttpClient;
15 |
16 |
17 | import org.apache.spark.api.java.JavaRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 |
21 | public class BasicMapPartitions {
22 | public static void main(String[] args) throws Exception {
23 | String master;
24 | if (args.length > 0) {
25 | master = args[0];
26 | } else {
27 | master = "local";
28 | }
29 | JavaSparkContext sc = new JavaSparkContext(
30 | master, "basicmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
31 | JavaRDD rdd = sc.parallelize(
32 | Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"));
33 | JavaRDD result = rdd.mapPartitions(
34 | new FlatMapFunction, String>() {
35 | public Iterable call(Iterator input) {
36 | ArrayList content = new ArrayList();
37 | ArrayList cea = new ArrayList();
38 | HttpClient client = new HttpClient();
39 | try {
40 | client.start();
41 | while (input.hasNext()) {
42 | ContentExchange exchange = new ContentExchange(true);
43 | exchange.setURL("http://qrzcq.com/call/" + input.next());
44 | client.send(exchange);
45 | cea.add(exchange);
46 | }
47 | for (ContentExchange exchange : cea) {
48 | exchange.waitForDone();
49 | content.add(exchange.getResponseContent());
50 | }
51 | } catch (Exception e) {
52 | }
53 | return content;
54 | }});
55 | System.out.println(StringUtils.join(result.collect(), ","));
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapThenFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map then filter in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 |
15 | public class BasicMapThenFilter {
16 | public static void main(String[] args) throws Exception {
17 | String master;
18 | if (args.length > 0) {
19 | master = args[0];
20 | } else {
21 | master = "local";
22 | }
23 | JavaSparkContext sc = new JavaSparkContext(
24 | master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 | JavaRDD squared = rdd.map(
27 | new Function() { public Integer call(Integer x) { return x*x;}});
28 | JavaRDD result = squared.filter(
29 | new Function() { public Boolean call(Integer x) { return x != 1; }});
30 | System.out.println(StringUtils.join(result.collect(), ","));
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicMapToDouble.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map to double in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaDoubleRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.DoubleFunction;
15 |
16 | public class BasicMapToDouble {
17 | public static void main(String[] args) throws Exception {
18 | String master;
19 | if (args.length > 0) {
20 | master = args[0];
21 | } else {
22 | master = "local";
23 | }
24 | JavaSparkContext sc = new JavaSparkContext(
25 | master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS"));
26 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
27 | JavaDoubleRDD result = rdd.mapToDouble(
28 | new DoubleFunction() {
29 | public double call(Integer x) {
30 | double y = (double) x;
31 | return y * y;
32 | }
33 | });
34 | System.out.println(StringUtils.join(result.collect(), ","));
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicQueryCassandra.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates loading a json file and finding out if people like pandas
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 | import java.io.Serializable;
6 |
7 | import java.io.StringReader;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import java.util.Iterator;
11 | import java.lang.Iterable;
12 | import scala.Tuple2;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.SparkConf;
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.DoubleFunction;
20 | import org.apache.spark.api.java.function.FlatMapFunction;
21 | import org.apache.spark.api.java.function.Function;
22 |
23 | import com.datastax.spark.connector.CassandraRow;
24 | import static com.datastax.spark.connector.CassandraJavaUtil.javaFunctions;
25 |
26 | public class BasicQueryCassandra {
27 | public static void main(String[] args) throws Exception {
28 | if (args.length != 2) {
29 | throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]");
30 | }
31 | String sparkMaster = args[0];
32 | String cassandraHost = args[1];
33 | SparkConf conf = new SparkConf(true)
34 | .set("spark.cassandra.connection.host", cassandraHost);
35 |
36 | JavaSparkContext sc = new JavaSparkContext(
37 | sparkMaster, "basicquerycassandra", conf);
38 | // entire table as an RDD
39 | // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int);
40 | JavaRDD data = javaFunctions(sc).cassandraTable("test" , "kv");
41 | // print some basic stats
42 | System.out.println(data.mapToDouble(new DoubleFunction() {
43 | public double call(CassandraRow row) {
44 | return row.getInt("value");
45 | }}).stats());
46 | // write some basic data to Cassandra
47 | ArrayList input = new ArrayList();
48 | input.add(KeyValue.newInstance("mostmagic", 3));
49 | JavaRDD kvRDD = sc.parallelize(input);
50 | javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv");
51 | }
52 | public static class KeyValue implements Serializable {
53 | private String key;
54 | private Integer value;
55 | public KeyValue() {
56 | }
57 | public static KeyValue newInstance(String k, Integer v) {
58 | KeyValue kv = new KeyValue();
59 | kv.setKey(k);
60 | kv.setValue(v);
61 | return kv;
62 | }
63 | public String getKey() {
64 | return key;
65 | }
66 | public Integer getValue() {
67 | return value;
68 | }
69 | void setKey(String k) {
70 | this.key = k;
71 | }
72 | void setValue(Integer v) {
73 | this.value = v;
74 | }
75 | }
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicSaveSequenceFile.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates saving a sequence file in Java using the old style hadoop APIs.
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.ArrayList;
7 | import java.util.List;
8 | import scala.Tuple2;
9 |
10 | import org.apache.spark.api.java.JavaPairRDD;
11 | import org.apache.spark.api.java.JavaSparkContext;
12 | import org.apache.spark.api.java.function.PairFunction;
13 | import org.apache.hadoop.io.IntWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
16 |
17 | public class BasicSaveSequenceFile {
18 |
19 | public static class ConvertToWritableTypes implements PairFunction, Text, IntWritable> {
20 | public Tuple2 call(Tuple2 record) {
21 | return new Tuple2(new Text(record._1), new IntWritable(record._2));
22 | }
23 | }
24 |
25 | public static void main(String[] args) throws Exception {
26 | if (args.length != 2) {
27 | throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
28 | }
29 | String master = args[0];
30 | String fileName = args[1];
31 |
32 | JavaSparkContext sc = new JavaSparkContext(
33 | master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
34 | List> input = new ArrayList();
35 | input.add(new Tuple2("coffee", 1));
36 | input.add(new Tuple2("coffee", 2));
37 | input.add(new Tuple2("pandas", 3));
38 | JavaPairRDD rdd = sc.parallelizePairs(input);
39 | JavaPairRDD result = rdd.mapToPair(new ConvertToWritableTypes());
40 | result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/BasicSum.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple fold in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function2;
14 |
15 | public class BasicSum {
16 | public static void main(String[] args) throws Exception {
17 | String master;
18 | if (args.length > 0) {
19 | master = args[0];
20 | } else {
21 | master = "local";
22 | }
23 | JavaSparkContext sc = new JavaSparkContext(
24 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
25 | JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
26 | Integer result = rdd.fold(0, new Function2() {
27 | public Integer call(Integer x, Integer y) { return x + y;}});
28 | System.out.println(result);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/CallLog.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java;
2 |
3 | import java.io.Serializable;
4 |
5 | public class CallLog implements Serializable {
6 | public String callsign;
7 | public Double contactlat;
8 | public Double contactlong;
9 | public Double mylat;
10 | public Double mylong;
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/HappyPerson.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java;
2 | import java.io.Serializable;
3 |
4 |
5 | class HappyPerson implements Serializable {
6 | private String name;
7 | private String favouriteBeverage;
8 | public HappyPerson() {}
9 | public HappyPerson(String n, String b) {
10 | name = n; favouriteBeverage = b;
11 | }
12 | public String getName() { return name; }
13 | public void setName(String n) { name = n; }
14 | public String getFavouriteBeverage() { return favouriteBeverage; }
15 | public void setFavouriteBeverage(String b) { favouriteBeverage = b; }
16 | };
17 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/IntersectByKey.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 |
12 | import com.google.common.collect.Iterables;
13 |
14 | import scala.Tuple2;
15 |
16 | import org.apache.commons.lang.StringUtils;
17 |
18 | import org.apache.spark.api.java.JavaRDD;
19 | import org.apache.spark.api.java.JavaPairRDD;
20 | import org.apache.spark.api.java.JavaSparkContext;
21 | import org.apache.spark.api.java.function.Function;
22 | import org.apache.spark.api.java.function.Function2;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 |
25 | public final class IntersectByKey {
26 | public static JavaPairRDD intersectByKey(JavaPairRDD rdd1, JavaPairRDD rdd2) {
27 | JavaPairRDD, Iterable>> grouped = rdd1.cogroup(rdd2);
28 | return grouped.flatMapValues(new Function, Iterable>, Iterable>() {
29 | @Override
30 | public Iterable call(Tuple2, Iterable> input) {
31 | ArrayList al = new ArrayList();
32 | if (!Iterables.isEmpty(input._1()) && !Iterables.isEmpty(input._2())) {
33 | Iterables.addAll(al, input._1());
34 | Iterables.addAll(al, input._2());
35 | }
36 | return al;
37 | }
38 | });
39 | }
40 | public static void main(String[] args) throws Exception {
41 | String master;
42 | if (args.length > 0) {
43 | master = args[0];
44 | } else {
45 | master = "local";
46 | }
47 |
48 | JavaSparkContext sc = new JavaSparkContext(
49 | master, "IntersectByKey", System.getenv("SPARK_HOME"), System.getenv("JARS"));
50 | List> input1 = new ArrayList();
51 | input1.add(new Tuple2("coffee", 1));
52 | input1.add(new Tuple2("coffee", 2));
53 | input1.add(new Tuple2("pandas", 3));
54 | List> input2 = new ArrayList();
55 | input2.add(new Tuple2("pandas", 20));
56 | JavaPairRDD rdd1 = sc.parallelizePairs(input1);
57 | JavaPairRDD rdd2 = sc.parallelizePairs(input2);
58 | JavaPairRDD result = intersectByKey(rdd1, rdd2);
59 | for (Tuple2 entry : result.collect()) {
60 | System.out.println(entry._1() + ":" + entry._2());
61 | }
62 | System.out.println("Done");
63 | sc.stop();
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/KafkaInput.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map then filter in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.HashMap;
8 | import java.util.List;
9 | import java.util.Map;
10 |
11 | import org.apache.commons.lang.StringUtils;
12 |
13 | import org.apache.spark.SparkConf;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaSparkContext;
16 | import org.apache.spark.api.java.function.Function;
17 | import org.apache.spark.streaming.api.java.JavaPairDStream;
18 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
19 | import org.apache.spark.streaming.Duration;
20 | import org.apache.spark.streaming.kafka.*;
21 |
22 | public final class KafkaInput {
23 | public static void main(String[] args) throws Exception {
24 | String zkQuorum = args[0];
25 | String group = args[1];
26 | SparkConf conf = new SparkConf().setAppName("KafkaInput");
27 | // Create a StreamingContext with a 1 second batch size
28 | JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000));
29 | Map topics = new HashMap();
30 | topics.put("pandas", 1);
31 | JavaPairDStream input = KafkaUtils.createStream(jssc, zkQuorum, group, topics);
32 | input.print();
33 | // start our streaming context and wait for it to "finish"
34 | jssc.start();
35 | // Wait for 10 seconds then exit. To run forever call without a timeout
36 | jssc.awaitTermination(10000);
37 | // Stop the streaming context
38 | jssc.stop();
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/KeyValueMapFilter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates how to make a PairRDD then do a basic filter
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 |
12 | import scala.Tuple2;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 |
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.api.java.function.PairFunction;
21 |
22 | public final class KeyValueMapFilter {
23 |
24 | public static void main(String[] args) throws Exception {
25 | if (args.length != 2) {
26 | throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
27 | }
28 | String master = args[0];
29 | String inputFile = args[1];
30 |
31 | JavaSparkContext sc = new JavaSparkContext(
32 | master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
33 | JavaRDD input = sc.textFile(inputFile);
34 | PairFunction keyData = new PairFunction() {
35 | @Override
36 | public Tuple2 call(String x) {
37 | return new Tuple2(x.split(" ")[0], x);
38 | }
39 | };
40 | Function, Boolean> longWordFilter = new Function, Boolean>() {
41 | @Override
42 | public Boolean call(Tuple2 input) {
43 | return (input._2().length() < 20);
44 | }
45 | };
46 | JavaPairRDD rdd = input.mapToPair(keyData);
47 | JavaPairRDD result = rdd.filter(longWordFilter);
48 | Map resultMap = result.collectAsMap();
49 | for (Entry entry : resultMap.entrySet()) {
50 | System.out.println(entry.getKey() + ":" + entry.getValue());
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/LoadHive.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates loading data from Hive with Spark SQL
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.StringReader;
7 | import java.util.Arrays;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import scala.Tuple2;
11 |
12 | import au.com.bytecode.opencsv.CSVReader;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.sql.SQLContext;
21 | import org.apache.spark.sql.Row;
22 | import org.apache.spark.sql.DataFrame;
23 |
24 | public class LoadHive {
25 |
26 | public static class SquareKey implements Function {
27 | public Integer call(Row row) throws Exception {
28 | return row.getInt(0) * row.getInt(0);
29 | }
30 | }
31 |
32 | public static void main(String[] args) throws Exception {
33 | if (args.length != 3) {
34 | throw new Exception("Usage LoadHive sparkMaster tbl");
35 | }
36 | String master = args[0];
37 | String tbl = args[1];
38 |
39 | JavaSparkContext sc = new JavaSparkContext(
40 | master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS"));
41 | SQLContext sqlCtx = new SQLContext(sc);
42 | DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src");
43 | JavaRDD squaredKeys = rdd.toJavaRDD().map(new SquareKey());
44 | List result = squaredKeys.collect();
45 | for (Integer elem : result) {
46 | System.out.println(elem);
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/LoadJsonWithSparkSQL.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates loading data from Hive with Spark SQL
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.io.StringReader;
7 | import java.util.Arrays;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import scala.Tuple2;
11 |
12 | import au.com.bytecode.opencsv.CSVReader;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 | import org.apache.spark.api.java.JavaRDD;
16 | import org.apache.spark.api.java.JavaPairRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.sql.SQLContext;
21 | import org.apache.spark.sql.Row;
22 | import org.apache.spark.sql.DataFrame;
23 |
24 | public class LoadJsonWithSparkSQL {
25 |
26 |
27 | public static void main(String[] args) throws Exception {
28 | if (args.length != 2) {
29 | throw new Exception("Usage LoadJsonWithSparkSQL sparkMaster jsonFile");
30 | }
31 | String master = args[0];
32 | String jsonFile = args[1];
33 |
34 | JavaSparkContext sc = new JavaSparkContext(
35 | master, "loadJsonwithsparksql");
36 | SQLContext sqlCtx = new SQLContext(sc);
37 | DataFrame input = sqlCtx.jsonFile(jsonFile);
38 | input.printSchema();
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/MLlib.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.oreilly.learningsparkexamples.java;
19 |
20 | import java.util.Arrays;
21 |
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.JavaRDD;
24 | import org.apache.spark.api.java.JavaSparkContext;
25 | import org.apache.spark.api.java.function.Function;
26 |
27 | import org.apache.spark.mllib.classification.LogisticRegressionModel;
28 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
29 | import org.apache.spark.mllib.feature.HashingTF;
30 | import org.apache.spark.mllib.linalg.Vector;
31 | import org.apache.spark.mllib.regression.LabeledPoint;
32 |
33 | public final class MLlib {
34 |
35 | public static void main(String[] args) {
36 | SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample");
37 | JavaSparkContext sc = new JavaSparkContext(sparkConf);
38 |
39 | // Load 2 types of emails from text files: spam and ham (non-spam).
40 | // Each line has text from one email.
41 | JavaRDD spam = sc.textFile("files/spam.txt");
42 | JavaRDD ham = sc.textFile("files/ham.txt");
43 |
44 | // Create a HashingTF instance to map email text to vectors of 100 features.
45 | final HashingTF tf = new HashingTF(100);
46 |
47 | // Each email is split into words, and each word is mapped to one feature.
48 | // Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
49 | JavaRDD positiveExamples = spam.map(new Function() {
50 | @Override public LabeledPoint call(String email) {
51 | return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" "))));
52 | }
53 | });
54 | JavaRDD negativeExamples = ham.map(new Function() {
55 | @Override public LabeledPoint call(String email) {
56 | return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" "))));
57 | }
58 | });
59 | JavaRDD trainingData = positiveExamples.union(negativeExamples);
60 | trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm.
61 |
62 | // Create a Logistic Regression learner which uses the LBFGS optimizer.
63 | LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
64 | // Run the actual learning algorithm on the training data.
65 | LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
66 |
67 | // Test on a positive example (spam) and a negative one (ham).
68 | // First apply the same HashingTF feature transformation used on the training data.
69 | Vector posTestExample =
70 | tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
71 | Vector negTestExample =
72 | tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
73 | // Now use the learned model to predict spam/ham for new emails.
74 | System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
75 | System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
76 |
77 | sc.stop();
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/PerKeyAvg.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 |
12 | import scala.Tuple2;
13 |
14 | import org.apache.commons.lang.StringUtils;
15 |
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaPairRDD;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | import org.apache.spark.api.java.function.Function;
20 | import org.apache.spark.api.java.function.Function2;
21 |
22 | public final class PerKeyAvg {
23 | public static class AvgCount implements java.io.Serializable {
24 | public AvgCount(int total, int num) {
25 | total_ = total;
26 | num_ = num;
27 | }
28 | public int total_;
29 | public int num_;
30 | public float avg() {
31 | return total_ / (float) num_;
32 | }
33 | }
34 | public static void main(String[] args) throws Exception {
35 | String master;
36 | if (args.length > 0) {
37 | master = args[0];
38 | } else {
39 | master = "local";
40 | }
41 |
42 | JavaSparkContext sc = new JavaSparkContext(
43 | master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
44 | List> input = new ArrayList();
45 | input.add(new Tuple2("coffee", 1));
46 | input.add(new Tuple2("coffee", 2));
47 | input.add(new Tuple2("pandas", 3));
48 | JavaPairRDD rdd = sc.parallelizePairs(input);
49 | Function createAcc = new Function() {
50 | @Override
51 | public AvgCount call(Integer x) {
52 | return new AvgCount(x, 1);
53 | }
54 | };
55 | Function2 addAndCount = new Function2() {
56 | @Override
57 | public AvgCount call(AvgCount a, Integer x) {
58 | a.total_ += x;
59 | a.num_ += 1;
60 | return a;
61 | }
62 | };
63 | Function2 combine = new Function2() {
64 | @Override
65 | public AvgCount call(AvgCount a, AvgCount b) {
66 | a.total_ += b.total_;
67 | a.num_ += b.num_;
68 | return a;
69 | }
70 | };
71 | AvgCount initial = new AvgCount(0,0);
72 | JavaPairRDD avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
73 | Map countMap = avgCounts.collectAsMap();
74 | for (Entry entry : countMap.entrySet()) {
75 | System.out.println(entry.getKey() + ":" + entry.getValue().avg());
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/RemoveOutliers.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates remove outliers in Java using summary Stats
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaDoubleRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.Function;
15 | import org.apache.spark.util.StatCounter;
16 |
17 | public class RemoveOutliers {
18 | public static void main(String[] args) {
19 | String master;
20 | if (args.length > 0) {
21 | master = args[0];
22 | } else {
23 | master = "local";
24 | }
25 | JavaSparkContext sc = new JavaSparkContext(
26 | master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
27 | JavaDoubleRDD input = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 1000.0));
28 | JavaDoubleRDD result = removeOutliers(input);
29 | System.out.println(StringUtils.join(result.collect(), ","));
30 | }
31 | static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) {
32 | final StatCounter summaryStats = rdd.stats();
33 | final Double stddev = Math.sqrt(summaryStats.variance());
34 | return rdd.filter(new Function() { public Boolean call(Double x) {
35 | return (Math.abs(x - summaryStats.mean()) < 3 * stddev);
36 | }});
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/SparkSQLTwitter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Load some tweets stored as JSON data and explore them.
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | import org.apache.commons.lang.StringUtils;
11 |
12 | import org.apache.spark.SparkConf;
13 | import org.apache.spark.api.java.JavaRDD;
14 | import org.apache.spark.api.java.JavaSparkContext;
15 | import org.apache.spark.api.java.function.Function;
16 | import org.apache.spark.sql.SQLContext;
17 | import org.apache.spark.sql.DataFrame;
18 | import org.apache.spark.sql.Row;
19 | import org.apache.spark.sql.api.java.UDF1;
20 | import org.apache.spark.sql.types.DataTypes;
21 |
22 | public class SparkSQLTwitter {
23 | public static void main(String[] args) {
24 | String inputFile = args[0];
25 | SparkConf conf = new SparkConf();
26 | JavaSparkContext sc = new JavaSparkContext(conf);
27 | SQLContext sqlCtx = new SQLContext(sc);
28 | DataFrame input = sqlCtx.jsonFile(inputFile);
29 | // Print the schema
30 | input.printSchema();
31 | // Register the input schema RDD
32 | input.registerTempTable("tweets");
33 | // Select tweets based on the retweetCount
34 | DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
35 | Row[] result = topTweets.collect();
36 | for (Row row : result) {
37 | System.out.println(row.get(0));
38 | }
39 | JavaRDD topTweetText = topTweets.toJavaRDD().map(new Function() {
40 | public String call(Row row) {
41 | return row.getString(0);
42 | }});
43 | System.out.println(topTweetText.collect());
44 | // Create a person and turn it into a Schema RDD
45 | ArrayList peopleList = new ArrayList();
46 | peopleList.add(new HappyPerson("holden", "coffee"));
47 | JavaRDD happyPeopleRDD = sc.parallelize(peopleList);
48 | DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
49 | happyPeopleSchemaRDD.registerTempTable("happy_people");
50 | sqlCtx.udf().register("stringLengthJava", new UDF1() {
51 | @Override
52 | public Integer call(String str) throws Exception {
53 | return str.length();
54 | }
55 | }, DataTypes.IntegerType);
56 | DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
57 | Row[] lengths = tweetLength.collect();
58 | for (Row row : result) {
59 | System.out.println(row.get(0));
60 | }
61 | sc.stop();
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/StreamingLogInput.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a simple map then filter in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | import org.apache.commons.lang.StringUtils;
10 |
11 | import org.apache.spark.api.java.JavaRDD;
12 | import org.apache.spark.api.java.JavaSparkContext;
13 | import org.apache.spark.api.java.function.Function;
14 | import org.apache.spark.streaming.api.java.JavaDStream;
15 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
16 | import org.apache.spark.streaming.Duration;
17 |
18 | public class StreamingLogInput {
19 | public static void main(String[] args) throws Exception {
20 | String master = args[0];
21 | JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput");
22 | // Create a StreamingContext with a 1 second batch size
23 | JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000));
24 | // Create a DStream from all the input on port 7777
25 | JavaDStream lines = jssc.socketTextStream("localhost", 7777);
26 | // Filter our DStream for lines with "error"
27 | JavaDStream errorLines = lines.filter(new Function() {
28 | public Boolean call(String line) {
29 | return line.contains("error");
30 | }});
31 | // Print out the lines with errors, which causes this DStream to be evaluated
32 | errorLines.print();
33 | // start our streaming context and wait for it to "finish"
34 | jssc.start();
35 | // Wait for 10 seconds then exit. To run forever call without a timeout
36 | jssc.awaitTermination(10000);
37 | // Stop the streaming context
38 | jssc.stop();
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/WordCount.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Illustrates a wordcount in Java
3 | */
4 | package com.oreilly.learningsparkexamples.java;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 | import java.lang.Iterable;
9 |
10 | import scala.Tuple2;
11 |
12 | import org.apache.commons.lang.StringUtils;
13 |
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaSparkContext;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | import org.apache.spark.api.java.function.Function2;
19 | import org.apache.spark.api.java.function.PairFunction;
20 |
21 |
22 | public class WordCount {
23 | public static void main(String[] args) throws Exception {
24 | String master = args[0];
25 | JavaSparkContext sc = new JavaSparkContext(
26 | master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS"));
27 | JavaRDD rdd = sc.textFile(args[1]);
28 | JavaPairRDD counts = rdd.flatMap(
29 | new FlatMapFunction() {
30 | public Iterable call(String x) {
31 | return Arrays.asList(x.split(" "));
32 | }}).mapToPair(new PairFunction(){
33 | public Tuple2 call(String x){
34 | return new Tuple2(x, 1);
35 | }}).reduceByKey(new Function2(){
36 | public Integer call(Integer x, Integer y){ return x+y;}});
37 | counts.saveAsTextFile(args[2]);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/ApacheAccessLog.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import java.io.Serializable;
4 | import java.util.logging.Level;
5 | import java.util.logging.Logger;
6 | import java.util.regex.Matcher;
7 | import java.util.regex.Pattern;
8 |
9 | /**
10 | * This class represents an Apache access log line.
11 | * See http://httpd.apache.org/docs/2.2/logs.html for more details.
12 | */
13 | public class ApacheAccessLog implements Serializable {
14 | private static final Logger logger = Logger.getLogger("Access");
15 |
16 | private String ipAddress;
17 | private String clientIdentd;
18 | private String userID;
19 | private String dateTimeString;
20 | private String method;
21 | private String endpoint;
22 | private String protocol;
23 | private int responseCode;
24 | private long contentSize;
25 |
26 | private ApacheAccessLog(String ipAddress, String clientIdentd, String userID,
27 | String dateTime, String method, String endpoint,
28 | String protocol, String responseCode,
29 | String contentSize) {
30 | this.ipAddress = ipAddress;
31 | this.clientIdentd = clientIdentd;
32 | this.userID = userID;
33 | this.dateTimeString = dateTime; // TODO: Parse from dateTime String;
34 | this.method = method;
35 | this.endpoint = endpoint;
36 | this.protocol = protocol;
37 | this.responseCode = Integer.parseInt(responseCode);
38 | this.contentSize = Long.parseLong(contentSize);
39 | }
40 |
41 | public String getIpAddress() {
42 | return ipAddress;
43 | }
44 |
45 | public String getClientIdentd() {
46 | return clientIdentd;
47 | }
48 |
49 | public String getUserID() {
50 | return userID;
51 | }
52 |
53 | public String getDateTimeString() {
54 | return dateTimeString;
55 | }
56 |
57 | public String getMethod() {
58 | return method;
59 | }
60 |
61 | public String getEndpoint() {
62 | return endpoint;
63 | }
64 |
65 | public String getProtocol() {
66 | return protocol;
67 | }
68 |
69 | public int getResponseCode() {
70 | return responseCode;
71 | }
72 |
73 | public long getContentSize() {
74 | return contentSize;
75 | }
76 |
77 | public void setIpAddress(String ipAddress) {
78 | this.ipAddress = ipAddress;
79 | }
80 |
81 | public void setClientIdentd(String clientIdentd) {
82 | this.clientIdentd = clientIdentd;
83 | }
84 |
85 | public void setUserID(String userID) {
86 | this.userID = userID;
87 | }
88 |
89 | public void setDateTimeString(String dateTimeString) {
90 | this.dateTimeString = dateTimeString;
91 | }
92 |
93 | public void setMethod(String method) {
94 | this.method = method;
95 | }
96 |
97 | public void setEndpoint(String endpoint) {
98 | this.endpoint = endpoint;
99 | }
100 |
101 | public void setProtocol(String protocol) {
102 | this.protocol = protocol;
103 | }
104 |
105 | public void setResponseCode(int responseCode) {
106 | this.responseCode = responseCode;
107 | }
108 |
109 | public void setContentSize(long contentSize) {
110 | this.contentSize = contentSize;
111 | }
112 |
113 | // Example Apache log line:
114 | // 127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048
115 | private static final String LOG_ENTRY_PATTERN =
116 | // 1:IP 2:client 3:user 4:date time 5:method 6:req 7:proto 8:respcode 9:size
117 | "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)";
118 | private static final Pattern PATTERN = Pattern.compile(LOG_ENTRY_PATTERN);
119 |
120 | public static ApacheAccessLog parseFromLogLine(String logline) {
121 | Matcher m = PATTERN.matcher(logline);
122 | if (!m.find()) {
123 | logger.log(Level.ALL, "Cannot parse logline" + logline);
124 | throw new RuntimeException("Error parsing logline");
125 | }
126 |
127 | return new ApacheAccessLog(m.group(1), m.group(2), m.group(3), m.group(4),
128 | m.group(5), m.group(6), m.group(7), m.group(8), m.group(9));
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/Flags.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import org.apache.commons.cli.*;
4 | import org.apache.spark.streaming.Duration;
5 |
6 | public class Flags {
7 | private static Flags THE_INSTANCE = new Flags();
8 |
9 | private Duration windowLength;
10 | private Duration slideInterval;
11 | private String logsDirectory;
12 | private String outputHtmlFile;
13 | private String checkpointDirectory;
14 | private String indexHtmlTemplate;
15 | private String outputDirectory;
16 |
17 | private boolean initialized = false;
18 |
19 | private Flags() {}
20 |
21 | public Duration getWindowLength() {
22 | return windowLength;
23 | }
24 |
25 | public Duration getSlideInterval() {
26 | return slideInterval;
27 | }
28 |
29 | public String getLogsDirectory() {
30 | return logsDirectory;
31 | }
32 |
33 | public String getOutputHtmlFile() {
34 | return outputHtmlFile;
35 | }
36 |
37 | public String getCheckpointDirectory() {
38 | return checkpointDirectory;
39 | }
40 |
41 | public String getOutputDirectory() {
42 | return outputDirectory;
43 | }
44 |
45 | public String getIndexHtmlTemplate() {
46 | return indexHtmlTemplate;
47 | }
48 |
49 | public static Flags getInstance() {
50 | if (!THE_INSTANCE.initialized) {
51 | throw new RuntimeException("Flags have not been initalized");
52 | }
53 | return THE_INSTANCE;
54 | }
55 |
56 | public static void setFromCommandLineArgs(Options options, String[] args) {
57 | CommandLineParser parser = new PosixParser();
58 | try {
59 | CommandLine cl = parser.parse(options, args);
60 | THE_INSTANCE.windowLength = new Duration(Integer.parseInt(
61 | cl.getOptionValue(LogAnalyzerAppMain.WINDOW_LENGTH, "30")) * 1000);
62 | THE_INSTANCE.slideInterval = new Duration(Integer.parseInt(
63 | cl.getOptionValue(LogAnalyzerAppMain.SLIDE_INTERVAL, "5")) * 1000);
64 | THE_INSTANCE.logsDirectory = cl.getOptionValue(
65 | LogAnalyzerAppMain.LOGS_DIRECTORY, "/tmp/logs");
66 | THE_INSTANCE.outputHtmlFile = cl.getOptionValue(
67 | LogAnalyzerAppMain.OUTPUT_HTML_FILE, "/tmp/log_stats.html");
68 | THE_INSTANCE.checkpointDirectory = cl.getOptionValue(
69 | LogAnalyzerAppMain.CHECKPOINT_DIRECTORY, "/tmp/log-analyzer-streaming");
70 | THE_INSTANCE.indexHtmlTemplate = cl.getOptionValue(
71 | LogAnalyzerAppMain.INDEX_HTML_TEMPLATE,
72 | "./src/main/resources/index.html.template");
73 | THE_INSTANCE.outputDirectory = cl.getOptionValue(
74 | LogAnalyzerAppMain.OUTPUT_DIRECTORY, "/tmp/pandaout");
75 | THE_INSTANCE.initialized = true;
76 | } catch (ParseException e) {
77 | THE_INSTANCE.initialized = false;
78 | System.err.println("Parsing failed. Reason: " + e.getMessage());
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/Functions.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import com.google.common.base.Optional;
4 | import com.google.common.collect.Ordering;
5 | import org.apache.spark.api.java.JavaDoubleRDD;
6 | import org.apache.spark.api.java.JavaPairRDD;
7 | import org.apache.spark.api.java.JavaRDD;
8 | import org.apache.spark.api.java.function.Function;
9 | import org.apache.spark.api.java.function.DoubleFunction;
10 | import org.apache.spark.api.java.function.Function2;
11 | import org.apache.spark.api.java.function.PairFunction;
12 | import scala.Tuple2;
13 | import scala.Tuple4;
14 |
15 | import javax.annotation.Nullable;
16 | import java.io.Serializable;
17 | import java.util.Comparator;
18 | import java.util.List;
19 |
20 | public class Functions {
21 | public static final class LongSumReducer implements Function2 {
22 | @Override
23 | public Long call(Long a, Long b) {
24 | return a + b;
25 | }
26 | };
27 |
28 | public static final class SumReducer implements Function2 {
29 | @Override
30 | public Double call(Double a, Double b) {
31 | return a + b;
32 | }
33 | };
34 |
35 |
36 | public static final class ValueComparator
37 | implements Comparator>, Serializable {
38 | private Comparator comparator;
39 |
40 | public ValueComparator(Comparator comparator) {
41 | this.comparator = comparator;
42 | }
43 |
44 | @Override
45 | public int compare(Tuple2 o1, Tuple2 o2) {
46 | return comparator.compare(o1._2(), o2._2());
47 | }
48 | }
49 |
50 | public static final class ComputeRunningSum implements Function2, Optional, Optional> {
51 | @Override
52 | public Optional call(List nums, Optional current) {
53 | long sum = current.or(0L);
54 | for (long i : nums) {
55 | sum += i;
56 | }
57 | return Optional.of(sum);
58 | }
59 | };
60 |
61 | public static final class GetContentSize implements DoubleFunction {
62 | @Override
63 | public double call(ApacheAccessLog log) {
64 | return new Long(log.getContentSize()).doubleValue();
65 | }
66 | }
67 |
68 | public static final @Nullable Tuple4 contentSizeStats(
69 | JavaRDD accessLogRDD) {
70 | JavaDoubleRDD contentSizes =
71 | accessLogRDD.mapToDouble(new GetContentSize()).cache();
72 | long count = contentSizes.count();
73 | if (count == 0) {
74 | return null;
75 | }
76 | Object ordering = Ordering.natural();
77 | final Comparator cmp = (Comparator)ordering;
78 |
79 | return new Tuple4<>(count,
80 | contentSizes.reduce(new SumReducer()).longValue(),
81 | contentSizes.min(cmp).longValue(),
82 | contentSizes.max(cmp).longValue());
83 | }
84 |
85 | public static final class ResponseCodeTuple implements PairFunction {
86 | @Override
87 | public Tuple2 call(ApacheAccessLog log) {
88 | return new Tuple2<>(log.getResponseCode(), 1L);
89 | }
90 | }
91 |
92 | public static final JavaPairRDD responseCodeCount(
93 | JavaRDD accessLogRDD) {
94 | return accessLogRDD
95 | .mapToPair(new ResponseCodeTuple())
96 | .reduceByKey(new LongSumReducer());
97 | }
98 |
99 | public static final class IpTuple implements PairFunction {
100 | @Override
101 | public Tuple2 call(ApacheAccessLog log) {
102 | return new Tuple2<>(log.getIpAddress(), 1L);
103 | }
104 | }
105 |
106 | public static final class IpContentTuple implements PairFunction {
107 | @Override
108 | public Tuple2 call(ApacheAccessLog log) {
109 | return new Tuple2<>(log.getIpAddress(), log.getContentSize());
110 | }
111 | }
112 |
113 |
114 | public static final class EndPointTuple implements PairFunction {
115 | @Override
116 | public Tuple2 call(ApacheAccessLog log) {
117 | return new Tuple2<>(log.getEndpoint(), 1L);
118 | }
119 | }
120 |
121 |
122 | public static final class IpCountGreaterThan10 implements Function, Boolean> {
123 | @Override
124 | public Boolean call(Tuple2 e) {
125 | return e._2() > 10;
126 | }
127 | }
128 |
129 | public static final class ParseFromLogLine implements Function {
130 | @Override
131 | public ApacheAccessLog call(String line) {
132 | return ApacheAccessLog.parseFromLogLine(line);
133 | }
134 | }
135 | public static final JavaPairRDD ipAddressCount(
136 | JavaRDD accessLogRDD) {
137 | return accessLogRDD
138 | .mapToPair(new IpTuple())
139 | .reduceByKey(new LongSumReducer());
140 | }
141 |
142 | public static final JavaRDD filterIPAddress(
143 | JavaPairRDD ipAddressCount) {
144 | return ipAddressCount
145 | .filter(new IpCountGreaterThan10())
146 | .keys();
147 | }
148 |
149 | public static final JavaPairRDD endpointCount(
150 | JavaRDD accessLogRDD) {
151 | return accessLogRDD
152 | .mapToPair(new EndPointTuple())
153 | .reduceByKey(new LongSumReducer());
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerAppMain.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import org.apache.commons.cli.Option;
4 | import org.apache.commons.cli.Options;
5 | import org.apache.spark.SparkConf;
6 | import org.apache.spark.api.java.JavaSparkContext;
7 | import org.apache.spark.api.java.JavaRDD;
8 | import org.apache.spark.streaming.api.java.JavaDStream;
9 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
10 | import org.apache.spark.api.java.function.Function;
11 |
12 | import java.io.IOException;
13 |
14 | /**
15 | * The LogAnalyzerAppMain is an sample logs analysis application. For now,
16 | * it is a simple minimal viable product:
17 | * - Read in new log files from a directory and input those new files into streaming.
18 | * - Computes stats for all of time as well as the last time interval based on those logs.
19 | * - Write the calculated stats to an txt file on the local file system
20 | * that gets refreshed every time interval.
21 | *
22 | * Once you get this program up and running, feed apache access log files
23 | * into the local directory of your choosing.
24 | *
25 | * Then open your output text file, perhaps in a web browser, and refresh
26 | * that page to see more stats come in.
27 | *
28 | * Modify the command line flags to the values of your choosing.
29 | * Notice how they come after you specify the jar when using spark-submit.
30 | *
31 | * Example command to run:
32 | * % ${YOUR_SPARK_HOME}/bin/spark-submit
33 | * --class "com.oreilly.learningsparkexamples.java.logs.LogAnalyzerAppMain"
34 | * --master local[4]
35 | * target/uber-log-analyzer-1.0.jar
36 | * --logs_directory /tmp/logs
37 | * --output_html_file /tmp/log_stats.html
38 | * --index_html_template ./src/main/resources/index.html.template
39 | * --output_directory /tmp/pandaout
40 | */
41 | public class LogAnalyzerAppMain {
42 | public static final String WINDOW_LENGTH = "window_length";
43 | public static final String SLIDE_INTERVAL = "slide_interval";
44 | public static final String LOGS_DIRECTORY = "logs_directory";
45 | public static final String OUTPUT_HTML_FILE = "output_html_file";
46 | public static final String CHECKPOINT_DIRECTORY = "checkpoint_directory";
47 | public static final String INDEX_HTML_TEMPLATE = "index_html_template";
48 | public static final String OUTPUT_DIRECTORY = "output_directory";
49 |
50 | private static final Options THE_OPTIONS = createOptions();
51 | private static Options createOptions() {
52 | Options options = new Options();
53 |
54 | options.addOption(
55 | new Option(WINDOW_LENGTH, false, "The window length in seconds"));
56 | options.addOption(
57 | new Option(SLIDE_INTERVAL, false, "The slide interval in seconds"));
58 | options.addOption(
59 | new Option(LOGS_DIRECTORY, true, "The directory where logs are written"));
60 | options.addOption(
61 | new Option(OUTPUT_HTML_FILE, false, "Where to write output html file"));
62 | options.addOption(
63 | new Option(CHECKPOINT_DIRECTORY, false, "The checkpoint directory."));
64 | options.addOption(new Option(INDEX_HTML_TEMPLATE, true,
65 | "path to the index.html.template file - accessible from all workers"));
66 | options.addOption(new Option(OUTPUT_DIRECTORY, false, "path to output DSTreams too"));
67 |
68 | return options;
69 | }
70 |
71 | public static void main(String[] args) throws IOException {
72 | Flags.setFromCommandLineArgs(THE_OPTIONS, args);
73 |
74 | // Startup the Spark Conf.
75 | SparkConf conf = new SparkConf()
76 | .setAppName("A Databricks Reference Application: Logs Analysis with Spark");
77 | JavaStreamingContext jssc = new JavaStreamingContext(conf,
78 | Flags.getInstance().getSlideInterval());
79 |
80 | // Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
81 | jssc.checkpoint(Flags.getInstance().getCheckpointDirectory());
82 |
83 | // This methods monitors a directory for new files to read in for streaming.
84 | JavaDStream logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory());
85 |
86 | JavaDStream accessLogsDStream
87 | = logData.map(new Functions.ParseFromLogLine()).cache();
88 |
89 | final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal();
90 | final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed();
91 |
92 | // Process the DStream which gathers stats for all of time.
93 | logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
94 |
95 | // Calculate statistics for the last time interval.
96 | logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
97 |
98 | // Render the output each time there is a new RDD in the accessLogsDStream.
99 | final Renderer renderer = new Renderer();
100 | accessLogsDStream.foreachRDD(new Function, Void>() {
101 | public Void call(JavaRDD rdd) {
102 | // Call this to output the stats.
103 | try {
104 | renderer.render(logAnalyzerTotal.getLogStatistics(),
105 | logAnalyzerWindowed.getLogStatistics());
106 | } catch (Exception e) {
107 | }
108 | return null;
109 | }
110 | });
111 |
112 | // Start the streaming server.
113 | jssc.start(); // Start the computation
114 | jssc.awaitTermination(); // Wait for the computation to terminate
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogAnalyzerWindowed.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import com.google.common.collect.Ordering;
4 | import org.apache.spark.api.java.JavaPairRDD;
5 | import org.apache.spark.api.java.JavaRDD;
6 | import org.apache.spark.streaming.api.java.JavaDStream;
7 | import org.apache.spark.streaming.api.java.JavaPairDStream;
8 | import org.apache.spark.api.java.function.Function;
9 | import org.apache.spark.api.java.function.Function2;
10 | import org.apache.spark.api.java.function.PairFunction;
11 | import scala.Tuple2;
12 | import scala.Tuple4;
13 |
14 | import java.io.Serializable;
15 | import java.util.Comparator;
16 | import java.util.List;
17 |
18 | public class LogAnalyzerWindowed implements Serializable {
19 | private LogStatistics logStatistics;
20 |
21 | public void processAccessLogs(String outDir, JavaDStream accessLogsDStream) {
22 | JavaDStream windowDStream = accessLogsDStream.window(
23 | Flags.getInstance().getWindowLength(),
24 | Flags.getInstance().getSlideInterval());
25 | JavaDStream ip = accessLogsDStream.map(
26 | new Function() {
27 | public String call(ApacheAccessLog entry) {
28 | return entry.getIpAddress();
29 | }});
30 | // reduceByWindow
31 | JavaDStream requestCountRBW = accessLogsDStream.map(new Function() {
32 | public Long call(ApacheAccessLog entry) {
33 | return 1L;
34 | }}).reduceByWindow(new Function2() {
35 | public Long call(Long v1, Long v2) {
36 | return v1+v2;
37 | }}, new Function2() {
38 | public Long call(Long v1, Long v2) {
39 | return v1-v2;
40 | }}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
41 | requestCountRBW.print();
42 | // reducebykeyandwindow
43 | JavaPairDStream ipAddressPairDStream = accessLogsDStream.mapToPair(
44 | new PairFunction() {
45 | public Tuple2 call(ApacheAccessLog entry) {
46 | return new Tuple2(entry.getIpAddress(), 1L);
47 | }});
48 | JavaPairDStream ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(
49 | // Adding elements in the new slice
50 | new Function2() {
51 | public Long call(Long v1, Long v2) {
52 | return v1+v2;
53 | }},
54 | // Removing elements from the oldest slice
55 | new Function2() {
56 | public Long call(Long v1, Long v2) {
57 | return v1-v2;
58 | }},
59 | Flags.getInstance().getWindowLength(),
60 | Flags.getInstance().getSlideInterval());
61 | ipCountDStream.print();
62 | // Use countByWindow
63 | JavaDStream requestCount = accessLogsDStream.countByWindow(
64 | Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
65 | JavaPairDStream ipAddressRequestCount = ip.countByValueAndWindow(
66 | Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
67 | requestCount.print();
68 | ipAddressRequestCount.print();
69 |
70 | // use a transform for the response code count
71 | JavaPairDStream responseCodeCountTransform = accessLogsDStream.transformToPair(
72 | new Function, JavaPairRDD>() {
73 | public JavaPairRDD call(JavaRDD logs) {
74 | return Functions.responseCodeCount(logs);
75 | }
76 | });
77 | windowDStream.foreachRDD(new Function, Void>() {
78 | public Void call(JavaRDD accessLogs) {
79 | Tuple4 contentSizeStats =
80 | Functions.contentSizeStats(accessLogs);
81 |
82 | List> responseCodeToCount =
83 | Functions.responseCodeCount(accessLogs)
84 | .take(100);
85 |
86 | JavaPairRDD ipAddressCounts =
87 | Functions.ipAddressCount(accessLogs);
88 | List ip = Functions.filterIPAddress(ipAddressCounts)
89 | .take(100);
90 |
91 | Object ordering = Ordering.natural();
92 | Comparator cmp = (Comparator)ordering;
93 | List> topEndpoints =
94 | Functions.endpointCount(accessLogs)
95 | .top(10, new Functions.ValueComparator(cmp));
96 |
97 | logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount,
98 | ip, topEndpoints);
99 | return null;
100 | }});
101 | }
102 |
103 | public LogStatistics getLogStatistics() {
104 | return logStatistics;
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/main/java/com/oreilly/learningsparkexamples/java/logs/LogStatistics.java:
--------------------------------------------------------------------------------
1 | package com.oreilly.learningsparkexamples.java.logs;
2 |
3 | import scala.Tuple2;
4 | import scala.Tuple4;
5 |
6 | import java.io.Serializable;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.List;
10 | import java.util.Map;
11 |
12 | public class LogStatistics implements Serializable {
13 | public final static LogStatistics EMPTY_LOG_STATISTICS =
14 | new LogStatistics(new Tuple4<>(0L, 0L, 0L, 0L), new ArrayList>(),
15 | new ArrayList(), new ArrayList>());
16 |
17 | private Tuple4 contentSizeStats;
18 | private List