├── LICENSE.txt
├── 9781484214800.jpg
├── prosparkstreaming-master
    ├── Chap10
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── scala
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── L10-2DataProc.scala
    │   │   │       │           └── L10-9Graph.scala
    │   │   │   └── java
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   ├── AbstractDriver.java
    │   │   │                   └── SocketDriver.java
    │   ├── yelp_pyspark.py
    │   └── spark.sbt
    ├── Chap2
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── spark.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── org
    │   │               └── apress
    │   │                   └── prospark
    │   │                       ├── T2-6Accumulator.scala
    │   │                       └── L2-1FirstApp.scala
    ├── Chap3
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── touch_files_window.sh
    │   ├── spark.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── org
    │   │               └── apress
    │   │                   └── prospark
    │   │                       ├── L3-DStreamVariation.scala
    │   │                       ├── L3-1DStreams.scala
    │   │                       ├── L3-DStreamAggregation.scala
    │   │                       ├── L3-DStreamMapping.scala
    │   │                       └── L3-DStreamWindowAndAction.scala
    ├── Chap4
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── spark.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── org
    │   │               └── apress
    │   │                   └── prospark
    │   │                       ├── L4-1Voyager.scala
    │   │                       ├── L4-4Kryo.scala
    │   │                       └── L4-3ProtonFlux.scala
    ├── Chap5
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── flumeConf
    │   │   ├── log4j.properties
    │   │   ├── flumeTest.conf
    │   │   ├── flumePush.conf
    │   │   └── flumePull.conf
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── scala
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── L5-6SocketStream.scala
    │   │   │       │           ├── L5-7MultipleSocketStreams.scala
    │   │   │       │           ├── L5-9Mqtt.scala
    │   │   │       │           ├── L5-16Twitter.scala
    │   │   │       │           ├── L5-11FlumePush.scala
    │   │   │       │           ├── L5-11FlumePull.scala
    │   │   │       │           ├── L5-13Kafka.scala
    │   │   │       │           ├── L5-15KafkaDirect.scala
    │   │   │       │           ├── L5-18Http.scala
    │   │   │       │           ├── L5-14KafkaCustomConf.scala
    │   │   │       │           ├── HttpInputDStreamAsync.scala
    │   │   │       │           └── HttpInputDStream.scala
    │   │   │   └── java
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   ├── KafkaDriver.java
    │   │   │                   ├── AbstractDriver.java
    │   │   │                   └── MqttDriver.java
    │   └── spark.sbt
    ├── Chap6
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── java
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── AbstractDriver.java
    │   │   │       │           └── MqttDriver.java
    │   │   │   └── scala
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   ├── L6-5Exception.scala
    │   │   │                   ├── L6-7PerPartition.scala
    │   │   │                   ├── L6-6PerRecord.scala
    │   │   │                   ├── L6-8Static.scala
    │   │   │                   ├── L6-23UpdateState.scala
    │   │   │                   ├── L6-16SparkHBase.scala
    │   │   │                   ├── L6-22Counters.scala
    │   │   │                   ├── HttpInputDStream.scala
    │   │   │                   ├── L6-12StaticPool.scala
    │   │   │                   ├── L6-20CassandraConnector.scala
    │   │   │                   ├── L6-14HBase.scala
    │   │   │                   ├── L6-26Redis.scala
    │   │   │                   ├── L6-18Cassandra.scala
    │   │   │                   ├── L6-24Accumulators.scala
    │   │   │                   └── L6-10LazyStatic.scala
    │   └── spark.sbt
    ├── Chap7
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── scala
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── L7-2-3Tachyon.scala
    │   │   │       │           └── L7-4UI.scala
    │   │   │   └── java
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   └── AbstractDriver.java
    │   └── spark.sbt
    ├── Chap8
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── scala
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── L8-4DataFrameCreationSchema.scala
    │   │   │       │           ├── L8-1DataFrameAPI.scala
    │   │   │       │           ├── L8-8Sql.scala
    │   │   │       │           ├── L8-13HiveQL.scala
    │   │   │       │           ├── L8-28DataFrameExamplesOps.scala
    │   │   │       │           ├── L8-3-6-7DataFrameCreation.scala
    │   │   │       │           ├── L8-38SparkR.scala
    │   │   │       │           ├── L8-35DataFrameExamplesRDD.scala
    │   │   │       │           ├── T8-3DataFrameExamplesNA.scala
    │   │   │       │           ├── L8-29DataFrameExamplesJoin.scala
    │   │   │       │           ├── L8-10-11UDF.scala
    │   │   │       │           ├── T8-5-L8-30-34DataFrameExamplesActions.scala
    │   │   │       │           └── L8-14-27DataFrameExamples.scala
    │   │   │   └── java
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   └── AbstractDriver.java
    │   ├── spark.sbt
    │   ├── L8-36CdrSparkRApp.R
    │   ├── L8-39CdrStreamingSparkRApp.R
    │   ├── cdrschema.json
    │   └── cdrschema2.json
    ├── Chap9
    │   ├── project
    │   │   ├── assembly.sbt
    │   │   └── plugins.sbt
    │   ├── src
    │   │   └── main
    │   │   │   ├── resources
    │   │   │       └── log4j.properties
    │   │   │   ├── scala
    │   │   │       └── org
    │   │   │       │   └── apress
    │   │   │       │       └── prospark
    │   │   │       │           ├── L9-14FPMining.scala
    │   │   │       │           ├── L9-6Preprocessing.scala
    │   │   │       │           ├── L9-5ChiSq.scala
    │   │   │       │           ├── L9-13FPMiningPreprocessing.scala
    │   │   │       │           ├── L9-11CollabFilteringPreprocessing.scala
    │   │   │       │           ├── L9-7FeatureExtraction.scala
    │   │   │       │           ├── L9-12CollabFiltering.scala
    │   │   │       │           ├── L9-4Correlation.scala
    │   │   │       │           ├── L9-3Statistics.scala
    │   │   │       │           ├── L9-8PCA.scala
    │   │   │       │           ├── L9-1LinearRegression.scala
    │   │   │       │           ├── L9-10KMeans.scala
    │   │   │       │           ├── L9-9LogisticRegression.scala
    │   │   │       │           ├── L9-15MLPipeline.scala
    │   │   │       │           ├── T9-4DataTypes.scala
    │   │   │       │           └── L9-17MLCrossValidation.scala
    │   │   │   └── java
    │   │   │       └── org
    │   │   │           └── apress
    │   │   │               └── prospark
    │   │   │                   └── AbstractDriver.java
    │   └── spark.sbt
    └── README.md
├── README.md
└── contributing.md


/LICENSE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/pro-spark-streaming/HEAD/LICENSE.txt


--------------------------------------------------------------------------------
/9781484214800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/pro-spark-streaming/HEAD/9781484214800.jpg


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap2/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap2/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 
5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8")
6 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/touch_files_window.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |         for i in `seq 1 10`;
3 |         do
4 | 		p=/Users/zubairnabi/Downloads/dummy/${i}.gz
5 |                 echo ${p}
6 | 		touch -c ${p}
7 | 		sleep 1
8 |       	done    
9 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 
5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
6 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/flumeConf/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap2/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap2"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Pro Spark Streaming*](http://www.apress.com/9781484214800) by Zubair Nabi (Apress, 2016).
 4 | 
 5 | ![Cover image](9781484214800.jpg)
 6 | 
 7 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 8 | 
 9 | ## Releases
10 | 
11 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
12 | 
13 | ## Contributions
14 | 
15 | See the file Contributing.md for more information on how you can contribute to this repository.
16 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap7"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/flumeConf/flumeTest.conf:
--------------------------------------------------------------------------------
 1 | # Name the components on this agent
 2 | a1.sources = r1
 3 | a1.sinks = k1
 4 | a1.channels = c1
 5 | 
 6 | # Describe/configure the source
 7 | a1.sources.r1.type = netcat
 8 | a1.sources.r1.bind = localhost
 9 | a1.sources.r1.port = 44444
10 | 
11 | # Describe the sink
12 | a1.sinks.k1.type = logger
13 | 
14 | # Use a channel which buffers events in memory
15 | a1.channels.c1.type = memory
16 | a1.channels.c1.capacity = 1000
17 | a1.channels.c1.transactionCapacity = 100
18 | 
19 | # Bind the source and sink to the channel
20 | a1.sources.r1.channels = c1
21 | a1.sinks.k1.channel = c1
22 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap3"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap4"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/flumeConf/flumePush.conf:
--------------------------------------------------------------------------------
 1 | # components on this agent
 2 | a1.sources = src-1
 3 | a1.sinks = snk-1
 4 | a1.channels = ch-1
 5 | 
 6 | # source
 7 | a1.sources.src-1.type = spooldir
 8 | a1.sources.src-1.channels = ch-1
 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes
10 | 
11 | # sink
12 | a1.sinks.snk-1.type = avro
13 | a1.sinks.snk-1.hostname = localhost
14 | a1.sinks.snk-1.port = 44444
15 | 
16 | # channel
17 | a1.channels.ch-1.type = memory
18 | a1.channels.ch-1.capacity = 10000
19 | a1.channels.ch-1.transactionCapacity = 1000
20 | 
21 | # bind source, sink, and channel
22 | a1.sources.src-1.channels = ch-1
23 | a1.sinks.snk-1.channel = ch-1
24 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, FILE, stdout
 2 | log4j.rootCategory=INFO, FILE, stdout
 3 | 
 4 | log4j.logger.org.eclipse.jetty=WARN
 5 | 
 6 | log4j.appender.FILE=org.apache.log4j.FileAppender
 7 | 
 8 | log4j.appender.FILE.File=/tmp/spark.log
 9 | 
10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
12 | 
13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
14 | log4j.appender.stdout.Target=System.out
15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, FILE, stdout
 2 | log4j.rootCategory=INFO, FILE, stdout
 3 | 
 4 | log4j.logger.org.eclipse.jetty=WARN
 5 | 
 6 | log4j.appender.FILE=org.apache.log4j.FileAppender
 7 | 
 8 | log4j.appender.FILE.File=/tmp/spark.log
 9 | 
10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
12 | 
13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
14 | log4j.appender.stdout.Target=System.out
15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/flumeConf/flumePull.conf:
--------------------------------------------------------------------------------
 1 | # components on this agent
 2 | a1.sources = src-1
 3 | a1.sinks = snk-1
 4 | a1.channels = ch-1
 5 | 
 6 | # source
 7 | a1.sources.src-1.type = spooldir
 8 | a1.sources.src-1.channels = ch-1
 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes
10 | 
11 | # sink
12 | a1.sinks.snk-1.type = org.apache.spark.streaming.flume.sink.SparkSink
13 | a1.sinks.snk-1.hostname = localhost
14 | a1.sinks.snk-1.port = 44444
15 | 
16 | # channel
17 | a1.channels.ch-1.type = memory
18 | a1.channels.ch-1.capacity = 10000
19 | a1.channels.ch-1.transactionCapacity = 1000
20 | 
21 | # bind source, sink, and channel
22 | a1.sources.src-1.channels = ch-1
23 | a1.sinks.snk-1.channel = ch-1
24 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap9"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.4.0"
24 | 
25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
26 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap8"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | //libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "1.4.0"
26 | 
27 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
28 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/L8-36CdrSparkRApp.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(trailingOnly = TRUE)
 2 | if(length(args) != 2) {
 3 |     stop("Usage: CdrSparkRApp <master> <filepath>")
 4 | }
 5 | library(SparkR)
 6 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.3.0" "sparkr-shell"')
 7 | sc <- sparkR.init(master = args[1])
 8 | sqlContext <- sparkRSQL.init(sc)
 9 | df <- read.df(sqlContext, args[2], source = "com.databricks.spark.csv", inferSchema = "true", delimiter = "\t")
10 | cnames <- c("squareId", "timeInterval", "countryCode", "smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity") 
11 | for (i in 1:NROW(cnames)) {
12 |     df <- withColumnRenamed(df, paste0("C", i - 1), cnames[i])
13 | }
14 | counts <- count(groupBy(df, "countryCode"))
15 | showDF(orderBy(counts, desc(counts$count)), numRows = 5)
16 | sparkR.stop()


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/L8-39CdrStreamingSparkRApp.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | if(length(args) != 1) {
 4 |     stop("Usage: CdrStreamingSparkRApp <master>")
 5 | }
 6 | library(SparkR)
 7 | sc <- sparkR.init(master = args[1])
 8 | hiveContext <- sparkRHive.init(sc)
 9 | f <- file("stdin")
10 | open(f)
11 | while(length(tableName <- readLines(f, n = 1)) > 0) {
12 |     tryCatch({
13 |         tableName <- trimws(tableName)
14 |         write(paste0("Processing table: ", tableName), stderr())
15 |         df <- table(hiveContext, tableName)
16 |         counts <- count(groupBy(df, "countryCode"))
17 |         outputTable <- paste0(tableName, "processed")
18 |         write(paste0("Output written to: ", outputTable), stderr())
19 |         saveAsTable(limit(orderBy(counts, desc(counts$count)), 5), outputTable, "parquet", "error")
20 |     }, error = function(e) {stop(e)})
21 | }
22 | close(f)
23 | sparkR.stop()


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap2/src/main/scala/org/apress/prospark/T2-6Accumulator.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | 
 8 | object AccumulatorApp {
 9 |   def main(args: Array[String]) {
10 |     if (args.length != 1) {
11 |       System.err.println(
12 |         "Usage: AccumulatorApp <appname>")
13 |       System.exit(1)
14 |     }
15 |     val Seq(appName) = args.toSeq
16 | 
17 |     val conf = new SparkConf()
18 |       .setAppName(appName)
19 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
20 |       .set("spark.eventLog.enabled", true.toString)
21 |       .set("spark.eventLog.dir", "/tmp")
22 |     val sc = new SparkContext(conf)
23 |     val setAcc = sc.accumulableCollection(mutable.HashSet[Int]())
24 |     val d = sc.parallelize(1 to 100)
25 |     d.foreach(x => setAcc += x)
26 |     println(setAcc.value.size)
27 |   }
28 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/cdrschema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "struct",
 3 |   "fields": [
 4 |     {
 5 |       "name": "squareId",
 6 |       "nullable": false,
 7 |       "type": "integer"
 8 |     },
 9 |     {
10 |       "name": "timeInterval",
11 |       "nullable": false,
12 |       "type": "long"
13 |     },
14 |     {
15 |       "name": "countryCode",
16 |       "nullable": true,
17 |       "type": "string"
18 |     },
19 |     {
20 |       "name": "smsInActivity",
21 |       "nullable": true,
22 |       "type": "float"
23 |     },
24 |     {
25 |       "name": "smsOutActivity",
26 |       "nullable": true,
27 |       "type": "float"
28 |     },
29 |     {
30 |       "name": "callInActivity",
31 |       "nullable": true,
32 |       "type": "float"
33 |     },
34 |     {
35 |       "name": "callOutActivity",
36 |       "nullable": true,
37 |       "type": "float"
38 |     },
39 |     {
40 |       "name": "internetTrafficActivity",
41 |       "nullable": true,
42 |       "type": "float"
43 |     }
44 |   ]
45 | }
46 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/cdrschema2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "struct",
 3 |   "fields": [
 4 |     {
 5 |       "name": "squareId",
 6 |       "nullable": false,
 7 |       "type": "integer"
 8 |     },
 9 |     {
10 |       "name": "timeInterval",
11 |       "nullable": false,
12 |       "type": "long"
13 |     },
14 |     {
15 |       "name": "countryCode",
16 |       "nullable": true,
17 |       "type": "integer"
18 |     },
19 |     {
20 |       "name": "smsInActivity",
21 |       "nullable": true,
22 |       "type": "float"
23 |     },
24 |     {
25 |       "name": "smsOutActivity",
26 |       "nullable": true,
27 |       "type": "float"
28 |     },
29 |     {
30 |       "name": "callInActivity",
31 |       "nullable": true,
32 |       "type": "float"
33 |     },
34 |     {
35 |       "name": "callOutActivity",
36 |       "nullable": true,
37 |       "type": "float"
38 |     },
39 |     {
40 |       "name": "internetTrafficActivity",
41 |       "nullable": true,
42 |       "type": "float"
43 |     }
44 |   ]
45 | }
46 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/README.md:
--------------------------------------------------------------------------------
 1 | # Pro Spark Streaming
 2 | 
 3 | Code used in "Pro Spark Streaming: The Zen of Real-time Analytics using Apache Spark" published by Apress Publishing.
 4 | 
 5 | ISBN-13: 978-1484214800
 6 | 
 7 | ISBN-10: 1484214803
 8 | 
 9 | # Layout
10 | 
11 | Each folder contains code for a particular chapter. The repetition of code is deliberate. While this goes against most software engineering principles (held very dearly by the author as well), it is necessary to expound a topic and keep its implementation self-contained.
12 | 
13 | ## Chapters
14 | 
15 | - 2:  Introduction to Spark
16 | - 3:  DStreams: Real-time RDDs
17 | - 4:  High Velocity Streams: Parallelism and Other Stories
18 | - 5:  Real-time Route 66: Linking External Data Sources
19 | - 6:  The Art of Side Effects
20 | - 7:  Getting Ready for Prime Time
21 | - 8:  Real-time ETL and Analytics Magic
22 | - 9:  Machine Learning at Scale
23 | - 10: Of Clouds, Lambdas, and Pythons
24 | 
25 | # Build
26 | 
27 | Jump to a particular folder and simply execute `sbt assembly`. This will generate an uber JAR that can directly be submitted to a Spark cluster.


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/yelp_pyspark.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.streaming import StreamingContext
 3 | from sys import argv, exit
 4 | try: import simplejson as json
 5 | except ImportError: import json
 6 | 
 7 | if len(argv) != 5:
 8 |     print 'Usage: yelp_pyspark.py <appname> <batchInterval> <hostname> <port>'
 9 |     exit(-1)
10 | 
11 | appname = argv[1]
12 | batch_interval = int(argv[2])
13 | hostname = argv[3]
14 | port = int(argv[4])
15 | 
16 | sc = SparkContext(appName=appname)
17 | ssc = StreamingContext(sc, batch_interval)
18 | 
19 | records = ssc.socketTextStream(hostname, port)
20 | json_records = records.map(lambda rec: json.loads(rec))
21 | restaurant_records = json_records.filter(lambda rec: 'attributes' in rec and 'Wi-Fi' in rec['attributes'])
22 | wifi_pairs = restaurant_records.map(lambda rec: (rec['attributes']['Wi-Fi'], rec['stars']))
23 | wifi_counts = wifi_pairs.combineByKey(lambda v: (v, 1),
24 |                              lambda x, value: (x[0] + value, x[1] + 1),
25 |                              lambda x, y: (x[0] + y[0], x[1] + y[1]))
26 | avg_stars = wifi_counts.map(lambda (key, (sum_, count)): (key, sum_ / count))
27 | avg_stars.pprint()
28 | 
29 | ssc.start()
30 | ssc.awaitTermination()
31 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | net.virtualvoid.sbt.graph.DependencyGraphSettings.graphSettings
 6 | 
 7 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 8 |  case entry => {
 9 |    val strategy = mergeStrategy(entry)
10 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
11 |    else strategy
12 |  }
13 | }}
14 | 
15 | name := "Chap10"
16 | 
17 | version := "1.0"
18 | 
19 | scalaVersion := "2.10.5"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
22 | 
23 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
24 | 
25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
26 | 
27 | libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-1.1" % "0.2.3" exclude("com.google.guava", "guava")
28 | 
29 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2"
30 | 
31 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2"
32 | 
33 | libraryDependencies += "com.google.guava" % "guava" % "16.0"
34 | 
35 | libraryDependencies += "org.mortbay.jetty.alpn" % "alpn-boot" % "8.1.6.v20151105"
36 | 
37 | libraryDependencies += "com.google.cloud.bigdataoss" % "bigquery-connector" % "0.7.4-hadoop2"
38 | 
39 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.4.0"
40 | 
41 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap5"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0"
26 | 
27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1"
28 | 
29 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.4.0"
30 | 
31 | libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka" % "1.4.0"
32 | 
33 | libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.4.0"
34 | 
35 | libraryDependencies += "com.ning" % "async-http-client" % "1.9.31"
36 | 
37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1"
38 | 
39 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/"
40 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-14FPMining.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.fpm.FPGrowth
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | 
 9 | object FPMiningApp {
10 | 
11 |   def main(args: Array[String]) {
12 |     if (args.length != 3) {
13 |       System.err.println(
14 |         "Usage: FPMiningApp <appname> <batchInterval> <iPath>")
15 |       System.exit(1)
16 |     }
17 |     val Seq(appName, batchInterval, iPath) = args.toSeq
18 | 
19 |     val conf = new SparkConf()
20 |       .setAppName(appName)
21 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
22 | 
23 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
24 | 
25 |     val minSupport = 0.4
26 | 
27 |     ssc.textFileStream(iPath)
28 |       .map(r => r.split(" "))
29 |       .foreachRDD(transactionRDD => {
30 |         val fpg = new FPGrowth()
31 |           .setMinSupport(minSupport)
32 |         val model = fpg.run(transactionRDD)
33 | 
34 |         model.freqItemsets
35 |           .collect()
36 |           .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq)))
37 |       })
38 | 
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-6SocketStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
 8 | 
 9 | import java.util.Calendar
10 | 
11 | object TripByYearApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 3) {
14 |       System.err.println(
15 |         "Usage: TripByYearApp <appname> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(10))
25 | 
26 |     ssc.socketTextStream(hostname, port.toInt)
27 |       .map(rec => rec.split(","))
28 |       .map(rec => (rec(13), rec(0).toInt))
29 |       .reduceByKey(_ + _)
30 |       .map(pair => (pair._2, normalizeYear(pair._1)))
31 |       .transform(rec => rec.sortByKey(ascending = false))
32 |       .saveAsTextFiles("TripByYear")
33 | 
34 |     ssc.start()
35 |     ssc.awaitTermination()
36 |   }
37 | 
38 |   def normalizeYear(s: String): String = {
39 |     try {
40 |       (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
41 |     } catch {
42 |       case e: Exception => s
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-6Preprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.StandardScaler
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | 
10 | object PreprocessingApp {
11 | 
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: PreprocessingAppApp <appname> <batchInterval> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
25 | 
26 |     val substream = ssc.socketTextStream(hostname, port.toInt)
27 |       .filter(!_.contains("NaN"))
28 |       .map(_.split(" "))
29 |       .filter(f => f(1) != "0")
30 | 
31 |     substream.map(f => Array(f(2), f(4), f(5), f(6)))
32 |       .map(f => f.map(v => v.toDouble))
33 |       .map(f => Vectors.dense(f))
34 |       .foreachRDD(rdd => {
35 |         val scalerModel = new StandardScaler().fit(rdd)
36 |         val scaledRDD = scalerModel.transform(rdd)
37 |       })
38 | 
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap2/src/main/scala/org/apress/prospark/L2-1FirstApp.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | 
 8 | object TranslateApp {
 9 |   def main(args: Array[String]) {
10 |     if (args.length != 4) {
11 |       System.err.println(
12 |         "Usage: TranslateApp <appname> <book_path> <output_path> <language>")
13 |       System.exit(1)
14 |     }
15 |     val Seq(appName, bookPath, outputPath, lang) = args.toSeq
16 | 
17 |     val dict = getDictionary(lang)
18 | 
19 |     val conf = new SparkConf()
20 |       .setAppName(appName)
21 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
22 |     val sc = new SparkContext(conf)
23 |     val book = sc.textFile(bookPath)
24 |     val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" "))
25 |     translated.saveAsTextFile(outputPath)
26 |   }
27 | 
28 |   def getDictionary(lang: String): Map[String, String] = {
29 |     if (!Set("German", "French", "Italian", "Spanish").contains(lang)) {
30 |       System.err.println(
31 |         "Unsupported language: %s".format(lang))
32 |       System.exit(1)
33 |     }
34 |     val url = "http://www.june29.com/IDP/files/%s.txt".format(lang)
35 |     println("Grabbing dictionary from: %s".format(url))
36 |     Source.fromURL(url, "ISO-8859-1").mkString
37 |       .split("\\r?\\n")
38 |       .filter(line => !line.startsWith("#"))
39 |       .map(line => line.split("\\t"))
40 |       .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-7MultipleSocketStreams.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
 8 | 
 9 | import java.util.Calendar
10 | 
11 | object TripByYearMultiApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, basePort, nSockets) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(10))
25 | 
26 |     val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i))
27 |     val uniStream = ssc.union(streams)
28 | 
29 |     uniStream
30 |       .map(rec => rec.split(","))
31 |       .map(rec => (rec(13), rec(0).toInt))
32 |       .reduceByKey(_ + _)
33 |       .map(pair => (pair._2, normalizeYear(pair._1)))
34 |       .transform(rec => rec.sortByKey(ascending = false))
35 |       .saveAsTextFiles("TripByYear")
36 | 
37 |     ssc.start()
38 |     ssc.awaitTermination()
39 |   }
40 | 
41 |   def normalizeYear(s: String): String = {
42 |     try {
43 |       (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
44 |     } catch {
45 |       case e: Exception => s
46 |     }
47 |   }
48 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-9Mqtt.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.mqtt.MQTTUtils
11 | 
12 | object YearlyDistributionApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => rec.split(","))
30 |       .map(rec => (rec(1).split(" ")(0), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .map(pair => (pair._2, pair._1))
33 |       .transform(rec => rec.sortByKey(ascending = false))
34 |       .saveAsTextFiles("YearlyDistribution")
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-5ChiSq.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.stat.Statistics
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object ChiSqApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 |       .map(f => f.map(f => f.toDouble))
32 | 
33 |     substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
34 |       .filter(f => f(0) == 4.0 || f(0) == 5.0)
35 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |       .foreachRDD(rdd => {
37 |         Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))
38 |       })
39 | 
40 |     ssc.start()
41 |     ssc.awaitTermination()
42 |   }
43 | 
44 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-16Twitter.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.apache.spark.streaming.twitter.TwitterUtils
10 | import org.apache.spark.storage.StorageLevel
11 | import twitter4j.conf.ConfigurationBuilder
12 | import twitter4j.TwitterFactory
13 | 
14 | object TwitterApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: TwitterApp <appname> <outputPath>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, outputPath) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 | 
31 |     val cb = new ConfigurationBuilder()
32 |     cb.setOAuthConsumerKey("")
33 |     cb.setOAuthConsumerSecret("")
34 |     cb.setOAuthAccessToken("")
35 |     cb.setOAuthAccessTokenSecret("")
36 | 
37 |     val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization()
38 | 
39 |     val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share"))
40 |     tweetStream.count().print()
41 |     tweetStream.saveAsTextFiles(outputPath)
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-13FPMiningPreprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapred.FileSplit
 6 | import org.apache.hadoop.mapred.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.HadoopRDD
10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
11 | 
12 | import com.google.common.io.Files
13 | 
14 | object FPMiningPreprocessingApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 3) {
18 |       System.err.println(
19 |         "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, iPath, oPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val delim = " "
29 | 
30 |     val sc = new SparkContext(conf)
31 |     sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
32 |       .asInstanceOf[HadoopRDD[LongWritable, Text]]
33 |       .mapPartitionsWithInputSplit((iSplit, iter) =>
34 |         iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
35 |       .filter(r => r._2 != "0")
36 |       .map(r => (r._1, r._2))
37 |       .distinct()
38 |       .groupByKey()
39 |       .map(r => r._2.mkString(" "))
40 |       .sample(false, 0.7)
41 |       .coalesce(1)
42 |       .saveAsTextFile(oPath)
43 |   }
44 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-4DataFrameCreationSchema.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql.functions.desc
 8 | import org.apache.spark.sql.types.DataType
 9 | import org.apache.spark.sql.types.StructType
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object DataframeCreationApp2 {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 5) {
17 |       System.err.println(
18 |         "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val sqlC = new SQLContext(ssc.sparkContext)
30 | 
31 |     val schemaJson = scala.io.Source.fromFile(schemaFile).mkString
32 |     val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType]
33 | 
34 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
35 |       .map(_.split("\\t", -1))
36 |       .foreachRDD(rdd => {
37 |         val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema)
38 |         
39 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
40 |       })
41 | 
42 |     ssc.start()
43 |     ssc.awaitTermination()
44 | 
45 |   }
46 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/KafkaDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import kafka.javaapi.producer.Producer;
 6 | import kafka.producer.KeyedMessage;
 7 | import kafka.producer.ProducerConfig;
 8 | 
 9 | public class KafkaDriver extends AbstractDriver {
10 | 
11 | 	private final String topic;
12 | 	private Producer<String, String> producer;
13 | 
14 | 	public KafkaDriver(String path, String topic, Properties props) {
15 | 		super(path);
16 | 		this.topic = topic;
17 | 		ProducerConfig config = new ProducerConfig(props);
18 | 		producer = new Producer<String, String>(config);
19 | 	}
20 | 
21 | 	@Override
22 | 	public void init() throws Exception {
23 | 	}
24 | 
25 | 	@Override
26 | 	public void close() throws Exception {
27 | 		producer.close();
28 | 	}
29 | 
30 | 	@Override
31 | 	public void sendRecord(String record) throws Exception {
32 | 		producer.send(new KeyedMessage<String, String>(topic, record));
33 | 	}
34 | 
35 | 	public static void main(String[] args) throws Exception {
36 | 
37 | 		if (args.length != 3) {
38 | 			System.err.println("Usage: KafkaDriver <path_to_input_folder> <brokerUrl> <topic>");
39 | 			System.exit(-1);
40 | 		}
41 | 
42 | 		String path = args[0];
43 | 		String brokerUrl = args[1];
44 | 		String topic = args[2];
45 | 
46 | 		Properties props = new Properties();
47 | 		props.put("metadata.broker.list", brokerUrl);
48 | 		props.put("serializer.class", "kafka.serializer.StringEncoder");
49 | 		// props.put("request.required.acks", "1");
50 | 
51 | 		KafkaDriver driver = new KafkaDriver(path, topic, props);
52 | 		try {
53 | 			driver.execute();
54 | 		} finally {
55 | 			driver.close();
56 | 		}
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePush.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.flume.FlumeUtils
11 | 
12 | object DailyUserTypeDistributionApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 5) {
15 |       System.err.println(
16 |         "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => new String(rec.event.getBody().array()).split(","))
30 |       .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .repartition(1)
33 |       .transform(rdd => rdd.sortByKey(ascending = false))
34 |       .saveAsTextFiles(outputPath)
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePull.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.flume.FlumeUtils
11 | 
12 | object DailyUserTypeDistributionApp2 {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 5) {
15 |       System.err.println(
16 |         "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => new String(rec.event.getBody().array()).split(","))
30 |       .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .repartition(1)
33 |       .transform(rdd => rdd.sortByKey(ascending = false))
34 |       .saveAsTextFiles(outputPath)
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-11CollabFilteringPreprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapred.FileSplit
 6 | import org.apache.hadoop.mapred.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.HadoopRDD
10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
11 | 
12 | import com.google.common.io.Files
13 | 
14 | object CollabFilteringPreprocessingApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 3) {
18 |       System.err.println(
19 |         "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, iPath, oPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val delim = " "
29 | 
30 |     val sc = new SparkContext(conf)
31 |     sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
32 |       .asInstanceOf[HadoopRDD[LongWritable, Text]]
33 |       .mapPartitionsWithInputSplit((iSplit, iter) =>
34 |         iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
35 |       .filter(r => r._2 != "0")
36 |       .map(r => ((r._1, r._2), 1))
37 |       .reduceByKey(_ + _)
38 |       .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2)
39 |       .sample(false, 0.7)
40 |       .coalesce(1)
41 |       .saveAsTextFile(oPath)
42 |   }
43 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-1Voyager.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.LongWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | 
14 | object VoyagerApp {
15 |   def main(args: Array[String]) {
16 |     if (args.length != 3) {
17 |       System.err.println(
18 |         "Usage: VoyagerApp <appname> <inputPath> <outputPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, inputPath, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |       .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC")
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(10))
29 | 
30 |     val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
31 |     voyager1.map(rec => {
32 |       val attrs = rec.split("\\s+")
33 |       ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble))
34 |     }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1))
35 |       .reduceByKey(_ + _)
36 |       .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath)
37 | 
38 |     ssc.start()
39 |     ssc.awaitTermination()
40 |   }
41 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-13Kafka.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | 
12 | object StationJourneyCountApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 7) {
16 |       System.err.println(
17 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
18 |       System.exit(1)
19 |     }
20 | 
21 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |     //.set("spark.streaming.receiver.writeAheadLog.enable", "true")
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(10))
29 |     ssc.checkpoint(checkpointDir)
30 | 
31 |     val topics = Map[String, Int](
32 |       topic -> 1)
33 |     KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
34 |       .map(rec => rec.split(","))
35 |       .map(rec => ((rec(3), rec(7)), 1))
36 |       .reduceByKey(_ + _)
37 |       .repartition(1)
38 |       .map(rec => (rec._2, rec._1))
39 |       .transform(rdd => rdd.sortByKey(ascending = false))
40 |       .saveAsTextFiles(outputPath)
41 | 
42 |     ssc.start()
43 |     ssc.awaitTermination()
44 |   }
45 | 
46 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-7FeatureExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.ChiSqSelector
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object FeatureExtractionApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 | 
32 |     val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
33 |       .map(f => f.map(v => v.toDouble))
34 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048))))
35 | 
36 |     datastream.foreachRDD(rdd => {
37 |       val selector = new ChiSqSelector(5)
38 |       val model = selector.fit(rdd)
39 |       val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features)))
40 |       filtered.take(20).foreach(println)
41 |     })
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-12CollabFiltering.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.recommendation.ALS
 6 | import org.apache.spark.mllib.recommendation.Rating
 7 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
 8 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CollabFilteringApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 3) {
16 |       System.err.println(
17 |         "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>")
18 |       System.exit(1)
19 |     }
20 |     val Seq(appName, batchInterval, iPath) = args.toSeq
21 | 
22 |     val conf = new SparkConf()
23 |       .setAppName(appName)
24 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
25 | 
26 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
27 | 
28 |     val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match {
29 |       case Array(subject, activity, freq) =>
30 |         Rating(subject.toInt, activity.toInt, freq.toDouble)
31 |     })
32 | 
33 |     val rank = 10
34 |     val numIterations = 10
35 |     val lambda = 0.01
36 |     ratingStream.foreachRDD(ratingRDD => {
37 |       val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7))
38 |       val model = ALS.train(testTrain(1), rank, numIterations, lambda)
39 |       val test = testTrain(0).map {
40 |         case Rating(subject, activity, freq) =>
41 |           (subject, activity)
42 |       }
43 |       val prediction = model.predict(test)
44 |       prediction.take(5).map(println)
45 |     })
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-15KafkaDirect.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import kafka.serializer.StringDecoder
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | 
12 | object StationJourneyCountDirectApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 7) {
16 |       System.err.println(
17 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
18 |       System.exit(1)
19 |     }
20 | 
21 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(10))
28 |     ssc.checkpoint(checkpointDir)
29 | 
30 |     val topics = Set(topic)
31 |     val params = Map[String, String](
32 |       "zookeeper.connect" -> zkQuorum,
33 |       "group.id" -> consumerGroupId,
34 |       "bootstrap.servers" -> brokerUrl)
35 |     KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2)
36 |       .map(rec => rec.split(","))
37 |       .map(rec => ((rec(3), rec(7)), 1))
38 |       .reduceByKey(_ + _)
39 |       .repartition(1)
40 |       .map(rec => (rec._2, rec._1))
41 |       .transform(rdd => rdd.sortByKey(ascending = false))
42 |       .saveAsTextFiles(outputPath)
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-4Correlation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.stat.Statistics
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object CorrelationApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 |       .map(f => f.map(f => f.toDouble))
32 | 
33 |     val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
34 | 
35 |     val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |     walkingOrRunning.map(f => f.features).foreachRDD(rdd => {
37 |       val corrSpearman = Statistics.corr(rdd, "spearman")
38 |       val corrPearson = Statistics.corr(rdd, "pearson")
39 |       println("Correlation Spearman: \n" + corrSpearman)
40 |       println("Correlation Pearson: \n" + corrPearson)
41 |     })
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-3Statistics.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.stat.Statistics
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | 
10 | object StatisticsApp {
11 | 
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
25 | 
26 |     val substream = ssc.socketTextStream(hostname, port.toInt)
27 |       .filter(!_.contains("NaN"))
28 |       .map(_.split(" "))
29 |       .filter(f => f(1) != "0")
30 |       .map(f => f.map(f => f.toDouble))
31 | 
32 |     substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => {
33 |       val stats = Statistics.colStats(rdd)
34 |       println("Count: " + stats.count)
35 |       println("Max: " + stats.max.toArray.mkString(" "))
36 |       println("Min: " + stats.min.toArray.mkString(" "))
37 |       println("Mean: " + stats.mean.toArray.mkString(" "))
38 |       println("L1-Norm: " + stats.normL1.toArray.mkString(" "))
39 |       println("L2-Norm: " + stats.normL2.toArray.mkString(" "))
40 |       println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" "))
41 |       println("Varience: " + stats.variance.toArray.mkString(" "))
42 |     })
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-18Http.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.streaming.Seconds
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import org.json4s.DefaultFormats
 8 | import org.json4s.JField
 9 | import org.json4s.jvalue2extractable
10 | import org.json4s.jvalue2monadic
11 | import org.json4s.native.JsonMethods.parse
12 | import org.json4s.string2JsonInput
13 | 
14 | object HttpApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: HttpApp <appname> <outputPath>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, outputPath) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val batchInterval = 10
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
32 | 
33 |     HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval)
34 |       .flatMap(rec => (parse(rec) \ "stationBeanList").children)
35 |       .filter(rec => {
36 |         implicit val formats = DefaultFormats
37 |         (rec \ "statusKey").extract[Integer] != 1
38 |       })
39 |       .map(rec => rec.filterField {
40 |         case JField("id", _) => true
41 |         case JField("stationName", _) => true
42 |         case JField("statusValue", _) => true
43 |         case _ => false
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String])
48 |       })
49 |       .saveAsTextFiles(outputPath)
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-8PCA.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.PCA
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object PCAApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: PCAApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 | 
32 |     val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
33 |       .map(f => f.map(v => v.toDouble))
34 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
35 | 
36 |     datastream.foreachRDD(rdd => {
37 |       val pca = new PCA(rdd.first().features.size / 2)
38 |         .fit(rdd.map(_.features))
39 |       val testTrain = rdd.randomSplit(Array(0.3, 0.7))
40 |       val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features)))
41 |       val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features)))
42 |       train.take(20).foreach(println)
43 |     })
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 |   }
48 | 
49 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamVariation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditVariationApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditVariationApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val merged = comments.union(comments)
39 | 
40 |     val repartitionedComments = comments.repartition(4)
41 | 
42 |     val rddMin = comments.glom().map(arr =>
43 |       arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 | 
48 |   }
49 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-1DataFrameAPI.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
37 |       .map(_.split("\\t", -1))
38 |       .foreachRDD(rdd => {
39 |         val cdrs = seqToCdr(rdd).toDF()
40 | 
41 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
42 |       })
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
49 |     rdd.map(c => c.map(f => f match {
50 |       case x if x.isEmpty() => "0"
51 |       case x => x
52 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
53 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
54 |   }
55 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-1DStreams.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.hadoop.io.LongWritable
 9 | import org.apache.hadoop.fs.Path
10 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
11 | import org.apache.hadoop.io.Text
12 | 
13 | object StreamingTranslateApp {
14 |   def main(args: Array[String]) {
15 |     if (args.length != 4) {
16 |       System.err.println(
17 |         "Usage: StreamingTranslateApp <appname> <book_path> <output_path> <language>")
18 |       System.exit(1)
19 |     }
20 |     val Seq(appName, bookPath, outputPath, lang) = args.toSeq
21 | 
22 |     val dict = getDictionary(lang)
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 |     val ssc = new StreamingContext(conf, Seconds(1))
28 | 
29 |     val book = ssc.textFileStream(bookPath)
30 |     val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" "))
31 |     translated.saveAsTextFiles(outputPath)
32 | 
33 |     ssc.start()
34 |     ssc.awaitTermination()
35 |   }
36 | 
37 |   def getDictionary(lang: String): Map[String, String] = {
38 |     if (!Set("German", "French", "Italian", "Spanish").contains(lang)) {
39 |       System.err.println(
40 |         "Unsupported language: %s".format(lang))
41 |       System.exit(1)
42 |     }
43 |     val url = "http://www.june29.com/IDP/files/%s.txt".format(lang)
44 |     println("Grabbing dictionary from: %s".format(url))
45 |     Source.fromURL(url, "ISO-8859-1").mkString
46 |       .split("\\r?\\n")
47 |       .filter(line => !line.startsWith("#"))
48 |       .map(line => line.split("\\t"))
49 |       .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap
50 |   }
51 | 
52 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamAggregation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditAggregationApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditAggregationApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val recCount = comments.count()
39 | 
40 |     val recCountValue = comments.countByValue()
41 | 
42 |     val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString))
43 |       .flatMap(body => body.split(" "))
44 |       .map(word => 1)
45 |       .reduce(_ + _)
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 | 
50 |   }
51 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-4Kryo.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.LongWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | 
14 | object VoyagerAppKryo {
15 |   def main(args: Array[String]) {
16 |     if (args.length != 3) {
17 |       System.err.println(
18 |         "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, inputPath, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
27 |       .registerKryoClasses(Array(classOf[ProtonFlux]))
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 | 
31 |     val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
32 |     val projected = voyager1.map(rec => {
33 |       val attrs = rec.split("\\s+")
34 |       new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21),
35 |         attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27),
36 |         attrs(28))
37 |     })
38 |     val filtered = projected.filter(pflux => pflux.isSolarStorm)
39 |     val yearlyBreakdown = filtered.map(rec => (rec.year, 1))
40 |       .reduceByKey(_ + _)
41 |       .transform(rec => rec.sortByKey(ascending = false))
42 |     yearlyBreakdown.saveAsTextFiles(outputPath)
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-14KafkaCustomConf.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | import kafka.serializer.StringDecoder
11 | import org.apache.spark.storage.StorageLevel
12 | 
13 | object StationJourneyCountCustomApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 7) {
17 |       System.err.println(
18 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
19 |       System.exit(1)
20 |     }
21 | 
22 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 |       //.set("spark.streaming.receiver.writeAheadLog.enable", "true")
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 |     ssc.checkpoint(checkpointDir)
31 | 
32 |     val topics = Map[String, Int](
33 |       topic -> 1)
34 |     val params = Map[String, String](
35 |       "zookeeper.connect" -> zkQuorum,
36 |       "group.id" -> consumerGroupId,
37 |       "bootstrap.servers" -> brokerUrl)
38 |     KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
39 |       .map(rec => rec.split(","))
40 |       .map(rec => ((rec(3), rec(7)), 1))
41 |       .reduceByKey(_ + _)
42 |       .repartition(1)
43 |       .map(rec => (rec._2, rec._1))
44 |       .transform(rdd => rdd.sortByKey(ascending = false))
45 |       .saveAsTextFiles(outputPath)
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-8Sql.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CdrSqlApp {
13 | 
14 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
15 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
16 |     callOutActivity: Float, internetTrafficActivity: Float)
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val sqlC = new SQLContext(ssc.sparkContext)
33 |     import sqlC.implicits._
34 | 
35 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
36 |       .map(_.split("\\t", -1))
37 |       .foreachRDD(rdd => {
38 |         val cdrs = seqToCdr(rdd).toDF()
39 |         cdrs.registerTempTable("cdrs")
40 | 
41 |         sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show()
42 |         sqlC.dropTempTable("cdrs")
43 |       })
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 |   }
48 | 
49 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
50 |     rdd.map(c => c.map(f => f match {
51 |       case x if x.isEmpty() => "0"
52 |       case x => x
53 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
54 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
55 |   }
56 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/src/main/scala/org/apress/prospark/L10-2DataProc.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.HashPartitioner
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.json4s.DefaultFormats
10 | import org.json4s.JsonAST.JNothing
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object DataProcApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     ssc.socketTextStream(hostname, port.toInt)
33 |       .map(r => {
34 |         implicit val formats = DefaultFormats
35 |         parse(r)
36 |       })
37 |       .filter(jvalue => {
38 |         jvalue \ "attributes" \ "Wi-Fi" != JNothing
39 |       })
40 |       .map(jvalue => {
41 |         implicit val formats = DefaultFormats
42 |         ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int])
43 |       })
44 |       .combineByKey(
45 |         (v) => (v, 1),
46 |         (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
47 |         (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
48 |         new HashPartitioner(ssc.sparkContext.defaultParallelism))
49 |       .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
50 |       .print()
51 | 
52 |     ssc.start()
53 |     ssc.awaitTermination()
54 |   }
55 | 
56 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap6"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0"
26 | 
27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1"
28 | 
29 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1"
30 | 
31 | libraryDependencies += "org.apache.commons" % "commons-pool2" % "2.4.2"
32 | 
33 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.15-hadoop2"
34 | 
35 | //libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2"
36 | 
37 | //libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2"
38 | 
39 | //libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2"
40 | 
41 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-SNAPSHOT"
42 | 
43 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-SNAPSHOT"
44 | 
45 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-SNAPSHOT"
46 | 
47 | libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-SNAPSHOT"
48 | 
49 | resolvers += "Apache Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
50 | 
51 | libraryDependencies += "org.apache.cassandra" % "cassandra-all" % "2.1.11"
52 | 
53 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0"
54 | 
55 | libraryDependencies += "redis.clients" % "jedis" % "2.7.3"
56 | 
57 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/"
58 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-1LinearRegression.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object LinearRegressionApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 4) {
17 |       System.err.println(
18 |         "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val substream = ssc.socketTextStream(hostname, port.toInt)
30 |       .filter(!_.contains("NaN"))
31 |       .map(_.split(" "))
32 |       .filter(f => f(1) != "0")
33 | 
34 |     val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
35 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |     val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
37 |     val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
38 |     val model = new StreamingLinearRegressionWithSGD()
39 |       .setInitialWeights(Vectors.zeros(4))
40 |       .setStepSize(0.0001)
41 |       .setNumIterations(1)
42 | 
43 |     model.trainOn(train)
44 |     model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
45 |       .map(v => math.pow((v._1 - v._2), 2)).mean())))
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-10KMeans.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.clustering.StreamingKMeans
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object KMeansClusteringApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 4) {
17 |       System.err.println(
18 |         "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val substream = ssc.socketTextStream(hostname, port.toInt)
30 |       .filter(!_.contains("NaN"))
31 |       .map(_.split(" "))
32 |       .filter(f => f(1) != "0")
33 | 
34 |     val orientationStream = substream
35 |       .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
36 |       .map(arr => arr.map(_.toDouble))
37 |       .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
38 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
39 |     val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
40 |     val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
41 |     val model = new StreamingKMeans()
42 |       .setK(3)
43 |       .setDecayFactor(0)
44 |       .setRandomCenters(18, 0.0)
45 | 
46 |     model.trainOn(train.map(v => v.features))
47 |     val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 |   }
52 | 
53 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStreamReader;
 7 | import java.util.Enumeration;
 8 | import java.util.zip.ZipEntry;
 9 | import java.util.zip.ZipFile;
10 | 
11 | import org.apache.log4j.LogManager;
12 | import org.apache.log4j.Logger;
13 | 
14 | public abstract class AbstractDriver {
15 | 
16 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
17 | 
18 | 	private String path;
19 | 
20 | 	public AbstractDriver(String path) {
21 | 		this.path = path;
22 | 	}
23 | 
24 | 	public abstract void init() throws Exception;
25 | 
26 | 	public abstract void close() throws Exception;
27 | 
28 | 	public abstract void sendRecord(String record) throws Exception;
29 | 
30 | 	public void execute() throws Exception {
31 | 
32 | 		try {
33 | 			init();
34 | 			File dirPath = new File(path);
35 | 			if (dirPath.isDirectory()) {
36 | 				File[] files = new File(path).listFiles();
37 | 				for (File f : files) {
38 | 					LOG.info(String.format("Feeding zipped file %s", f.getName()));
39 | 					ZipFile zFile = null;
40 | 					try {
41 | 						zFile = new ZipFile(f);
42 | 						Enumeration<? extends ZipEntry> zEntries = zFile.entries();
43 | 
44 | 						while (zEntries.hasMoreElements()) {
45 | 							ZipEntry zEntry = zEntries.nextElement();
46 | 							LOG.info(String.format("Feeding file %s", zEntry.getName()));
47 | 							try (BufferedReader br = new BufferedReader(
48 | 									new InputStreamReader(zFile.getInputStream(zEntry)))) {
49 | 								// skip header
50 | 								br.readLine();
51 | 								String line;
52 | 								while ((line = br.readLine()) != null) {
53 | 									sendRecord(line);
54 | 								}
55 | 							}
56 | 						}
57 | 					} catch (IOException e) {
58 | 						LOG.error(e.getMessage());
59 | 					} finally {
60 | 						if (zFile != null) {
61 | 							try {
62 | 								zFile.close();
63 | 							} catch (IOException e) {
64 | 								LOG.error(e.getMessage());
65 | 							}
66 | 						}
67 | 					}
68 | 				}
69 | 			} else {
70 | 				LOG.error(String.format("Path %s is not a directory", path));
71 | 			}
72 | 		} finally {
73 | 			close();
74 | 		}
75 | 	}
76 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStreamReader;
 7 | import java.util.Enumeration;
 8 | import java.util.zip.ZipEntry;
 9 | import java.util.zip.ZipFile;
10 | 
11 | import org.apache.log4j.LogManager;
12 | import org.apache.log4j.Logger;
13 | 
14 | public abstract class AbstractDriver {
15 | 
16 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
17 | 
18 | 	private String path;
19 | 
20 | 	public AbstractDriver(String path) {
21 | 		this.path = path;
22 | 	}
23 | 
24 | 	public abstract void init() throws Exception;
25 | 
26 | 	public abstract void close() throws Exception;
27 | 
28 | 	public abstract void sendRecord(String record) throws Exception;
29 | 
30 | 	public void execute() throws Exception {
31 | 
32 | 		try {
33 | 			init();
34 | 			File dirPath = new File(path);
35 | 			if (dirPath.isDirectory()) {
36 | 				File[] files = new File(path).listFiles();
37 | 				for (File f : files) {
38 | 					LOG.info(String.format("Feeding zipped file %s", f.getName()));
39 | 					ZipFile zFile = null;
40 | 					try {
41 | 						zFile = new ZipFile(f);
42 | 						Enumeration<? extends ZipEntry> zEntries = zFile.entries();
43 | 
44 | 						while (zEntries.hasMoreElements()) {
45 | 							ZipEntry zEntry = zEntries.nextElement();
46 | 							LOG.info(String.format("Feeding file %s", zEntry.getName()));
47 | 							try (BufferedReader br = new BufferedReader(
48 | 									new InputStreamReader(zFile.getInputStream(zEntry)))) {
49 | 								// skip header
50 | 								br.readLine();
51 | 								String line;
52 | 								while ((line = br.readLine()) != null) {
53 | 									sendRecord(line);
54 | 								}
55 | 							}
56 | 						}
57 | 					} catch (IOException e) {
58 | 						LOG.error(e.getMessage());
59 | 					} finally {
60 | 						if (zFile != null) {
61 | 							try {
62 | 								zFile.close();
63 | 							} catch (IOException e) {
64 | 								LOG.error(e.getMessage());
65 | 							}
66 | 						}
67 | 					}
68 | 				}
69 | 			} else {
70 | 				LOG.error(String.format("Path %s is not a directory", path));
71 | 			}
72 | 		} finally {
73 | 			close();
74 | 		}
75 | 	}
76 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/MqttDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.nio.charset.StandardCharsets;
 4 | 
 5 | import org.apache.log4j.LogManager;
 6 | import org.apache.log4j.Logger;
 7 | import org.eclipse.paho.client.mqttv3.MqttClient;
 8 | import org.eclipse.paho.client.mqttv3.MqttException;
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage;
10 | import org.eclipse.paho.client.mqttv3.MqttTopic;
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence;
12 | 
13 | public class MqttDriver extends AbstractDriver {
14 | 
15 | 	private static final Logger LOG = LogManager.getLogger(MqttDriver.class);
16 | 
17 | 	private final String brokerUrl;
18 | 	private final String topic;
19 | 	private MqttClient client;
20 | 	private MqttTopic mqttTopic;
21 | 
22 | 	public MqttDriver(String path, String brokerUrl, String topic) {
23 | 		super(path);
24 | 		this.brokerUrl = brokerUrl;
25 | 		this.topic = topic;
26 | 	}
27 | 
28 | 	@Override
29 | 	public void init() throws Exception {
30 | 		client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence());
31 | 		LOG.info(String.format("Attempting to connect to broker %s", brokerUrl));
32 | 		client.connect();
33 | 		mqttTopic = client.getTopic(topic);
34 | 		LOG.info(String.format("Connected to broker %s", brokerUrl));
35 | 	}
36 | 
37 | 	@Override
38 | 	public void close() throws Exception {
39 | 		if (client != null) {
40 | 			client.disconnect();
41 | 		}
42 | 	}
43 | 
44 | 	@Override
45 | 	public void sendRecord(String record) throws Exception {
46 | 		try {
47 | 			mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8)));
48 | 		} catch (MqttException e) {
49 | 			if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) {
50 | 				Thread.sleep(10);
51 | 			}
52 | 		}
53 | 	}
54 | 
55 | 	public static void main(String[] args) throws Exception {
56 | 
57 | 		if (args.length != 3) {
58 | 			System.err.println("Usage:MqttDriver <path_to_input_folder> <broker_url> <topic>");
59 | 			System.exit(-1);
60 | 		}
61 | 
62 | 		String path = args[0];
63 | 		String brokerUrl = args[1];
64 | 		String topic = args[2];
65 | 
66 | 		MqttDriver driver = new MqttDriver(path, brokerUrl, topic);
67 | 		try {
68 | 			driver.execute();
69 | 		} finally {
70 | 			driver.close();
71 | 		}
72 | 	}
73 | 
74 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/java/org/apress/prospark/MqttDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.nio.charset.StandardCharsets;
 4 | 
 5 | import org.apache.log4j.LogManager;
 6 | import org.apache.log4j.Logger;
 7 | import org.eclipse.paho.client.mqttv3.MqttClient;
 8 | import org.eclipse.paho.client.mqttv3.MqttException;
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage;
10 | import org.eclipse.paho.client.mqttv3.MqttTopic;
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence;
12 | 
13 | public class MqttDriver extends AbstractDriver {
14 | 
15 | 	private static final Logger LOG = LogManager.getLogger(MqttDriver.class);
16 | 
17 | 	private final String brokerUrl;
18 | 	private final String topic;
19 | 	private MqttClient client;
20 | 	private MqttTopic mqttTopic;
21 | 
22 | 	public MqttDriver(String path, String brokerUrl, String topic) {
23 | 		super(path);
24 | 		this.brokerUrl = brokerUrl;
25 | 		this.topic = topic;
26 | 	}
27 | 
28 | 	@Override
29 | 	public void init() throws Exception {
30 | 		client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence());
31 | 		LOG.info(String.format("Attempting to connect to broker %s", brokerUrl));
32 | 		client.connect();
33 | 		mqttTopic = client.getTopic(topic);
34 | 		LOG.info(String.format("Connected to broker %s", brokerUrl));
35 | 	}
36 | 
37 | 	@Override
38 | 	public void close() throws Exception {
39 | 		if (client != null) {
40 | 			client.disconnect();
41 | 		}
42 | 	}
43 | 
44 | 	@Override
45 | 	public void sendRecord(String record) throws Exception {
46 | 		try {
47 | 			mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8)));
48 | 		} catch (MqttException e) {
49 | 			if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) {
50 | 				Thread.sleep(10);
51 | 			}
52 | 		}
53 | 	}
54 | 
55 | 	public static void main(String[] args) throws Exception {
56 | 
57 | 		if (args.length != 3) {
58 | 			System.err.println("Usage:MqttDriver <path_to_input_folder> <broker_url> <topic>");
59 | 			System.exit(-1);
60 | 		}
61 | 
62 | 		String path = args[0];
63 | 		String brokerUrl = args[1];
64 | 		String topic = args[2];
65 | 
66 | 		MqttDriver driver = new MqttDriver(path, brokerUrl, topic);
67 | 		try {
68 | 			driver.execute();
69 | 		} finally {
70 | 			driver.close();
71 | 		}
72 | 	}
73 | 
74 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/src/main/scala/org/apress/prospark/L7-2-3Tachyon.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
11 | 
12 | object ReferrerApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 7) {
15 |       System.err.println(
16 |         "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 |       .set("spark.externalBlockStore.url", tachyonUrl)
25 | 
26 |     val ssc = new StreamingContext(conf, Seconds(10))
27 |     ssc.checkpoint(checkpointDir)
28 | 
29 |     val clickstream = ssc.socketTextStream(hostname, port.toInt)
30 |       .map(rec => rec.split("\\t"))
31 |       .persist(StorageLevel.OFF_HEAP)
32 | 
33 |     val topRefStream = clickstream
34 |       .map(rec => {
35 |         var prev_title = rec(3)
36 |         if (!prev_title.startsWith("other")) {
37 |           prev_title = "wikipedia"
38 |         }
39 |         (prev_title, 1)
40 |       })
41 | 
42 |     val topSparkStream = clickstream
43 |       .filter(rec => rec(4).equals("Apache_Spark"))
44 |       .map(rec => (rec(3), 1))
45 | 
46 |     saveTopKeys(topRefStream, outputPathTop)
47 | 
48 |     saveTopKeys(topSparkStream, outputPathSpark)
49 | 
50 |     ssc.start()
51 |     ssc.awaitTermination()
52 |   }
53 | 
54 |   def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) {
55 |     clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0)))
56 |       .repartition(1)
57 |       .map(rec => (rec._2, rec._1))
58 |       .transform(rec => rec.sortByKey(ascending = false))
59 |       .saveAsTextFiles(outputPath)
60 |   }
61 | 
62 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-13HiveQL.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.hive.HiveContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CdrHiveqlApp {
13 | 
14 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
15 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
16 |     callOutActivity: Float, internetTrafficActivity: Float)
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val cl = Thread.currentThread().getContextClassLoader()
33 |     val hiveC = new HiveContext(ssc.sparkContext)
34 |     Thread.currentThread().setContextClassLoader(cl)
35 | 
36 |     import hiveC.implicits._
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         seqToCdr(rdd).toDF().registerTempTable("cdrs")
42 | 
43 |         hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'")
44 |         hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show()
45 |       })
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
52 |     rdd.map(c => c.map(f => f match {
53 |       case x if x.isEmpty() => "0"
54 |       case x => x
55 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
56 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
57 |   }
58 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-9LogisticRegression.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
13 | 
14 | object LogisticRegressionApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 4) {
18 |       System.err.println(
19 |         "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
29 | 
30 |     val substream = ssc.socketTextStream(hostname, port.toInt)
31 |       .filter(!_.contains("NaN"))
32 |       .map(_.split(" "))
33 |       .filter(f => f(1) != "0")
34 | 
35 |     val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
36 | 
37 |     val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
38 |     val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
39 |     val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
40 |     val model = new StreamingLogisticRegressionWithSGD()
41 |       .setInitialWeights(Vectors.zeros(4))
42 |       .setStepSize(0.0001)
43 |       .setNumIterations(1)
44 | 
45 |     model.trainOn(train)
46 |     model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
47 |       .map(v => math.pow((v._1 - v._2), 2)).mean())))
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 |   }
52 | 
53 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-28DataFrameExamplesOps.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.DataFrame
 9 | import org.apache.spark.sql.SQLContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeExamples2App {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     var previousCdrs: Option[DataFrame] = None
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates()
42 |         previousCdrs match {
43 |           case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show()
44 |           //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show()
45 |           //case Some(prevCdrs) => cdrs.except(prevCdrs).show()
46 |           case None => Unit
47 |         }
48 |         previousCdrs = Some(cdrs)
49 |       })
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
56 |     rdd.map(c => c.map(f => f match {
57 |       case x if x.isEmpty() => "0"
58 |       case x => x
59 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
60 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
61 |   }
62 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-3-6-7DataFrameCreation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.json4s.native.Serialization.write
13 | import org.json4s.DefaultFormats
14 | 
15 | object DataframeCreationApp {
16 | 
17 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
18 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
19 |     callOutActivity: Float, internetTrafficActivity: Float)
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 4) {
23 |       System.err.println(
24 |         "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
34 | 
35 |     val sqlC = new SQLContext(ssc.sparkContext)
36 |     import sqlC.implicits._
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         //val cdrs = sqlC.createDataFrame(seqToCdr(rdd))
42 |         //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect())
43 |         //val cdrs = seqToCdr(rdd).toDF()
44 |         val cdrsJson = seqToCdr(rdd).map(r => {
45 |           implicit val formats = DefaultFormats
46 |           write(r)
47 |         })
48 |         val cdrs = sqlC.read.json(cdrsJson)
49 | 
50 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
51 |       })
52 | 
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 | 
56 |   }
57 | 
58 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
59 |     rdd.map(c => c.map(f => f match {
60 |       case x if x.isEmpty() => "0"
61 |       case x => x
62 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
63 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
64 |   }
65 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-5Exception.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppA {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
51 |         client.connect()
52 |         rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))))
53 |         client.disconnect()
54 |         client.close()
55 |       }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/src/main/scala/org/apress/prospark/L10-9Graph.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.graphx.Edge
 6 | import org.apache.spark.graphx.Graph
 7 | import org.apache.spark.graphx.Graph.graphToGraphOps
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object UserRankApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     ssc.socketTextStream(hostname, port.toInt)
33 |       .map(r => {
34 |         implicit val formats = DefaultFormats
35 |         parse(r)
36 |       })
37 |       .foreachRDD(rdd => {
38 |         val edges = rdd.map(jvalue => {
39 |           implicit val formats = DefaultFormats
40 |           ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
41 |         })
42 |           .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))
43 | 
44 |         val vertices = rdd.map(jvalue => {
45 |           implicit val formats = DefaultFormats
46 |           ((jvalue \ "user_id").extract[String])
47 |         })
48 |           .map(r => (r.hashCode.toLong, r))
49 | 
50 |         val tolerance = 0.0001
51 |         val graph = Graph(vertices, edges, "defaultUser")
52 |           .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
53 |         val pr = graph.pageRank(tolerance).cache
54 | 
55 |         graph.outerJoinVertices(pr.vertices) {
56 |           (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
57 |         }.vertices.top(10) {
58 |           Ordering.by(_._2._1)
59 |         }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
60 |       })
61 | 
62 |     ssc.start()
63 |     ssc.awaitTermination()
64 | 
65 |   }
66 | 
67 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-7PerPartition.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppC {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreachPartition { par =>
51 |           val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
52 |           client.connect()
53 |           par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))))
54 |           client.disconnect()
55 |           client.close()
56 |         }
57 |       }
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/HttpInputDStreamAsync.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.ClassTag
 4 | 
 5 | import org.apache.spark.Logging
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.api.java.JavaDStream
 9 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
10 | import org.apache.spark.streaming.api.java.JavaStreamingContext
11 | import org.apache.spark.streaming.dstream.DStream
12 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
13 | import org.apache.spark.streaming.receiver.Receiver
14 | 
15 | import com.ning.http.client.AsyncCompletionHandler
16 | import com.ning.http.client.AsyncHttpClient
17 | import com.ning.http.client.Response
18 | 
19 | class HttpInputDStreamAsync(
20 |     @transient ssc_ : StreamingContext,
21 |     storageLevel: StorageLevel,
22 |     url: String) extends ReceiverInputDStream[String](ssc_) with Logging {
23 | 
24 |   def getReceiver(): Receiver[String] = {
25 |     new HttpReceiverAsync(storageLevel, url)
26 |   }
27 | }
28 | 
29 | class HttpReceiverAsync(
30 |     storageLevel: StorageLevel,
31 |     url: String) extends Receiver[String](storageLevel) with Logging {
32 | 
33 |   var asyncHttpClient: AsyncHttpClient = _
34 | 
35 |   def onStop() {
36 |     asyncHttpClient.close()
37 |     logInfo("Disconnected from Http Server")
38 |   }
39 | 
40 |   def onStart() {
41 |     asyncHttpClient = new AsyncHttpClient()
42 |     asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {
43 | 
44 |       override def onCompleted(response: Response): Response = {
45 |         store(response.getResponseBody)
46 |         return response
47 |       }
48 | 
49 |       override def onThrowable(t: Throwable) {
50 |         restart("Error! Problems while connecting", t)
51 |       }
52 |     });
53 |     logInfo("Http Connection initiated")
54 |   }
55 |   
56 | }
57 | 
58 | object HttpUtilsAsync {
59 |   def createStream(
60 |     ssc: StreamingContext,
61 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
62 |     url: String): DStream[String] = {
63 |     new HttpInputDStreamAsync(ssc, storageLevel, url)
64 |   }
65 | 
66 |   def createStream(
67 |     jssc: JavaStreamingContext,
68 |     storageLevel: StorageLevel,
69 |     url: String): JavaDStream[String] = {
70 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
71 |     createStream(jssc.ssc, storageLevel, url)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-6PerRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppB {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreach { rec =>
51 |           {
52 |             val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
53 |             client.connect()
54 |             client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))
55 |             client.disconnect()
56 |             client.close()
57 |           }
58 |         }
59 |       }
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/src/main/scala/org/apress/prospark/L7-4UI.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.concurrent.atomic.AtomicLong
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object SocialSearchApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 3) {
14 |       System.err.println(
15 |         "Usage: SocialSearchApp <appname> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 |       //.set("spark.eventLog.enabled", "true")
24 |       //.set("spark.eventLog.dir", "/tmp/historical")
25 |       
26 | 
27 |     val countSearch = new AtomicLong(0)
28 |     val countSocial = new AtomicLong(0)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(1))
31 |     
32 |     val titleStream = ssc.socketTextStream(hostname, port.toInt)
33 |       .map(rec => rec.split("\\t"))
34 |       .filter(_(3) match {
35 |         case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true
36 |         case _ => false
37 |       })
38 |       .map(rec => (rec(3), rec(4)))
39 |       .cache()
40 | 
41 |     val searchStream = titleStream.filter(_._1 match {
42 |       case "other-google" | "other-bing" | "other-yahoo" => true
43 |       case _ => false
44 |     })
45 |       .map(rec => rec._2)
46 | 
47 |     val socialStream = titleStream.filter(_._1 match {
48 |       case "other-facebook" | "other-twitter" => true
49 |       case _ => false
50 |     })
51 |       .map(rec => rec._2)
52 | 
53 |     val exclusiveSearch = searchStream.transformWith(socialStream,
54 |       (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD))
55 |       .foreachRDD(rdd => {
56 |         countSearch.addAndGet(rdd.count())
57 |         println("Exclusive count search engines: " + countSearch)
58 |       })
59 | 
60 |     val exclusiveSocial = socialStream.transformWith(searchStream,
61 |       (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD))
62 |       .foreachRDD(rdd => {
63 |         countSocial.addAndGet(rdd.count())
64 |         println("Exclusive count social media: " + countSocial)
65 |       })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 |   }
70 | 
71 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-38SparkR.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.sql.hive.HiveContext
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import java.nio.file.Paths
11 | import org.apache.spark.SparkFiles
12 | 
13 | object CdrStreamingSparkRApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 7) {
21 |       System.err.println(
22 |         "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val cl = Thread.currentThread().getContextClassLoader()
34 |     val hiveC = new HiveContext(ssc.sparkContext)
35 |     Thread.currentThread().setContextClassLoader(cl)
36 | 
37 |     import hiveC.implicits._
38 | 
39 |     ssc.sparkContext.addFile(rScriptPath)
40 |     val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString)
41 |     val master = hiveC.sparkContext.getConf.get("spark.master")
42 | 
43 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
44 |       .map(_.split("\\t", -1))
45 |       .foreachRDD((rdd, time) => {
46 |         val iTableName = tableName + time.milliseconds
47 |         seqToCdr(rdd).toDF().write.saveAsTable(iTableName)
48 |         hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString)
49 |       })
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
56 |     rdd.map(c => c.map(f => f match {
57 |       case x if x.isEmpty() => "0"
58 |       case x => x
59 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
60 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
61 |   }
62 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-8Static.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppD {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreachPartition { par =>
51 |           par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
52 |         }
53 |       }
54 | 
55 |     ssc.start()
56 |     ssc.awaitTermination()
57 |   }
58 | }
59 | 
60 | object MqttSink {
61 |   val brokerUrl = "tcp://localhost:1883"
62 |   val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
63 |   client.connect()
64 |   sys.addShutdownHook {
65 |     client.disconnect()
66 |     client.close()
67 |   }
68 | 
69 |   def apply(): MqttClient = {
70 |     client
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-23UpdateState.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.streaming.Seconds
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 8 | import org.json4s.DefaultFormats
 9 | import org.json4s.jvalue2extractable
10 | import org.json4s.jvalue2monadic
11 | import org.json4s.native.JsonMethods.parse
12 | import org.json4s.string2JsonInput
13 | 
14 | object StatefulUpdateStateApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: StatefulUpdateStateApp <appname> <checkpointDir>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, checkpointDir) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val batchInterval = 10
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
32 |     ssc.checkpoint(checkpointDir)
33 | 
34 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
35 |       interval = batchInterval)
36 |       .flatMap(rec => {
37 |         implicit val formats = DefaultFormats
38 |         val query = parse(rec) \ "query"
39 |         ((query \ "results" \ "quote").children)
40 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
41 |       })
42 |       .updateStateByKey(updateState)
43 |       .print()
44 | 
45 |     def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = {
46 |       val volumes = values.map(s => s._2)
47 |       val localMin = volumes.min
48 |       val localMax = volumes.max
49 |       val localCount500 = values.map(s => s._1).count(price => price > 500)
50 |       val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)]
51 |       val newMin = if (localMin < globalValues._1) localMin else globalValues._1
52 |       val newMax = if (localMax > globalValues._2) localMax else globalValues._2
53 |       val newCount500 = globalValues._3 + localCount500
54 |       return Some(newMin, newMax, newCount500)
55 |     }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-35DataFrameExamplesRDD.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.types.DataType
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.json4s.DefaultFormats
14 | 
15 | object CdrDataframeExamplesRDDApp {
16 | 
17 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
18 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
19 |     callOutActivity: Float, internetTrafficActivity: Float)
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 5) {
23 |       System.err.println(
24 |         "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
34 | 
35 |     val sqlC = new SQLContext(ssc.sparkContext)
36 |     import sqlC.implicits._
37 |     implicit val formats = DefaultFormats
38 | 
39 |     val schemaJson = scala.io.Source.fromFile(schemaFile).mkString
40 |     val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType]
41 | 
42 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
43 |       .map(_.split("\\t", -1))
44 |       .foreachRDD(rdd => {
45 |         val cdrs = seqToCdr(rdd).toDF()
46 |         val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema)
47 |         val highOther = cdrs.except(highInternet)
48 |         val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates()
49 |         val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates()
50 |         highOtherGrid.except(highInternetGrid).show()
51 |         highInternetGrid.except(highOtherGrid).show()
52 |       })
53 | 
54 |     ssc.start()
55 |     ssc.awaitTermination()
56 |   }
57 | 
58 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
59 |     rdd.map(c => c.map(f => f match {
60 |       case x if x.isEmpty() => "0"
61 |       case x => x
62 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
63 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
64 |   }
65 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-16SparkHBase.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.hbase.HBaseConfiguration
 4 | import org.apache.hadoop.hbase.TableName
 5 | import org.apache.hadoop.hbase.client.Put
 6 | import org.apache.hadoop.hbase.spark.HBaseContext
 7 | import org.apache.hadoop.hbase.util.Bytes
 8 | import org.apache.spark.SparkConf
 9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | import org.json4s.DefaultFormats
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.parse
17 | import org.json4s.string2JsonInput
18 | 
19 | object SparkHBaseBulkPutApp {
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 4) {
23 |       System.err.println(
24 |         "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>")
25 |       System.exit(1)
26 |     }
27 | 
28 |     val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val batchInterval = 10
35 |     val windowSize = 20
36 |     val slideInterval = 10
37 | 
38 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
39 | 
40 |     val hbaseConf = HBaseConfiguration.create()
41 |     val hContext = new HBaseContext(ssc.sparkContext, hbaseConf)
42 | 
43 |     val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
44 |       interval = batchInterval)
45 |       .flatMap(rec => {
46 |         implicit val formats = DefaultFormats
47 |         val query = parse(rec) \ "query"
48 |         ((query \ "results" \ "quote").children)
49 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
50 |       })
51 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
52 | 
53 |     hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => {
54 |       val put = new Put(rec._1.getBytes)
55 |       put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
56 |       put
57 |     })
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-22Counters.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.concurrent.atomic.AtomicLong
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.json4s.DefaultFormats
10 | import org.json4s.jvalue2extractable
11 | import org.json4s.jvalue2monadic
12 | import org.json4s.native.JsonMethods.parse
13 | import org.json4s.string2JsonInput
14 | 
15 | object StatefulCountersApp {
16 | 
17 |   def main(args: Array[String]) {
18 |     if (args.length != 1) {
19 |       System.err.println(
20 |         "Usage: StatefulCountersApp <appname>")
21 |       System.exit(1)
22 |     }
23 | 
24 |     val Seq(appName) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val batchInterval = 10
31 | 
32 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
33 |     
34 |     var globalMax: AtomicLong = new AtomicLong(Long.MinValue)
35 |     var globalMin: AtomicLong = new AtomicLong(Long.MaxValue)
36 |     var globalCounter500: AtomicLong = new AtomicLong(0)
37 | 
38 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
39 |       interval = batchInterval)
40 |       .flatMap(rec => {
41 |         implicit val formats = DefaultFormats
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children)
44 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))
45 |       })
46 |       .foreachRDD(rdd => {
47 |         val stocks = rdd.take(10)
48 |         stocks.foreach(stock => {
49 |           val price = stock._2
50 |           val volume = stock._3
51 |           if (volume > globalMax.get()) {
52 |             globalMax.set(volume)
53 |           }
54 |           if (volume < globalMin.get()) {
55 |             globalMin.set(volume)
56 |           }
57 |           if (price > 500) {
58 |             globalCounter500.incrementAndGet()
59 |           }
60 |         })
61 |         if (globalCounter500.get() > 1000L) {
62 |           println("Global counter has reached 1000")
63 |           println("Max ----> " + globalMax.get)
64 |           println("Min ----> " + globalMin.get)
65 |           globalCounter500.set(0)
66 |         }
67 |       })
68 | 
69 |     ssc.start()
70 |     ssc.awaitTermination()
71 |   }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/T8-3DataFrameExamplesNA.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JDouble
13 | import org.json4s.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.compact
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.native.JsonMethods.render
19 | import org.json4s.string2JsonInput
20 | 
21 | object CdrDataframeExamplesNAApp {
22 | 
23 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
24 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
25 |     callOutActivity: Float, internetTrafficActivity: Float)
26 | 
27 |   def main(args: Array[String]) {
28 |     if (args.length != 4) {
29 |       System.err.println(
30 |         "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>")
31 |       System.exit(1)
32 |     }
33 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
40 | 
41 |     val sqlC = new SQLContext(ssc.sparkContext)
42 |     import sqlC.implicits._
43 |     implicit val formats = DefaultFormats
44 | 
45 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
46 |       .map(_.split("\\t", -1))
47 |       .foreachRDD(rdd => {
48 |         val cdrs = seqToCdr(rdd).toDF()
49 |         cdrs.na.drop("any").show()
50 |         cdrs.na.fill(0, Array("squareId")).show()
51 |         cdrs.na.replace("squareId", Map(0 -> 1)).show()
52 |         println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity"))
53 |         println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity"))
54 |         cdrs.stat.crosstab("squareId", "countryCode").show()
55 |         cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show()
56 |         cdrs.stat.crosstab("callOutActivity", "callInActivity").show()
57 |       })
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | 
63 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
64 |     rdd.map(c => c.map(f => f match {
65 |       case x if x.isEmpty() => "0"
66 |       case x => x
67 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
68 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
69 |   }
70 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamMapping.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditMappingApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditMappingApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val sdf = new SimpleDateFormat("yyyy-MM-dd")
39 |     val tsKey = "created_utc"
40 |     val secs = 1000L
41 |     val keyedByDay = comments.map(rec => {
42 |       val ts = (parse(rec) \ tsKey).values
43 |       (sdf.format(new Date(ts.toString.toLong * secs)), rec)
44 |     })
45 | 
46 |     val keyedByDayPart = comments.mapPartitions(iter => {
47 |       var ret = List[(String, String)]()
48 |       while (iter.hasNext) {
49 |         val rec = iter.next
50 |         val ts = (parse(rec) \ tsKey).values
51 |         ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec)
52 |       }
53 |       ret.iterator
54 |     })
55 | 
56 |     val wordTokens = comments.map(rec => {
57 |       ((parse(rec) \ "body")).values.toString.split(" ")
58 |     })
59 | 
60 |     val wordTokensFlat = comments.flatMap(rec => {
61 |       ((parse(rec) \ "body")).values.toString.split(" ")
62 |     })
63 | 
64 |     val filterSubreddit = comments.filter(rec =>
65 |       (parse(rec) \ "subreddit").values.toString.equals("AskReddit"))
66 | 
67 |     val sortedByAuthor = comments.transform(rdd =>
68 |       (rdd.sortBy(rec => (parse(rec) \ "author").values.toString)))
69 | 
70 |     ssc.start()
71 |     ssc.awaitTermination()
72 | 
73 |   }
74 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-3ProtonFlux.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import com.esotericsoftware.kryo.{KryoSerializable,Kryo}
 4 | import com.esotericsoftware.kryo.io.{Output, Input}
 5 | 
 6 | class ProtonFlux(
 7 |     var year: Int,
 8 |     var bin0_57to1_78: Double,
 9 |     var bin3_40to17_6: Double,
10 |     var bin22_0to31_0: Double,
11 |     var bin1_894to2_605: Double,
12 |     var bin4_200to6_240: Double,
13 |     var bin3_256to8_132: Double,
14 |     var bin3_276to8_097: Double,
15 |     var bin6_343to42_03: Double,
16 |     var bin17_88to26_81: Double,
17 |     var bin30_29to69_47: Double,
18 |     var bin132_8to242_0: Double
19 |   ) extends KryoSerializable {
20 |   
21 |   def this(year: String, bin0_57to1_78: String, bin3_40to17_6: String, 
22 |       bin22_0to31_0: String, bin1_894to2_605: String, bin4_200to6_240: String, 
23 |       bin3_256to8_132: String, bin3_276to8_097: String, bin6_343to42_03: String,
24 |       bin17_88to26_81: String, bin30_29to69_47: String, bin132_8to242_0: String) {
25 |     this(year.toInt, bin0_57to1_78.toDouble, bin3_40to17_6.toDouble,
26 |         bin22_0to31_0.toDouble, bin1_894to2_605.toDouble, bin4_200to6_240.toDouble, 
27 |         bin3_256to8_132.toDouble, bin3_276to8_097.toDouble, bin6_343to42_03.toDouble,
28 |         bin17_88to26_81.toDouble, bin30_29to69_47.toDouble, bin132_8to242_0.toDouble)
29 |   }
30 |   
31 |   def isSolarStorm = (bin0_57to1_78 > 1.0 || bin3_40to17_6 > 1.0 
32 |     || bin22_0to31_0 > 1.0 || bin1_894to2_605 > 1.0 || bin4_200to6_240 > 1.0 
33 |     || bin3_256to8_132 > 1.0 || bin3_276to8_097 > 1.0 || bin6_343to42_03 > 1.0
34 |     || bin17_88to26_81 > 1.0 || bin30_29to69_47 > 1.0 || bin132_8to242_0 > 1.0)
35 | 
36 |   override def write(kryo: Kryo, output: Output) {
37 |     output.writeInt(year)
38 |     output.writeDouble(bin0_57to1_78)
39 |     output.writeDouble(bin3_40to17_6)
40 |     output.writeDouble(bin22_0to31_0)
41 |     output.writeDouble(bin1_894to2_605)
42 |     output.writeDouble(bin4_200to6_240)
43 |     output.writeDouble(bin3_256to8_132)
44 |     output.writeDouble(bin3_276to8_097)
45 |     output.writeDouble(bin6_343to42_03)
46 |     output.writeDouble(bin17_88to26_81)
47 |     output.writeDouble(bin30_29to69_47)
48 |     output.writeDouble(bin132_8to242_0)
49 |   }
50 | 
51 |   override def read(kryo: Kryo, input: Input) {
52 |     year = input.readInt()
53 |     bin0_57to1_78 = input.readDouble()
54 |     bin3_40to17_6 = input.readDouble()
55 |     bin22_0to31_0 = input.readDouble()
56 |     bin1_894to2_605 = input.readDouble()
57 |     bin4_200to6_240 = input.readDouble()
58 |     bin3_256to8_132 = input.readDouble()
59 |     bin3_276to8_097 = input.readDouble()
60 |     bin6_343to42_03 = input.readDouble()
61 |     bin17_88to26_81 = input.readDouble()
62 |     bin30_29to69_47 = input.readDouble()
63 |     bin132_8to242_0 = input.readDouble()
64 |   }
65 | 
66 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/HttpInputDStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.Timer
 4 | import java.util.TimerTask
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | import org.apache.http.client.methods.HttpGet
 9 | import org.apache.http.impl.client.CloseableHttpClient
10 | import org.apache.http.impl.client.HttpClients
11 | import org.apache.http.util.EntityUtils
12 | import org.apache.spark.Logging
13 | import org.apache.spark.storage.StorageLevel
14 | import org.apache.spark.streaming.StreamingContext
15 | import org.apache.spark.streaming.api.java.JavaDStream
16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
17 | import org.apache.spark.streaming.api.java.JavaStreamingContext
18 | import org.apache.spark.streaming.dstream.DStream
19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
20 | import org.apache.spark.streaming.receiver.Receiver
21 | 
22 | class HttpInputDStream(
23 |     @transient ssc_ : StreamingContext,
24 |     storageLevel: StorageLevel,
25 |     url: String,
26 |     interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {
27 | 
28 |   def getReceiver(): Receiver[String] = {
29 |     new HttpReceiver(storageLevel, url, interval)
30 |   }
31 | }
32 | 
33 | class HttpReceiver(
34 |     storageLevel: StorageLevel,
35 |     url: String,
36 |     interval: Long) extends Receiver[String](storageLevel) with Logging {
37 | 
38 |   var httpClient: CloseableHttpClient = _
39 |   var trigger: Timer = _
40 | 
41 |   def onStop() {
42 |     httpClient.close()
43 |     logInfo("Disconnected from Http Server")
44 |   }
45 | 
46 |   def onStart() {
47 |     httpClient = HttpClients.createDefault()
48 |     trigger = new Timer()
49 |     trigger.scheduleAtFixedRate(new TimerTask {
50 |       def run() = doGet()
51 |     }, 0, interval * 1000)
52 | 
53 |     logInfo("Http Receiver initiated")
54 |   }
55 | 
56 |   def doGet() {
57 |     logInfo("Fetching data from Http source")
58 |     val response = httpClient.execute(new HttpGet(url))
59 |     try {
60 |       val content = EntityUtils.toString(response.getEntity())
61 |       store(content)
62 |     } catch {
63 |       case e: Exception => restart("Error! Problems while connecting", e)
64 |     } finally {
65 |       response.close()
66 |     }
67 | 
68 |   }
69 | 
70 | }
71 | 
72 | object HttpUtils {
73 |   def createStream(
74 |     ssc: StreamingContext,
75 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
76 |     url: String,
77 |     interval: Long): DStream[String] = {
78 |     new HttpInputDStream(ssc, storageLevel, url, interval)
79 |   }
80 | 
81 |   def createStream(
82 |     jssc: JavaStreamingContext,
83 |     storageLevel: StorageLevel,
84 |     url: String,
85 |     interval: Long): JavaDStream[String] = {
86 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
87 |     createStream(jssc.ssc, storageLevel, url, interval)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/HttpInputDStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.Timer
 4 | import java.util.TimerTask
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | import org.apache.http.client.methods.HttpGet
 9 | import org.apache.http.impl.client.CloseableHttpClient
10 | import org.apache.http.impl.client.HttpClients
11 | import org.apache.http.util.EntityUtils
12 | import org.apache.spark.Logging
13 | import org.apache.spark.storage.StorageLevel
14 | import org.apache.spark.streaming.StreamingContext
15 | import org.apache.spark.streaming.api.java.JavaDStream
16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
17 | import org.apache.spark.streaming.api.java.JavaStreamingContext
18 | import org.apache.spark.streaming.dstream.DStream
19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
20 | import org.apache.spark.streaming.receiver.Receiver
21 | 
22 | class HttpInputDStream(
23 |     @transient ssc_ : StreamingContext,
24 |     storageLevel: StorageLevel,
25 |     url: String,
26 |     interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {
27 | 
28 |   def getReceiver(): Receiver[String] = {
29 |     new HttpReceiver(storageLevel, url, interval)
30 |   }
31 | }
32 | 
33 | class HttpReceiver(
34 |     storageLevel: StorageLevel,
35 |     url: String,
36 |     interval: Long) extends Receiver[String](storageLevel) with Logging {
37 | 
38 |   var httpClient: CloseableHttpClient = _
39 |   var trigger: Timer = _
40 | 
41 |   def onStop() {
42 |     httpClient.close()
43 |     logInfo("Disconnected from Http Server")
44 |   }
45 | 
46 |   def onStart() {
47 |     httpClient = HttpClients.createDefault()
48 |     trigger = new Timer()
49 |     trigger.scheduleAtFixedRate(new TimerTask {
50 |       def run() = doGet()
51 |     }, 0, interval * 1000)
52 | 
53 |     logInfo("Http Receiver initiated")
54 |   }
55 | 
56 |   def doGet() {
57 |     logInfo("Fetching data from Http source")
58 |     val response = httpClient.execute(new HttpGet(url))
59 |     try {
60 |       val content = EntityUtils.toString(response.getEntity())
61 |       store(content)
62 |     } catch {
63 |       case e: Exception => restart("Error! Problems while connecting", e)
64 |     } finally {
65 |       response.close()
66 |     }
67 | 
68 |   }
69 | 
70 | }
71 | 
72 | object HttpUtils {
73 |   def createStream(
74 |     ssc: StreamingContext,
75 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
76 |     url: String,
77 |     interval: Long): DStream[String] = {
78 |     new HttpInputDStream(ssc, storageLevel, url, interval)
79 |   }
80 | 
81 |   def createStream(
82 |     jssc: JavaStreamingContext,
83 |     storageLevel: StorageLevel,
84 |     url: String,
85 |     interval: Long): JavaDStream[String] = {
86 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
87 |     createStream(jssc.ssc, storageLevel, url, interval)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-29DataFrameExamplesJoin.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JDouble
13 | import org.json4s.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.compact
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.native.JsonMethods.render
19 | import org.json4s.string2JsonInput
20 | 
21 | object CdrDataframeExamples3App {
22 | 
23 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
24 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
25 |     callOutActivity: Float, internetTrafficActivity: Float)
26 | 
27 |   def main(args: Array[String]) {
28 |     if (args.length != 5) {
29 |       System.err.println(
30 |         "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>")
31 |       System.exit(1)
32 |     }
33 |     val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
40 | 
41 |     val sqlC = new SQLContext(ssc.sparkContext)
42 |     import sqlC.implicits._
43 |     implicit val formats = DefaultFormats
44 | 
45 |     val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString
46 |     val gridGeo = (parse(gridFile) \ "features")
47 |     val gridStr = gridGeo.children.map(r => {
48 |       val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r))
49 |       val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)),
50 |         ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7)))
51 |       compact(render(JObject(l)))
52 |     })
53 | 
54 |     val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr))
55 | 
56 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
57 |       .map(_.split("\\t", -1))
58 |       .foreachRDD(rdd => {
59 |         val cdrs = seqToCdr(rdd).toDF()
60 |         cdrs.join(gridDF, $"squareId" === $"id").show()
61 |       })
62 | 
63 |     ssc.start()
64 |     ssc.awaitTermination()
65 |   }
66 | 
67 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
68 |     rdd.map(c => c.map(f => f match {
69 |       case x if x.isEmpty() => "0"
70 |       case x => x
71 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
72 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
73 |   }
74 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-12StaticPool.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppF {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl))
40 | 
41 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
42 |       interval = batchInterval)
43 |       .flatMap(rec => {
44 |         val query = parse(rec) \ "query"
45 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
46 |       })
47 |       .map(rec => {
48 |         implicit val formats = DefaultFormats
49 |         rec.children.map(f => f.extract[String]) mkString ","
50 |       })
51 |       .foreachRDD { rdd =>
52 |         rdd.foreachPartition { par =>
53 |           par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
54 |         }
55 |       }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | 
61 | }
62 | 
63 | class MqttSinkLazy(brokerUrl: String) extends Serializable {
64 |   lazy val client = {
65 |     val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
66 |     client.connect()
67 |     sys.addShutdownHook {
68 |       client.disconnect()
69 |       client.close()
70 |     }
71 |     client
72 |   }
73 | }
74 | 
75 | object MqttSinkLazy {
76 |   val brokerUrl = "tcp://localhost:1883"
77 |   val client = new MqttSinkLazy(brokerUrl)
78 | 
79 |   def apply(brokerUrl: String): MqttSinkLazy = {
80 |     client
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-20CassandraConnector.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | import com.datastax.spark.connector.SomeColumns
17 | import com.datastax.spark.connector.cql.CassandraConnector
18 | import com.datastax.spark.connector.streaming.toDStreamFunctions
19 | import com.datastax.spark.connector.toNamedColumnRef
20 | 
21 | object CassandraConnectorSinkApp {
22 | 
23 |   def main(args: Array[String]) {
24 |     if (args.length != 6) {
25 |       System.err.println(
26 |         "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>")
27 |       System.exit(1)
28 |     }
29 | 
30 |     val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq
31 | 
32 |     val conf = new SparkConf()
33 |       .setAppName(appName)
34 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
35 |       .set("spark.cassandra.connection.host", cassandraHost)
36 |       .set("spark.cassandra.connection.port", cassandraPort)
37 | 
38 |     val batchInterval = 10
39 |     val windowSize = 20
40 |     val slideInterval = 10
41 | 
42 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
43 | 
44 |     CassandraConnector(conf).withSessionDo { session =>
45 |       session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace))
46 |       session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName))
47 |     }
48 | 
49 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
50 |       interval = batchInterval)
51 |       .flatMap(rec => {
52 |         implicit val formats = DefaultFormats
53 |         val query = parse(rec) \ "query"
54 |         ((query \ "results" \ "quote").children)
55 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
56 |       })
57 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
58 |       .map(stock => (stock._1, stock._2 / (windowSize / batchInterval)))
59 |       .saveToCassandra(keyspace, tableName)
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap7/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.Enumeration;
 9 | import java.util.zip.GZIPInputStream;
10 | import java.util.zip.ZipEntry;
11 | import java.util.zip.ZipFile;
12 | 
13 | import org.apache.commons.io.FilenameUtils;
14 | import org.apache.log4j.LogManager;
15 | import org.apache.log4j.Logger;
16 | 
17 | public abstract class AbstractDriver {
18 | 
19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
20 | 
21 | 	private String path;
22 | 
23 | 	public AbstractDriver(String path) {
24 | 		this.path = path;
25 | 	}
26 | 
27 | 	public abstract void init() throws Exception;
28 | 
29 | 	public abstract void close() throws Exception;
30 | 
31 | 	public abstract void sendRecord(String record) throws Exception;
32 | 
33 | 	public void execute() throws Exception {
34 | 
35 | 		try {
36 | 			init();
37 | 			File dirPath = new File(path);
38 | 			if (dirPath.isDirectory()) {
39 | 				File[] files = new File(path).listFiles();
40 | 				for (File f : files) {
41 | 					String ext = FilenameUtils.getExtension(f.getPath());
42 | 					if (ext.equals("zip")) {
43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
44 | 						ZipFile zFile = null;
45 | 						try {
46 | 							zFile = new ZipFile(f);
47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
48 | 
49 | 							while (zEntries.hasMoreElements()) {
50 | 								ZipEntry zEntry = zEntries.nextElement();
51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
52 | 								try (BufferedReader br = new BufferedReader(
53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
54 | 									// skip header
55 | 									br.readLine();
56 | 									String line;
57 | 									while ((line = br.readLine()) != null) {
58 | 										sendRecord(line);
59 | 									}
60 | 								}
61 | 							}
62 | 						} catch (IOException e) {
63 | 							LOG.error(e.getMessage());
64 | 						} finally {
65 | 							if (zFile != null) {
66 | 								try {
67 | 									zFile.close();
68 | 								} catch (IOException e) {
69 | 									LOG.error(e.getMessage());
70 | 								}
71 | 							}
72 | 						}
73 | 					} else if (ext.equals("gz")) {
74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
75 | 						try (BufferedReader br = new BufferedReader(
76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
77 | 							// skip header
78 | 							br.readLine();
79 | 							String line;
80 | 							while ((line = br.readLine()) != null) {
81 | 								sendRecord(line);
82 | 							}
83 | 						}
84 | 					} else {
85 | 						LOG.warn("Unsupported file type: " + f.getName());
86 | 					}
87 | 				}
88 | 			} else {
89 | 				LOG.error(String.format("Path %s is not a directory", path));
90 | 			}
91 | 		} finally {
92 | 			close();
93 | 		}
94 | 	}
95 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.Enumeration;
 9 | import java.util.zip.GZIPInputStream;
10 | import java.util.zip.ZipEntry;
11 | import java.util.zip.ZipFile;
12 | 
13 | import org.apache.commons.io.FilenameUtils;
14 | import org.apache.log4j.LogManager;
15 | import org.apache.log4j.Logger;
16 | 
17 | public abstract class AbstractDriver {
18 | 
19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
20 | 
21 | 	private String path;
22 | 
23 | 	public AbstractDriver(String path) {
24 | 		this.path = path;
25 | 	}
26 | 
27 | 	public abstract void init() throws Exception;
28 | 
29 | 	public abstract void close() throws Exception;
30 | 
31 | 	public abstract void sendRecord(String record) throws Exception;
32 | 
33 | 	public void execute() throws Exception {
34 | 
35 | 		try {
36 | 			init();
37 | 			File dirPath = new File(path);
38 | 			if (dirPath.isDirectory()) {
39 | 				File[] files = new File(path).listFiles();
40 | 				for (File f : files) {
41 | 					String ext = FilenameUtils.getExtension(f.getPath());
42 | 					if (ext.equals("zip")) {
43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
44 | 						ZipFile zFile = null;
45 | 						try {
46 | 							zFile = new ZipFile(f);
47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
48 | 
49 | 							while (zEntries.hasMoreElements()) {
50 | 								ZipEntry zEntry = zEntries.nextElement();
51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
52 | 								try (BufferedReader br = new BufferedReader(
53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
54 | 									// skip header
55 | 									br.readLine();
56 | 									String line;
57 | 									while ((line = br.readLine()) != null) {
58 | 										sendRecord(line);
59 | 									}
60 | 								}
61 | 							}
62 | 						} catch (IOException e) {
63 | 							LOG.error(e.getMessage());
64 | 						} finally {
65 | 							if (zFile != null) {
66 | 								try {
67 | 									zFile.close();
68 | 								} catch (IOException e) {
69 | 									LOG.error(e.getMessage());
70 | 								}
71 | 							}
72 | 						}
73 | 					} else if (ext.equals("gz")) {
74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
75 | 						try (BufferedReader br = new BufferedReader(
76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
77 | 							// skip header
78 | 							br.readLine();
79 | 							String line;
80 | 							while ((line = br.readLine()) != null) {
81 | 								sendRecord(line);
82 | 							}
83 | 						}
84 | 					} else {
85 | 						LOG.warn("Unsupported file type: " + f.getName());
86 | 					}
87 | 				}
88 | 			} else {
89 | 				LOG.error(String.format("Path %s is not a directory", path));
90 | 			}
91 | 		} finally {
92 | 			close();
93 | 		}
94 | 	}
95 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-14HBase.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.conf.Configuration
 4 | import org.apache.hadoop.hbase.HBaseConfiguration
 5 | import org.apache.hadoop.hbase.client.Put
 6 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
 7 | import org.apache.hadoop.hbase.util.Bytes
 8 | import org.apache.hadoop.io.Text
 9 | import org.apache.spark.SparkConf
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
12 | import org.apache.spark.streaming.Seconds
13 | import org.apache.spark.streaming.StreamingContext
14 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
15 | import org.json4s.DefaultFormats
16 | import org.json4s.jvalue2extractable
17 | import org.json4s.jvalue2monadic
18 | import org.json4s.native.JsonMethods.parse
19 | import org.json4s.string2JsonInput
20 | 
21 | object HBaseSinkApp {
22 | 
23 |   def main(args: Array[String]) {
24 |     if (args.length != 5) {
25 |       System.err.println(
26 |         "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
27 |       System.exit(1)
28 |     }
29 | 
30 |     val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq
31 | 
32 |     val conf = new SparkConf()
33 |       .setAppName(appName)
34 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
35 | 
36 |     val batchInterval = 10
37 |     val windowSize = 20
38 |     val slideInterval = 10
39 | 
40 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
41 | 
42 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
43 |       interval = batchInterval)
44 |       .flatMap(rec => {
45 |         implicit val formats = DefaultFormats
46 |         val query = parse(rec) \ "query"
47 |         ((query \ "results" \ "quote").children)
48 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
49 |       })
50 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
51 |       .foreachRDD(rdd => {
52 |         val hbaseConf = HBaseConfiguration.create()
53 |         hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
54 |         hbaseConf.set("hbase.master", hbaseMaster)
55 |         val jobConf = new Configuration(hbaseConf)
56 |         jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
57 |         rdd.map(rec => {
58 |           val put = new Put(rec._1.getBytes)
59 |           put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
60 |           (rec._1, put)
61 |         }).saveAsNewAPIHadoopDataset(jobConf)
62 |       })
63 | 
64 |     ssc.start()
65 |     ssc.awaitTermination()
66 |   }
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-10-11UDF.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | import scala.reflect.runtime.universe
 5 | 
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.sql.SQLContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.json4s.jackson.JsonMethods.parse
13 | import org.json4s.jvalue2extractable
14 | import org.json4s.string2JsonInput
15 | 
16 | object CdrUDFApp {
17 | 
18 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
19 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
20 |     callOutActivity: Float, internetTrafficActivity: Float)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val sqlC = new SQLContext(ssc.sparkContext)
37 |     import sqlC.implicits._
38 | 
39 |     def getCountryCodeMapping() = {
40 |       implicit val formats = org.json4s.DefaultFormats
41 |       parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap)
42 |     }
43 | 
44 |     def getCountryNameMapping() = {
45 |       implicit val formats = org.json4s.DefaultFormats
46 |       parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]]
47 |     }
48 | 
49 |     def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = {
50 |       mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound")
51 |     }
52 | 
53 |     val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int)
54 | 
55 |     sqlC.udf.register("getCountryNamePartial", getCountryNamePartial)
56 | 
57 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
58 |       .map(_.split("\\t", -1))
59 |       .foreachRDD(rdd => {
60 |         val cdrs = seqToCdr(rdd).toDF()
61 |         cdrs.registerTempTable("cdrs")
62 | 
63 |         sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show()
64 | 
65 |       })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 |   }
70 | 
71 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
72 |     rdd.map(c => c.map(f => f match {
73 |       case x if x.isEmpty() => "0"
74 |       case x => x
75 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
76 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
77 |   }
78 | 
79 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-15MLPipeline.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.ml.Pipeline
 7 | import org.apache.spark.ml.feature.Normalizer
 8 | import org.apache.spark.ml.feature.VectorAssembler
 9 | import org.apache.spark.ml.regression.RandomForestRegressor
10 | import org.apache.spark.sql.SQLContext
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.apache.spark.ml.param.ParamMap
14 | 
15 | object MLPipelineApp {
16 | 
17 |   case class Activity(label: Double,
18 |     accelXHand: Double, accelYHand: Double, accelZHand: Double,
19 |     accelXChest: Double, accelYChest: Double, accelZChest: Double,
20 |     accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val sqlC = new SQLContext(ssc.sparkContext)
37 |     import sqlC.implicits._
38 | 
39 |     val substream = ssc.socketTextStream(hostname, port.toInt)
40 |       .filter(!_.contains("NaN"))
41 |       .map(_.split(" "))
42 |       .filter(f => f(1) == "4" || f(1) == "5")
43 |       .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
44 |       .map(f => f.map(v => v.toDouble))
45 |       .foreachRDD(rdd => {
46 |         if (!rdd.isEmpty) {
47 |           val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF()
48 |           val split = accelerometer.randomSplit(Array(0.3, 0.7))
49 |           val test = split(0)
50 |           val train = split(1)
51 | 
52 |           val assembler = new VectorAssembler()
53 |             .setInputCols(Array(
54 |               "accelXHand", "accelYHand", "accelZHand",
55 |               "accelXChest", "accelYChest", "accelZChest",
56 |               "accelXAnkle", "accelYAnkle", "accelZAnkle"))
57 |             .setOutputCol("vectors")
58 |           val normalizer = new Normalizer()
59 |             .setInputCol(assembler.getOutputCol)
60 |             .setOutputCol("features")
61 |           val regressor = new RandomForestRegressor()
62 | 
63 |           val pipeline = new Pipeline()
64 |             .setStages(Array(assembler, normalizer, regressor))
65 |           val pMap =  ParamMap(normalizer.p -> 1.0)
66 |           val model = pipeline.fit(train, pMap)
67 |           val prediction = model.transform(test)
68 |           prediction.show()
69 |         }
70 |       })
71 | 
72 |     ssc.start()
73 |     ssc.awaitTermination()
74 |   }
75 | 
76 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamWindowAndAction.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | import org.apache.spark.HashPartitioner
19 | 
20 | object RedditWindowAndActionApp {
21 |   def main(args: Array[String]) {
22 |     if (args.length != 2) {
23 |       System.err.println(
24 |         "Usage: RedditWindowAndActionApp <appname> <input_path>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, inputPath) = args.toSeq
28 |     val LOG = LogManager.getLogger(this.getClass)
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(1))
35 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
36 | 
37 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
38 | 
39 |     val checkpointPath = "/tmp"
40 |     ssc.checkpoint(checkpointPath)
41 |     val updateFunc = (values: Seq[Int], state: Option[Int]) => {
42 |       val currentCount = values.sum
43 |       val previousCount = state.getOrElse(0)
44 |       Some(currentCount + previousCount)
45 |     }
46 |     val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
47 |     val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
48 |       .map(r => (r._2, r._1))
49 |       .transform(rdd => rdd.sortByKey(ascending = false))
50 | 
51 |     val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
52 |     val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
53 |     val windowedCounts = windowedRecs.countByValue()
54 | 
55 |     windowedCounts.print(10)
56 |     windowedCounts.saveAsObjectFiles("subreddit", "obj")
57 |     windowedCounts.saveAsTextFiles("subreddit", "txt")
58 | 
59 |     globalCount.saveAsHadoopFiles("subreddit", "hadoop",
60 |       classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
61 |     globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
62 |       classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
63 |     comments.foreachRDD(rdd => {
64 |       LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
65 |     })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 | 
70 |   }
71 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/T9-4DataTypes.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Matrices
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
 8 | import org.apache.spark.mllib.linalg.distributed.IndexedRow
 9 | import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
10 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry
11 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
12 | import org.apache.spark.mllib.regression.LabeledPoint
13 | import org.apache.spark.streaming.Seconds
14 | import org.apache.spark.streaming.StreamingContext
15 | 
16 | object DataTypesApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val substream = ssc.socketTextStream(hostname, port.toInt)
33 |       .filter(!_.contains("NaN"))
34 |       .map(_.split(" "))
35 |       .filter(f => f(1) != "0")
36 |       .map(f => f.map(f => f.toDouble))
37 | 
38 |     val denseV = substream.map(f => Vectors.dense(f.slice(1, 5)))
39 |     denseV.print()
40 |     val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) })
41 |       .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
42 |     sparseV.print()
43 |     val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
44 |     labeledP.print()
45 |     val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
46 |     denseM.print()
47 |     denseV.foreachRDD(rdd => {
48 |       val rowM = new RowMatrix(rdd)
49 |       println(rowM)
50 |     })
51 |     denseV.foreachRDD(rdd => {
52 |       val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1))
53 |       val iRowM = new IndexedRowMatrix(iRdd)
54 |       println(iRowM)
55 |     })
56 |     substream.foreachRDD(rdd => {
57 |       val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
58 |         .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
59 |       val cRowM = new CoordinateMatrix(entries)
60 |       println(cRowM)
61 |     })
62 |     substream.foreachRDD(rdd => {
63 |       val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
64 |         .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
65 |       val blockM = new CoordinateMatrix(entries).toBlockMatrix
66 |       println(blockM)
67 |     })
68 | 
69 |     ssc.start()
70 |     ssc.awaitTermination()
71 |   }
72 | 
73 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-26Redis.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.JavaConversions.asScalaBuffer
 4 | import scala.collection.JavaConversions.mutableMapAsJavaMap
 5 | import scala.collection.mutable
 6 | 
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.jvalue2extractable
13 | import org.json4s.jvalue2monadic
14 | import org.json4s.native.JsonMethods.parse
15 | import org.json4s.string2JsonInput
16 | 
17 | import redis.clients.jedis.Jedis
18 | 
19 | object StatefulRedisApp {
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 3) {
23 |       System.err.println(
24 |         "Usage: StatefulRedisApp <appname> <checkpointDir> <hostname>")
25 |       System.exit(1)
26 |     }
27 | 
28 |     val Seq(appName, checkpointDir, hostname) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val batchInterval = 10
35 | 
36 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
37 | 
38 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
39 |       interval = batchInterval)
40 |       .flatMap(rec => {
41 |         implicit val formats = DefaultFormats
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children)
44 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
45 |       })
46 |       .foreachRDD(rdd => {
47 |         rdd.foreachPartition({ part =>
48 |           val jedis = new Jedis(hostname)
49 |           part.foreach(f => {
50 |             val prev = jedis.hmget(f._1, "min", "max", "count")
51 |             if (prev(0) == null) {
52 |               jedis.hmset(f._1, mutable.HashMap("min" -> Long.MaxValue.toString, "max" -> Long.MinValue.toString, "count" -> 0.toString))
53 |             } else {
54 |               val prevLong = prev.toList.map(v => v.toLong)
55 |               var newCount = prevLong(2)
56 |               val newPrice = f._2._1
57 |               val newVolume = f._2._2
58 |               if (newPrice > 500.0) {
59 |                 newCount += 1
60 |               }
61 |               val newMin = if (newVolume < prevLong(0)) newVolume else prevLong(0)
62 |               val newMax = if (newVolume > prevLong(1)) newVolume else prevLong(1)
63 |               jedis.hmset(f._1, mutable.HashMap("min" -> newMin.toString, "max" -> newMax.toString, "count" -> newCount.toString))
64 |             }
65 |           })
66 |           jedis.close()
67 |         })
68 | 
69 |         val jedis = new Jedis(hostname)
70 |         jedis.scan(0).getResult.foreach(sym => println("Symbol: %s, Stats: %s".format(sym, jedis.hmget(sym, "min", "max", "count").toString)))
71 |         jedis.close()
72 |       })
73 | 
74 |     ssc.start()
75 |     ssc.awaitTermination()
76 |   }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.Enumeration;
  9 | import java.util.zip.GZIPInputStream;
 10 | import java.util.zip.ZipEntry;
 11 | import java.util.zip.ZipFile;
 12 | 
 13 | import org.apache.commons.io.FilenameUtils;
 14 | import org.apache.log4j.LogManager;
 15 | import org.apache.log4j.Logger;
 16 | 
 17 | public abstract class AbstractDriver {
 18 | 
 19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
 20 | 
 21 | 	private String path;
 22 | 
 23 | 	public AbstractDriver(String path) {
 24 | 		this.path = path;
 25 | 	}
 26 | 
 27 | 	public abstract void init() throws Exception;
 28 | 
 29 | 	public abstract void close() throws Exception;
 30 | 
 31 | 	public abstract void sendRecord(String record) throws Exception;
 32 | 
 33 | 	public void execute() throws Exception {
 34 | 
 35 | 		try {
 36 | 			init();
 37 | 			File dirPath = new File(path);
 38 | 			if (dirPath.isDirectory()) {
 39 | 				File[] files = new File(path).listFiles();
 40 | 				for (File f : files) {
 41 | 					String ext = FilenameUtils.getExtension(f.getPath());
 42 | 					if (ext.equals("zip")) {
 43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
 44 | 						ZipFile zFile = null;
 45 | 						try {
 46 | 							zFile = new ZipFile(f);
 47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
 48 | 
 49 | 							while (zEntries.hasMoreElements()) {
 50 | 								ZipEntry zEntry = zEntries.nextElement();
 51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
 52 | 								try (BufferedReader br = new BufferedReader(
 53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
 54 | 									// skip header
 55 | 									br.readLine();
 56 | 									String line;
 57 | 									while ((line = br.readLine()) != null) {
 58 | 										sendRecord(line);
 59 | 									}
 60 | 								}
 61 | 							}
 62 | 						} catch (IOException e) {
 63 | 							LOG.error(e.getMessage());
 64 | 						} finally {
 65 | 							if (zFile != null) {
 66 | 								try {
 67 | 									zFile.close();
 68 | 								} catch (IOException e) {
 69 | 									LOG.error(e.getMessage());
 70 | 								}
 71 | 							}
 72 | 						}
 73 | 					} else if (ext.equals("gz")) {
 74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
 75 | 						try (BufferedReader br = new BufferedReader(
 76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
 77 | 							// skip header
 78 | 							br.readLine();
 79 | 							String line;
 80 | 							while ((line = br.readLine()) != null) {
 81 | 								sendRecord(line);
 82 | 							}
 83 | 						}
 84 | 					} else if (ext.equals("dat")) {
 85 | 						LOG.info(String.format("Feeding dat file %s", f.getName()));
 86 | 						try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) {
 87 | 							String line;
 88 | 							while ((line = br.readLine()) != null) {
 89 | 								sendRecord(line);
 90 | 							}
 91 | 						}
 92 | 					} else {
 93 | 						LOG.warn("Unsupported file type: " + f.getName());
 94 | 					}
 95 | 				}
 96 | 			} else {
 97 | 				LOG.error(String.format("Path %s is not a directory", path));
 98 | 			}
 99 | 		} finally {
100 | 			close();
101 | 		}
102 | 	}
103 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/T8-5-L8-30-34DataFrameExamplesActions.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SaveMode
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.sql.hive.HiveContext
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr
14 | import org.json4s.DefaultFormats
15 | 
16 | object CdrDataframeExamplesActionsApp {
17 | 
18 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
19 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
20 |     callOutActivity: Float, internetTrafficActivity: Float)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val cl = Thread.currentThread().getContextClassLoader()
37 |     val hiveC = new HiveContext(ssc.sparkContext)
38 |     Thread.currentThread().setContextClassLoader(cl)
39 |     import hiveC.implicits._
40 |     implicit val formats = DefaultFormats
41 | 
42 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
43 |       .map(_.split("\\t", -1))
44 |       .foreachRDD(rdd => {
45 |         val cdrs = seqToCdr(rdd).toDF()
46 | 
47 |         val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count"))
48 |         counts.show(5)
49 |         counts.show()
50 |         println("head(5): " + counts.head(5))
51 |         println("take(5): " + counts.take(5))
52 |         println("head(): " + counts.head())
53 |         println("first(5): " + counts.first())
54 |         println("count(): " + counts.count())
55 |         println("collect(): " + counts.collect())
56 |         println("collectAsList(): " + counts.collectAsList())
57 |         println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show())
58 |         counts.write.format("parquet").save("/tmp/parquent" + rdd.id)
59 |         counts.write.format("json").save("/tmp/json" + rdd.id)
60 |         counts.write.parquet("/tmp/parquent2" + rdd.id)
61 |         counts.write.json("/tmp/json2" + rdd.id)
62 |         counts.write.saveAsTable("count_table")
63 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts")
64 |         val prop: java.util.Properties = new java.util.Properties()
65 |         counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop)
66 |       })
67 | 
68 |     ssc.start()
69 |     ssc.awaitTermination()
70 |   }
71 | 
72 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
73 |     rdd.map(c => c.map(f => f match {
74 |       case x if x.isEmpty() => "0"
75 |       case x => x
76 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
77 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
78 |   }
79 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.Enumeration;
  9 | import java.util.zip.GZIPInputStream;
 10 | import java.util.zip.ZipEntry;
 11 | import java.util.zip.ZipFile;
 12 | 
 13 | import org.apache.commons.io.FilenameUtils;
 14 | import org.apache.log4j.LogManager;
 15 | import org.apache.log4j.Logger;
 16 | 
 17 | public abstract class AbstractDriver {
 18 | 
 19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
 20 | 
 21 | 	private String path;
 22 | 
 23 | 	public AbstractDriver(String path) {
 24 | 		this.path = path;
 25 | 	}
 26 | 
 27 | 	public abstract void init() throws Exception;
 28 | 
 29 | 	public abstract void close() throws Exception;
 30 | 
 31 | 	public abstract void sendRecord(String record) throws Exception;
 32 | 
 33 | 	public void execute() throws Exception {
 34 | 
 35 | 		try {
 36 | 			init();
 37 | 			File dirPath = new File(path);
 38 | 			if (dirPath.isDirectory()) {
 39 | 				File[] files = new File(path).listFiles();
 40 | 				for (File f : files) {
 41 | 					String ext = FilenameUtils.getExtension(f.getPath());
 42 | 					if (ext.equals("zip")) {
 43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
 44 | 						ZipFile zFile = null;
 45 | 						try {
 46 | 							zFile = new ZipFile(f);
 47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
 48 | 
 49 | 							while (zEntries.hasMoreElements()) {
 50 | 								ZipEntry zEntry = zEntries.nextElement();
 51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
 52 | 								try (BufferedReader br = new BufferedReader(
 53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
 54 | 									// skip header
 55 | 									br.readLine();
 56 | 									String line;
 57 | 									while ((line = br.readLine()) != null) {
 58 | 										sendRecord(line);
 59 | 									}
 60 | 								}
 61 | 							}
 62 | 						} catch (IOException e) {
 63 | 							LOG.error(e.getMessage());
 64 | 						} finally {
 65 | 							if (zFile != null) {
 66 | 								try {
 67 | 									zFile.close();
 68 | 								} catch (IOException e) {
 69 | 									LOG.error(e.getMessage());
 70 | 								}
 71 | 							}
 72 | 						}
 73 | 					} else if (ext.equals("gz")) {
 74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
 75 | 						try (BufferedReader br = new BufferedReader(
 76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
 77 | 							// skip header
 78 | 							br.readLine();
 79 | 							String line;
 80 | 							while ((line = br.readLine()) != null) {
 81 | 								sendRecord(line);
 82 | 							}
 83 | 						}
 84 | 					} else if (ext.equals("dat") || ext.equals("json")) {
 85 | 						LOG.info(String.format("Feeding dat file %s", f.getName()));
 86 | 						try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) {
 87 | 							String line;
 88 | 							while ((line = br.readLine()) != null) {
 89 | 								sendRecord(line);
 90 | 							}
 91 | 						}
 92 | 					} else {
 93 | 						LOG.warn("Unsupported file type: " + f.getName());
 94 | 					}
 95 | 				}
 96 | 			} else {
 97 | 				LOG.error(String.format("Path %s is not a directory", path));
 98 | 			}
 99 | 		} finally {
100 | 			close();
101 | 		}
102 | 	}
103 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-18Cassandra.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.json4s.DefaultFormats
 9 | import org.json4s.JField
10 | import org.json4s.JsonAST.JObject
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | import org.apache.hadoop.conf.Configuration
16 | import org.apache.hadoop.io.Text
17 | import java.nio.ByteBuffer
18 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
19 | import org.apache.cassandra.hadoop.ConfigHelper
20 | import org.apache.cassandra.thrift.ColumnOrSuperColumn
21 | import org.apache.cassandra.thrift.Column
22 | import org.apache.cassandra.utils.ByteBufferUtil
23 | import org.apache.cassandra.thrift.Mutation
24 | import java.util.Arrays
25 | 
26 | object CassandraSinkApp {
27 | 
28 |   def main(args: Array[String]) {
29 |     if (args.length != 6) {
30 |       System.err.println(
31 |         "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>")
32 |       System.exit(1)
33 |     }
34 | 
35 |     val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq
36 | 
37 |     val conf = new SparkConf()
38 |       .setAppName(appName)
39 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
40 | 
41 |     val batchInterval = 10
42 |     val windowSize = 20
43 |     val slideInterval = 10
44 | 
45 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
46 | 
47 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
48 |       interval = batchInterval)
49 |       .flatMap(rec => {
50 |         implicit val formats = DefaultFormats
51 |         val query = parse(rec) \ "query"
52 |         ((query \ "results" \ "quote").children)
53 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
54 |       })
55 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
56 |       .foreachRDD(rdd => {
57 |         val jobConf = new Configuration()
58 |         ConfigHelper.setOutputRpcPort(jobConf, cassandraPort)
59 |         ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost)
60 |         ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName)
61 |         ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner")
62 |         rdd.map(rec => {
63 |           val c = new Column()
64 |           c.setName(ByteBufferUtil.bytes(columnName))
65 |           c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval)))
66 |           c.setTimestamp(System.currentTimeMillis)
67 |           val m = new Mutation()
68 |           m.setColumn_or_supercolumn(new ColumnOrSuperColumn())
69 |           m.column_or_supercolumn.setColumn(c)
70 |           (ByteBufferUtil.bytes(rec._1), Arrays.asList(m))
71 |         }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf)
72 |       })
73 | 
74 |     ssc.start()
75 |     ssc.awaitTermination()
76 |   }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-17MLCrossValidation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.ml.Pipeline
 8 | import org.apache.spark.ml.evaluation.RegressionEvaluator
 9 | import org.apache.spark.ml.feature.Normalizer
10 | import org.apache.spark.ml.feature.VectorAssembler
11 | import org.apache.spark.ml.regression.RandomForestRegressor
12 | import org.apache.spark.ml.tuning.CrossValidator
13 | import org.apache.spark.ml.tuning.ParamGridBuilder
14 | import org.apache.spark.sql.SQLContext
15 | import org.apache.spark.streaming.Seconds
16 | import org.apache.spark.streaming.StreamingContext
17 | 
18 | object MLCrossValidationApp {
19 | 
20 |   case class Activity(label: Double,
21 |     accelXHand: Double, accelYHand: Double, accelZHand: Double,
22 |     accelXChest: Double, accelYChest: Double, accelZChest: Double,
23 |     accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double)
24 | 
25 |   def main(args: Array[String]) {
26 |     if (args.length != 4) {
27 |       System.err.println(
28 |         "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>")
29 |       System.exit(1)
30 |     }
31 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
32 | 
33 |     val conf = new SparkConf()
34 |       .setAppName(appName)
35 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
38 | 
39 |     val sqlC = new SQLContext(ssc.sparkContext)
40 |     import sqlC.implicits._
41 | 
42 |     val substream = ssc.socketTextStream(hostname, port.toInt)
43 |       .filter(!_.contains("NaN"))
44 |       .map(_.split(" "))
45 |       .filter(f => f(1) == "4" || f(1) == "5")
46 |       .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
47 |       .map(f => f.map(v => v.toDouble))
48 |       .foreachRDD(rdd => {
49 |         if (!rdd.isEmpty) {
50 |           val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF()
51 |           val split = accelerometer.randomSplit(Array(0.3, 0.7))
52 |           val test = split(0)
53 |           val train = split(1)
54 | 
55 |           val assembler = new VectorAssembler()
56 |             .setInputCols(Array(
57 |               "accelXHand", "accelYHand", "accelZHand",
58 |               "accelXChest", "accelYChest", "accelZChest",
59 |               "accelXAnkle", "accelYAnkle", "accelZAnkle"))
60 |             .setOutputCol("vectors")
61 |           val normalizer = new Normalizer()
62 |             .setInputCol(assembler.getOutputCol)
63 |             .setOutputCol("features")
64 |           val regressor = new RandomForestRegressor()
65 | 
66 |           val pipeline = new Pipeline()
67 |             .setStages(Array(assembler, normalizer, regressor))
68 | 
69 |           val validator = new CrossValidator()
70 |             .setEstimator(pipeline)
71 |             .setEvaluator(new RegressionEvaluator)
72 |           val pGrid = new ParamGridBuilder()
73 |             .addGrid(normalizer.p, Array(1.0, 5.0, 10.0))
74 |             .addGrid(regressor.numTrees, Array(10, 50, 100))
75 |             .build()
76 |           validator.setEstimatorParamMaps(pGrid)
77 |           validator.setNumFolds(5)
78 | 
79 |           val bestModel = validator.fit(train)
80 |           val prediction = bestModel.transform(test)
81 |           prediction.show()
82 |         }
83 |       })
84 | 
85 |     ssc.start()
86 |     ssc.awaitTermination()
87 |   }
88 | 
89 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-14-27DataFrameExamples.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeExamplesApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
37 |       .map(_.split("\\t", -1))
38 |       .foreachRDD(rdd => {
39 |         val cdrs = seqToCdr(rdd).toDF()
40 | 
41 |         cdrs.select("squareId", "timeInterval", "countryCode").show()
42 |         cdrs.select($"squareId", $"timeInterval", $"countryCode").show()
43 |         cdrs.filter("squareId = 5").show()
44 |         cdrs.drop("countryCode").show()
45 |         cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show()
46 |         cdrs.limit(5).show()
47 |         cdrs.groupBy("squareId").count().show()
48 |         cdrs.groupBy("countryCode").avg("internetTrafficActivity").show()
49 |         cdrs.groupBy("countryCode").max("callOutActivity").show()
50 |         cdrs.groupBy("countryCode").min("callOutActivity").show()
51 |         cdrs.groupBy("squareId").sum("internetTrafficActivity").show()
52 |         cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
53 |         cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show()
54 |         cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
55 |         cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode())
56 |         cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode())
57 |         cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show()
58 |         cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show()
59 |         cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show()
60 |         cdrs.sample(true, 0.01).show()
61 |       })
62 | 
63 |     ssc.start()
64 |     ssc.awaitTermination()
65 |   }
66 | 
67 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
68 |     rdd.map(c => c.map(f => f match {
69 |       case x if x.isEmpty() => "0"
70 |       case x => x
71 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
72 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
73 |   }
74 | }


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-24Accumulators.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | import org.apache.spark.AccumulableParam
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object StatefulAccumulatorsApp {
17 | 
18 |   object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] {
19 |     def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = {
20 |       new mutable.HashMap[String, (Long, Long, Long)]()
21 |     }
22 |     def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = {
23 |       t1 ++ t2.map {
24 |         case (k, v2) => (k -> {
25 |           val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L))
26 |           val newMin = if (v2._1 < v1._1) v2._1 else v1._1
27 |           val newMax = if (v2._2 > v1._2) v2._2 else v1._2
28 |           (newMin, newMax, v1._3 + v2._3)
29 |         })
30 |       }
31 |     }
32 |     def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = {
33 |       val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L))
34 |       val newVals = t2._2
35 |       var newCount = prevStats._3
36 |       if (newVals._1 > 500.0) {
37 |         newCount += 1
38 |       }
39 |       val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1
40 |       val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2
41 |       t1 += t2._1 -> (newMin, newMax, newCount)
42 |     }
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     if (args.length != 2) {
47 |       System.err.println(
48 |         "Usage: StatefulAccumulatorsApp <appname> <checkpointDir>")
49 |       System.exit(1)
50 |     }
51 | 
52 |     val Seq(appName, checkpointDir) = args.toSeq
53 | 
54 |     val conf = new SparkConf()
55 |       .setAppName(appName)
56 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
57 | 
58 |     val batchInterval = 10
59 | 
60 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
61 | 
62 |     val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum)
63 | 
64 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
65 |       interval = batchInterval)
66 |       .flatMap(rec => {
67 |         implicit val formats = DefaultFormats
68 |         val query = parse(rec) \ "query"
69 |         ((query \ "results" \ "quote").children)
70 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
71 |       })
72 |       .foreachRDD(rdd => {
73 |         rdd.foreach({ stock =>
74 |           stateAccum += (stock._1, (stock._2._1, stock._2._2))
75 |         })
76 |         for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats)
77 |       })
78 | 
79 |     ssc.start()
80 |     ssc.awaitTermination()
81 |   }
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-10LazyStatic.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.eclipse.paho.client.mqttv3.MqttClient
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage
10 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JField
13 | import org.json4s.JsonAST.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.parse
17 | import org.json4s.string2JsonInput
18 | import org.apache.commons.pool2.PooledObject
19 | import org.apache.commons.pool2.BasePooledObjectFactory
20 | import org.apache.commons.pool2.impl.DefaultPooledObject
21 | import org.apache.commons.pool2.impl.GenericObjectPool
22 | import org.apache.commons.pool2.ObjectPool
23 | 
24 | object MqttSinkAppE {
25 | 
26 |   def main(args: Array[String]) {
27 |     if (args.length != 3) {
28 |       System.err.println(
29 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
30 |       System.exit(1)
31 |     }
32 | 
33 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val batchInterval = 10
40 | 
41 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
42 | 
43 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
44 |       interval = batchInterval)
45 |       .flatMap(rec => {
46 |         val query = parse(rec) \ "query"
47 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
48 |       })
49 |       .map(rec => {
50 |         implicit val formats = DefaultFormats
51 |         rec.children.map(f => f.extract[String]) mkString ","
52 |       })
53 |       .foreachRDD { rdd =>
54 |         rdd.foreachPartition { par =>
55 |           val mqttSink = MqttSinkPool().borrowObject()
56 |           par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
57 |           MqttSinkPool().returnObject(mqttSink)
58 |         }
59 |       }
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | }
65 | 
66 | object MqttSinkPool {
67 |   val poolSize = 8
68 |   val brokerUrl = "tcp://localhost:1883"
69 |   val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl))
70 |   mqttPool.setMaxTotal(poolSize)
71 |   sys.addShutdownHook {
72 |     mqttPool.close()
73 |   }
74 |   
75 |   def apply(): GenericObjectPool[MqttClient] = {
76 |     mqttPool
77 |   }
78 | }
79 | 
80 | class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] {
81 |   override def create() = {
82 |     val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
83 |     client.connect()
84 |     client
85 |   }
86 |   override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client)
87 |   override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected()
88 |   override def destroyObject(pObj: PooledObject[MqttClient]) = {
89 |     pObj.getObject.disconnect()
90 |     pObj.getObject.close()
91 |   }
92 |   override def passivateObject(pObj: PooledObject[MqttClient]) = {}
93 | }
94 | 


--------------------------------------------------------------------------------
/prosparkstreaming-master/Chap10/src/main/java/org/apress/prospark/SocketDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.IOException;
  4 | import java.net.InetSocketAddress;
  5 | import java.nio.ByteBuffer;
  6 | import java.nio.channels.ServerSocketChannel;
  7 | import java.nio.channels.SocketChannel;
  8 | import java.nio.charset.StandardCharsets;
  9 | import java.util.concurrent.ExecutionException;
 10 | 
 11 | import org.apache.log4j.LogManager;
 12 | import org.apache.log4j.Logger;
 13 | 
 14 | public class SocketDriver extends AbstractDriver {
 15 | 
 16 | 	private static final Logger LOG = LogManager.getLogger(SocketDriver.class);
 17 | 
 18 | 	private String hostname;
 19 | 	private int port;
 20 | 	private SocketStream socketStream;
 21 | 
 22 | 	public SocketDriver(String path, String hostname, int port) {
 23 | 		super(path);
 24 | 		this.hostname = hostname;
 25 | 		this.port = port;
 26 | 	}
 27 | 
 28 | 	@Override
 29 | 	public void init() throws Exception {
 30 | 		socketStream = new SocketStream(hostname, port);
 31 | 		LOG.info(String.format("Waiting for client to connect on port %d", port));
 32 | 		SocketChannel socketChan = socketStream.init();
 33 | 		LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port));
 34 | 		socketStream.kickOff(socketChan);
 35 | 		socketStream.start();
 36 | 	}
 37 | 
 38 | 	@Override
 39 | 	public void close() throws IOException {
 40 | 		socketStream.done();
 41 | 		if (socketStream != null) {
 42 | 			socketStream.close();
 43 | 		}
 44 | 	}
 45 | 
 46 | 	@Override
 47 | 	public void sendRecord(String record) throws Exception {
 48 | 		socketStream.sendMsg(record + "\n");
 49 | 	}
 50 | 
 51 | 	static class SocketStream extends Thread {
 52 | 
 53 | 		private String hostname;
 54 | 		private int port;
 55 | 		private ServerSocketChannel server;
 56 | 		private volatile boolean isDone = false;
 57 | 		private SocketChannel socket = null;
 58 | 		private long totalBytes;
 59 | 		private long totalLines;
 60 | 
 61 | 		public SocketStream(String hostname, int port) {
 62 | 			this.hostname = hostname;
 63 | 			this.port = port;
 64 | 			totalBytes = 0;
 65 | 			totalLines = 0;
 66 | 		}
 67 | 
 68 | 		public SocketChannel init() throws IOException {
 69 | 			server = ServerSocketChannel.open();
 70 | 			server.bind(new InetSocketAddress(hostname, port));
 71 | 			LOG.info(String.format("Listening on %s", server.getLocalAddress()));
 72 | 			return server.accept();
 73 | 		}
 74 | 
 75 | 		public void kickOff(SocketChannel socket) {
 76 | 			LOG.info("Kicking off data transfer");
 77 | 			this.socket = socket;
 78 | 		}
 79 | 
 80 | 		@Override
 81 | 		public void run() {
 82 | 			try {
 83 | 				while (!isDone) {
 84 | 					Thread.sleep(1000);
 85 | 				}
 86 | 			} catch (Exception e) {
 87 | 				LOG.error(e);
 88 | 			}
 89 | 		}
 90 | 
 91 | 		public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException {
 92 | 			if (socket != null) {
 93 | 				ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8));
 94 | 				int bytesWritten = socket.write(buffer);
 95 | 				totalBytes += bytesWritten;
 96 | 			} else {
 97 | 				throw new IOException("Client hasn't connected yet!");
 98 | 			}
 99 | 			totalLines++;
100 | 		}
101 | 
102 | 		public void done() {
103 | 			isDone = true;
104 | 		}
105 | 
106 | 		public void close() throws IOException {
107 | 			if (socket != null) {
108 | 				socket.close();
109 | 				socket = null;
110 | 			}
111 | 			LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes,
112 | 					totalLines));
113 | 		}
114 | 	}
115 | 
116 | 	public static void main(String[] args) throws Exception {
117 | 
118 | 		if (args.length != 3) {
119 | 			System.err.println("Usage: SocketDriver <path_to_input_folder> <hostname> <port>");
120 | 			System.exit(-1);
121 | 		}
122 | 
123 | 		String path = args[0];
124 | 		String hostname = args[1];
125 | 		int port = Integer.parseInt(args[2]);
126 | 
127 | 		SocketDriver driver = new SocketDriver(path, hostname, port);
128 | 		try {
129 | 			driver.execute();
130 | 		} finally {
131 | 			driver.close();
132 | 		}
133 | 	}
134 | }


--------------------------------------------------------------------------------