├── LICENSE.txt ├── 9781484214800.jpg ├── prosparkstreaming-master ├── Chap10 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── scala │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── L10-2DataProc.scala │ │ │ │ └── L10-9Graph.scala │ │ │ └── java │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ ├── AbstractDriver.java │ │ │ └── SocketDriver.java │ ├── yelp_pyspark.py │ └── spark.sbt ├── Chap2 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── spark.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apress │ │ └── prospark │ │ ├── T2-6Accumulator.scala │ │ └── L2-1FirstApp.scala ├── Chap3 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── touch_files_window.sh │ ├── spark.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apress │ │ └── prospark │ │ ├── L3-DStreamVariation.scala │ │ ├── L3-1DStreams.scala │ │ ├── L3-DStreamAggregation.scala │ │ ├── L3-DStreamMapping.scala │ │ └── L3-DStreamWindowAndAction.scala ├── Chap4 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── spark.sbt │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── apress │ │ └── prospark │ │ ├── L4-1Voyager.scala │ │ ├── L4-4Kryo.scala │ │ └── L4-3ProtonFlux.scala ├── Chap5 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── flumeConf │ │ ├── log4j.properties │ │ ├── flumeTest.conf │ │ ├── flumePush.conf │ │ └── flumePull.conf │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── scala │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── L5-6SocketStream.scala │ │ │ │ ├── L5-7MultipleSocketStreams.scala │ │ │ │ ├── L5-9Mqtt.scala │ │ │ │ ├── L5-16Twitter.scala │ │ │ │ ├── L5-11FlumePush.scala │ │ │ │ ├── L5-11FlumePull.scala │ │ │ │ ├── L5-13Kafka.scala │ │ │ │ ├── L5-15KafkaDirect.scala │ │ │ │ ├── L5-18Http.scala │ │ │ │ ├── L5-14KafkaCustomConf.scala │ │ │ │ ├── HttpInputDStreamAsync.scala │ │ │ │ └── HttpInputDStream.scala │ │ │ └── java │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ ├── KafkaDriver.java │ │ │ ├── AbstractDriver.java │ │ │ └── MqttDriver.java │ └── spark.sbt ├── Chap6 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── java │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── AbstractDriver.java │ │ │ │ └── MqttDriver.java │ │ │ └── scala │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ ├── L6-5Exception.scala │ │ │ ├── L6-7PerPartition.scala │ │ │ ├── L6-6PerRecord.scala │ │ │ ├── L6-8Static.scala │ │ │ ├── L6-23UpdateState.scala │ │ │ ├── L6-16SparkHBase.scala │ │ │ ├── L6-22Counters.scala │ │ │ ├── HttpInputDStream.scala │ │ │ ├── L6-12StaticPool.scala │ │ │ ├── L6-20CassandraConnector.scala │ │ │ ├── L6-14HBase.scala │ │ │ ├── L6-26Redis.scala │ │ │ ├── L6-18Cassandra.scala │ │ │ ├── L6-24Accumulators.scala │ │ │ └── L6-10LazyStatic.scala │ └── spark.sbt ├── Chap7 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── scala │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── L7-2-3Tachyon.scala │ │ │ │ └── L7-4UI.scala │ │ │ └── java │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ └── AbstractDriver.java │ └── spark.sbt ├── Chap8 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── scala │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── L8-4DataFrameCreationSchema.scala │ │ │ │ ├── L8-1DataFrameAPI.scala │ │ │ │ ├── L8-8Sql.scala │ │ │ │ ├── L8-13HiveQL.scala │ │ │ │ ├── L8-28DataFrameExamplesOps.scala │ │ │ │ ├── L8-3-6-7DataFrameCreation.scala │ │ │ │ ├── L8-38SparkR.scala │ │ │ │ ├── L8-35DataFrameExamplesRDD.scala │ │ │ │ ├── T8-3DataFrameExamplesNA.scala │ │ │ │ ├── L8-29DataFrameExamplesJoin.scala │ │ │ │ ├── L8-10-11UDF.scala │ │ │ │ ├── T8-5-L8-30-34DataFrameExamplesActions.scala │ │ │ │ └── L8-14-27DataFrameExamples.scala │ │ │ └── java │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ └── AbstractDriver.java │ ├── spark.sbt │ ├── L8-36CdrSparkRApp.R │ ├── L8-39CdrStreamingSparkRApp.R │ ├── cdrschema.json │ └── cdrschema2.json ├── Chap9 │ ├── project │ │ ├── assembly.sbt │ │ └── plugins.sbt │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ ├── scala │ │ │ └── org │ │ │ │ └── apress │ │ │ │ └── prospark │ │ │ │ ├── L9-14FPMining.scala │ │ │ │ ├── L9-6Preprocessing.scala │ │ │ │ ├── L9-5ChiSq.scala │ │ │ │ ├── L9-13FPMiningPreprocessing.scala │ │ │ │ ├── L9-11CollabFilteringPreprocessing.scala │ │ │ │ ├── L9-7FeatureExtraction.scala │ │ │ │ ├── L9-12CollabFiltering.scala │ │ │ │ ├── L9-4Correlation.scala │ │ │ │ ├── L9-3Statistics.scala │ │ │ │ ├── L9-8PCA.scala │ │ │ │ ├── L9-1LinearRegression.scala │ │ │ │ ├── L9-10KMeans.scala │ │ │ │ ├── L9-9LogisticRegression.scala │ │ │ │ ├── L9-15MLPipeline.scala │ │ │ │ ├── T9-4DataTypes.scala │ │ │ │ └── L9-17MLCrossValidation.scala │ │ │ └── java │ │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ └── AbstractDriver.java │ └── spark.sbt └── README.md ├── README.md └── contributing.md /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/pro-spark-streaming/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /9781484214800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/pro-spark-streaming/HEAD/9781484214800.jpg -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap2/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap2/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | 5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8") 6 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/touch_files_window.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in `seq 1 10`; 3 | do 4 | p=/Users/zubairnabi/Downloads/dummy/${i}.gz 5 | echo ${p} 6 | touch -c ${p} 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | 5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") 6 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/flumeConf/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap2/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap2" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Pro Spark Streaming*](http://www.apress.com/9781484214800) by Zubair Nabi (Apress, 2016). 4 | 5 | ![Cover image](9781484214800.jpg) 6 | 7 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 8 | 9 | ## Releases 10 | 11 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 12 | 13 | ## Contributions 14 | 15 | See the file Contributing.md for more information on how you can contribute to this repository. 16 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap7" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/flumeConf/flumeTest.conf: -------------------------------------------------------------------------------- 1 | # Name the components on this agent 2 | a1.sources = r1 3 | a1.sinks = k1 4 | a1.channels = c1 5 | 6 | # Describe/configure the source 7 | a1.sources.r1.type = netcat 8 | a1.sources.r1.bind = localhost 9 | a1.sources.r1.port = 44444 10 | 11 | # Describe the sink 12 | a1.sinks.k1.type = logger 13 | 14 | # Use a channel which buffers events in memory 15 | a1.channels.c1.type = memory 16 | a1.channels.c1.capacity = 1000 17 | a1.channels.c1.transactionCapacity = 100 18 | 19 | # Bind the source and sink to the channel 20 | a1.sources.r1.channels = c1 21 | a1.sinks.k1.channel = c1 22 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap3" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap4" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/flumeConf/flumePush.conf: -------------------------------------------------------------------------------- 1 | # components on this agent 2 | a1.sources = src-1 3 | a1.sinks = snk-1 4 | a1.channels = ch-1 5 | 6 | # source 7 | a1.sources.src-1.type = spooldir 8 | a1.sources.src-1.channels = ch-1 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes 10 | 11 | # sink 12 | a1.sinks.snk-1.type = avro 13 | a1.sinks.snk-1.hostname = localhost 14 | a1.sinks.snk-1.port = 44444 15 | 16 | # channel 17 | a1.channels.ch-1.type = memory 18 | a1.channels.ch-1.capacity = 10000 19 | a1.channels.ch-1.transactionCapacity = 1000 20 | 21 | # bind source, sink, and channel 22 | a1.sources.src-1.channels = ch-1 23 | a1.sinks.snk-1.channel = ch-1 24 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, FILE, stdout 2 | log4j.rootCategory=INFO, FILE, stdout 3 | 4 | log4j.logger.org.eclipse.jetty=WARN 5 | 6 | log4j.appender.FILE=org.apache.log4j.FileAppender 7 | 8 | log4j.appender.FILE.File=/tmp/spark.log 9 | 10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 12 | 13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 14 | log4j.appender.stdout.Target=System.out 15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 17 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, FILE, stdout 2 | log4j.rootCategory=INFO, FILE, stdout 3 | 4 | log4j.logger.org.eclipse.jetty=WARN 5 | 6 | log4j.appender.FILE=org.apache.log4j.FileAppender 7 | 8 | log4j.appender.FILE.File=/tmp/spark.log 9 | 10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 12 | 13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 14 | log4j.appender.stdout.Target=System.out 15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 17 | -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/flumeConf/flumePull.conf: -------------------------------------------------------------------------------- 1 | # components on this agent 2 | a1.sources = src-1 3 | a1.sinks = snk-1 4 | a1.channels = ch-1 5 | 6 | # source 7 | a1.sources.src-1.type = spooldir 8 | a1.sources.src-1.channels = ch-1 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes 10 | 11 | # sink 12 | a1.sinks.snk-1.type = org.apache.spark.streaming.flume.sink.SparkSink 13 | a1.sinks.snk-1.hostname = localhost 14 | a1.sinks.snk-1.port = 44444 15 | 16 | # channel 17 | a1.channels.ch-1.type = memory 18 | a1.channels.ch-1.capacity = 10000 19 | a1.channels.ch-1.transactionCapacity = 1000 20 | 21 | # bind source, sink, and channel 22 | a1.sources.src-1.channels = ch-1 23 | a1.sinks.snk-1.channel = ch-1 24 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap9" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.4.0" 24 | 25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 26 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap8" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | //libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "1.4.0" 26 | 27 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 28 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/L8-36CdrSparkRApp.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(trailingOnly = TRUE) 2 | if(length(args) != 2) { 3 | stop("Usage: CdrSparkRApp ") 4 | } 5 | library(SparkR) 6 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.3.0" "sparkr-shell"') 7 | sc <- sparkR.init(master = args[1]) 8 | sqlContext <- sparkRSQL.init(sc) 9 | df <- read.df(sqlContext, args[2], source = "com.databricks.spark.csv", inferSchema = "true", delimiter = "\t") 10 | cnames <- c("squareId", "timeInterval", "countryCode", "smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity") 11 | for (i in 1:NROW(cnames)) { 12 | df <- withColumnRenamed(df, paste0("C", i - 1), cnames[i]) 13 | } 14 | counts <- count(groupBy(df, "countryCode")) 15 | showDF(orderBy(counts, desc(counts$count)), numRows = 5) 16 | sparkR.stop() -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/L8-39CdrStreamingSparkRApp.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | if(length(args) != 1) { 4 | stop("Usage: CdrStreamingSparkRApp ") 5 | } 6 | library(SparkR) 7 | sc <- sparkR.init(master = args[1]) 8 | hiveContext <- sparkRHive.init(sc) 9 | f <- file("stdin") 10 | open(f) 11 | while(length(tableName <- readLines(f, n = 1)) > 0) { 12 | tryCatch({ 13 | tableName <- trimws(tableName) 14 | write(paste0("Processing table: ", tableName), stderr()) 15 | df <- table(hiveContext, tableName) 16 | counts <- count(groupBy(df, "countryCode")) 17 | outputTable <- paste0(tableName, "processed") 18 | write(paste0("Output written to: ", outputTable), stderr()) 19 | saveAsTable(limit(orderBy(counts, desc(counts$count)), 5), outputTable, "parquet", "error") 20 | }, error = function(e) {stop(e)}) 21 | } 22 | close(f) 23 | sparkR.stop() -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap2/src/main/scala/org/apress/prospark/T2-6Accumulator.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.mutable 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | 8 | object AccumulatorApp { 9 | def main(args: Array[String]) { 10 | if (args.length != 1) { 11 | System.err.println( 12 | "Usage: AccumulatorApp ") 13 | System.exit(1) 14 | } 15 | val Seq(appName) = args.toSeq 16 | 17 | val conf = new SparkConf() 18 | .setAppName(appName) 19 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 20 | .set("spark.eventLog.enabled", true.toString) 21 | .set("spark.eventLog.dir", "/tmp") 22 | val sc = new SparkContext(conf) 23 | val setAcc = sc.accumulableCollection(mutable.HashSet[Int]()) 24 | val d = sc.parallelize(1 to 100) 25 | d.foreach(x => setAcc += x) 26 | println(setAcc.value.size) 27 | } 28 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/cdrschema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "struct", 3 | "fields": [ 4 | { 5 | "name": "squareId", 6 | "nullable": false, 7 | "type": "integer" 8 | }, 9 | { 10 | "name": "timeInterval", 11 | "nullable": false, 12 | "type": "long" 13 | }, 14 | { 15 | "name": "countryCode", 16 | "nullable": true, 17 | "type": "string" 18 | }, 19 | { 20 | "name": "smsInActivity", 21 | "nullable": true, 22 | "type": "float" 23 | }, 24 | { 25 | "name": "smsOutActivity", 26 | "nullable": true, 27 | "type": "float" 28 | }, 29 | { 30 | "name": "callInActivity", 31 | "nullable": true, 32 | "type": "float" 33 | }, 34 | { 35 | "name": "callOutActivity", 36 | "nullable": true, 37 | "type": "float" 38 | }, 39 | { 40 | "name": "internetTrafficActivity", 41 | "nullable": true, 42 | "type": "float" 43 | } 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/cdrschema2.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "struct", 3 | "fields": [ 4 | { 5 | "name": "squareId", 6 | "nullable": false, 7 | "type": "integer" 8 | }, 9 | { 10 | "name": "timeInterval", 11 | "nullable": false, 12 | "type": "long" 13 | }, 14 | { 15 | "name": "countryCode", 16 | "nullable": true, 17 | "type": "integer" 18 | }, 19 | { 20 | "name": "smsInActivity", 21 | "nullable": true, 22 | "type": "float" 23 | }, 24 | { 25 | "name": "smsOutActivity", 26 | "nullable": true, 27 | "type": "float" 28 | }, 29 | { 30 | "name": "callInActivity", 31 | "nullable": true, 32 | "type": "float" 33 | }, 34 | { 35 | "name": "callOutActivity", 36 | "nullable": true, 37 | "type": "float" 38 | }, 39 | { 40 | "name": "internetTrafficActivity", 41 | "nullable": true, 42 | "type": "float" 43 | } 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /prosparkstreaming-master/README.md: -------------------------------------------------------------------------------- 1 | # Pro Spark Streaming 2 | 3 | Code used in "Pro Spark Streaming: The Zen of Real-time Analytics using Apache Spark" published by Apress Publishing. 4 | 5 | ISBN-13: 978-1484214800 6 | 7 | ISBN-10: 1484214803 8 | 9 | # Layout 10 | 11 | Each folder contains code for a particular chapter. The repetition of code is deliberate. While this goes against most software engineering principles (held very dearly by the author as well), it is necessary to expound a topic and keep its implementation self-contained. 12 | 13 | ## Chapters 14 | 15 | - 2: Introduction to Spark 16 | - 3: DStreams: Real-time RDDs 17 | - 4: High Velocity Streams: Parallelism and Other Stories 18 | - 5: Real-time Route 66: Linking External Data Sources 19 | - 6: The Art of Side Effects 20 | - 7: Getting Ready for Prime Time 21 | - 8: Real-time ETL and Analytics Magic 22 | - 9: Machine Learning at Scale 23 | - 10: Of Clouds, Lambdas, and Pythons 24 | 25 | # Build 26 | 27 | Jump to a particular folder and simply execute `sbt assembly`. This will generate an uber JAR that can directly be submitted to a Spark cluster. -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/yelp_pyspark.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.streaming import StreamingContext 3 | from sys import argv, exit 4 | try: import simplejson as json 5 | except ImportError: import json 6 | 7 | if len(argv) != 5: 8 | print 'Usage: yelp_pyspark.py ' 9 | exit(-1) 10 | 11 | appname = argv[1] 12 | batch_interval = int(argv[2]) 13 | hostname = argv[3] 14 | port = int(argv[4]) 15 | 16 | sc = SparkContext(appName=appname) 17 | ssc = StreamingContext(sc, batch_interval) 18 | 19 | records = ssc.socketTextStream(hostname, port) 20 | json_records = records.map(lambda rec: json.loads(rec)) 21 | restaurant_records = json_records.filter(lambda rec: 'attributes' in rec and 'Wi-Fi' in rec['attributes']) 22 | wifi_pairs = restaurant_records.map(lambda rec: (rec['attributes']['Wi-Fi'], rec['stars'])) 23 | wifi_counts = wifi_pairs.combineByKey(lambda v: (v, 1), 24 | lambda x, value: (x[0] + value, x[1] + 1), 25 | lambda x, y: (x[0] + y[0], x[1] + y[1])) 26 | avg_stars = wifi_counts.map(lambda (key, (sum_, count)): (key, sum_ / count)) 27 | avg_stars.pprint() 28 | 29 | ssc.start() 30 | ssc.awaitTermination() 31 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | net.virtualvoid.sbt.graph.DependencyGraphSettings.graphSettings 6 | 7 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 8 | case entry => { 9 | val strategy = mergeStrategy(entry) 10 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 11 | else strategy 12 | } 13 | }} 14 | 15 | name := "Chap10" 16 | 17 | version := "1.0" 18 | 19 | scalaVersion := "2.10.5" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 22 | 23 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 24 | 25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 26 | 27 | libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-1.1" % "0.2.3" exclude("com.google.guava", "guava") 28 | 29 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2" 30 | 31 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2" 32 | 33 | libraryDependencies += "com.google.guava" % "guava" % "16.0" 34 | 35 | libraryDependencies += "org.mortbay.jetty.alpn" % "alpn-boot" % "8.1.6.v20151105" 36 | 37 | libraryDependencies += "com.google.cloud.bigdataoss" % "bigquery-connector" % "0.7.4-hadoop2" 38 | 39 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.4.0" 40 | 41 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap5" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0" 26 | 27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1" 28 | 29 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.4.0" 30 | 31 | libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka" % "1.4.0" 32 | 33 | libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.4.0" 34 | 35 | libraryDependencies += "com.ning" % "async-http-client" % "1.9.31" 36 | 37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1" 38 | 39 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/" 40 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-14FPMining.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.fpm.FPGrowth 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | 9 | object FPMiningApp { 10 | 11 | def main(args: Array[String]) { 12 | if (args.length != 3) { 13 | System.err.println( 14 | "Usage: FPMiningApp ") 15 | System.exit(1) 16 | } 17 | val Seq(appName, batchInterval, iPath) = args.toSeq 18 | 19 | val conf = new SparkConf() 20 | .setAppName(appName) 21 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 22 | 23 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 24 | 25 | val minSupport = 0.4 26 | 27 | ssc.textFileStream(iPath) 28 | .map(r => r.split(" ")) 29 | .foreachRDD(transactionRDD => { 30 | val fpg = new FPGrowth() 31 | .setMinSupport(minSupport) 32 | val model = fpg.run(transactionRDD) 33 | 34 | model.freqItemsets 35 | .collect() 36 | .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq))) 37 | }) 38 | 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-6SocketStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 8 | 9 | import java.util.Calendar 10 | 11 | object TripByYearApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 3) { 14 | System.err.println( 15 | "Usage: TripByYearApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(10)) 25 | 26 | ssc.socketTextStream(hostname, port.toInt) 27 | .map(rec => rec.split(",")) 28 | .map(rec => (rec(13), rec(0).toInt)) 29 | .reduceByKey(_ + _) 30 | .map(pair => (pair._2, normalizeYear(pair._1))) 31 | .transform(rec => rec.sortByKey(ascending = false)) 32 | .saveAsTextFiles("TripByYear") 33 | 34 | ssc.start() 35 | ssc.awaitTermination() 36 | } 37 | 38 | def normalizeYear(s: String): String = { 39 | try { 40 | (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString 41 | } catch { 42 | case e: Exception => s 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-6Preprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.StandardScaler 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | 10 | object PreprocessingApp { 11 | 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: PreprocessingAppApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 25 | 26 | val substream = ssc.socketTextStream(hostname, port.toInt) 27 | .filter(!_.contains("NaN")) 28 | .map(_.split(" ")) 29 | .filter(f => f(1) != "0") 30 | 31 | substream.map(f => Array(f(2), f(4), f(5), f(6))) 32 | .map(f => f.map(v => v.toDouble)) 33 | .map(f => Vectors.dense(f)) 34 | .foreachRDD(rdd => { 35 | val scalerModel = new StandardScaler().fit(rdd) 36 | val scaledRDD = scalerModel.transform(rdd) 37 | }) 38 | 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap2/src/main/scala/org/apress/prospark/L2-1FirstApp.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | 8 | object TranslateApp { 9 | def main(args: Array[String]) { 10 | if (args.length != 4) { 11 | System.err.println( 12 | "Usage: TranslateApp ") 13 | System.exit(1) 14 | } 15 | val Seq(appName, bookPath, outputPath, lang) = args.toSeq 16 | 17 | val dict = getDictionary(lang) 18 | 19 | val conf = new SparkConf() 20 | .setAppName(appName) 21 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 22 | val sc = new SparkContext(conf) 23 | val book = sc.textFile(bookPath) 24 | val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) 25 | translated.saveAsTextFile(outputPath) 26 | } 27 | 28 | def getDictionary(lang: String): Map[String, String] = { 29 | if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { 30 | System.err.println( 31 | "Unsupported language: %s".format(lang)) 32 | System.exit(1) 33 | } 34 | val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) 35 | println("Grabbing dictionary from: %s".format(url)) 36 | Source.fromURL(url, "ISO-8859-1").mkString 37 | .split("\\r?\\n") 38 | .filter(line => !line.startsWith("#")) 39 | .map(line => line.split("\\t")) 40 | .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-7MultipleSocketStreams.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 8 | 9 | import java.util.Calendar 10 | 11 | object TripByYearMultiApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: TripByYearMultiApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, basePort, nSockets) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(10)) 25 | 26 | val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) 27 | val uniStream = ssc.union(streams) 28 | 29 | uniStream 30 | .map(rec => rec.split(",")) 31 | .map(rec => (rec(13), rec(0).toInt)) 32 | .reduceByKey(_ + _) 33 | .map(pair => (pair._2, normalizeYear(pair._1))) 34 | .transform(rec => rec.sortByKey(ascending = false)) 35 | .saveAsTextFiles("TripByYear") 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | } 40 | 41 | def normalizeYear(s: String): String = { 42 | try { 43 | (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString 44 | } catch { 45 | case e: Exception => s 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-9Mqtt.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.mqtt.MQTTUtils 11 | 12 | object YearlyDistributionApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: YearlyDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => rec.split(",")) 30 | .map(rec => (rec(1).split(" ")(0), 1)) 31 | .updateStateByKey(statefulCount) 32 | .map(pair => (pair._2, pair._1)) 33 | .transform(rec => rec.sortByKey(ascending = false)) 34 | .saveAsTextFiles("YearlyDistribution") 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-5ChiSq.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.stat.Statistics 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object ChiSqApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: ChiSqApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | .map(f => f.map(f => f.toDouble)) 32 | 33 | substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 34 | .filter(f => f(0) == 4.0 || f(0) == 5.0) 35 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | .foreachRDD(rdd => { 37 | Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) 38 | }) 39 | 40 | ssc.start() 41 | ssc.awaitTermination() 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-16Twitter.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.apache.spark.streaming.twitter.TwitterUtils 10 | import org.apache.spark.storage.StorageLevel 11 | import twitter4j.conf.ConfigurationBuilder 12 | import twitter4j.TwitterFactory 13 | 14 | object TwitterApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: TwitterApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, outputPath) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | 31 | val cb = new ConfigurationBuilder() 32 | cb.setOAuthConsumerKey("") 33 | cb.setOAuthConsumerSecret("") 34 | cb.setOAuthAccessToken("") 35 | cb.setOAuthAccessTokenSecret("") 36 | 37 | val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() 38 | 39 | val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) 40 | tweetStream.count().print() 41 | tweetStream.saveAsTextFiles(outputPath) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-13FPMiningPreprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapred.FileSplit 6 | import org.apache.hadoop.mapred.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.HadoopRDD 10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 11 | 12 | import com.google.common.io.Files 13 | 14 | object FPMiningPreprocessingApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 3) { 18 | System.err.println( 19 | "Usage: FPMiningPreprocessingApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, iPath, oPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val delim = " " 29 | 30 | val sc = new SparkContext(conf) 31 | sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) 32 | .asInstanceOf[HadoopRDD[LongWritable, Text]] 33 | .mapPartitionsWithInputSplit((iSplit, iter) => 34 | iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) 35 | .filter(r => r._2 != "0") 36 | .map(r => (r._1, r._2)) 37 | .distinct() 38 | .groupByKey() 39 | .map(r => r._2.mkString(" ")) 40 | .sample(false, 0.7) 41 | .coalesce(1) 42 | .saveAsTextFile(oPath) 43 | } 44 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-4DataFrameCreationSchema.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql.functions.desc 8 | import org.apache.spark.sql.types.DataType 9 | import org.apache.spark.sql.types.StructType 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object DataframeCreationApp2 { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 5) { 17 | System.err.println( 18 | "Usage: CdrDataframeApp2 ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val sqlC = new SQLContext(ssc.sparkContext) 30 | 31 | val schemaJson = scala.io.Source.fromFile(schemaFile).mkString 32 | val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] 33 | 34 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 35 | .map(_.split("\\t", -1)) 36 | .foreachRDD(rdd => { 37 | val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) 38 | 39 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 40 | }) 41 | 42 | ssc.start() 43 | ssc.awaitTermination() 44 | 45 | } 46 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/KafkaDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.util.Properties; 4 | 5 | import kafka.javaapi.producer.Producer; 6 | import kafka.producer.KeyedMessage; 7 | import kafka.producer.ProducerConfig; 8 | 9 | public class KafkaDriver extends AbstractDriver { 10 | 11 | private final String topic; 12 | private Producer producer; 13 | 14 | public KafkaDriver(String path, String topic, Properties props) { 15 | super(path); 16 | this.topic = topic; 17 | ProducerConfig config = new ProducerConfig(props); 18 | producer = new Producer(config); 19 | } 20 | 21 | @Override 22 | public void init() throws Exception { 23 | } 24 | 25 | @Override 26 | public void close() throws Exception { 27 | producer.close(); 28 | } 29 | 30 | @Override 31 | public void sendRecord(String record) throws Exception { 32 | producer.send(new KeyedMessage(topic, record)); 33 | } 34 | 35 | public static void main(String[] args) throws Exception { 36 | 37 | if (args.length != 3) { 38 | System.err.println("Usage: KafkaDriver "); 39 | System.exit(-1); 40 | } 41 | 42 | String path = args[0]; 43 | String brokerUrl = args[1]; 44 | String topic = args[2]; 45 | 46 | Properties props = new Properties(); 47 | props.put("metadata.broker.list", brokerUrl); 48 | props.put("serializer.class", "kafka.serializer.StringEncoder"); 49 | // props.put("request.required.acks", "1"); 50 | 51 | KafkaDriver driver = new KafkaDriver(path, topic, props); 52 | try { 53 | driver.execute(); 54 | } finally { 55 | driver.close(); 56 | } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePush.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.flume.FlumeUtils 11 | 12 | object DailyUserTypeDistributionApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 5) { 15 | System.err.println( 16 | "Usage: DailyUserTypeDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => new String(rec.event.getBody().array()).split(",")) 30 | .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) 31 | .updateStateByKey(statefulCount) 32 | .repartition(1) 33 | .transform(rdd => rdd.sortByKey(ascending = false)) 34 | .saveAsTextFiles(outputPath) 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePull.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.flume.FlumeUtils 11 | 12 | object DailyUserTypeDistributionApp2 { 13 | def main(args: Array[String]) { 14 | if (args.length != 5) { 15 | System.err.println( 16 | "Usage: DailyUserTypeDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => new String(rec.event.getBody().array()).split(",")) 30 | .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) 31 | .updateStateByKey(statefulCount) 32 | .repartition(1) 33 | .transform(rdd => rdd.sortByKey(ascending = false)) 34 | .saveAsTextFiles(outputPath) 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-11CollabFilteringPreprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapred.FileSplit 6 | import org.apache.hadoop.mapred.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.HadoopRDD 10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 11 | 12 | import com.google.common.io.Files 13 | 14 | object CollabFilteringPreprocessingApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 3) { 18 | System.err.println( 19 | "Usage: CollabFilteringPreprocessingApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, iPath, oPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val delim = " " 29 | 30 | val sc = new SparkContext(conf) 31 | sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) 32 | .asInstanceOf[HadoopRDD[LongWritable, Text]] 33 | .mapPartitionsWithInputSplit((iSplit, iter) => 34 | iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) 35 | .filter(r => r._2 != "0") 36 | .map(r => ((r._1, r._2), 1)) 37 | .reduceByKey(_ + _) 38 | .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) 39 | .sample(false, 0.7) 40 | .coalesce(1) 41 | .saveAsTextFile(oPath) 42 | } 43 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-1Voyager.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.LongWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | 14 | object VoyagerApp { 15 | def main(args: Array[String]) { 16 | if (args.length != 3) { 17 | System.err.println( 18 | "Usage: VoyagerApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, inputPath, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") 27 | 28 | val ssc = new StreamingContext(conf, Seconds(10)) 29 | 30 | val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 31 | voyager1.map(rec => { 32 | val attrs = rec.split("\\s+") 33 | ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) 34 | }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) 35 | .reduceByKey(_ + _) 36 | .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) 37 | 38 | ssc.start() 39 | ssc.awaitTermination() 40 | } 41 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-13Kafka.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | 12 | object StationJourneyCountApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 7) { 16 | System.err.println( 17 | "Usage: StationJourneyCountApp ") 18 | System.exit(1) 19 | } 20 | 21 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | //.set("spark.streaming.receiver.writeAheadLog.enable", "true") 27 | 28 | val ssc = new StreamingContext(conf, Seconds(10)) 29 | ssc.checkpoint(checkpointDir) 30 | 31 | val topics = Map[String, Int]( 32 | topic -> 1) 33 | KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) 34 | .map(rec => rec.split(",")) 35 | .map(rec => ((rec(3), rec(7)), 1)) 36 | .reduceByKey(_ + _) 37 | .repartition(1) 38 | .map(rec => (rec._2, rec._1)) 39 | .transform(rdd => rdd.sortByKey(ascending = false)) 40 | .saveAsTextFiles(outputPath) 41 | 42 | ssc.start() 43 | ssc.awaitTermination() 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-7FeatureExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.ChiSqSelector 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object FeatureExtractionApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: FeatureExtractionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | 32 | val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 33 | .map(f => f.map(v => v.toDouble)) 34 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048)))) 35 | 36 | datastream.foreachRDD(rdd => { 37 | val selector = new ChiSqSelector(5) 38 | val model = selector.fit(rdd) 39 | val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features))) 40 | filtered.take(20).foreach(println) 41 | }) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-12CollabFiltering.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.recommendation.ALS 6 | import org.apache.spark.mllib.recommendation.Rating 7 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 8 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CollabFilteringApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 3) { 16 | System.err.println( 17 | "Usage: CollabFilteringApp ") 18 | System.exit(1) 19 | } 20 | val Seq(appName, batchInterval, iPath) = args.toSeq 21 | 22 | val conf = new SparkConf() 23 | .setAppName(appName) 24 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 25 | 26 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 27 | 28 | val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match { 29 | case Array(subject, activity, freq) => 30 | Rating(subject.toInt, activity.toInt, freq.toDouble) 31 | }) 32 | 33 | val rank = 10 34 | val numIterations = 10 35 | val lambda = 0.01 36 | ratingStream.foreachRDD(ratingRDD => { 37 | val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7)) 38 | val model = ALS.train(testTrain(1), rank, numIterations, lambda) 39 | val test = testTrain(0).map { 40 | case Rating(subject, activity, freq) => 41 | (subject, activity) 42 | } 43 | val prediction = model.predict(test) 44 | prediction.take(5).map(println) 45 | }) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-15KafkaDirect.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import kafka.serializer.StringDecoder 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | 12 | object StationJourneyCountDirectApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 7) { 16 | System.err.println( 17 | "Usage: StationJourneyCountApp ") 18 | System.exit(1) 19 | } 20 | 21 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(10)) 28 | ssc.checkpoint(checkpointDir) 29 | 30 | val topics = Set(topic) 31 | val params = Map[String, String]( 32 | "zookeeper.connect" -> zkQuorum, 33 | "group.id" -> consumerGroupId, 34 | "bootstrap.servers" -> brokerUrl) 35 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) 36 | .map(rec => rec.split(",")) 37 | .map(rec => ((rec(3), rec(7)), 1)) 38 | .reduceByKey(_ + _) 39 | .repartition(1) 40 | .map(rec => (rec._2, rec._1)) 41 | .transform(rdd => rdd.sortByKey(ascending = false)) 42 | .saveAsTextFiles(outputPath) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-4Correlation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.stat.Statistics 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object CorrelationApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: CorrelationApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | .map(f => f.map(f => f.toDouble)) 32 | 33 | val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 34 | 35 | val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | walkingOrRunning.map(f => f.features).foreachRDD(rdd => { 37 | val corrSpearman = Statistics.corr(rdd, "spearman") 38 | val corrPearson = Statistics.corr(rdd, "pearson") 39 | println("Correlation Spearman: \n" + corrSpearman) 40 | println("Correlation Pearson: \n" + corrPearson) 41 | }) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-3Statistics.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.stat.Statistics 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | 10 | object StatisticsApp { 11 | 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: StatisticsApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 25 | 26 | val substream = ssc.socketTextStream(hostname, port.toInt) 27 | .filter(!_.contains("NaN")) 28 | .map(_.split(" ")) 29 | .filter(f => f(1) != "0") 30 | .map(f => f.map(f => f.toDouble)) 31 | 32 | substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => { 33 | val stats = Statistics.colStats(rdd) 34 | println("Count: " + stats.count) 35 | println("Max: " + stats.max.toArray.mkString(" ")) 36 | println("Min: " + stats.min.toArray.mkString(" ")) 37 | println("Mean: " + stats.mean.toArray.mkString(" ")) 38 | println("L1-Norm: " + stats.normL1.toArray.mkString(" ")) 39 | println("L2-Norm: " + stats.normL2.toArray.mkString(" ")) 40 | println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" ")) 41 | println("Varience: " + stats.variance.toArray.mkString(" ")) 42 | }) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-18Http.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.Seconds 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.json4s.DefaultFormats 8 | import org.json4s.JField 9 | import org.json4s.jvalue2extractable 10 | import org.json4s.jvalue2monadic 11 | import org.json4s.native.JsonMethods.parse 12 | import org.json4s.string2JsonInput 13 | 14 | object HttpApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: HttpApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, outputPath) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val batchInterval = 10 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 32 | 33 | HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) 34 | .flatMap(rec => (parse(rec) \ "stationBeanList").children) 35 | .filter(rec => { 36 | implicit val formats = DefaultFormats 37 | (rec \ "statusKey").extract[Integer] != 1 38 | }) 39 | .map(rec => rec.filterField { 40 | case JField("id", _) => true 41 | case JField("stationName", _) => true 42 | case JField("statusValue", _) => true 43 | case _ => false 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) 48 | }) 49 | .saveAsTextFiles(outputPath) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-8PCA.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.PCA 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object PCAApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: PCAApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | 32 | val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 33 | .map(f => f.map(v => v.toDouble)) 34 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) 35 | 36 | datastream.foreachRDD(rdd => { 37 | val pca = new PCA(rdd.first().features.size / 2) 38 | .fit(rdd.map(_.features)) 39 | val testTrain = rdd.randomSplit(Array(0.3, 0.7)) 40 | val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features))) 41 | val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features))) 42 | train.take(20).foreach(println) 43 | }) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamVariation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditVariationApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditVariationApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val merged = comments.union(comments) 39 | 40 | val repartitionedComments = comments.repartition(4) 41 | 42 | val rddMin = comments.glom().map(arr => 43 | arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | 48 | } 49 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-1DataFrameAPI.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 37 | .map(_.split("\\t", -1)) 38 | .foreachRDD(rdd => { 39 | val cdrs = seqToCdr(rdd).toDF() 40 | 41 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 42 | }) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 49 | rdd.map(c => c.map(f => f match { 50 | case x if x.isEmpty() => "0" 51 | case x => x 52 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 53 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 54 | } 55 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-1DStreams.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.hadoop.io.LongWritable 9 | import org.apache.hadoop.fs.Path 10 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 11 | import org.apache.hadoop.io.Text 12 | 13 | object StreamingTranslateApp { 14 | def main(args: Array[String]) { 15 | if (args.length != 4) { 16 | System.err.println( 17 | "Usage: StreamingTranslateApp ") 18 | System.exit(1) 19 | } 20 | val Seq(appName, bookPath, outputPath, lang) = args.toSeq 21 | 22 | val dict = getDictionary(lang) 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | val ssc = new StreamingContext(conf, Seconds(1)) 28 | 29 | val book = ssc.textFileStream(bookPath) 30 | val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) 31 | translated.saveAsTextFiles(outputPath) 32 | 33 | ssc.start() 34 | ssc.awaitTermination() 35 | } 36 | 37 | def getDictionary(lang: String): Map[String, String] = { 38 | if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { 39 | System.err.println( 40 | "Unsupported language: %s".format(lang)) 41 | System.exit(1) 42 | } 43 | val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) 44 | println("Grabbing dictionary from: %s".format(url)) 45 | Source.fromURL(url, "ISO-8859-1").mkString 46 | .split("\\r?\\n") 47 | .filter(line => !line.startsWith("#")) 48 | .map(line => line.split("\\t")) 49 | .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamAggregation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditAggregationApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditAggregationApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val recCount = comments.count() 39 | 40 | val recCountValue = comments.countByValue() 41 | 42 | val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) 43 | .flatMap(body => body.split(" ")) 44 | .map(word => 1) 45 | .reduce(_ + _) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | 50 | } 51 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-4Kryo.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.LongWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | 14 | object VoyagerAppKryo { 15 | def main(args: Array[String]) { 16 | if (args.length != 3) { 17 | System.err.println( 18 | "Usage: VoyagerAppKryo ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, inputPath, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 27 | .registerKryoClasses(Array(classOf[ProtonFlux])) 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | 31 | val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 32 | val projected = voyager1.map(rec => { 33 | val attrs = rec.split("\\s+") 34 | new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), 35 | attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), 36 | attrs(28)) 37 | }) 38 | val filtered = projected.filter(pflux => pflux.isSolarStorm) 39 | val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) 40 | .reduceByKey(_ + _) 41 | .transform(rec => rec.sortByKey(ascending = false)) 42 | yearlyBreakdown.saveAsTextFiles(outputPath) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/L5-14KafkaCustomConf.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | import kafka.serializer.StringDecoder 11 | import org.apache.spark.storage.StorageLevel 12 | 13 | object StationJourneyCountCustomApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 7) { 17 | System.err.println( 18 | "Usage: StationJourneyCountApp ") 19 | System.exit(1) 20 | } 21 | 22 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | //.set("spark.streaming.receiver.writeAheadLog.enable", "true") 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | ssc.checkpoint(checkpointDir) 31 | 32 | val topics = Map[String, Int]( 33 | topic -> 1) 34 | val params = Map[String, String]( 35 | "zookeeper.connect" -> zkQuorum, 36 | "group.id" -> consumerGroupId, 37 | "bootstrap.servers" -> brokerUrl) 38 | KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) 39 | .map(rec => rec.split(",")) 40 | .map(rec => ((rec(3), rec(7)), 1)) 41 | .reduceByKey(_ + _) 42 | .repartition(1) 43 | .map(rec => (rec._2, rec._1)) 44 | .transform(rdd => rdd.sortByKey(ascending = false)) 45 | .saveAsTextFiles(outputPath) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-8Sql.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CdrSqlApp { 13 | 14 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 15 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 16 | callOutActivity: Float, internetTrafficActivity: Float) 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: CdrSqlApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val sqlC = new SQLContext(ssc.sparkContext) 33 | import sqlC.implicits._ 34 | 35 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 36 | .map(_.split("\\t", -1)) 37 | .foreachRDD(rdd => { 38 | val cdrs = seqToCdr(rdd).toDF() 39 | cdrs.registerTempTable("cdrs") 40 | 41 | sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() 42 | sqlC.dropTempTable("cdrs") 43 | }) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | } 48 | 49 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 50 | rdd.map(c => c.map(f => f match { 51 | case x if x.isEmpty() => "0" 52 | case x => x 53 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 54 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 55 | } 56 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/src/main/scala/org/apress/prospark/L10-2DataProc.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.HashPartitioner 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.json4s.DefaultFormats 10 | import org.json4s.JsonAST.JNothing 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object DataProcApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: DataProcApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | ssc.socketTextStream(hostname, port.toInt) 33 | .map(r => { 34 | implicit val formats = DefaultFormats 35 | parse(r) 36 | }) 37 | .filter(jvalue => { 38 | jvalue \ "attributes" \ "Wi-Fi" != JNothing 39 | }) 40 | .map(jvalue => { 41 | implicit val formats = DefaultFormats 42 | ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) 43 | }) 44 | .combineByKey( 45 | (v) => (v, 1), 46 | (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), 47 | (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), 48 | new HashPartitioner(ssc.sparkContext.defaultParallelism)) 49 | .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) 50 | .print() 51 | 52 | ssc.start() 53 | ssc.awaitTermination() 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap6" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0" 26 | 27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1" 28 | 29 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1" 30 | 31 | libraryDependencies += "org.apache.commons" % "commons-pool2" % "2.4.2" 32 | 33 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.15-hadoop2" 34 | 35 | //libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2" 36 | 37 | //libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2" 38 | 39 | //libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2" 40 | 41 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-SNAPSHOT" 42 | 43 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-SNAPSHOT" 44 | 45 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-SNAPSHOT" 46 | 47 | libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-SNAPSHOT" 48 | 49 | resolvers += "Apache Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots" 50 | 51 | libraryDependencies += "org.apache.cassandra" % "cassandra-all" % "2.1.11" 52 | 53 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0" 54 | 55 | libraryDependencies += "redis.clients" % "jedis" % "2.7.3" 56 | 57 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/" 58 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-1LinearRegression.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object LinearRegressionApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 4) { 17 | System.err.println( 18 | "Usage: LinearRegressionApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val substream = ssc.socketTextStream(hostname, port.toInt) 30 | .filter(!_.contains("NaN")) 31 | .map(_.split(" ")) 32 | .filter(f => f(1) != "0") 33 | 34 | val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 35 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 37 | val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 38 | val model = new StreamingLinearRegressionWithSGD() 39 | .setInitialWeights(Vectors.zeros(4)) 40 | .setStepSize(0.0001) 41 | .setNumIterations(1) 42 | 43 | model.trainOn(train) 44 | model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd 45 | .map(v => math.pow((v._1 - v._2), 2)).mean()))) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-10KMeans.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.clustering.StreamingKMeans 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object KMeansClusteringApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 4) { 17 | System.err.println( 18 | "Usage: KMeansClusteringApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val substream = ssc.socketTextStream(hostname, port.toInt) 30 | .filter(!_.contains("NaN")) 31 | .map(_.split(" ")) 32 | .filter(f => f(1) != "0") 33 | 34 | val orientationStream = substream 35 | .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) 36 | .map(arr => arr.map(_.toDouble)) 37 | .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) 38 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) 39 | val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 40 | val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 41 | val model = new StreamingKMeans() 42 | .setK(3) 43 | .setDecayFactor(0) 44 | .setRandomCenters(18, 0.0) 45 | 46 | model.trainOn(train.map(v => v.features)) 47 | val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.Enumeration; 8 | import java.util.zip.ZipEntry; 9 | import java.util.zip.ZipFile; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public abstract class AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 17 | 18 | private String path; 19 | 20 | public AbstractDriver(String path) { 21 | this.path = path; 22 | } 23 | 24 | public abstract void init() throws Exception; 25 | 26 | public abstract void close() throws Exception; 27 | 28 | public abstract void sendRecord(String record) throws Exception; 29 | 30 | public void execute() throws Exception { 31 | 32 | try { 33 | init(); 34 | File dirPath = new File(path); 35 | if (dirPath.isDirectory()) { 36 | File[] files = new File(path).listFiles(); 37 | for (File f : files) { 38 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 39 | ZipFile zFile = null; 40 | try { 41 | zFile = new ZipFile(f); 42 | Enumeration zEntries = zFile.entries(); 43 | 44 | while (zEntries.hasMoreElements()) { 45 | ZipEntry zEntry = zEntries.nextElement(); 46 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 47 | try (BufferedReader br = new BufferedReader( 48 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 49 | // skip header 50 | br.readLine(); 51 | String line; 52 | while ((line = br.readLine()) != null) { 53 | sendRecord(line); 54 | } 55 | } 56 | } 57 | } catch (IOException e) { 58 | LOG.error(e.getMessage()); 59 | } finally { 60 | if (zFile != null) { 61 | try { 62 | zFile.close(); 63 | } catch (IOException e) { 64 | LOG.error(e.getMessage()); 65 | } 66 | } 67 | } 68 | } 69 | } else { 70 | LOG.error(String.format("Path %s is not a directory", path)); 71 | } 72 | } finally { 73 | close(); 74 | } 75 | } 76 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.Enumeration; 8 | import java.util.zip.ZipEntry; 9 | import java.util.zip.ZipFile; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public abstract class AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 17 | 18 | private String path; 19 | 20 | public AbstractDriver(String path) { 21 | this.path = path; 22 | } 23 | 24 | public abstract void init() throws Exception; 25 | 26 | public abstract void close() throws Exception; 27 | 28 | public abstract void sendRecord(String record) throws Exception; 29 | 30 | public void execute() throws Exception { 31 | 32 | try { 33 | init(); 34 | File dirPath = new File(path); 35 | if (dirPath.isDirectory()) { 36 | File[] files = new File(path).listFiles(); 37 | for (File f : files) { 38 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 39 | ZipFile zFile = null; 40 | try { 41 | zFile = new ZipFile(f); 42 | Enumeration zEntries = zFile.entries(); 43 | 44 | while (zEntries.hasMoreElements()) { 45 | ZipEntry zEntry = zEntries.nextElement(); 46 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 47 | try (BufferedReader br = new BufferedReader( 48 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 49 | // skip header 50 | br.readLine(); 51 | String line; 52 | while ((line = br.readLine()) != null) { 53 | sendRecord(line); 54 | } 55 | } 56 | } 57 | } catch (IOException e) { 58 | LOG.error(e.getMessage()); 59 | } finally { 60 | if (zFile != null) { 61 | try { 62 | zFile.close(); 63 | } catch (IOException e) { 64 | LOG.error(e.getMessage()); 65 | } 66 | } 67 | } 68 | } 69 | } else { 70 | LOG.error(String.format("Path %s is not a directory", path)); 71 | } 72 | } finally { 73 | close(); 74 | } 75 | } 76 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/java/org/apress/prospark/MqttDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | 5 | import org.apache.log4j.LogManager; 6 | import org.apache.log4j.Logger; 7 | import org.eclipse.paho.client.mqttv3.MqttClient; 8 | import org.eclipse.paho.client.mqttv3.MqttException; 9 | import org.eclipse.paho.client.mqttv3.MqttMessage; 10 | import org.eclipse.paho.client.mqttv3.MqttTopic; 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence; 12 | 13 | public class MqttDriver extends AbstractDriver { 14 | 15 | private static final Logger LOG = LogManager.getLogger(MqttDriver.class); 16 | 17 | private final String brokerUrl; 18 | private final String topic; 19 | private MqttClient client; 20 | private MqttTopic mqttTopic; 21 | 22 | public MqttDriver(String path, String brokerUrl, String topic) { 23 | super(path); 24 | this.brokerUrl = brokerUrl; 25 | this.topic = topic; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()); 31 | LOG.info(String.format("Attempting to connect to broker %s", brokerUrl)); 32 | client.connect(); 33 | mqttTopic = client.getTopic(topic); 34 | LOG.info(String.format("Connected to broker %s", brokerUrl)); 35 | } 36 | 37 | @Override 38 | public void close() throws Exception { 39 | if (client != null) { 40 | client.disconnect(); 41 | } 42 | } 43 | 44 | @Override 45 | public void sendRecord(String record) throws Exception { 46 | try { 47 | mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8))); 48 | } catch (MqttException e) { 49 | if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) { 50 | Thread.sleep(10); 51 | } 52 | } 53 | } 54 | 55 | public static void main(String[] args) throws Exception { 56 | 57 | if (args.length != 3) { 58 | System.err.println("Usage:MqttDriver "); 59 | System.exit(-1); 60 | } 61 | 62 | String path = args[0]; 63 | String brokerUrl = args[1]; 64 | String topic = args[2]; 65 | 66 | MqttDriver driver = new MqttDriver(path, brokerUrl, topic); 67 | try { 68 | driver.execute(); 69 | } finally { 70 | driver.close(); 71 | } 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/java/org/apress/prospark/MqttDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | 5 | import org.apache.log4j.LogManager; 6 | import org.apache.log4j.Logger; 7 | import org.eclipse.paho.client.mqttv3.MqttClient; 8 | import org.eclipse.paho.client.mqttv3.MqttException; 9 | import org.eclipse.paho.client.mqttv3.MqttMessage; 10 | import org.eclipse.paho.client.mqttv3.MqttTopic; 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence; 12 | 13 | public class MqttDriver extends AbstractDriver { 14 | 15 | private static final Logger LOG = LogManager.getLogger(MqttDriver.class); 16 | 17 | private final String brokerUrl; 18 | private final String topic; 19 | private MqttClient client; 20 | private MqttTopic mqttTopic; 21 | 22 | public MqttDriver(String path, String brokerUrl, String topic) { 23 | super(path); 24 | this.brokerUrl = brokerUrl; 25 | this.topic = topic; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()); 31 | LOG.info(String.format("Attempting to connect to broker %s", brokerUrl)); 32 | client.connect(); 33 | mqttTopic = client.getTopic(topic); 34 | LOG.info(String.format("Connected to broker %s", brokerUrl)); 35 | } 36 | 37 | @Override 38 | public void close() throws Exception { 39 | if (client != null) { 40 | client.disconnect(); 41 | } 42 | } 43 | 44 | @Override 45 | public void sendRecord(String record) throws Exception { 46 | try { 47 | mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8))); 48 | } catch (MqttException e) { 49 | if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) { 50 | Thread.sleep(10); 51 | } 52 | } 53 | } 54 | 55 | public static void main(String[] args) throws Exception { 56 | 57 | if (args.length != 3) { 58 | System.err.println("Usage:MqttDriver "); 59 | System.exit(-1); 60 | } 61 | 62 | String path = args[0]; 63 | String brokerUrl = args[1]; 64 | String topic = args[2]; 65 | 66 | MqttDriver driver = new MqttDriver(path, brokerUrl, topic); 67 | try { 68 | driver.execute(); 69 | } finally { 70 | driver.close(); 71 | } 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/src/main/scala/org/apress/prospark/L7-2-3Tachyon.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 11 | 12 | object ReferrerApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 7) { 15 | System.err.println( 16 | "Usage: ReferrerApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | .set("spark.externalBlockStore.url", tachyonUrl) 25 | 26 | val ssc = new StreamingContext(conf, Seconds(10)) 27 | ssc.checkpoint(checkpointDir) 28 | 29 | val clickstream = ssc.socketTextStream(hostname, port.toInt) 30 | .map(rec => rec.split("\\t")) 31 | .persist(StorageLevel.OFF_HEAP) 32 | 33 | val topRefStream = clickstream 34 | .map(rec => { 35 | var prev_title = rec(3) 36 | if (!prev_title.startsWith("other")) { 37 | prev_title = "wikipedia" 38 | } 39 | (prev_title, 1) 40 | }) 41 | 42 | val topSparkStream = clickstream 43 | .filter(rec => rec(4).equals("Apache_Spark")) 44 | .map(rec => (rec(3), 1)) 45 | 46 | saveTopKeys(topRefStream, outputPathTop) 47 | 48 | saveTopKeys(topSparkStream, outputPathSpark) 49 | 50 | ssc.start() 51 | ssc.awaitTermination() 52 | } 53 | 54 | def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { 55 | clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) 56 | .repartition(1) 57 | .map(rec => (rec._2, rec._1)) 58 | .transform(rec => rec.sortByKey(ascending = false)) 59 | .saveAsTextFiles(outputPath) 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-13HiveQL.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.hive.HiveContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CdrHiveqlApp { 13 | 14 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 15 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 16 | callOutActivity: Float, internetTrafficActivity: Float) 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: CdrHiveqlApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val cl = Thread.currentThread().getContextClassLoader() 33 | val hiveC = new HiveContext(ssc.sparkContext) 34 | Thread.currentThread().setContextClassLoader(cl) 35 | 36 | import hiveC.implicits._ 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | seqToCdr(rdd).toDF().registerTempTable("cdrs") 42 | 43 | hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") 44 | hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() 45 | }) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 52 | rdd.map(c => c.map(f => f match { 53 | case x if x.isEmpty() => "0" 54 | case x => x 55 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 56 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 57 | } 58 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-9LogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD 13 | 14 | object LogisticRegressionApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 4) { 18 | System.err.println( 19 | "Usage: LogisticRegressionApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 29 | 30 | val substream = ssc.socketTextStream(hostname, port.toInt) 31 | .filter(!_.contains("NaN")) 32 | .map(_.split(" ")) 33 | .filter(f => f(1) != "0") 34 | 35 | val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 36 | 37 | val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 38 | val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 39 | val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 40 | val model = new StreamingLogisticRegressionWithSGD() 41 | .setInitialWeights(Vectors.zeros(4)) 42 | .setStepSize(0.0001) 43 | .setNumIterations(1) 44 | 45 | model.trainOn(train) 46 | model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd 47 | .map(v => math.pow((v._1 - v._2), 2)).mean()))) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-28DataFrameExamplesOps.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.DataFrame 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeExamples2App { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeExamples2App ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | var previousCdrs: Option[DataFrame] = None 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() 42 | previousCdrs match { 43 | case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() 44 | //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() 45 | //case Some(prevCdrs) => cdrs.except(prevCdrs).show() 46 | case None => Unit 47 | } 48 | previousCdrs = Some(cdrs) 49 | }) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 56 | rdd.map(c => c.map(f => f match { 57 | case x if x.isEmpty() => "0" 58 | case x => x 59 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 60 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 61 | } 62 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-3-6-7DataFrameCreation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.json4s.native.Serialization.write 13 | import org.json4s.DefaultFormats 14 | 15 | object DataframeCreationApp { 16 | 17 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 18 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 19 | callOutActivity: Float, internetTrafficActivity: Float) 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 4) { 23 | System.err.println( 24 | "Usage: CdrDataframeApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 34 | 35 | val sqlC = new SQLContext(ssc.sparkContext) 36 | import sqlC.implicits._ 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) 42 | //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) 43 | //val cdrs = seqToCdr(rdd).toDF() 44 | val cdrsJson = seqToCdr(rdd).map(r => { 45 | implicit val formats = DefaultFormats 46 | write(r) 47 | }) 48 | val cdrs = sqlC.read.json(cdrsJson) 49 | 50 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 51 | }) 52 | 53 | ssc.start() 54 | ssc.awaitTermination() 55 | 56 | } 57 | 58 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 59 | rdd.map(c => c.map(f => f match { 60 | case x if x.isEmpty() => "0" 61 | case x => x 62 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 63 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 64 | } 65 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-5Exception.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppA { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 51 | client.connect() 52 | rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) 53 | client.disconnect() 54 | client.close() 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/src/main/scala/org/apress/prospark/L10-9Graph.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.graphx.Edge 6 | import org.apache.spark.graphx.Graph 7 | import org.apache.spark.graphx.Graph.graphToGraphOps 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object UserRankApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: UserRankApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | ssc.socketTextStream(hostname, port.toInt) 33 | .map(r => { 34 | implicit val formats = DefaultFormats 35 | parse(r) 36 | }) 37 | .foreachRDD(rdd => { 38 | val edges = rdd.map(jvalue => { 39 | implicit val formats = DefaultFormats 40 | ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) 41 | }) 42 | .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) 43 | 44 | val vertices = rdd.map(jvalue => { 45 | implicit val formats = DefaultFormats 46 | ((jvalue \ "user_id").extract[String]) 47 | }) 48 | .map(r => (r.hashCode.toLong, r)) 49 | 50 | val tolerance = 0.0001 51 | val graph = Graph(vertices, edges, "defaultUser") 52 | .subgraph(vpred = (id, idStr) => idStr != "defaultUser") 53 | val pr = graph.pageRank(tolerance).cache 54 | 55 | graph.outerJoinVertices(pr.vertices) { 56 | (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) 57 | }.vertices.top(10) { 58 | Ordering.by(_._2._1) 59 | }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) 60 | }) 61 | 62 | ssc.start() 63 | ssc.awaitTermination() 64 | 65 | } 66 | 67 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-7PerPartition.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppC { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreachPartition { par => 51 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 52 | client.connect() 53 | par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) 54 | client.disconnect() 55 | client.close() 56 | } 57 | } 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/HttpInputDStreamAsync.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.ClassTag 4 | 5 | import org.apache.spark.Logging 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.api.java.JavaDStream 9 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 10 | import org.apache.spark.streaming.api.java.JavaStreamingContext 11 | import org.apache.spark.streaming.dstream.DStream 12 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 13 | import org.apache.spark.streaming.receiver.Receiver 14 | 15 | import com.ning.http.client.AsyncCompletionHandler 16 | import com.ning.http.client.AsyncHttpClient 17 | import com.ning.http.client.Response 18 | 19 | class HttpInputDStreamAsync( 20 | @transient ssc_ : StreamingContext, 21 | storageLevel: StorageLevel, 22 | url: String) extends ReceiverInputDStream[String](ssc_) with Logging { 23 | 24 | def getReceiver(): Receiver[String] = { 25 | new HttpReceiverAsync(storageLevel, url) 26 | } 27 | } 28 | 29 | class HttpReceiverAsync( 30 | storageLevel: StorageLevel, 31 | url: String) extends Receiver[String](storageLevel) with Logging { 32 | 33 | var asyncHttpClient: AsyncHttpClient = _ 34 | 35 | def onStop() { 36 | asyncHttpClient.close() 37 | logInfo("Disconnected from Http Server") 38 | } 39 | 40 | def onStart() { 41 | asyncHttpClient = new AsyncHttpClient() 42 | asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { 43 | 44 | override def onCompleted(response: Response): Response = { 45 | store(response.getResponseBody) 46 | return response 47 | } 48 | 49 | override def onThrowable(t: Throwable) { 50 | restart("Error! Problems while connecting", t) 51 | } 52 | }); 53 | logInfo("Http Connection initiated") 54 | } 55 | 56 | } 57 | 58 | object HttpUtilsAsync { 59 | def createStream( 60 | ssc: StreamingContext, 61 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 62 | url: String): DStream[String] = { 63 | new HttpInputDStreamAsync(ssc, storageLevel, url) 64 | } 65 | 66 | def createStream( 67 | jssc: JavaStreamingContext, 68 | storageLevel: StorageLevel, 69 | url: String): JavaDStream[String] = { 70 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 71 | createStream(jssc.ssc, storageLevel, url) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-6PerRecord.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppB { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreach { rec => 51 | { 52 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 53 | client.connect() 54 | client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))) 55 | client.disconnect() 56 | client.close() 57 | } 58 | } 59 | } 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/src/main/scala/org/apress/prospark/L7-4UI.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.concurrent.atomic.AtomicLong 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object SocialSearchApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 3) { 14 | System.err.println( 15 | "Usage: SocialSearchApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | //.set("spark.eventLog.enabled", "true") 24 | //.set("spark.eventLog.dir", "/tmp/historical") 25 | 26 | 27 | val countSearch = new AtomicLong(0) 28 | val countSocial = new AtomicLong(0) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(1)) 31 | 32 | val titleStream = ssc.socketTextStream(hostname, port.toInt) 33 | .map(rec => rec.split("\\t")) 34 | .filter(_(3) match { 35 | case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true 36 | case _ => false 37 | }) 38 | .map(rec => (rec(3), rec(4))) 39 | .cache() 40 | 41 | val searchStream = titleStream.filter(_._1 match { 42 | case "other-google" | "other-bing" | "other-yahoo" => true 43 | case _ => false 44 | }) 45 | .map(rec => rec._2) 46 | 47 | val socialStream = titleStream.filter(_._1 match { 48 | case "other-facebook" | "other-twitter" => true 49 | case _ => false 50 | }) 51 | .map(rec => rec._2) 52 | 53 | val exclusiveSearch = searchStream.transformWith(socialStream, 54 | (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) 55 | .foreachRDD(rdd => { 56 | countSearch.addAndGet(rdd.count()) 57 | println("Exclusive count search engines: " + countSearch) 58 | }) 59 | 60 | val exclusiveSocial = socialStream.transformWith(searchStream, 61 | (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) 62 | .foreachRDD(rdd => { 63 | countSocial.addAndGet(rdd.count()) 64 | println("Exclusive count social media: " + countSocial) 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | 71 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-38SparkR.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.hive.HiveContext 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import java.nio.file.Paths 11 | import org.apache.spark.SparkFiles 12 | 13 | object CdrStreamingSparkRApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 7) { 21 | System.err.println( 22 | "Usage: CdrStreamingSparkRApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val cl = Thread.currentThread().getContextClassLoader() 34 | val hiveC = new HiveContext(ssc.sparkContext) 35 | Thread.currentThread().setContextClassLoader(cl) 36 | 37 | import hiveC.implicits._ 38 | 39 | ssc.sparkContext.addFile(rScriptPath) 40 | val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) 41 | val master = hiveC.sparkContext.getConf.get("spark.master") 42 | 43 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 44 | .map(_.split("\\t", -1)) 45 | .foreachRDD((rdd, time) => { 46 | val iTableName = tableName + time.milliseconds 47 | seqToCdr(rdd).toDF().write.saveAsTable(iTableName) 48 | hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) 49 | }) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 56 | rdd.map(c => c.map(f => f match { 57 | case x if x.isEmpty() => "0" 58 | case x => x 59 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 60 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 61 | } 62 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-8Static.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppD { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreachPartition { par => 51 | par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 52 | } 53 | } 54 | 55 | ssc.start() 56 | ssc.awaitTermination() 57 | } 58 | } 59 | 60 | object MqttSink { 61 | val brokerUrl = "tcp://localhost:1883" 62 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 63 | client.connect() 64 | sys.addShutdownHook { 65 | client.disconnect() 66 | client.close() 67 | } 68 | 69 | def apply(): MqttClient = { 70 | client 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-23UpdateState.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.Seconds 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 8 | import org.json4s.DefaultFormats 9 | import org.json4s.jvalue2extractable 10 | import org.json4s.jvalue2monadic 11 | import org.json4s.native.JsonMethods.parse 12 | import org.json4s.string2JsonInput 13 | 14 | object StatefulUpdateStateApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: StatefulUpdateStateApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, checkpointDir) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val batchInterval = 10 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 32 | ssc.checkpoint(checkpointDir) 33 | 34 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 35 | interval = batchInterval) 36 | .flatMap(rec => { 37 | implicit val formats = DefaultFormats 38 | val query = parse(rec) \ "query" 39 | ((query \ "results" \ "quote").children) 40 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 41 | }) 42 | .updateStateByKey(updateState) 43 | .print() 44 | 45 | def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = { 46 | val volumes = values.map(s => s._2) 47 | val localMin = volumes.min 48 | val localMax = volumes.max 49 | val localCount500 = values.map(s => s._1).count(price => price > 500) 50 | val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)] 51 | val newMin = if (localMin < globalValues._1) localMin else globalValues._1 52 | val newMax = if (localMax > globalValues._2) localMax else globalValues._2 53 | val newCount500 = globalValues._3 + localCount500 54 | return Some(newMin, newMax, newCount500) 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-35DataFrameExamplesRDD.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.types.DataType 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.json4s.DefaultFormats 14 | 15 | object CdrDataframeExamplesRDDApp { 16 | 17 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 18 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 19 | callOutActivity: Float, internetTrafficActivity: Float) 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 5) { 23 | System.err.println( 24 | "Usage: CdrDataframeExamplesRDDApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 34 | 35 | val sqlC = new SQLContext(ssc.sparkContext) 36 | import sqlC.implicits._ 37 | implicit val formats = DefaultFormats 38 | 39 | val schemaJson = scala.io.Source.fromFile(schemaFile).mkString 40 | val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] 41 | 42 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 43 | .map(_.split("\\t", -1)) 44 | .foreachRDD(rdd => { 45 | val cdrs = seqToCdr(rdd).toDF() 46 | val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) 47 | val highOther = cdrs.except(highInternet) 48 | val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() 49 | val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() 50 | highOtherGrid.except(highInternetGrid).show() 51 | highInternetGrid.except(highOtherGrid).show() 52 | }) 53 | 54 | ssc.start() 55 | ssc.awaitTermination() 56 | } 57 | 58 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 59 | rdd.map(c => c.map(f => f match { 60 | case x if x.isEmpty() => "0" 61 | case x => x 62 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 63 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 64 | } 65 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-16SparkHBase.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.hbase.HBaseConfiguration 4 | import org.apache.hadoop.hbase.TableName 5 | import org.apache.hadoop.hbase.client.Put 6 | import org.apache.hadoop.hbase.spark.HBaseContext 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | import org.json4s.DefaultFormats 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.parse 17 | import org.json4s.string2JsonInput 18 | 19 | object SparkHBaseBulkPutApp { 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 4) { 23 | System.err.println( 24 | "Usage: SparkHBaseBulkPutApp ") 25 | System.exit(1) 26 | } 27 | 28 | val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val batchInterval = 10 35 | val windowSize = 20 36 | val slideInterval = 10 37 | 38 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 39 | 40 | val hbaseConf = HBaseConfiguration.create() 41 | val hContext = new HBaseContext(ssc.sparkContext, hbaseConf) 42 | 43 | val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 44 | interval = batchInterval) 45 | .flatMap(rec => { 46 | implicit val formats = DefaultFormats 47 | val query = parse(rec) \ "query" 48 | ((query \ "results" \ "quote").children) 49 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 50 | }) 51 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 52 | 53 | hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => { 54 | val put = new Put(rec._1.getBytes) 55 | put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) 56 | put 57 | }) 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-22Counters.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.concurrent.atomic.AtomicLong 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.json4s.DefaultFormats 10 | import org.json4s.jvalue2extractable 11 | import org.json4s.jvalue2monadic 12 | import org.json4s.native.JsonMethods.parse 13 | import org.json4s.string2JsonInput 14 | 15 | object StatefulCountersApp { 16 | 17 | def main(args: Array[String]) { 18 | if (args.length != 1) { 19 | System.err.println( 20 | "Usage: StatefulCountersApp ") 21 | System.exit(1) 22 | } 23 | 24 | val Seq(appName) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val batchInterval = 10 31 | 32 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 33 | 34 | var globalMax: AtomicLong = new AtomicLong(Long.MinValue) 35 | var globalMin: AtomicLong = new AtomicLong(Long.MaxValue) 36 | var globalCounter500: AtomicLong = new AtomicLong(0) 37 | 38 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 39 | interval = batchInterval) 40 | .flatMap(rec => { 41 | implicit val formats = DefaultFormats 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children) 44 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)) 45 | }) 46 | .foreachRDD(rdd => { 47 | val stocks = rdd.take(10) 48 | stocks.foreach(stock => { 49 | val price = stock._2 50 | val volume = stock._3 51 | if (volume > globalMax.get()) { 52 | globalMax.set(volume) 53 | } 54 | if (volume < globalMin.get()) { 55 | globalMin.set(volume) 56 | } 57 | if (price > 500) { 58 | globalCounter500.incrementAndGet() 59 | } 60 | }) 61 | if (globalCounter500.get() > 1000L) { 62 | println("Global counter has reached 1000") 63 | println("Max ----> " + globalMax.get) 64 | println("Min ----> " + globalMin.get) 65 | globalCounter500.set(0) 66 | } 67 | }) 68 | 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/T8-3DataFrameExamplesNA.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JDouble 13 | import org.json4s.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.compact 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.native.JsonMethods.render 19 | import org.json4s.string2JsonInput 20 | 21 | object CdrDataframeExamplesNAApp { 22 | 23 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 24 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 25 | callOutActivity: Float, internetTrafficActivity: Float) 26 | 27 | def main(args: Array[String]) { 28 | if (args.length != 4) { 29 | System.err.println( 30 | "Usage: CdrDataframeExamplesNAApp ") 31 | System.exit(1) 32 | } 33 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 40 | 41 | val sqlC = new SQLContext(ssc.sparkContext) 42 | import sqlC.implicits._ 43 | implicit val formats = DefaultFormats 44 | 45 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 46 | .map(_.split("\\t", -1)) 47 | .foreachRDD(rdd => { 48 | val cdrs = seqToCdr(rdd).toDF() 49 | cdrs.na.drop("any").show() 50 | cdrs.na.fill(0, Array("squareId")).show() 51 | cdrs.na.replace("squareId", Map(0 -> 1)).show() 52 | println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) 53 | println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) 54 | cdrs.stat.crosstab("squareId", "countryCode").show() 55 | cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() 56 | cdrs.stat.crosstab("callOutActivity", "callInActivity").show() 57 | }) 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | 63 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 64 | rdd.map(c => c.map(f => f match { 65 | case x if x.isEmpty() => "0" 66 | case x => x 67 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 68 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 69 | } 70 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamMapping.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditMappingApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditMappingApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 39 | val tsKey = "created_utc" 40 | val secs = 1000L 41 | val keyedByDay = comments.map(rec => { 42 | val ts = (parse(rec) \ tsKey).values 43 | (sdf.format(new Date(ts.toString.toLong * secs)), rec) 44 | }) 45 | 46 | val keyedByDayPart = comments.mapPartitions(iter => { 47 | var ret = List[(String, String)]() 48 | while (iter.hasNext) { 49 | val rec = iter.next 50 | val ts = (parse(rec) \ tsKey).values 51 | ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) 52 | } 53 | ret.iterator 54 | }) 55 | 56 | val wordTokens = comments.map(rec => { 57 | ((parse(rec) \ "body")).values.toString.split(" ") 58 | }) 59 | 60 | val wordTokensFlat = comments.flatMap(rec => { 61 | ((parse(rec) \ "body")).values.toString.split(" ") 62 | }) 63 | 64 | val filterSubreddit = comments.filter(rec => 65 | (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) 66 | 67 | val sortedByAuthor = comments.transform(rdd => 68 | (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) 69 | 70 | ssc.start() 71 | ssc.awaitTermination() 72 | 73 | } 74 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap4/src/main/scala/org/apress/prospark/L4-3ProtonFlux.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import com.esotericsoftware.kryo.{KryoSerializable,Kryo} 4 | import com.esotericsoftware.kryo.io.{Output, Input} 5 | 6 | class ProtonFlux( 7 | var year: Int, 8 | var bin0_57to1_78: Double, 9 | var bin3_40to17_6: Double, 10 | var bin22_0to31_0: Double, 11 | var bin1_894to2_605: Double, 12 | var bin4_200to6_240: Double, 13 | var bin3_256to8_132: Double, 14 | var bin3_276to8_097: Double, 15 | var bin6_343to42_03: Double, 16 | var bin17_88to26_81: Double, 17 | var bin30_29to69_47: Double, 18 | var bin132_8to242_0: Double 19 | ) extends KryoSerializable { 20 | 21 | def this(year: String, bin0_57to1_78: String, bin3_40to17_6: String, 22 | bin22_0to31_0: String, bin1_894to2_605: String, bin4_200to6_240: String, 23 | bin3_256to8_132: String, bin3_276to8_097: String, bin6_343to42_03: String, 24 | bin17_88to26_81: String, bin30_29to69_47: String, bin132_8to242_0: String) { 25 | this(year.toInt, bin0_57to1_78.toDouble, bin3_40to17_6.toDouble, 26 | bin22_0to31_0.toDouble, bin1_894to2_605.toDouble, bin4_200to6_240.toDouble, 27 | bin3_256to8_132.toDouble, bin3_276to8_097.toDouble, bin6_343to42_03.toDouble, 28 | bin17_88to26_81.toDouble, bin30_29to69_47.toDouble, bin132_8to242_0.toDouble) 29 | } 30 | 31 | def isSolarStorm = (bin0_57to1_78 > 1.0 || bin3_40to17_6 > 1.0 32 | || bin22_0to31_0 > 1.0 || bin1_894to2_605 > 1.0 || bin4_200to6_240 > 1.0 33 | || bin3_256to8_132 > 1.0 || bin3_276to8_097 > 1.0 || bin6_343to42_03 > 1.0 34 | || bin17_88to26_81 > 1.0 || bin30_29to69_47 > 1.0 || bin132_8to242_0 > 1.0) 35 | 36 | override def write(kryo: Kryo, output: Output) { 37 | output.writeInt(year) 38 | output.writeDouble(bin0_57to1_78) 39 | output.writeDouble(bin3_40to17_6) 40 | output.writeDouble(bin22_0to31_0) 41 | output.writeDouble(bin1_894to2_605) 42 | output.writeDouble(bin4_200to6_240) 43 | output.writeDouble(bin3_256to8_132) 44 | output.writeDouble(bin3_276to8_097) 45 | output.writeDouble(bin6_343to42_03) 46 | output.writeDouble(bin17_88to26_81) 47 | output.writeDouble(bin30_29to69_47) 48 | output.writeDouble(bin132_8to242_0) 49 | } 50 | 51 | override def read(kryo: Kryo, input: Input) { 52 | year = input.readInt() 53 | bin0_57to1_78 = input.readDouble() 54 | bin3_40to17_6 = input.readDouble() 55 | bin22_0to31_0 = input.readDouble() 56 | bin1_894to2_605 = input.readDouble() 57 | bin4_200to6_240 = input.readDouble() 58 | bin3_256to8_132 = input.readDouble() 59 | bin3_276to8_097 = input.readDouble() 60 | bin6_343to42_03 = input.readDouble() 61 | bin17_88to26_81 = input.readDouble() 62 | bin30_29to69_47 = input.readDouble() 63 | bin132_8to242_0 = input.readDouble() 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap5/src/main/scala/org/apress/prospark/HttpInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.Timer 4 | import java.util.TimerTask 5 | 6 | import scala.reflect.ClassTag 7 | 8 | import org.apache.http.client.methods.HttpGet 9 | import org.apache.http.impl.client.CloseableHttpClient 10 | import org.apache.http.impl.client.HttpClients 11 | import org.apache.http.util.EntityUtils 12 | import org.apache.spark.Logging 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.streaming.StreamingContext 15 | import org.apache.spark.streaming.api.java.JavaDStream 16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 17 | import org.apache.spark.streaming.api.java.JavaStreamingContext 18 | import org.apache.spark.streaming.dstream.DStream 19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 20 | import org.apache.spark.streaming.receiver.Receiver 21 | 22 | class HttpInputDStream( 23 | @transient ssc_ : StreamingContext, 24 | storageLevel: StorageLevel, 25 | url: String, 26 | interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { 27 | 28 | def getReceiver(): Receiver[String] = { 29 | new HttpReceiver(storageLevel, url, interval) 30 | } 31 | } 32 | 33 | class HttpReceiver( 34 | storageLevel: StorageLevel, 35 | url: String, 36 | interval: Long) extends Receiver[String](storageLevel) with Logging { 37 | 38 | var httpClient: CloseableHttpClient = _ 39 | var trigger: Timer = _ 40 | 41 | def onStop() { 42 | httpClient.close() 43 | logInfo("Disconnected from Http Server") 44 | } 45 | 46 | def onStart() { 47 | httpClient = HttpClients.createDefault() 48 | trigger = new Timer() 49 | trigger.scheduleAtFixedRate(new TimerTask { 50 | def run() = doGet() 51 | }, 0, interval * 1000) 52 | 53 | logInfo("Http Receiver initiated") 54 | } 55 | 56 | def doGet() { 57 | logInfo("Fetching data from Http source") 58 | val response = httpClient.execute(new HttpGet(url)) 59 | try { 60 | val content = EntityUtils.toString(response.getEntity()) 61 | store(content) 62 | } catch { 63 | case e: Exception => restart("Error! Problems while connecting", e) 64 | } finally { 65 | response.close() 66 | } 67 | 68 | } 69 | 70 | } 71 | 72 | object HttpUtils { 73 | def createStream( 74 | ssc: StreamingContext, 75 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 76 | url: String, 77 | interval: Long): DStream[String] = { 78 | new HttpInputDStream(ssc, storageLevel, url, interval) 79 | } 80 | 81 | def createStream( 82 | jssc: JavaStreamingContext, 83 | storageLevel: StorageLevel, 84 | url: String, 85 | interval: Long): JavaDStream[String] = { 86 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 87 | createStream(jssc.ssc, storageLevel, url, interval) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/HttpInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.Timer 4 | import java.util.TimerTask 5 | 6 | import scala.reflect.ClassTag 7 | 8 | import org.apache.http.client.methods.HttpGet 9 | import org.apache.http.impl.client.CloseableHttpClient 10 | import org.apache.http.impl.client.HttpClients 11 | import org.apache.http.util.EntityUtils 12 | import org.apache.spark.Logging 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.streaming.StreamingContext 15 | import org.apache.spark.streaming.api.java.JavaDStream 16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 17 | import org.apache.spark.streaming.api.java.JavaStreamingContext 18 | import org.apache.spark.streaming.dstream.DStream 19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 20 | import org.apache.spark.streaming.receiver.Receiver 21 | 22 | class HttpInputDStream( 23 | @transient ssc_ : StreamingContext, 24 | storageLevel: StorageLevel, 25 | url: String, 26 | interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { 27 | 28 | def getReceiver(): Receiver[String] = { 29 | new HttpReceiver(storageLevel, url, interval) 30 | } 31 | } 32 | 33 | class HttpReceiver( 34 | storageLevel: StorageLevel, 35 | url: String, 36 | interval: Long) extends Receiver[String](storageLevel) with Logging { 37 | 38 | var httpClient: CloseableHttpClient = _ 39 | var trigger: Timer = _ 40 | 41 | def onStop() { 42 | httpClient.close() 43 | logInfo("Disconnected from Http Server") 44 | } 45 | 46 | def onStart() { 47 | httpClient = HttpClients.createDefault() 48 | trigger = new Timer() 49 | trigger.scheduleAtFixedRate(new TimerTask { 50 | def run() = doGet() 51 | }, 0, interval * 1000) 52 | 53 | logInfo("Http Receiver initiated") 54 | } 55 | 56 | def doGet() { 57 | logInfo("Fetching data from Http source") 58 | val response = httpClient.execute(new HttpGet(url)) 59 | try { 60 | val content = EntityUtils.toString(response.getEntity()) 61 | store(content) 62 | } catch { 63 | case e: Exception => restart("Error! Problems while connecting", e) 64 | } finally { 65 | response.close() 66 | } 67 | 68 | } 69 | 70 | } 71 | 72 | object HttpUtils { 73 | def createStream( 74 | ssc: StreamingContext, 75 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 76 | url: String, 77 | interval: Long): DStream[String] = { 78 | new HttpInputDStream(ssc, storageLevel, url, interval) 79 | } 80 | 81 | def createStream( 82 | jssc: JavaStreamingContext, 83 | storageLevel: StorageLevel, 84 | url: String, 85 | interval: Long): JavaDStream[String] = { 86 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 87 | createStream(jssc.ssc, storageLevel, url, interval) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-29DataFrameExamplesJoin.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JDouble 13 | import org.json4s.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.compact 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.native.JsonMethods.render 19 | import org.json4s.string2JsonInput 20 | 21 | object CdrDataframeExamples3App { 22 | 23 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 24 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 25 | callOutActivity: Float, internetTrafficActivity: Float) 26 | 27 | def main(args: Array[String]) { 28 | if (args.length != 5) { 29 | System.err.println( 30 | "Usage: CdrDataframeExamples3App ") 31 | System.exit(1) 32 | } 33 | val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 40 | 41 | val sqlC = new SQLContext(ssc.sparkContext) 42 | import sqlC.implicits._ 43 | implicit val formats = DefaultFormats 44 | 45 | val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString 46 | val gridGeo = (parse(gridFile) \ "features") 47 | val gridStr = gridGeo.children.map(r => { 48 | val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) 49 | val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), 50 | ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) 51 | compact(render(JObject(l))) 52 | }) 53 | 54 | val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) 55 | 56 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 57 | .map(_.split("\\t", -1)) 58 | .foreachRDD(rdd => { 59 | val cdrs = seqToCdr(rdd).toDF() 60 | cdrs.join(gridDF, $"squareId" === $"id").show() 61 | }) 62 | 63 | ssc.start() 64 | ssc.awaitTermination() 65 | } 66 | 67 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 68 | rdd.map(c => c.map(f => f match { 69 | case x if x.isEmpty() => "0" 70 | case x => x 71 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 72 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 73 | } 74 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-12StaticPool.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppF { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl)) 40 | 41 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 42 | interval = batchInterval) 43 | .flatMap(rec => { 44 | val query = parse(rec) \ "query" 45 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 46 | }) 47 | .map(rec => { 48 | implicit val formats = DefaultFormats 49 | rec.children.map(f => f.extract[String]) mkString "," 50 | }) 51 | .foreachRDD { rdd => 52 | rdd.foreachPartition { par => 53 | par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 54 | } 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | 61 | } 62 | 63 | class MqttSinkLazy(brokerUrl: String) extends Serializable { 64 | lazy val client = { 65 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 66 | client.connect() 67 | sys.addShutdownHook { 68 | client.disconnect() 69 | client.close() 70 | } 71 | client 72 | } 73 | } 74 | 75 | object MqttSinkLazy { 76 | val brokerUrl = "tcp://localhost:1883" 77 | val client = new MqttSinkLazy(brokerUrl) 78 | 79 | def apply(brokerUrl: String): MqttSinkLazy = { 80 | client 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-20CassandraConnector.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | import com.datastax.spark.connector.SomeColumns 17 | import com.datastax.spark.connector.cql.CassandraConnector 18 | import com.datastax.spark.connector.streaming.toDStreamFunctions 19 | import com.datastax.spark.connector.toNamedColumnRef 20 | 21 | object CassandraConnectorSinkApp { 22 | 23 | def main(args: Array[String]) { 24 | if (args.length != 6) { 25 | System.err.println( 26 | "Usage: CassandraConnectorSinkApp ") 27 | System.exit(1) 28 | } 29 | 30 | val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq 31 | 32 | val conf = new SparkConf() 33 | .setAppName(appName) 34 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 35 | .set("spark.cassandra.connection.host", cassandraHost) 36 | .set("spark.cassandra.connection.port", cassandraPort) 37 | 38 | val batchInterval = 10 39 | val windowSize = 20 40 | val slideInterval = 10 41 | 42 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 43 | 44 | CassandraConnector(conf).withSessionDo { session => 45 | session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) 46 | session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) 47 | } 48 | 49 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 50 | interval = batchInterval) 51 | .flatMap(rec => { 52 | implicit val formats = DefaultFormats 53 | val query = parse(rec) \ "query" 54 | ((query \ "results" \ "quote").children) 55 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 56 | }) 57 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 58 | .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) 59 | .saveToCassandra(keyspace, tableName) 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | } 65 | 66 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap7/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else { 85 | LOG.warn("Unsupported file type: " + f.getName()); 86 | } 87 | } 88 | } else { 89 | LOG.error(String.format("Path %s is not a directory", path)); 90 | } 91 | } finally { 92 | close(); 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else { 85 | LOG.warn("Unsupported file type: " + f.getName()); 86 | } 87 | } 88 | } else { 89 | LOG.error(String.format("Path %s is not a directory", path)); 90 | } 91 | } finally { 92 | close(); 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-14HBase.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.hbase.HBaseConfiguration 5 | import org.apache.hadoop.hbase.client.Put 6 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import org.apache.hadoop.io.Text 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 12 | import org.apache.spark.streaming.Seconds 13 | import org.apache.spark.streaming.StreamingContext 14 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 15 | import org.json4s.DefaultFormats 16 | import org.json4s.jvalue2extractable 17 | import org.json4s.jvalue2monadic 18 | import org.json4s.native.JsonMethods.parse 19 | import org.json4s.string2JsonInput 20 | 21 | object HBaseSinkApp { 22 | 23 | def main(args: Array[String]) { 24 | if (args.length != 5) { 25 | System.err.println( 26 | "Usage: HBaseSinkApp ") 27 | System.exit(1) 28 | } 29 | 30 | val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq 31 | 32 | val conf = new SparkConf() 33 | .setAppName(appName) 34 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 35 | 36 | val batchInterval = 10 37 | val windowSize = 20 38 | val slideInterval = 10 39 | 40 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 41 | 42 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 43 | interval = batchInterval) 44 | .flatMap(rec => { 45 | implicit val formats = DefaultFormats 46 | val query = parse(rec) \ "query" 47 | ((query \ "results" \ "quote").children) 48 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 49 | }) 50 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 51 | .foreachRDD(rdd => { 52 | val hbaseConf = HBaseConfiguration.create() 53 | hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) 54 | hbaseConf.set("hbase.master", hbaseMaster) 55 | val jobConf = new Configuration(hbaseConf) 56 | jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) 57 | rdd.map(rec => { 58 | val put = new Put(rec._1.getBytes) 59 | put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) 60 | (rec._1, put) 61 | }).saveAsNewAPIHadoopDataset(jobConf) 62 | }) 63 | 64 | ssc.start() 65 | ssc.awaitTermination() 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-10-11UDF.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | import scala.reflect.runtime.universe 5 | 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.json4s.jackson.JsonMethods.parse 13 | import org.json4s.jvalue2extractable 14 | import org.json4s.string2JsonInput 15 | 16 | object CdrUDFApp { 17 | 18 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 19 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 20 | callOutActivity: Float, internetTrafficActivity: Float) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: CdrUDFApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val sqlC = new SQLContext(ssc.sparkContext) 37 | import sqlC.implicits._ 38 | 39 | def getCountryCodeMapping() = { 40 | implicit val formats = org.json4s.DefaultFormats 41 | parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) 42 | } 43 | 44 | def getCountryNameMapping() = { 45 | implicit val formats = org.json4s.DefaultFormats 46 | parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] 47 | } 48 | 49 | def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { 50 | mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") 51 | } 52 | 53 | val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) 54 | 55 | sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) 56 | 57 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 58 | .map(_.split("\\t", -1)) 59 | .foreachRDD(rdd => { 60 | val cdrs = seqToCdr(rdd).toDF() 61 | cdrs.registerTempTable("cdrs") 62 | 63 | sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() 64 | 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | 71 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 72 | rdd.map(c => c.map(f => f match { 73 | case x if x.isEmpty() => "0" 74 | case x => x 75 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 76 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-15MLPipeline.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.ml.Pipeline 7 | import org.apache.spark.ml.feature.Normalizer 8 | import org.apache.spark.ml.feature.VectorAssembler 9 | import org.apache.spark.ml.regression.RandomForestRegressor 10 | import org.apache.spark.sql.SQLContext 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.apache.spark.ml.param.ParamMap 14 | 15 | object MLPipelineApp { 16 | 17 | case class Activity(label: Double, 18 | accelXHand: Double, accelYHand: Double, accelZHand: Double, 19 | accelXChest: Double, accelYChest: Double, accelZChest: Double, 20 | accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: MLPipelineApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val sqlC = new SQLContext(ssc.sparkContext) 37 | import sqlC.implicits._ 38 | 39 | val substream = ssc.socketTextStream(hostname, port.toInt) 40 | .filter(!_.contains("NaN")) 41 | .map(_.split(" ")) 42 | .filter(f => f(1) == "4" || f(1) == "5") 43 | .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 44 | .map(f => f.map(v => v.toDouble)) 45 | .foreachRDD(rdd => { 46 | if (!rdd.isEmpty) { 47 | val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() 48 | val split = accelerometer.randomSplit(Array(0.3, 0.7)) 49 | val test = split(0) 50 | val train = split(1) 51 | 52 | val assembler = new VectorAssembler() 53 | .setInputCols(Array( 54 | "accelXHand", "accelYHand", "accelZHand", 55 | "accelXChest", "accelYChest", "accelZChest", 56 | "accelXAnkle", "accelYAnkle", "accelZAnkle")) 57 | .setOutputCol("vectors") 58 | val normalizer = new Normalizer() 59 | .setInputCol(assembler.getOutputCol) 60 | .setOutputCol("features") 61 | val regressor = new RandomForestRegressor() 62 | 63 | val pipeline = new Pipeline() 64 | .setStages(Array(assembler, normalizer, regressor)) 65 | val pMap = ParamMap(normalizer.p -> 1.0) 66 | val model = pipeline.fit(train, pMap) 67 | val prediction = model.transform(test) 68 | prediction.show() 69 | } 70 | }) 71 | 72 | ssc.start() 73 | ssc.awaitTermination() 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap3/src/main/scala/org/apress/prospark/L3-DStreamWindowAndAction.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | import org.apache.spark.HashPartitioner 19 | 20 | object RedditWindowAndActionApp { 21 | def main(args: Array[String]) { 22 | if (args.length != 2) { 23 | System.err.println( 24 | "Usage: RedditWindowAndActionApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, inputPath) = args.toSeq 28 | val LOG = LogManager.getLogger(this.getClass) 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(1)) 35 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 36 | 37 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 38 | 39 | val checkpointPath = "/tmp" 40 | ssc.checkpoint(checkpointPath) 41 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 42 | val currentCount = values.sum 43 | val previousCount = state.getOrElse(0) 44 | Some(currentCount + previousCount) 45 | } 46 | val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) 47 | val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) 48 | .map(r => (r._2, r._1)) 49 | .transform(rdd => rdd.sortByKey(ascending = false)) 50 | 51 | val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) 52 | val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) 53 | val windowedCounts = windowedRecs.countByValue() 54 | 55 | windowedCounts.print(10) 56 | windowedCounts.saveAsObjectFiles("subreddit", "obj") 57 | windowedCounts.saveAsTextFiles("subreddit", "txt") 58 | 59 | globalCount.saveAsHadoopFiles("subreddit", "hadoop", 60 | classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) 61 | globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", 62 | classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) 63 | comments.foreachRDD(rdd => { 64 | LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | 70 | } 71 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/T9-4DataTypes.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Matrices 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix 8 | import org.apache.spark.mllib.linalg.distributed.IndexedRow 9 | import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix 10 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry 11 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 12 | import org.apache.spark.mllib.regression.LabeledPoint 13 | import org.apache.spark.streaming.Seconds 14 | import org.apache.spark.streaming.StreamingContext 15 | 16 | object DataTypesApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: DataTypesApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val substream = ssc.socketTextStream(hostname, port.toInt) 33 | .filter(!_.contains("NaN")) 34 | .map(_.split(" ")) 35 | .filter(f => f(1) != "0") 36 | .map(f => f.map(f => f.toDouble)) 37 | 38 | val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) 39 | denseV.print() 40 | val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) 41 | .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) 42 | sparseV.print() 43 | val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 44 | labeledP.print() 45 | val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) 46 | denseM.print() 47 | denseV.foreachRDD(rdd => { 48 | val rowM = new RowMatrix(rdd) 49 | println(rowM) 50 | }) 51 | denseV.foreachRDD(rdd => { 52 | val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) 53 | val iRowM = new IndexedRowMatrix(iRdd) 54 | println(iRowM) 55 | }) 56 | substream.foreachRDD(rdd => { 57 | val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) 58 | .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) 59 | val cRowM = new CoordinateMatrix(entries) 60 | println(cRowM) 61 | }) 62 | substream.foreachRDD(rdd => { 63 | val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) 64 | .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) 65 | val blockM = new CoordinateMatrix(entries).toBlockMatrix 66 | println(blockM) 67 | }) 68 | 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | 73 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-26Redis.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.JavaConversions.asScalaBuffer 4 | import scala.collection.JavaConversions.mutableMapAsJavaMap 5 | import scala.collection.mutable 6 | 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.jvalue2extractable 13 | import org.json4s.jvalue2monadic 14 | import org.json4s.native.JsonMethods.parse 15 | import org.json4s.string2JsonInput 16 | 17 | import redis.clients.jedis.Jedis 18 | 19 | object StatefulRedisApp { 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 3) { 23 | System.err.println( 24 | "Usage: StatefulRedisApp ") 25 | System.exit(1) 26 | } 27 | 28 | val Seq(appName, checkpointDir, hostname) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val batchInterval = 10 35 | 36 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 37 | 38 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 39 | interval = batchInterval) 40 | .flatMap(rec => { 41 | implicit val formats = DefaultFormats 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children) 44 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 45 | }) 46 | .foreachRDD(rdd => { 47 | rdd.foreachPartition({ part => 48 | val jedis = new Jedis(hostname) 49 | part.foreach(f => { 50 | val prev = jedis.hmget(f._1, "min", "max", "count") 51 | if (prev(0) == null) { 52 | jedis.hmset(f._1, mutable.HashMap("min" -> Long.MaxValue.toString, "max" -> Long.MinValue.toString, "count" -> 0.toString)) 53 | } else { 54 | val prevLong = prev.toList.map(v => v.toLong) 55 | var newCount = prevLong(2) 56 | val newPrice = f._2._1 57 | val newVolume = f._2._2 58 | if (newPrice > 500.0) { 59 | newCount += 1 60 | } 61 | val newMin = if (newVolume < prevLong(0)) newVolume else prevLong(0) 62 | val newMax = if (newVolume > prevLong(1)) newVolume else prevLong(1) 63 | jedis.hmset(f._1, mutable.HashMap("min" -> newMin.toString, "max" -> newMax.toString, "count" -> newCount.toString)) 64 | } 65 | }) 66 | jedis.close() 67 | }) 68 | 69 | val jedis = new Jedis(hostname) 70 | jedis.scan(0).getResult.foreach(sym => println("Symbol: %s, Stats: %s".format(sym, jedis.hmget(sym, "min", "max", "count").toString))) 71 | jedis.close() 72 | }) 73 | 74 | ssc.start() 75 | ssc.awaitTermination() 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else if (ext.equals("dat")) { 85 | LOG.info(String.format("Feeding dat file %s", f.getName())); 86 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) { 87 | String line; 88 | while ((line = br.readLine()) != null) { 89 | sendRecord(line); 90 | } 91 | } 92 | } else { 93 | LOG.warn("Unsupported file type: " + f.getName()); 94 | } 95 | } 96 | } else { 97 | LOG.error(String.format("Path %s is not a directory", path)); 98 | } 99 | } finally { 100 | close(); 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/T8-5-L8-30-34DataFrameExamplesActions.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SaveMode 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.sql.hive.HiveContext 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr 14 | import org.json4s.DefaultFormats 15 | 16 | object CdrDataframeExamplesActionsApp { 17 | 18 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 19 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 20 | callOutActivity: Float, internetTrafficActivity: Float) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: CdrDataframeExamplesActionsApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val cl = Thread.currentThread().getContextClassLoader() 37 | val hiveC = new HiveContext(ssc.sparkContext) 38 | Thread.currentThread().setContextClassLoader(cl) 39 | import hiveC.implicits._ 40 | implicit val formats = DefaultFormats 41 | 42 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 43 | .map(_.split("\\t", -1)) 44 | .foreachRDD(rdd => { 45 | val cdrs = seqToCdr(rdd).toDF() 46 | 47 | val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) 48 | counts.show(5) 49 | counts.show() 50 | println("head(5): " + counts.head(5)) 51 | println("take(5): " + counts.take(5)) 52 | println("head(): " + counts.head()) 53 | println("first(5): " + counts.first()) 54 | println("count(): " + counts.count()) 55 | println("collect(): " + counts.collect()) 56 | println("collectAsList(): " + counts.collectAsList()) 57 | println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) 58 | counts.write.format("parquet").save("/tmp/parquent" + rdd.id) 59 | counts.write.format("json").save("/tmp/json" + rdd.id) 60 | counts.write.parquet("/tmp/parquent2" + rdd.id) 61 | counts.write.json("/tmp/json2" + rdd.id) 62 | counts.write.saveAsTable("count_table") 63 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") 64 | val prop: java.util.Properties = new java.util.Properties() 65 | counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) 66 | }) 67 | 68 | ssc.start() 69 | ssc.awaitTermination() 70 | } 71 | 72 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 73 | rdd.map(c => c.map(f => f match { 74 | case x if x.isEmpty() => "0" 75 | case x => x 76 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 77 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 78 | } 79 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else if (ext.equals("dat") || ext.equals("json")) { 85 | LOG.info(String.format("Feeding dat file %s", f.getName())); 86 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) { 87 | String line; 88 | while ((line = br.readLine()) != null) { 89 | sendRecord(line); 90 | } 91 | } 92 | } else { 93 | LOG.warn("Unsupported file type: " + f.getName()); 94 | } 95 | } 96 | } else { 97 | LOG.error(String.format("Path %s is not a directory", path)); 98 | } 99 | } finally { 100 | close(); 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-18Cassandra.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.json4s.DefaultFormats 9 | import org.json4s.JField 10 | import org.json4s.JsonAST.JObject 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | import org.apache.hadoop.conf.Configuration 16 | import org.apache.hadoop.io.Text 17 | import java.nio.ByteBuffer 18 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat 19 | import org.apache.cassandra.hadoop.ConfigHelper 20 | import org.apache.cassandra.thrift.ColumnOrSuperColumn 21 | import org.apache.cassandra.thrift.Column 22 | import org.apache.cassandra.utils.ByteBufferUtil 23 | import org.apache.cassandra.thrift.Mutation 24 | import java.util.Arrays 25 | 26 | object CassandraSinkApp { 27 | 28 | def main(args: Array[String]) { 29 | if (args.length != 6) { 30 | System.err.println( 31 | "Usage: CassandraSinkApp ") 32 | System.exit(1) 33 | } 34 | 35 | val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq 36 | 37 | val conf = new SparkConf() 38 | .setAppName(appName) 39 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 40 | 41 | val batchInterval = 10 42 | val windowSize = 20 43 | val slideInterval = 10 44 | 45 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 46 | 47 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 48 | interval = batchInterval) 49 | .flatMap(rec => { 50 | implicit val formats = DefaultFormats 51 | val query = parse(rec) \ "query" 52 | ((query \ "results" \ "quote").children) 53 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 54 | }) 55 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 56 | .foreachRDD(rdd => { 57 | val jobConf = new Configuration() 58 | ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) 59 | ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) 60 | ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) 61 | ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") 62 | rdd.map(rec => { 63 | val c = new Column() 64 | c.setName(ByteBufferUtil.bytes(columnName)) 65 | c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) 66 | c.setTimestamp(System.currentTimeMillis) 67 | val m = new Mutation() 68 | m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) 69 | m.column_or_supercolumn.setColumn(c) 70 | (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) 71 | }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) 72 | }) 73 | 74 | ssc.start() 75 | ssc.awaitTermination() 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap9/src/main/scala/org/apress/prospark/L9-17MLCrossValidation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.ml.Pipeline 8 | import org.apache.spark.ml.evaluation.RegressionEvaluator 9 | import org.apache.spark.ml.feature.Normalizer 10 | import org.apache.spark.ml.feature.VectorAssembler 11 | import org.apache.spark.ml.regression.RandomForestRegressor 12 | import org.apache.spark.ml.tuning.CrossValidator 13 | import org.apache.spark.ml.tuning.ParamGridBuilder 14 | import org.apache.spark.sql.SQLContext 15 | import org.apache.spark.streaming.Seconds 16 | import org.apache.spark.streaming.StreamingContext 17 | 18 | object MLCrossValidationApp { 19 | 20 | case class Activity(label: Double, 21 | accelXHand: Double, accelYHand: Double, accelZHand: Double, 22 | accelXChest: Double, accelYChest: Double, accelZChest: Double, 23 | accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) 24 | 25 | def main(args: Array[String]) { 26 | if (args.length != 4) { 27 | System.err.println( 28 | "Usage: MLCrossValidationApp ") 29 | System.exit(1) 30 | } 31 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 32 | 33 | val conf = new SparkConf() 34 | .setAppName(appName) 35 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 38 | 39 | val sqlC = new SQLContext(ssc.sparkContext) 40 | import sqlC.implicits._ 41 | 42 | val substream = ssc.socketTextStream(hostname, port.toInt) 43 | .filter(!_.contains("NaN")) 44 | .map(_.split(" ")) 45 | .filter(f => f(1) == "4" || f(1) == "5") 46 | .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 47 | .map(f => f.map(v => v.toDouble)) 48 | .foreachRDD(rdd => { 49 | if (!rdd.isEmpty) { 50 | val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() 51 | val split = accelerometer.randomSplit(Array(0.3, 0.7)) 52 | val test = split(0) 53 | val train = split(1) 54 | 55 | val assembler = new VectorAssembler() 56 | .setInputCols(Array( 57 | "accelXHand", "accelYHand", "accelZHand", 58 | "accelXChest", "accelYChest", "accelZChest", 59 | "accelXAnkle", "accelYAnkle", "accelZAnkle")) 60 | .setOutputCol("vectors") 61 | val normalizer = new Normalizer() 62 | .setInputCol(assembler.getOutputCol) 63 | .setOutputCol("features") 64 | val regressor = new RandomForestRegressor() 65 | 66 | val pipeline = new Pipeline() 67 | .setStages(Array(assembler, normalizer, regressor)) 68 | 69 | val validator = new CrossValidator() 70 | .setEstimator(pipeline) 71 | .setEvaluator(new RegressionEvaluator) 72 | val pGrid = new ParamGridBuilder() 73 | .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) 74 | .addGrid(regressor.numTrees, Array(10, 50, 100)) 75 | .build() 76 | validator.setEstimatorParamMaps(pGrid) 77 | validator.setNumFolds(5) 78 | 79 | val bestModel = validator.fit(train) 80 | val prediction = bestModel.transform(test) 81 | prediction.show() 82 | } 83 | }) 84 | 85 | ssc.start() 86 | ssc.awaitTermination() 87 | } 88 | 89 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap8/src/main/scala/org/apress/prospark/L8-14-27DataFrameExamples.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeExamplesApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeExamplesApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 37 | .map(_.split("\\t", -1)) 38 | .foreachRDD(rdd => { 39 | val cdrs = seqToCdr(rdd).toDF() 40 | 41 | cdrs.select("squareId", "timeInterval", "countryCode").show() 42 | cdrs.select($"squareId", $"timeInterval", $"countryCode").show() 43 | cdrs.filter("squareId = 5").show() 44 | cdrs.drop("countryCode").show() 45 | cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() 46 | cdrs.limit(5).show() 47 | cdrs.groupBy("squareId").count().show() 48 | cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() 49 | cdrs.groupBy("countryCode").max("callOutActivity").show() 50 | cdrs.groupBy("countryCode").min("callOutActivity").show() 51 | cdrs.groupBy("squareId").sum("internetTrafficActivity").show() 52 | cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() 53 | cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() 54 | cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() 55 | cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) 56 | cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) 57 | cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() 58 | cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() 59 | cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() 60 | cdrs.sample(true, 0.01).show() 61 | }) 62 | 63 | ssc.start() 64 | ssc.awaitTermination() 65 | } 66 | 67 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 68 | rdd.map(c => c.map(f => f match { 69 | case x if x.isEmpty() => "0" 70 | case x => x 71 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 72 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 73 | } 74 | } -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-24Accumulators.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.mutable 4 | 5 | import org.apache.spark.AccumulableParam 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object StatefulAccumulatorsApp { 17 | 18 | object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] { 19 | def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { 20 | new mutable.HashMap[String, (Long, Long, Long)]() 21 | } 22 | def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { 23 | t1 ++ t2.map { 24 | case (k, v2) => (k -> { 25 | val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L)) 26 | val newMin = if (v2._1 < v1._1) v2._1 else v1._1 27 | val newMax = if (v2._2 > v1._2) v2._2 else v1._2 28 | (newMin, newMax, v1._3 + v2._3) 29 | }) 30 | } 31 | } 32 | def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = { 33 | val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L)) 34 | val newVals = t2._2 35 | var newCount = prevStats._3 36 | if (newVals._1 > 500.0) { 37 | newCount += 1 38 | } 39 | val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1 40 | val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2 41 | t1 += t2._1 -> (newMin, newMax, newCount) 42 | } 43 | } 44 | 45 | def main(args: Array[String]) { 46 | if (args.length != 2) { 47 | System.err.println( 48 | "Usage: StatefulAccumulatorsApp ") 49 | System.exit(1) 50 | } 51 | 52 | val Seq(appName, checkpointDir) = args.toSeq 53 | 54 | val conf = new SparkConf() 55 | .setAppName(appName) 56 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 57 | 58 | val batchInterval = 10 59 | 60 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 61 | 62 | val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum) 63 | 64 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 65 | interval = batchInterval) 66 | .flatMap(rec => { 67 | implicit val formats = DefaultFormats 68 | val query = parse(rec) \ "query" 69 | ((query \ "results" \ "quote").children) 70 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 71 | }) 72 | .foreachRDD(rdd => { 73 | rdd.foreach({ stock => 74 | stateAccum += (stock._1, (stock._2._1, stock._2._2)) 75 | }) 76 | for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats) 77 | }) 78 | 79 | ssc.start() 80 | ssc.awaitTermination() 81 | } 82 | } 83 | 84 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap6/src/main/scala/org/apress/prospark/L6-10LazyStatic.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.eclipse.paho.client.mqttv3.MqttClient 9 | import org.eclipse.paho.client.mqttv3.MqttMessage 10 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JField 13 | import org.json4s.JsonAST.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.parse 17 | import org.json4s.string2JsonInput 18 | import org.apache.commons.pool2.PooledObject 19 | import org.apache.commons.pool2.BasePooledObjectFactory 20 | import org.apache.commons.pool2.impl.DefaultPooledObject 21 | import org.apache.commons.pool2.impl.GenericObjectPool 22 | import org.apache.commons.pool2.ObjectPool 23 | 24 | object MqttSinkAppE { 25 | 26 | def main(args: Array[String]) { 27 | if (args.length != 3) { 28 | System.err.println( 29 | "Usage: MqttSinkApp ") 30 | System.exit(1) 31 | } 32 | 33 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val batchInterval = 10 40 | 41 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 42 | 43 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 44 | interval = batchInterval) 45 | .flatMap(rec => { 46 | val query = parse(rec) \ "query" 47 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 48 | }) 49 | .map(rec => { 50 | implicit val formats = DefaultFormats 51 | rec.children.map(f => f.extract[String]) mkString "," 52 | }) 53 | .foreachRDD { rdd => 54 | rdd.foreachPartition { par => 55 | val mqttSink = MqttSinkPool().borrowObject() 56 | par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 57 | MqttSinkPool().returnObject(mqttSink) 58 | } 59 | } 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | } 65 | 66 | object MqttSinkPool { 67 | val poolSize = 8 68 | val brokerUrl = "tcp://localhost:1883" 69 | val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl)) 70 | mqttPool.setMaxTotal(poolSize) 71 | sys.addShutdownHook { 72 | mqttPool.close() 73 | } 74 | 75 | def apply(): GenericObjectPool[MqttClient] = { 76 | mqttPool 77 | } 78 | } 79 | 80 | class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] { 81 | override def create() = { 82 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 83 | client.connect() 84 | client 85 | } 86 | override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client) 87 | override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected() 88 | override def destroyObject(pObj: PooledObject[MqttClient]) = { 89 | pObj.getObject.disconnect() 90 | pObj.getObject.close() 91 | } 92 | override def passivateObject(pObj: PooledObject[MqttClient]) = {} 93 | } 94 | -------------------------------------------------------------------------------- /prosparkstreaming-master/Chap10/src/main/java/org/apress/prospark/SocketDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.IOException; 4 | import java.net.InetSocketAddress; 5 | import java.nio.ByteBuffer; 6 | import java.nio.channels.ServerSocketChannel; 7 | import java.nio.channels.SocketChannel; 8 | import java.nio.charset.StandardCharsets; 9 | import java.util.concurrent.ExecutionException; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public class SocketDriver extends AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(SocketDriver.class); 17 | 18 | private String hostname; 19 | private int port; 20 | private SocketStream socketStream; 21 | 22 | public SocketDriver(String path, String hostname, int port) { 23 | super(path); 24 | this.hostname = hostname; 25 | this.port = port; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | socketStream = new SocketStream(hostname, port); 31 | LOG.info(String.format("Waiting for client to connect on port %d", port)); 32 | SocketChannel socketChan = socketStream.init(); 33 | LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port)); 34 | socketStream.kickOff(socketChan); 35 | socketStream.start(); 36 | } 37 | 38 | @Override 39 | public void close() throws IOException { 40 | socketStream.done(); 41 | if (socketStream != null) { 42 | socketStream.close(); 43 | } 44 | } 45 | 46 | @Override 47 | public void sendRecord(String record) throws Exception { 48 | socketStream.sendMsg(record + "\n"); 49 | } 50 | 51 | static class SocketStream extends Thread { 52 | 53 | private String hostname; 54 | private int port; 55 | private ServerSocketChannel server; 56 | private volatile boolean isDone = false; 57 | private SocketChannel socket = null; 58 | private long totalBytes; 59 | private long totalLines; 60 | 61 | public SocketStream(String hostname, int port) { 62 | this.hostname = hostname; 63 | this.port = port; 64 | totalBytes = 0; 65 | totalLines = 0; 66 | } 67 | 68 | public SocketChannel init() throws IOException { 69 | server = ServerSocketChannel.open(); 70 | server.bind(new InetSocketAddress(hostname, port)); 71 | LOG.info(String.format("Listening on %s", server.getLocalAddress())); 72 | return server.accept(); 73 | } 74 | 75 | public void kickOff(SocketChannel socket) { 76 | LOG.info("Kicking off data transfer"); 77 | this.socket = socket; 78 | } 79 | 80 | @Override 81 | public void run() { 82 | try { 83 | while (!isDone) { 84 | Thread.sleep(1000); 85 | } 86 | } catch (Exception e) { 87 | LOG.error(e); 88 | } 89 | } 90 | 91 | public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException { 92 | if (socket != null) { 93 | ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8)); 94 | int bytesWritten = socket.write(buffer); 95 | totalBytes += bytesWritten; 96 | } else { 97 | throw new IOException("Client hasn't connected yet!"); 98 | } 99 | totalLines++; 100 | } 101 | 102 | public void done() { 103 | isDone = true; 104 | } 105 | 106 | public void close() throws IOException { 107 | if (socket != null) { 108 | socket.close(); 109 | socket = null; 110 | } 111 | LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes, 112 | totalLines)); 113 | } 114 | } 115 | 116 | public static void main(String[] args) throws Exception { 117 | 118 | if (args.length != 3) { 119 | System.err.println("Usage: SocketDriver "); 120 | System.exit(-1); 121 | } 122 | 123 | String path = args[0]; 124 | String hostname = args[1]; 125 | int port = Integer.parseInt(args[2]); 126 | 127 | SocketDriver driver = new SocketDriver(path, hostname, port); 128 | try { 129 | driver.execute(); 130 | } finally { 131 | driver.close(); 132 | } 133 | } 134 | } --------------------------------------------------------------------------------