├── .gitignore ├── Chap10 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt ├── src │ └── main │ │ ├── java │ │ └── org │ │ │ └── apress │ │ │ └── prospark │ │ │ ├── AbstractDriver.java │ │ │ └── SocketDriver.java │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── apress │ │ └── prospark │ │ ├── L10-2DataProc.scala │ │ ├── L10-4LambdaDataproc.scala │ │ ├── L10-4LambdaLocal.scala │ │ └── L10-9Graph.scala └── yelp_pyspark.py ├── Chap2 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ └── scala │ └── org │ └── apress │ └── prospark │ ├── L2-1FirstApp.scala │ └── T2-6Accumulator.scala ├── Chap3 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt ├── src │ └── main │ │ └── scala │ │ └── org │ │ └── apress │ │ └── prospark │ │ ├── L3-1DStreams.scala │ │ ├── L3-DStreamAggregation.scala │ │ ├── L3-DStreamKeyValue.scala │ │ ├── L3-DStreamMapping.scala │ │ ├── L3-DStreamVariation.scala │ │ └── L3-DStreamWindowAndAction.scala └── touch_files_window.sh ├── Chap4 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ └── scala │ └── org │ └── apress │ └── prospark │ ├── L4-1Voyager.scala │ ├── L4-3ProtonFlux.scala │ └── L4-4Kryo.scala ├── Chap5 ├── flumeConf │ ├── flumePull.conf │ ├── flumePush.conf │ ├── flumeTest.conf │ └── log4j.properties ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ ├── java │ └── org │ │ └── apress │ │ └── prospark │ │ ├── AbstractDriver.java │ │ ├── KafkaDriver.java │ │ ├── MqttDriver.java │ │ └── SocketDriver.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apress │ └── prospark │ ├── HttpInputDStream.scala │ ├── HttpInputDStreamAsync.scala │ ├── L5-11FlumePull.scala │ ├── L5-11FlumePush.scala │ ├── L5-13Kafka.scala │ ├── L5-14KafkaCustomConf.scala │ ├── L5-15KafkaDirect.scala │ ├── L5-16Twitter.scala │ ├── L5-18Http.scala │ ├── L5-6SocketStream.scala │ ├── L5-7MultipleSocketStreams.scala │ └── L5-9Mqtt.scala ├── Chap6 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ ├── java │ └── org │ │ └── apress │ │ └── prospark │ │ ├── AbstractDriver.java │ │ ├── MqttDriver.java │ │ └── SocketDriver.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apress │ └── prospark │ ├── HttpInputDStream.scala │ ├── L6-10LazyStatic.scala │ ├── L6-12StaticPool.scala │ ├── L6-14HBase.scala │ ├── L6-16SparkHBase.scala │ ├── L6-18Cassandra.scala │ ├── L6-20CassandraConnector.scala │ ├── L6-22Counters.scala │ ├── L6-23UpdateState.scala │ ├── L6-24Accumulators.scala │ ├── L6-26Redis.scala │ ├── L6-5Exception.scala │ ├── L6-6PerRecord.scala │ ├── L6-7PerPartition.scala │ └── L6-8Static.scala ├── Chap7 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ ├── java │ └── org │ │ └── apress │ │ └── prospark │ │ ├── AbstractDriver.java │ │ └── SocketDriver.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apress │ └── prospark │ ├── L7-2-3Tachyon.scala │ └── L7-4UI.scala ├── Chap8 ├── L8-36CdrSparkRApp.R ├── L8-39CdrStreamingSparkRApp.R ├── cdrschema.json ├── cdrschema2.json ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ ├── java │ └── org │ │ └── apress │ │ └── prospark │ │ ├── AbstractDriver.java │ │ └── SocketDriver.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apress │ └── prospark │ ├── L8-10-11UDF.scala │ ├── L8-13HiveQL.scala │ ├── L8-14-27DataFrameExamples.scala │ ├── L8-1DataFrameAPI.scala │ ├── L8-28DataFrameExamplesOps.scala │ ├── L8-29DataFrameExamplesJoin.scala │ ├── L8-3-6-7DataFrameCreation.scala │ ├── L8-35DataFrameExamplesRDD.scala │ ├── L8-38SparkR.scala │ ├── L8-4DataFrameCreationSchema.scala │ ├── L8-8Sql.scala │ ├── T8-3DataFrameExamplesNA.scala │ └── T8-5-L8-30-34DataFrameExamplesActions.scala ├── Chap9 ├── project │ ├── assembly.sbt │ └── plugins.sbt ├── spark.sbt └── src │ └── main │ ├── java │ └── org │ │ └── apress │ │ └── prospark │ │ ├── AbstractDriver.java │ │ └── SocketDriver.java │ ├── resources │ └── log4j.properties │ └── scala │ └── org │ └── apress │ └── prospark │ ├── L9-10KMeans.scala │ ├── L9-11CollabFilteringPreprocessing.scala │ ├── L9-12CollabFiltering.scala │ ├── L9-13FPMiningPreprocessing.scala │ ├── L9-14FPMining.scala │ ├── L9-15MLPipeline.scala │ ├── L9-17MLCrossValidation.scala │ ├── L9-1LinearRegression.scala │ ├── L9-3Statistics.scala │ ├── L9-4Correlation.scala │ ├── L9-5ChiSq.scala │ ├── L9-6Preprocessing.scala │ ├── L9-7FeatureExtraction.scala │ ├── L9-8PCA.scala │ ├── L9-9LogisticRegression.scala │ └── T9-4DataTypes.scala ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | .cache-main 15 | 16 | # Scala-IDE specific 17 | .scala_dependencies 18 | .worksheet 19 | 20 | # Eclipse 21 | .classpath 22 | .project 23 | .settings/ 24 | .pydevproject 25 | -------------------------------------------------------------------------------- /Chap10/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap10/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | 5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") 6 | -------------------------------------------------------------------------------- /Chap10/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | net.virtualvoid.sbt.graph.DependencyGraphSettings.graphSettings 6 | 7 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 8 | case entry => { 9 | val strategy = mergeStrategy(entry) 10 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 11 | else strategy 12 | } 13 | }} 14 | 15 | name := "Chap10" 16 | 17 | version := "1.0" 18 | 19 | scalaVersion := "2.10.5" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 22 | 23 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 24 | 25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 26 | 27 | libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-1.1" % "0.2.3" exclude("com.google.guava", "guava") 28 | 29 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2" 30 | 31 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2" 32 | 33 | libraryDependencies += "com.google.guava" % "guava" % "16.0" 34 | 35 | libraryDependencies += "org.mortbay.jetty.alpn" % "alpn-boot" % "8.1.6.v20151105" 36 | 37 | libraryDependencies += "com.google.cloud.bigdataoss" % "bigquery-connector" % "0.7.4-hadoop2" 38 | 39 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.4.0" 40 | 41 | -------------------------------------------------------------------------------- /Chap10/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else if (ext.equals("dat") || ext.equals("json")) { 85 | LOG.info(String.format("Feeding dat file %s", f.getName())); 86 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) { 87 | String line; 88 | while ((line = br.readLine()) != null) { 89 | sendRecord(line); 90 | } 91 | } 92 | } else { 93 | LOG.warn("Unsupported file type: " + f.getName()); 94 | } 95 | } 96 | } else { 97 | LOG.error(String.format("Path %s is not a directory", path)); 98 | } 99 | } finally { 100 | close(); 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /Chap10/src/main/java/org/apress/prospark/SocketDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.IOException; 4 | import java.net.InetSocketAddress; 5 | import java.nio.ByteBuffer; 6 | import java.nio.channels.ServerSocketChannel; 7 | import java.nio.channels.SocketChannel; 8 | import java.nio.charset.StandardCharsets; 9 | import java.util.concurrent.ExecutionException; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public class SocketDriver extends AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(SocketDriver.class); 17 | 18 | private String hostname; 19 | private int port; 20 | private SocketStream socketStream; 21 | 22 | public SocketDriver(String path, String hostname, int port) { 23 | super(path); 24 | this.hostname = hostname; 25 | this.port = port; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | socketStream = new SocketStream(hostname, port); 31 | LOG.info(String.format("Waiting for client to connect on port %d", port)); 32 | SocketChannel socketChan = socketStream.init(); 33 | LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port)); 34 | socketStream.kickOff(socketChan); 35 | socketStream.start(); 36 | } 37 | 38 | @Override 39 | public void close() throws IOException { 40 | socketStream.done(); 41 | if (socketStream != null) { 42 | socketStream.close(); 43 | } 44 | } 45 | 46 | @Override 47 | public void sendRecord(String record) throws Exception { 48 | socketStream.sendMsg(record + "\n"); 49 | } 50 | 51 | static class SocketStream extends Thread { 52 | 53 | private String hostname; 54 | private int port; 55 | private ServerSocketChannel server; 56 | private volatile boolean isDone = false; 57 | private SocketChannel socket = null; 58 | private long totalBytes; 59 | private long totalLines; 60 | 61 | public SocketStream(String hostname, int port) { 62 | this.hostname = hostname; 63 | this.port = port; 64 | totalBytes = 0; 65 | totalLines = 0; 66 | } 67 | 68 | public SocketChannel init() throws IOException { 69 | server = ServerSocketChannel.open(); 70 | server.bind(new InetSocketAddress(hostname, port)); 71 | LOG.info(String.format("Listening on %s", server.getLocalAddress())); 72 | return server.accept(); 73 | } 74 | 75 | public void kickOff(SocketChannel socket) { 76 | LOG.info("Kicking off data transfer"); 77 | this.socket = socket; 78 | } 79 | 80 | @Override 81 | public void run() { 82 | try { 83 | while (!isDone) { 84 | Thread.sleep(1000); 85 | } 86 | } catch (Exception e) { 87 | LOG.error(e); 88 | } 89 | } 90 | 91 | public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException { 92 | if (socket != null) { 93 | ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8)); 94 | int bytesWritten = socket.write(buffer); 95 | totalBytes += bytesWritten; 96 | } else { 97 | throw new IOException("Client hasn't connected yet!"); 98 | } 99 | totalLines++; 100 | } 101 | 102 | public void done() { 103 | isDone = true; 104 | } 105 | 106 | public void close() throws IOException { 107 | if (socket != null) { 108 | socket.close(); 109 | socket = null; 110 | } 111 | LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes, 112 | totalLines)); 113 | } 114 | } 115 | 116 | public static void main(String[] args) throws Exception { 117 | 118 | if (args.length != 3) { 119 | System.err.println("Usage: SocketDriver "); 120 | System.exit(-1); 121 | } 122 | 123 | String path = args[0]; 124 | String hostname = args[1]; 125 | int port = Integer.parseInt(args[2]); 126 | 127 | SocketDriver driver = new SocketDriver(path, hostname, port); 128 | try { 129 | driver.execute(); 130 | } finally { 131 | driver.close(); 132 | } 133 | } 134 | } -------------------------------------------------------------------------------- /Chap10/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /Chap10/src/main/scala/org/apress/prospark/L10-2DataProc.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.HashPartitioner 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.json4s.DefaultFormats 10 | import org.json4s.JsonAST.JNothing 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object DataProcApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: DataProcApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | ssc.socketTextStream(hostname, port.toInt) 33 | .map(r => { 34 | implicit val formats = DefaultFormats 35 | parse(r) 36 | }) 37 | .filter(jvalue => { 38 | jvalue \ "attributes" \ "Wi-Fi" != JNothing 39 | }) 40 | .map(jvalue => { 41 | implicit val formats = DefaultFormats 42 | ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) 43 | }) 44 | .combineByKey( 45 | (v) => (v, 1), 46 | (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), 47 | (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), 48 | new HashPartitioner(ssc.sparkContext.defaultParallelism)) 49 | .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) 50 | .print() 51 | 52 | ssc.start() 53 | ssc.awaitTermination() 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /Chap10/src/main/scala/org/apress/prospark/L10-9Graph.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.graphx.Edge 6 | import org.apache.spark.graphx.Graph 7 | import org.apache.spark.graphx.Graph.graphToGraphOps 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object UserRankApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: UserRankApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | ssc.socketTextStream(hostname, port.toInt) 33 | .map(r => { 34 | implicit val formats = DefaultFormats 35 | parse(r) 36 | }) 37 | .foreachRDD(rdd => { 38 | val edges = rdd.map(jvalue => { 39 | implicit val formats = DefaultFormats 40 | ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) 41 | }) 42 | .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) 43 | 44 | val vertices = rdd.map(jvalue => { 45 | implicit val formats = DefaultFormats 46 | ((jvalue \ "user_id").extract[String]) 47 | }) 48 | .map(r => (r.hashCode.toLong, r)) 49 | 50 | val tolerance = 0.0001 51 | val graph = Graph(vertices, edges, "defaultUser") 52 | .subgraph(vpred = (id, idStr) => idStr != "defaultUser") 53 | val pr = graph.pageRank(tolerance).cache 54 | 55 | graph.outerJoinVertices(pr.vertices) { 56 | (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) 57 | }.vertices.top(10) { 58 | Ordering.by(_._2._1) 59 | }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) 60 | }) 61 | 62 | ssc.start() 63 | ssc.awaitTermination() 64 | 65 | } 66 | 67 | } -------------------------------------------------------------------------------- /Chap10/yelp_pyspark.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.streaming import StreamingContext 3 | from sys import argv, exit 4 | try: import simplejson as json 5 | except ImportError: import json 6 | 7 | if len(argv) != 5: 8 | print 'Usage: yelp_pyspark.py ' 9 | exit(-1) 10 | 11 | appname = argv[1] 12 | batch_interval = int(argv[2]) 13 | hostname = argv[3] 14 | port = int(argv[4]) 15 | 16 | sc = SparkContext(appName=appname) 17 | ssc = StreamingContext(sc, batch_interval) 18 | 19 | records = ssc.socketTextStream(hostname, port) 20 | json_records = records.map(lambda rec: json.loads(rec)) 21 | restaurant_records = json_records.filter(lambda rec: 'attributes' in rec and 'Wi-Fi' in rec['attributes']) 22 | wifi_pairs = restaurant_records.map(lambda rec: (rec['attributes']['Wi-Fi'], rec['stars'])) 23 | wifi_counts = wifi_pairs.combineByKey(lambda v: (v, 1), 24 | lambda x, value: (x[0] + value, x[1] + 1), 25 | lambda x, y: (x[0] + y[0], x[1] + y[1])) 26 | avg_stars = wifi_counts.map(lambda (key, (sum_, count)): (key, sum_ / count)) 27 | avg_stars.pprint() 28 | 29 | ssc.start() 30 | ssc.awaitTermination() 31 | -------------------------------------------------------------------------------- /Chap2/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap2/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") -------------------------------------------------------------------------------- /Chap2/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap2" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | -------------------------------------------------------------------------------- /Chap2/src/main/scala/org/apress/prospark/L2-1FirstApp.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | 8 | object TranslateApp { 9 | def main(args: Array[String]) { 10 | if (args.length != 4) { 11 | System.err.println( 12 | "Usage: TranslateApp ") 13 | System.exit(1) 14 | } 15 | val Seq(appName, bookPath, outputPath, lang) = args.toSeq 16 | 17 | val dict = getDictionary(lang) 18 | 19 | val conf = new SparkConf() 20 | .setAppName(appName) 21 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 22 | val sc = new SparkContext(conf) 23 | val book = sc.textFile(bookPath) 24 | val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) 25 | translated.saveAsTextFile(outputPath) 26 | } 27 | 28 | def getDictionary(lang: String): Map[String, String] = { 29 | if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { 30 | System.err.println( 31 | "Unsupported language: %s".format(lang)) 32 | System.exit(1) 33 | } 34 | val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) 35 | println("Grabbing dictionary from: %s".format(url)) 36 | Source.fromURL(url, "ISO-8859-1").mkString 37 | .split("\\r?\\n") 38 | .filter(line => !line.startsWith("#")) 39 | .map(line => line.split("\\t")) 40 | .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /Chap2/src/main/scala/org/apress/prospark/T2-6Accumulator.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.mutable 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | 8 | object AccumulatorApp { 9 | def main(args: Array[String]) { 10 | if (args.length != 1) { 11 | System.err.println( 12 | "Usage: AccumulatorApp ") 13 | System.exit(1) 14 | } 15 | val Seq(appName) = args.toSeq 16 | 17 | val conf = new SparkConf() 18 | .setAppName(appName) 19 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 20 | .set("spark.eventLog.enabled", true.toString) 21 | .set("spark.eventLog.dir", "/tmp") 22 | val sc = new SparkContext(conf) 23 | val setAcc = sc.accumulableCollection(mutable.HashSet[Int]()) 24 | val d = sc.parallelize(1 to 100) 25 | d.foreach(x => setAcc += x) 26 | println(setAcc.value.size) 27 | } 28 | } -------------------------------------------------------------------------------- /Chap3/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap3/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap3/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap3" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | -------------------------------------------------------------------------------- /Chap3/src/main/scala/org/apress/prospark/L3-1DStreams.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.hadoop.io.LongWritable 9 | import org.apache.hadoop.fs.Path 10 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 11 | import org.apache.hadoop.io.Text 12 | 13 | object StreamingTranslateApp { 14 | def main(args: Array[String]) { 15 | if (args.length != 4) { 16 | System.err.println( 17 | "Usage: StreamingTranslateApp ") 18 | System.exit(1) 19 | } 20 | val Seq(appName, bookPath, outputPath, lang) = args.toSeq 21 | 22 | val dict = getDictionary(lang) 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | val ssc = new StreamingContext(conf, Seconds(1)) 28 | 29 | val book = ssc.textFileStream(bookPath) 30 | val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) 31 | translated.saveAsTextFiles(outputPath) 32 | 33 | ssc.start() 34 | ssc.awaitTermination() 35 | } 36 | 37 | def getDictionary(lang: String): Map[String, String] = { 38 | if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { 39 | System.err.println( 40 | "Unsupported language: %s".format(lang)) 41 | System.exit(1) 42 | } 43 | val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) 44 | println("Grabbing dictionary from: %s".format(url)) 45 | Source.fromURL(url, "ISO-8859-1").mkString 46 | .split("\\r?\\n") 47 | .filter(line => !line.startsWith("#")) 48 | .map(line => line.split("\\t")) 49 | .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /Chap3/src/main/scala/org/apress/prospark/L3-DStreamAggregation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditAggregationApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditAggregationApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val recCount = comments.count() 39 | 40 | val recCountValue = comments.countByValue() 41 | 42 | val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) 43 | .flatMap(body => body.split(" ")) 44 | .map(word => 1) 45 | .reduce(_ + _) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | 50 | } 51 | } -------------------------------------------------------------------------------- /Chap3/src/main/scala/org/apress/prospark/L3-DStreamMapping.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditMappingApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditMappingApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 39 | val tsKey = "created_utc" 40 | val secs = 1000L 41 | val keyedByDay = comments.map(rec => { 42 | val ts = (parse(rec) \ tsKey).values 43 | (sdf.format(new Date(ts.toString.toLong * secs)), rec) 44 | }) 45 | 46 | val keyedByDayPart = comments.mapPartitions(iter => { 47 | var ret = List[(String, String)]() 48 | while (iter.hasNext) { 49 | val rec = iter.next 50 | val ts = (parse(rec) \ tsKey).values 51 | ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) 52 | } 53 | ret.iterator 54 | }) 55 | 56 | val wordTokens = comments.map(rec => { 57 | ((parse(rec) \ "body")).values.toString.split(" ") 58 | }) 59 | 60 | val wordTokensFlat = comments.flatMap(rec => { 61 | ((parse(rec) \ "body")).values.toString.split(" ") 62 | }) 63 | 64 | val filterSubreddit = comments.filter(rec => 65 | (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) 66 | 67 | val sortedByAuthor = comments.transform(rdd => 68 | (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) 69 | 70 | ssc.start() 71 | ssc.awaitTermination() 72 | 73 | } 74 | } -------------------------------------------------------------------------------- /Chap3/src/main/scala/org/apress/prospark/L3-DStreamVariation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | 19 | object RedditVariationApp { 20 | def main(args: Array[String]) { 21 | if (args.length != 2) { 22 | System.err.println( 23 | "Usage: RedditVariationApp ") 24 | System.exit(1) 25 | } 26 | val Seq(appName, inputPath) = args.toSeq 27 | val LOG = LogManager.getLogger(this.getClass) 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(1)) 34 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 35 | 36 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 37 | 38 | val merged = comments.union(comments) 39 | 40 | val repartitionedComments = comments.repartition(4) 41 | 42 | val rddMin = comments.glom().map(arr => 43 | arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | 48 | } 49 | } -------------------------------------------------------------------------------- /Chap3/src/main/scala/org/apress/prospark/L3-DStreamWindowAndAction.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.hadoop.mapred.TextOutputFormat 11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } 12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 13 | import org.apache.log4j.LogManager 14 | import org.json4s._ 15 | import org.json4s.native.JsonMethods._ 16 | import java.text.SimpleDateFormat 17 | import java.util.Date 18 | import org.apache.spark.HashPartitioner 19 | 20 | object RedditWindowAndActionApp { 21 | def main(args: Array[String]) { 22 | if (args.length != 2) { 23 | System.err.println( 24 | "Usage: RedditWindowAndActionApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, inputPath) = args.toSeq 28 | val LOG = LogManager.getLogger(this.getClass) 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(1)) 35 | LOG.info("Started at %d".format(ssc.sparkContext.startTime)) 36 | 37 | val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 38 | 39 | val checkpointPath = "/tmp" 40 | ssc.checkpoint(checkpointPath) 41 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 42 | val currentCount = values.sum 43 | val previousCount = state.getOrElse(0) 44 | Some(currentCount + previousCount) 45 | } 46 | val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) 47 | val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) 48 | .map(r => (r._2, r._1)) 49 | .transform(rdd => rdd.sortByKey(ascending = false)) 50 | 51 | val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) 52 | val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) 53 | val windowedCounts = windowedRecs.countByValue() 54 | 55 | windowedCounts.print(10) 56 | windowedCounts.saveAsObjectFiles("subreddit", "obj") 57 | windowedCounts.saveAsTextFiles("subreddit", "txt") 58 | 59 | globalCount.saveAsHadoopFiles("subreddit", "hadoop", 60 | classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) 61 | globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", 62 | classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) 63 | comments.foreachRDD(rdd => { 64 | LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | 70 | } 71 | } -------------------------------------------------------------------------------- /Chap3/touch_files_window.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in `seq 1 10`; 3 | do 4 | p=/Users/zubairnabi/Downloads/dummy/${i}.gz 5 | echo ${p} 6 | touch -c ${p} 7 | sleep 1 8 | done 9 | -------------------------------------------------------------------------------- /Chap4/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap4/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap4/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap4" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | -------------------------------------------------------------------------------- /Chap4/src/main/scala/org/apress/prospark/L4-1Voyager.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.LongWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | 14 | object VoyagerApp { 15 | def main(args: Array[String]) { 16 | if (args.length != 3) { 17 | System.err.println( 18 | "Usage: VoyagerApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, inputPath, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") 27 | 28 | val ssc = new StreamingContext(conf, Seconds(10)) 29 | 30 | val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 31 | voyager1.map(rec => { 32 | val attrs = rec.split("\\s+") 33 | ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) 34 | }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) 35 | .reduceByKey(_ + _) 36 | .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) 37 | 38 | ssc.start() 39 | ssc.awaitTermination() 40 | } 41 | } -------------------------------------------------------------------------------- /Chap4/src/main/scala/org/apress/prospark/L4-3ProtonFlux.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import com.esotericsoftware.kryo.{KryoSerializable,Kryo} 4 | import com.esotericsoftware.kryo.io.{Output, Input} 5 | 6 | class ProtonFlux( 7 | var year: Int, 8 | var bin0_57to1_78: Double, 9 | var bin3_40to17_6: Double, 10 | var bin22_0to31_0: Double, 11 | var bin1_894to2_605: Double, 12 | var bin4_200to6_240: Double, 13 | var bin3_256to8_132: Double, 14 | var bin3_276to8_097: Double, 15 | var bin6_343to42_03: Double, 16 | var bin17_88to26_81: Double, 17 | var bin30_29to69_47: Double, 18 | var bin132_8to242_0: Double 19 | ) extends KryoSerializable { 20 | 21 | def this(year: String, bin0_57to1_78: String, bin3_40to17_6: String, 22 | bin22_0to31_0: String, bin1_894to2_605: String, bin4_200to6_240: String, 23 | bin3_256to8_132: String, bin3_276to8_097: String, bin6_343to42_03: String, 24 | bin17_88to26_81: String, bin30_29to69_47: String, bin132_8to242_0: String) { 25 | this(year.toInt, bin0_57to1_78.toDouble, bin3_40to17_6.toDouble, 26 | bin22_0to31_0.toDouble, bin1_894to2_605.toDouble, bin4_200to6_240.toDouble, 27 | bin3_256to8_132.toDouble, bin3_276to8_097.toDouble, bin6_343to42_03.toDouble, 28 | bin17_88to26_81.toDouble, bin30_29to69_47.toDouble, bin132_8to242_0.toDouble) 29 | } 30 | 31 | def isSolarStorm = (bin0_57to1_78 > 1.0 || bin3_40to17_6 > 1.0 32 | || bin22_0to31_0 > 1.0 || bin1_894to2_605 > 1.0 || bin4_200to6_240 > 1.0 33 | || bin3_256to8_132 > 1.0 || bin3_276to8_097 > 1.0 || bin6_343to42_03 > 1.0 34 | || bin17_88to26_81 > 1.0 || bin30_29to69_47 > 1.0 || bin132_8to242_0 > 1.0) 35 | 36 | override def write(kryo: Kryo, output: Output) { 37 | output.writeInt(year) 38 | output.writeDouble(bin0_57to1_78) 39 | output.writeDouble(bin3_40to17_6) 40 | output.writeDouble(bin22_0to31_0) 41 | output.writeDouble(bin1_894to2_605) 42 | output.writeDouble(bin4_200to6_240) 43 | output.writeDouble(bin3_256to8_132) 44 | output.writeDouble(bin3_276to8_097) 45 | output.writeDouble(bin6_343to42_03) 46 | output.writeDouble(bin17_88to26_81) 47 | output.writeDouble(bin30_29to69_47) 48 | output.writeDouble(bin132_8to242_0) 49 | } 50 | 51 | override def read(kryo: Kryo, input: Input) { 52 | year = input.readInt() 53 | bin0_57to1_78 = input.readDouble() 54 | bin3_40to17_6 = input.readDouble() 55 | bin22_0to31_0 = input.readDouble() 56 | bin1_894to2_605 = input.readDouble() 57 | bin4_200to6_240 = input.readDouble() 58 | bin3_256to8_132 = input.readDouble() 59 | bin3_276to8_097 = input.readDouble() 60 | bin6_343to42_03 = input.readDouble() 61 | bin17_88to26_81 = input.readDouble() 62 | bin30_29to69_47 = input.readDouble() 63 | bin132_8to242_0 = input.readDouble() 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /Chap4/src/main/scala/org/apress/prospark/L4-4Kryo.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.io.LongWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | 14 | object VoyagerAppKryo { 15 | def main(args: Array[String]) { 16 | if (args.length != 3) { 17 | System.err.println( 18 | "Usage: VoyagerAppKryo ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, inputPath, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 27 | .registerKryoClasses(Array(classOf[ProtonFlux])) 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | 31 | val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) 32 | val projected = voyager1.map(rec => { 33 | val attrs = rec.split("\\s+") 34 | new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), 35 | attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), 36 | attrs(28)) 37 | }) 38 | val filtered = projected.filter(pflux => pflux.isSolarStorm) 39 | val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) 40 | .reduceByKey(_ + _) 41 | .transform(rec => rec.sortByKey(ascending = false)) 42 | yearlyBreakdown.saveAsTextFiles(outputPath) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | } -------------------------------------------------------------------------------- /Chap5/flumeConf/flumePull.conf: -------------------------------------------------------------------------------- 1 | # components on this agent 2 | a1.sources = src-1 3 | a1.sinks = snk-1 4 | a1.channels = ch-1 5 | 6 | # source 7 | a1.sources.src-1.type = spooldir 8 | a1.sources.src-1.channels = ch-1 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes 10 | 11 | # sink 12 | a1.sinks.snk-1.type = org.apache.spark.streaming.flume.sink.SparkSink 13 | a1.sinks.snk-1.hostname = localhost 14 | a1.sinks.snk-1.port = 44444 15 | 16 | # channel 17 | a1.channels.ch-1.type = memory 18 | a1.channels.ch-1.capacity = 10000 19 | a1.channels.ch-1.transactionCapacity = 1000 20 | 21 | # bind source, sink, and channel 22 | a1.sources.src-1.channels = ch-1 23 | a1.sinks.snk-1.channel = ch-1 24 | -------------------------------------------------------------------------------- /Chap5/flumeConf/flumePush.conf: -------------------------------------------------------------------------------- 1 | # components on this agent 2 | a1.sources = src-1 3 | a1.sinks = snk-1 4 | a1.channels = ch-1 5 | 6 | # source 7 | a1.sources.src-1.type = spooldir 8 | a1.sources.src-1.channels = ch-1 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes 10 | 11 | # sink 12 | a1.sinks.snk-1.type = avro 13 | a1.sinks.snk-1.hostname = localhost 14 | a1.sinks.snk-1.port = 44444 15 | 16 | # channel 17 | a1.channels.ch-1.type = memory 18 | a1.channels.ch-1.capacity = 10000 19 | a1.channels.ch-1.transactionCapacity = 1000 20 | 21 | # bind source, sink, and channel 22 | a1.sources.src-1.channels = ch-1 23 | a1.sinks.snk-1.channel = ch-1 24 | -------------------------------------------------------------------------------- /Chap5/flumeConf/flumeTest.conf: -------------------------------------------------------------------------------- 1 | # Name the components on this agent 2 | a1.sources = r1 3 | a1.sinks = k1 4 | a1.channels = c1 5 | 6 | # Describe/configure the source 7 | a1.sources.r1.type = netcat 8 | a1.sources.r1.bind = localhost 9 | a1.sources.r1.port = 44444 10 | 11 | # Describe the sink 12 | a1.sinks.k1.type = logger 13 | 14 | # Use a channel which buffers events in memory 15 | a1.channels.c1.type = memory 16 | a1.channels.c1.capacity = 1000 17 | a1.channels.c1.transactionCapacity = 100 18 | 19 | # Bind the source and sink to the channel 20 | a1.sources.r1.channels = c1 21 | a1.sinks.k1.channel = c1 22 | -------------------------------------------------------------------------------- /Chap5/flumeConf/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /Chap5/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap5/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | 5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8") 6 | -------------------------------------------------------------------------------- /Chap5/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap5" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0" 26 | 27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1" 28 | 29 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.4.0" 30 | 31 | libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka" % "1.4.0" 32 | 33 | libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.4.0" 34 | 35 | libraryDependencies += "com.ning" % "async-http-client" % "1.9.31" 36 | 37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1" 38 | 39 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/" 40 | -------------------------------------------------------------------------------- /Chap5/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.Enumeration; 8 | import java.util.zip.ZipEntry; 9 | import java.util.zip.ZipFile; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public abstract class AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 17 | 18 | private String path; 19 | 20 | public AbstractDriver(String path) { 21 | this.path = path; 22 | } 23 | 24 | public abstract void init() throws Exception; 25 | 26 | public abstract void close() throws Exception; 27 | 28 | public abstract void sendRecord(String record) throws Exception; 29 | 30 | public void execute() throws Exception { 31 | 32 | try { 33 | init(); 34 | File dirPath = new File(path); 35 | if (dirPath.isDirectory()) { 36 | File[] files = new File(path).listFiles(); 37 | for (File f : files) { 38 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 39 | ZipFile zFile = null; 40 | try { 41 | zFile = new ZipFile(f); 42 | Enumeration zEntries = zFile.entries(); 43 | 44 | while (zEntries.hasMoreElements()) { 45 | ZipEntry zEntry = zEntries.nextElement(); 46 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 47 | try (BufferedReader br = new BufferedReader( 48 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 49 | // skip header 50 | br.readLine(); 51 | String line; 52 | while ((line = br.readLine()) != null) { 53 | sendRecord(line); 54 | } 55 | } 56 | } 57 | } catch (IOException e) { 58 | LOG.error(e.getMessage()); 59 | } finally { 60 | if (zFile != null) { 61 | try { 62 | zFile.close(); 63 | } catch (IOException e) { 64 | LOG.error(e.getMessage()); 65 | } 66 | } 67 | } 68 | } 69 | } else { 70 | LOG.error(String.format("Path %s is not a directory", path)); 71 | } 72 | } finally { 73 | close(); 74 | } 75 | } 76 | } -------------------------------------------------------------------------------- /Chap5/src/main/java/org/apress/prospark/KafkaDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.util.Properties; 4 | 5 | import kafka.javaapi.producer.Producer; 6 | import kafka.producer.KeyedMessage; 7 | import kafka.producer.ProducerConfig; 8 | 9 | public class KafkaDriver extends AbstractDriver { 10 | 11 | private final String topic; 12 | private Producer producer; 13 | 14 | public KafkaDriver(String path, String topic, Properties props) { 15 | super(path); 16 | this.topic = topic; 17 | ProducerConfig config = new ProducerConfig(props); 18 | producer = new Producer(config); 19 | } 20 | 21 | @Override 22 | public void init() throws Exception { 23 | } 24 | 25 | @Override 26 | public void close() throws Exception { 27 | producer.close(); 28 | } 29 | 30 | @Override 31 | public void sendRecord(String record) throws Exception { 32 | producer.send(new KeyedMessage(topic, record)); 33 | } 34 | 35 | public static void main(String[] args) throws Exception { 36 | 37 | if (args.length != 3) { 38 | System.err.println("Usage: KafkaDriver "); 39 | System.exit(-1); 40 | } 41 | 42 | String path = args[0]; 43 | String brokerUrl = args[1]; 44 | String topic = args[2]; 45 | 46 | Properties props = new Properties(); 47 | props.put("metadata.broker.list", brokerUrl); 48 | props.put("serializer.class", "kafka.serializer.StringEncoder"); 49 | // props.put("request.required.acks", "1"); 50 | 51 | KafkaDriver driver = new KafkaDriver(path, topic, props); 52 | try { 53 | driver.execute(); 54 | } finally { 55 | driver.close(); 56 | } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /Chap5/src/main/java/org/apress/prospark/MqttDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | 5 | import org.apache.log4j.LogManager; 6 | import org.apache.log4j.Logger; 7 | import org.eclipse.paho.client.mqttv3.MqttClient; 8 | import org.eclipse.paho.client.mqttv3.MqttException; 9 | import org.eclipse.paho.client.mqttv3.MqttMessage; 10 | import org.eclipse.paho.client.mqttv3.MqttTopic; 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence; 12 | 13 | public class MqttDriver extends AbstractDriver { 14 | 15 | private static final Logger LOG = LogManager.getLogger(MqttDriver.class); 16 | 17 | private final String brokerUrl; 18 | private final String topic; 19 | private MqttClient client; 20 | private MqttTopic mqttTopic; 21 | 22 | public MqttDriver(String path, String brokerUrl, String topic) { 23 | super(path); 24 | this.brokerUrl = brokerUrl; 25 | this.topic = topic; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()); 31 | LOG.info(String.format("Attempting to connect to broker %s", brokerUrl)); 32 | client.connect(); 33 | mqttTopic = client.getTopic(topic); 34 | LOG.info(String.format("Connected to broker %s", brokerUrl)); 35 | } 36 | 37 | @Override 38 | public void close() throws Exception { 39 | if (client != null) { 40 | client.disconnect(); 41 | } 42 | } 43 | 44 | @Override 45 | public void sendRecord(String record) throws Exception { 46 | try { 47 | mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8))); 48 | } catch (MqttException e) { 49 | if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) { 50 | Thread.sleep(10); 51 | } 52 | } 53 | } 54 | 55 | public static void main(String[] args) throws Exception { 56 | 57 | if (args.length != 3) { 58 | System.err.println("Usage:MqttDriver "); 59 | System.exit(-1); 60 | } 61 | 62 | String path = args[0]; 63 | String brokerUrl = args[1]; 64 | String topic = args[2]; 65 | 66 | MqttDriver driver = new MqttDriver(path, brokerUrl, topic); 67 | try { 68 | driver.execute(); 69 | } finally { 70 | driver.close(); 71 | } 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /Chap5/src/main/java/org/apress/prospark/SocketDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.IOException; 4 | import java.net.InetSocketAddress; 5 | import java.nio.ByteBuffer; 6 | import java.nio.channels.ServerSocketChannel; 7 | import java.nio.channels.SocketChannel; 8 | import java.nio.charset.StandardCharsets; 9 | import java.util.concurrent.ExecutionException; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public class SocketDriver extends AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(SocketDriver.class); 17 | 18 | private String hostname; 19 | private int port; 20 | private SocketStream socketStream; 21 | 22 | public SocketDriver(String path, String hostname, int port) { 23 | super(path); 24 | this.hostname = hostname; 25 | this.port = port; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | socketStream = new SocketStream(hostname, port); 31 | LOG.info(String.format("Waiting for client to connect on port %d", port)); 32 | SocketChannel socketChan = socketStream.init(); 33 | LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port)); 34 | socketStream.kickOff(socketChan); 35 | socketStream.start(); 36 | } 37 | 38 | @Override 39 | public void close() throws IOException { 40 | socketStream.done(); 41 | if (socketStream != null) { 42 | socketStream.close(); 43 | } 44 | } 45 | 46 | @Override 47 | public void sendRecord(String record) throws Exception { 48 | socketStream.sendMsg(record + "\n"); 49 | } 50 | 51 | static class SocketStream extends Thread { 52 | 53 | private String hostname; 54 | private int port; 55 | private ServerSocketChannel server; 56 | private volatile boolean isDone = false; 57 | private SocketChannel socket = null; 58 | private long totalBytes; 59 | private long totalLines; 60 | 61 | public SocketStream(String hostname, int port) { 62 | this.hostname = hostname; 63 | this.port = port; 64 | totalBytes = 0; 65 | totalLines = 0; 66 | } 67 | 68 | public SocketChannel init() throws IOException { 69 | server = ServerSocketChannel.open(); 70 | server.bind(new InetSocketAddress(hostname, port)); 71 | LOG.info(String.format("Listening on %s", server.getLocalAddress())); 72 | return server.accept(); 73 | } 74 | 75 | public void kickOff(SocketChannel socket) { 76 | LOG.info("Kicking off data transfer"); 77 | this.socket = socket; 78 | } 79 | 80 | @Override 81 | public void run() { 82 | try { 83 | while (!isDone) { 84 | Thread.sleep(1000); 85 | } 86 | } catch (Exception e) { 87 | LOG.error(e); 88 | } 89 | } 90 | 91 | public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException { 92 | if (socket != null) { 93 | ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8)); 94 | int bytesWritten = socket.write(buffer); 95 | totalBytes += bytesWritten; 96 | } else { 97 | throw new IOException("Client hasn't connected yet!"); 98 | } 99 | totalLines++; 100 | } 101 | 102 | public void done() { 103 | isDone = true; 104 | } 105 | 106 | public void close() throws IOException { 107 | if (socket != null) { 108 | socket.close(); 109 | socket = null; 110 | } 111 | LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes, 112 | totalLines)); 113 | } 114 | } 115 | 116 | public static void main(String[] args) throws Exception { 117 | 118 | if (args.length != 3) { 119 | System.err.println("Usage: SocketDriver "); 120 | System.exit(-1); 121 | } 122 | 123 | String path = args[0]; 124 | String hostname = args[1]; 125 | int port = Integer.parseInt(args[2]); 126 | 127 | SocketDriver driver = new SocketDriver(path, hostname, port); 128 | try { 129 | driver.execute(); 130 | } finally { 131 | driver.close(); 132 | } 133 | } 134 | } -------------------------------------------------------------------------------- /Chap5/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, FILE, stdout 2 | log4j.rootCategory=INFO, FILE, stdout 3 | 4 | log4j.logger.org.eclipse.jetty=WARN 5 | 6 | log4j.appender.FILE=org.apache.log4j.FileAppender 7 | 8 | log4j.appender.FILE.File=/tmp/spark.log 9 | 10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 12 | 13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 14 | log4j.appender.stdout.Target=System.out 15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 17 | -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/HttpInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.Timer 4 | import java.util.TimerTask 5 | 6 | import scala.reflect.ClassTag 7 | 8 | import org.apache.http.client.methods.HttpGet 9 | import org.apache.http.impl.client.CloseableHttpClient 10 | import org.apache.http.impl.client.HttpClients 11 | import org.apache.http.util.EntityUtils 12 | import org.apache.spark.Logging 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.streaming.StreamingContext 15 | import org.apache.spark.streaming.api.java.JavaDStream 16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 17 | import org.apache.spark.streaming.api.java.JavaStreamingContext 18 | import org.apache.spark.streaming.dstream.DStream 19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 20 | import org.apache.spark.streaming.receiver.Receiver 21 | 22 | class HttpInputDStream( 23 | @transient ssc_ : StreamingContext, 24 | storageLevel: StorageLevel, 25 | url: String, 26 | interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { 27 | 28 | def getReceiver(): Receiver[String] = { 29 | new HttpReceiver(storageLevel, url, interval) 30 | } 31 | } 32 | 33 | class HttpReceiver( 34 | storageLevel: StorageLevel, 35 | url: String, 36 | interval: Long) extends Receiver[String](storageLevel) with Logging { 37 | 38 | var httpClient: CloseableHttpClient = _ 39 | var trigger: Timer = _ 40 | 41 | def onStop() { 42 | httpClient.close() 43 | logInfo("Disconnected from Http Server") 44 | } 45 | 46 | def onStart() { 47 | httpClient = HttpClients.createDefault() 48 | trigger = new Timer() 49 | trigger.scheduleAtFixedRate(new TimerTask { 50 | def run() = doGet() 51 | }, 0, interval * 1000) 52 | 53 | logInfo("Http Receiver initiated") 54 | } 55 | 56 | def doGet() { 57 | logInfo("Fetching data from Http source") 58 | val response = httpClient.execute(new HttpGet(url)) 59 | try { 60 | val content = EntityUtils.toString(response.getEntity()) 61 | store(content) 62 | } catch { 63 | case e: Exception => restart("Error! Problems while connecting", e) 64 | } finally { 65 | response.close() 66 | } 67 | 68 | } 69 | 70 | } 71 | 72 | object HttpUtils { 73 | def createStream( 74 | ssc: StreamingContext, 75 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 76 | url: String, 77 | interval: Long): DStream[String] = { 78 | new HttpInputDStream(ssc, storageLevel, url, interval) 79 | } 80 | 81 | def createStream( 82 | jssc: JavaStreamingContext, 83 | storageLevel: StorageLevel, 84 | url: String, 85 | interval: Long): JavaDStream[String] = { 86 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 87 | createStream(jssc.ssc, storageLevel, url, interval) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/HttpInputDStreamAsync.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.ClassTag 4 | 5 | import org.apache.spark.Logging 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.api.java.JavaDStream 9 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 10 | import org.apache.spark.streaming.api.java.JavaStreamingContext 11 | import org.apache.spark.streaming.dstream.DStream 12 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 13 | import org.apache.spark.streaming.receiver.Receiver 14 | 15 | import com.ning.http.client.AsyncCompletionHandler 16 | import com.ning.http.client.AsyncHttpClient 17 | import com.ning.http.client.Response 18 | 19 | class HttpInputDStreamAsync( 20 | @transient ssc_ : StreamingContext, 21 | storageLevel: StorageLevel, 22 | url: String) extends ReceiverInputDStream[String](ssc_) with Logging { 23 | 24 | def getReceiver(): Receiver[String] = { 25 | new HttpReceiverAsync(storageLevel, url) 26 | } 27 | } 28 | 29 | class HttpReceiverAsync( 30 | storageLevel: StorageLevel, 31 | url: String) extends Receiver[String](storageLevel) with Logging { 32 | 33 | var asyncHttpClient: AsyncHttpClient = _ 34 | 35 | def onStop() { 36 | asyncHttpClient.close() 37 | logInfo("Disconnected from Http Server") 38 | } 39 | 40 | def onStart() { 41 | asyncHttpClient = new AsyncHttpClient() 42 | asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { 43 | 44 | override def onCompleted(response: Response): Response = { 45 | store(response.getResponseBody) 46 | return response 47 | } 48 | 49 | override def onThrowable(t: Throwable) { 50 | restart("Error! Problems while connecting", t) 51 | } 52 | }); 53 | logInfo("Http Connection initiated") 54 | } 55 | 56 | } 57 | 58 | object HttpUtilsAsync { 59 | def createStream( 60 | ssc: StreamingContext, 61 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 62 | url: String): DStream[String] = { 63 | new HttpInputDStreamAsync(ssc, storageLevel, url) 64 | } 65 | 66 | def createStream( 67 | jssc: JavaStreamingContext, 68 | storageLevel: StorageLevel, 69 | url: String): JavaDStream[String] = { 70 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 71 | createStream(jssc.ssc, storageLevel, url) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-11FlumePull.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.flume.FlumeUtils 11 | 12 | object DailyUserTypeDistributionApp2 { 13 | def main(args: Array[String]) { 14 | if (args.length != 5) { 15 | System.err.println( 16 | "Usage: DailyUserTypeDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => new String(rec.event.getBody().array()).split(",")) 30 | .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) 31 | .updateStateByKey(statefulCount) 32 | .repartition(1) 33 | .transform(rdd => rdd.sortByKey(ascending = false)) 34 | .saveAsTextFiles(outputPath) 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-11FlumePush.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.flume.FlumeUtils 11 | 12 | object DailyUserTypeDistributionApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 5) { 15 | System.err.println( 16 | "Usage: DailyUserTypeDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => new String(rec.event.getBody().array()).split(",")) 30 | .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) 31 | .updateStateByKey(statefulCount) 32 | .repartition(1) 33 | .transform(rdd => rdd.sortByKey(ascending = false)) 34 | .saveAsTextFiles(outputPath) 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-13Kafka.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | 12 | object StationJourneyCountApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 7) { 16 | System.err.println( 17 | "Usage: StationJourneyCountApp ") 18 | System.exit(1) 19 | } 20 | 21 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | //.set("spark.streaming.receiver.writeAheadLog.enable", "true") 27 | 28 | val ssc = new StreamingContext(conf, Seconds(10)) 29 | ssc.checkpoint(checkpointDir) 30 | 31 | val topics = Map[String, Int]( 32 | topic -> 1) 33 | KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) 34 | .map(rec => rec.split(",")) 35 | .map(rec => ((rec(3), rec(7)), 1)) 36 | .reduceByKey(_ + _) 37 | .repartition(1) 38 | .map(rec => (rec._2, rec._1)) 39 | .transform(rdd => rdd.sortByKey(ascending = false)) 40 | .saveAsTextFiles(outputPath) 41 | 42 | ssc.start() 43 | ssc.awaitTermination() 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-14KafkaCustomConf.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | import kafka.serializer.StringDecoder 11 | import org.apache.spark.storage.StorageLevel 12 | 13 | object StationJourneyCountCustomApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 7) { 17 | System.err.println( 18 | "Usage: StationJourneyCountApp ") 19 | System.exit(1) 20 | } 21 | 22 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | //.set("spark.streaming.receiver.writeAheadLog.enable", "true") 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | ssc.checkpoint(checkpointDir) 31 | 32 | val topics = Map[String, Int]( 33 | topic -> 1) 34 | val params = Map[String, String]( 35 | "zookeeper.connect" -> zkQuorum, 36 | "group.id" -> consumerGroupId, 37 | "bootstrap.servers" -> brokerUrl) 38 | KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) 39 | .map(rec => rec.split(",")) 40 | .map(rec => ((rec(3), rec(7)), 1)) 41 | .reduceByKey(_ + _) 42 | .repartition(1) 43 | .map(rec => (rec._2, rec._1)) 44 | .transform(rdd => rdd.sortByKey(ascending = false)) 45 | .saveAsTextFiles(outputPath) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-15KafkaDirect.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import kafka.serializer.StringDecoder 10 | import org.apache.spark.streaming.kafka.KafkaUtils 11 | 12 | object StationJourneyCountDirectApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 7) { 16 | System.err.println( 17 | "Usage: StationJourneyCountApp ") 18 | System.exit(1) 19 | } 20 | 21 | val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(10)) 28 | ssc.checkpoint(checkpointDir) 29 | 30 | val topics = Set(topic) 31 | val params = Map[String, String]( 32 | "zookeeper.connect" -> zkQuorum, 33 | "group.id" -> consumerGroupId, 34 | "bootstrap.servers" -> brokerUrl) 35 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) 36 | .map(rec => rec.split(",")) 37 | .map(rec => ((rec(3), rec(7)), 1)) 38 | .reduceByKey(_ + _) 39 | .repartition(1) 40 | .map(rec => (rec._2, rec._1)) 41 | .transform(rdd => rdd.sortByKey(ascending = false)) 42 | .saveAsTextFiles(outputPath) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-16Twitter.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 9 | import org.apache.spark.streaming.twitter.TwitterUtils 10 | import org.apache.spark.storage.StorageLevel 11 | import twitter4j.conf.ConfigurationBuilder 12 | import twitter4j.TwitterFactory 13 | 14 | object TwitterApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: TwitterApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, outputPath) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val ssc = new StreamingContext(conf, Seconds(10)) 30 | 31 | val cb = new ConfigurationBuilder() 32 | cb.setOAuthConsumerKey("") 33 | cb.setOAuthConsumerSecret("") 34 | cb.setOAuthAccessToken("") 35 | cb.setOAuthAccessTokenSecret("") 36 | 37 | val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() 38 | 39 | val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) 40 | tweetStream.count().print() 41 | tweetStream.saveAsTextFiles(outputPath) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-18Http.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.Seconds 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.json4s.DefaultFormats 8 | import org.json4s.JField 9 | import org.json4s.jvalue2extractable 10 | import org.json4s.jvalue2monadic 11 | import org.json4s.native.JsonMethods.parse 12 | import org.json4s.string2JsonInput 13 | 14 | object HttpApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: HttpApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, outputPath) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val batchInterval = 10 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 32 | 33 | HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) 34 | .flatMap(rec => (parse(rec) \ "stationBeanList").children) 35 | .filter(rec => { 36 | implicit val formats = DefaultFormats 37 | (rec \ "statusKey").extract[Integer] != 1 38 | }) 39 | .map(rec => rec.filterField { 40 | case JField("id", _) => true 41 | case JField("stationName", _) => true 42 | case JField("statusValue", _) => true 43 | case _ => false 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) 48 | }) 49 | .saveAsTextFiles(outputPath) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-6SocketStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 8 | 9 | import java.util.Calendar 10 | 11 | object TripByYearApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 3) { 14 | System.err.println( 15 | "Usage: TripByYearApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(10)) 25 | 26 | ssc.socketTextStream(hostname, port.toInt) 27 | .map(rec => rec.split(",")) 28 | .map(rec => (rec(13), rec(0).toInt)) 29 | .reduceByKey(_ + _) 30 | .map(pair => (pair._2, normalizeYear(pair._1))) 31 | .transform(rec => rec.sortByKey(ascending = false)) 32 | .saveAsTextFiles("TripByYear") 33 | 34 | ssc.start() 35 | ssc.awaitTermination() 36 | } 37 | 38 | def normalizeYear(s: String): String = { 39 | try { 40 | (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString 41 | } catch { 42 | case e: Exception => s 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-7MultipleSocketStreams.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions 8 | 9 | import java.util.Calendar 10 | 11 | object TripByYearMultiApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: TripByYearMultiApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, basePort, nSockets) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(10)) 25 | 26 | val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) 27 | val uniStream = ssc.union(streams) 28 | 29 | uniStream 30 | .map(rec => rec.split(",")) 31 | .map(rec => (rec(13), rec(0).toInt)) 32 | .reduceByKey(_ + _) 33 | .map(pair => (pair._2, normalizeYear(pair._1))) 34 | .transform(rec => rec.sortByKey(ascending = false)) 35 | .saveAsTextFiles("TripByYear") 36 | 37 | ssc.start() 38 | ssc.awaitTermination() 39 | } 40 | 41 | def normalizeYear(s: String): String = { 42 | try { 43 | (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString 44 | } catch { 45 | case e: Exception => s 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /Chap5/src/main/scala/org/apress/prospark/L5-9Mqtt.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.mqtt.MQTTUtils 11 | 12 | object YearlyDistributionApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: YearlyDistributionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(10)) 26 | ssc.checkpoint(checkpointDir) 27 | 28 | MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) 29 | .map(rec => rec.split(",")) 30 | .map(rec => (rec(1).split(" ")(0), 1)) 31 | .updateStateByKey(statefulCount) 32 | .map(pair => (pair._2, pair._1)) 33 | .transform(rec => rec.sortByKey(ascending = false)) 34 | .saveAsTextFiles("YearlyDistribution") 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | 40 | val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) 41 | 42 | } -------------------------------------------------------------------------------- /Chap6/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap6/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap6/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap6" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0" 26 | 27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1" 28 | 29 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1" 30 | 31 | libraryDependencies += "org.apache.commons" % "commons-pool2" % "2.4.2" 32 | 33 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.15-hadoop2" 34 | 35 | //libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2" 36 | 37 | //libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2" 38 | 39 | //libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2" 40 | 41 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-SNAPSHOT" 42 | 43 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-SNAPSHOT" 44 | 45 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-SNAPSHOT" 46 | 47 | libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-SNAPSHOT" 48 | 49 | resolvers += "Apache Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots" 50 | 51 | libraryDependencies += "org.apache.cassandra" % "cassandra-all" % "2.1.11" 52 | 53 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0" 54 | 55 | libraryDependencies += "redis.clients" % "jedis" % "2.7.3" 56 | 57 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/" 58 | -------------------------------------------------------------------------------- /Chap6/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.Enumeration; 8 | import java.util.zip.ZipEntry; 9 | import java.util.zip.ZipFile; 10 | 11 | import org.apache.log4j.LogManager; 12 | import org.apache.log4j.Logger; 13 | 14 | public abstract class AbstractDriver { 15 | 16 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 17 | 18 | private String path; 19 | 20 | public AbstractDriver(String path) { 21 | this.path = path; 22 | } 23 | 24 | public abstract void init() throws Exception; 25 | 26 | public abstract void close() throws Exception; 27 | 28 | public abstract void sendRecord(String record) throws Exception; 29 | 30 | public void execute() throws Exception { 31 | 32 | try { 33 | init(); 34 | File dirPath = new File(path); 35 | if (dirPath.isDirectory()) { 36 | File[] files = new File(path).listFiles(); 37 | for (File f : files) { 38 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 39 | ZipFile zFile = null; 40 | try { 41 | zFile = new ZipFile(f); 42 | Enumeration zEntries = zFile.entries(); 43 | 44 | while (zEntries.hasMoreElements()) { 45 | ZipEntry zEntry = zEntries.nextElement(); 46 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 47 | try (BufferedReader br = new BufferedReader( 48 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 49 | // skip header 50 | br.readLine(); 51 | String line; 52 | while ((line = br.readLine()) != null) { 53 | sendRecord(line); 54 | } 55 | } 56 | } 57 | } catch (IOException e) { 58 | LOG.error(e.getMessage()); 59 | } finally { 60 | if (zFile != null) { 61 | try { 62 | zFile.close(); 63 | } catch (IOException e) { 64 | LOG.error(e.getMessage()); 65 | } 66 | } 67 | } 68 | } 69 | } else { 70 | LOG.error(String.format("Path %s is not a directory", path)); 71 | } 72 | } finally { 73 | close(); 74 | } 75 | } 76 | } -------------------------------------------------------------------------------- /Chap6/src/main/java/org/apress/prospark/MqttDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | 5 | import org.apache.log4j.LogManager; 6 | import org.apache.log4j.Logger; 7 | import org.eclipse.paho.client.mqttv3.MqttClient; 8 | import org.eclipse.paho.client.mqttv3.MqttException; 9 | import org.eclipse.paho.client.mqttv3.MqttMessage; 10 | import org.eclipse.paho.client.mqttv3.MqttTopic; 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence; 12 | 13 | public class MqttDriver extends AbstractDriver { 14 | 15 | private static final Logger LOG = LogManager.getLogger(MqttDriver.class); 16 | 17 | private final String brokerUrl; 18 | private final String topic; 19 | private MqttClient client; 20 | private MqttTopic mqttTopic; 21 | 22 | public MqttDriver(String path, String brokerUrl, String topic) { 23 | super(path); 24 | this.brokerUrl = brokerUrl; 25 | this.topic = topic; 26 | } 27 | 28 | @Override 29 | public void init() throws Exception { 30 | client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()); 31 | LOG.info(String.format("Attempting to connect to broker %s", brokerUrl)); 32 | client.connect(); 33 | mqttTopic = client.getTopic(topic); 34 | LOG.info(String.format("Connected to broker %s", brokerUrl)); 35 | } 36 | 37 | @Override 38 | public void close() throws Exception { 39 | if (client != null) { 40 | client.disconnect(); 41 | } 42 | } 43 | 44 | @Override 45 | public void sendRecord(String record) throws Exception { 46 | try { 47 | mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8))); 48 | } catch (MqttException e) { 49 | if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) { 50 | Thread.sleep(10); 51 | } 52 | } 53 | } 54 | 55 | public static void main(String[] args) throws Exception { 56 | 57 | if (args.length != 3) { 58 | System.err.println("Usage:MqttDriver "); 59 | System.exit(-1); 60 | } 61 | 62 | String path = args[0]; 63 | String brokerUrl = args[1]; 64 | String topic = args[2]; 65 | 66 | MqttDriver driver = new MqttDriver(path, brokerUrl, topic); 67 | try { 68 | driver.execute(); 69 | } finally { 70 | driver.close(); 71 | } 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /Chap6/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, FILE, stdout 2 | log4j.rootCategory=INFO, FILE, stdout 3 | 4 | log4j.logger.org.eclipse.jetty=WARN 5 | 6 | log4j.appender.FILE=org.apache.log4j.FileAppender 7 | 8 | log4j.appender.FILE.File=/tmp/spark.log 9 | 10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 12 | 13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 14 | log4j.appender.stdout.Target=System.out 15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 17 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/HttpInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.Timer 4 | import java.util.TimerTask 5 | 6 | import scala.reflect.ClassTag 7 | 8 | import org.apache.http.client.methods.HttpGet 9 | import org.apache.http.impl.client.CloseableHttpClient 10 | import org.apache.http.impl.client.HttpClients 11 | import org.apache.http.util.EntityUtils 12 | import org.apache.spark.Logging 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.streaming.StreamingContext 15 | import org.apache.spark.streaming.api.java.JavaDStream 16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream 17 | import org.apache.spark.streaming.api.java.JavaStreamingContext 18 | import org.apache.spark.streaming.dstream.DStream 19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream 20 | import org.apache.spark.streaming.receiver.Receiver 21 | 22 | class HttpInputDStream( 23 | @transient ssc_ : StreamingContext, 24 | storageLevel: StorageLevel, 25 | url: String, 26 | interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { 27 | 28 | def getReceiver(): Receiver[String] = { 29 | new HttpReceiver(storageLevel, url, interval) 30 | } 31 | } 32 | 33 | class HttpReceiver( 34 | storageLevel: StorageLevel, 35 | url: String, 36 | interval: Long) extends Receiver[String](storageLevel) with Logging { 37 | 38 | var httpClient: CloseableHttpClient = _ 39 | var trigger: Timer = _ 40 | 41 | def onStop() { 42 | httpClient.close() 43 | logInfo("Disconnected from Http Server") 44 | } 45 | 46 | def onStart() { 47 | httpClient = HttpClients.createDefault() 48 | trigger = new Timer() 49 | trigger.scheduleAtFixedRate(new TimerTask { 50 | def run() = doGet() 51 | }, 0, interval * 1000) 52 | 53 | logInfo("Http Receiver initiated") 54 | } 55 | 56 | def doGet() { 57 | logInfo("Fetching data from Http source") 58 | val response = httpClient.execute(new HttpGet(url)) 59 | try { 60 | val content = EntityUtils.toString(response.getEntity()) 61 | store(content) 62 | } catch { 63 | case e: Exception => restart("Error! Problems while connecting", e) 64 | } finally { 65 | response.close() 66 | } 67 | 68 | } 69 | 70 | } 71 | 72 | object HttpUtils { 73 | def createStream( 74 | ssc: StreamingContext, 75 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, 76 | url: String, 77 | interval: Long): DStream[String] = { 78 | new HttpInputDStream(ssc, storageLevel, url, interval) 79 | } 80 | 81 | def createStream( 82 | jssc: JavaStreamingContext, 83 | storageLevel: StorageLevel, 84 | url: String, 85 | interval: Long): JavaDStream[String] = { 86 | implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] 87 | createStream(jssc.ssc, storageLevel, url, interval) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-10LazyStatic.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.eclipse.paho.client.mqttv3.MqttClient 9 | import org.eclipse.paho.client.mqttv3.MqttMessage 10 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JField 13 | import org.json4s.JsonAST.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.parse 17 | import org.json4s.string2JsonInput 18 | import org.apache.commons.pool2.PooledObject 19 | import org.apache.commons.pool2.BasePooledObjectFactory 20 | import org.apache.commons.pool2.impl.DefaultPooledObject 21 | import org.apache.commons.pool2.impl.GenericObjectPool 22 | import org.apache.commons.pool2.ObjectPool 23 | 24 | object MqttSinkAppE { 25 | 26 | def main(args: Array[String]) { 27 | if (args.length != 3) { 28 | System.err.println( 29 | "Usage: MqttSinkApp ") 30 | System.exit(1) 31 | } 32 | 33 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val batchInterval = 10 40 | 41 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 42 | 43 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 44 | interval = batchInterval) 45 | .flatMap(rec => { 46 | val query = parse(rec) \ "query" 47 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 48 | }) 49 | .map(rec => { 50 | implicit val formats = DefaultFormats 51 | rec.children.map(f => f.extract[String]) mkString "," 52 | }) 53 | .foreachRDD { rdd => 54 | rdd.foreachPartition { par => 55 | val mqttSink = MqttSinkPool().borrowObject() 56 | par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 57 | MqttSinkPool().returnObject(mqttSink) 58 | } 59 | } 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | } 65 | 66 | object MqttSinkPool { 67 | val poolSize = 8 68 | val brokerUrl = "tcp://localhost:1883" 69 | val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl)) 70 | mqttPool.setMaxTotal(poolSize) 71 | sys.addShutdownHook { 72 | mqttPool.close() 73 | } 74 | 75 | def apply(): GenericObjectPool[MqttClient] = { 76 | mqttPool 77 | } 78 | } 79 | 80 | class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] { 81 | override def create() = { 82 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 83 | client.connect() 84 | client 85 | } 86 | override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client) 87 | override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected() 88 | override def destroyObject(pObj: PooledObject[MqttClient]) = { 89 | pObj.getObject.disconnect() 90 | pObj.getObject.close() 91 | } 92 | override def passivateObject(pObj: PooledObject[MqttClient]) = {} 93 | } 94 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-12StaticPool.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppF { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl)) 40 | 41 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 42 | interval = batchInterval) 43 | .flatMap(rec => { 44 | val query = parse(rec) \ "query" 45 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 46 | }) 47 | .map(rec => { 48 | implicit val formats = DefaultFormats 49 | rec.children.map(f => f.extract[String]) mkString "," 50 | }) 51 | .foreachRDD { rdd => 52 | rdd.foreachPartition { par => 53 | par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 54 | } 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | 61 | } 62 | 63 | class MqttSinkLazy(brokerUrl: String) extends Serializable { 64 | lazy val client = { 65 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 66 | client.connect() 67 | sys.addShutdownHook { 68 | client.disconnect() 69 | client.close() 70 | } 71 | client 72 | } 73 | } 74 | 75 | object MqttSinkLazy { 76 | val brokerUrl = "tcp://localhost:1883" 77 | val client = new MqttSinkLazy(brokerUrl) 78 | 79 | def apply(brokerUrl: String): MqttSinkLazy = { 80 | client 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-14HBase.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.hbase.HBaseConfiguration 5 | import org.apache.hadoop.hbase.client.Put 6 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import org.apache.hadoop.io.Text 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 12 | import org.apache.spark.streaming.Seconds 13 | import org.apache.spark.streaming.StreamingContext 14 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 15 | import org.json4s.DefaultFormats 16 | import org.json4s.jvalue2extractable 17 | import org.json4s.jvalue2monadic 18 | import org.json4s.native.JsonMethods.parse 19 | import org.json4s.string2JsonInput 20 | 21 | object HBaseSinkApp { 22 | 23 | def main(args: Array[String]) { 24 | if (args.length != 5) { 25 | System.err.println( 26 | "Usage: HBaseSinkApp ") 27 | System.exit(1) 28 | } 29 | 30 | val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq 31 | 32 | val conf = new SparkConf() 33 | .setAppName(appName) 34 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 35 | 36 | val batchInterval = 10 37 | val windowSize = 20 38 | val slideInterval = 10 39 | 40 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 41 | 42 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 43 | interval = batchInterval) 44 | .flatMap(rec => { 45 | implicit val formats = DefaultFormats 46 | val query = parse(rec) \ "query" 47 | ((query \ "results" \ "quote").children) 48 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 49 | }) 50 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 51 | .foreachRDD(rdd => { 52 | val hbaseConf = HBaseConfiguration.create() 53 | hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) 54 | hbaseConf.set("hbase.master", hbaseMaster) 55 | val jobConf = new Configuration(hbaseConf) 56 | jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) 57 | rdd.map(rec => { 58 | val put = new Put(rec._1.getBytes) 59 | put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) 60 | (rec._1, put) 61 | }).saveAsNewAPIHadoopDataset(jobConf) 62 | }) 63 | 64 | ssc.start() 65 | ssc.awaitTermination() 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-16SparkHBase.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.hbase.HBaseConfiguration 4 | import org.apache.hadoop.hbase.TableName 5 | import org.apache.hadoop.hbase.client.Put 6 | import org.apache.hadoop.hbase.spark.HBaseContext 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 13 | import org.json4s.DefaultFormats 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.parse 17 | import org.json4s.string2JsonInput 18 | 19 | object SparkHBaseBulkPutApp { 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 4) { 23 | System.err.println( 24 | "Usage: SparkHBaseBulkPutApp ") 25 | System.exit(1) 26 | } 27 | 28 | val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val batchInterval = 10 35 | val windowSize = 20 36 | val slideInterval = 10 37 | 38 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 39 | 40 | val hbaseConf = HBaseConfiguration.create() 41 | val hContext = new HBaseContext(ssc.sparkContext, hbaseConf) 42 | 43 | val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 44 | interval = batchInterval) 45 | .flatMap(rec => { 46 | implicit val formats = DefaultFormats 47 | val query = parse(rec) \ "query" 48 | ((query \ "results" \ "quote").children) 49 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 50 | }) 51 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 52 | 53 | hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => { 54 | val put = new Put(rec._1.getBytes) 55 | put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) 56 | put 57 | }) 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-18Cassandra.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | import org.json4s.DefaultFormats 9 | import org.json4s.JField 10 | import org.json4s.JsonAST.JObject 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | import org.apache.hadoop.conf.Configuration 16 | import org.apache.hadoop.io.Text 17 | import java.nio.ByteBuffer 18 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat 19 | import org.apache.cassandra.hadoop.ConfigHelper 20 | import org.apache.cassandra.thrift.ColumnOrSuperColumn 21 | import org.apache.cassandra.thrift.Column 22 | import org.apache.cassandra.utils.ByteBufferUtil 23 | import org.apache.cassandra.thrift.Mutation 24 | import java.util.Arrays 25 | 26 | object CassandraSinkApp { 27 | 28 | def main(args: Array[String]) { 29 | if (args.length != 6) { 30 | System.err.println( 31 | "Usage: CassandraSinkApp ") 32 | System.exit(1) 33 | } 34 | 35 | val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq 36 | 37 | val conf = new SparkConf() 38 | .setAppName(appName) 39 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 40 | 41 | val batchInterval = 10 42 | val windowSize = 20 43 | val slideInterval = 10 44 | 45 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 46 | 47 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 48 | interval = batchInterval) 49 | .flatMap(rec => { 50 | implicit val formats = DefaultFormats 51 | val query = parse(rec) \ "query" 52 | ((query \ "results" \ "quote").children) 53 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 54 | }) 55 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 56 | .foreachRDD(rdd => { 57 | val jobConf = new Configuration() 58 | ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) 59 | ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) 60 | ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) 61 | ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") 62 | rdd.map(rec => { 63 | val c = new Column() 64 | c.setName(ByteBufferUtil.bytes(columnName)) 65 | c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) 66 | c.setTimestamp(System.currentTimeMillis) 67 | val m = new Mutation() 68 | m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) 69 | m.column_or_supercolumn.setColumn(c) 70 | (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) 71 | }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) 72 | }) 73 | 74 | ssc.start() 75 | ssc.awaitTermination() 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-20CassandraConnector.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | import com.datastax.spark.connector.SomeColumns 17 | import com.datastax.spark.connector.cql.CassandraConnector 18 | import com.datastax.spark.connector.streaming.toDStreamFunctions 19 | import com.datastax.spark.connector.toNamedColumnRef 20 | 21 | object CassandraConnectorSinkApp { 22 | 23 | def main(args: Array[String]) { 24 | if (args.length != 6) { 25 | System.err.println( 26 | "Usage: CassandraConnectorSinkApp ") 27 | System.exit(1) 28 | } 29 | 30 | val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq 31 | 32 | val conf = new SparkConf() 33 | .setAppName(appName) 34 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 35 | .set("spark.cassandra.connection.host", cassandraHost) 36 | .set("spark.cassandra.connection.port", cassandraPort) 37 | 38 | val batchInterval = 10 39 | val windowSize = 20 40 | val slideInterval = 10 41 | 42 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 43 | 44 | CassandraConnector(conf).withSessionDo { session => 45 | session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) 46 | session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) 47 | } 48 | 49 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 50 | interval = batchInterval) 51 | .flatMap(rec => { 52 | implicit val formats = DefaultFormats 53 | val query = parse(rec) \ "query" 54 | ((query \ "results" \ "quote").children) 55 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) 56 | }) 57 | .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) 58 | .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) 59 | .saveToCassandra(keyspace, tableName) 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | } 65 | 66 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-22Counters.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.concurrent.atomic.AtomicLong 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.json4s.DefaultFormats 10 | import org.json4s.jvalue2extractable 11 | import org.json4s.jvalue2monadic 12 | import org.json4s.native.JsonMethods.parse 13 | import org.json4s.string2JsonInput 14 | 15 | object StatefulCountersApp { 16 | 17 | def main(args: Array[String]) { 18 | if (args.length != 1) { 19 | System.err.println( 20 | "Usage: StatefulCountersApp ") 21 | System.exit(1) 22 | } 23 | 24 | val Seq(appName) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val batchInterval = 10 31 | 32 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 33 | 34 | var globalMax: AtomicLong = new AtomicLong(Long.MinValue) 35 | var globalMin: AtomicLong = new AtomicLong(Long.MaxValue) 36 | var globalCounter500: AtomicLong = new AtomicLong(0) 37 | 38 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 39 | interval = batchInterval) 40 | .flatMap(rec => { 41 | implicit val formats = DefaultFormats 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children) 44 | .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)) 45 | }) 46 | .foreachRDD(rdd => { 47 | val stocks = rdd.take(10) 48 | stocks.foreach(stock => { 49 | val price = stock._2 50 | val volume = stock._3 51 | if (volume > globalMax.get()) { 52 | globalMax.set(volume) 53 | } 54 | if (volume < globalMin.get()) { 55 | globalMin.set(volume) 56 | } 57 | if (price > 500) { 58 | globalCounter500.incrementAndGet() 59 | } 60 | }) 61 | if (globalCounter500.get() > 1000L) { 62 | println("Global counter has reached 1000") 63 | println("Max ----> " + globalMax.get) 64 | println("Min ----> " + globalMin.get) 65 | globalCounter500.set(0) 66 | } 67 | }) 68 | 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-23UpdateState.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.Seconds 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 8 | import org.json4s.DefaultFormats 9 | import org.json4s.jvalue2extractable 10 | import org.json4s.jvalue2monadic 11 | import org.json4s.native.JsonMethods.parse 12 | import org.json4s.string2JsonInput 13 | 14 | object StatefulUpdateStateApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 2) { 18 | System.err.println( 19 | "Usage: StatefulUpdateStateApp ") 20 | System.exit(1) 21 | } 22 | 23 | val Seq(appName, checkpointDir) = args.toSeq 24 | 25 | val conf = new SparkConf() 26 | .setAppName(appName) 27 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 28 | 29 | val batchInterval = 10 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 32 | ssc.checkpoint(checkpointDir) 33 | 34 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 35 | interval = batchInterval) 36 | .flatMap(rec => { 37 | implicit val formats = DefaultFormats 38 | val query = parse(rec) \ "query" 39 | ((query \ "results" \ "quote").children) 40 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 41 | }) 42 | .updateStateByKey(updateState) 43 | .print() 44 | 45 | def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = { 46 | val volumes = values.map(s => s._2) 47 | val localMin = volumes.min 48 | val localMax = volumes.max 49 | val localCount500 = values.map(s => s._1).count(price => price > 500) 50 | val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)] 51 | val newMin = if (localMin < globalValues._1) localMin else globalValues._1 52 | val newMax = if (localMax > globalValues._2) localMax else globalValues._2 53 | val newCount500 = globalValues._3 + localCount500 54 | return Some(newMin, newMax, newCount500) 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-24Accumulators.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.mutable 4 | 5 | import org.apache.spark.AccumulableParam 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import org.json4s.DefaultFormats 11 | import org.json4s.jvalue2extractable 12 | import org.json4s.jvalue2monadic 13 | import org.json4s.native.JsonMethods.parse 14 | import org.json4s.string2JsonInput 15 | 16 | object StatefulAccumulatorsApp { 17 | 18 | object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] { 19 | def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { 20 | new mutable.HashMap[String, (Long, Long, Long)]() 21 | } 22 | def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { 23 | t1 ++ t2.map { 24 | case (k, v2) => (k -> { 25 | val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L)) 26 | val newMin = if (v2._1 < v1._1) v2._1 else v1._1 27 | val newMax = if (v2._2 > v1._2) v2._2 else v1._2 28 | (newMin, newMax, v1._3 + v2._3) 29 | }) 30 | } 31 | } 32 | def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = { 33 | val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L)) 34 | val newVals = t2._2 35 | var newCount = prevStats._3 36 | if (newVals._1 > 500.0) { 37 | newCount += 1 38 | } 39 | val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1 40 | val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2 41 | t1 += t2._1 -> (newMin, newMax, newCount) 42 | } 43 | } 44 | 45 | def main(args: Array[String]) { 46 | if (args.length != 2) { 47 | System.err.println( 48 | "Usage: StatefulAccumulatorsApp ") 49 | System.exit(1) 50 | } 51 | 52 | val Seq(appName, checkpointDir) = args.toSeq 53 | 54 | val conf = new SparkConf() 55 | .setAppName(appName) 56 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 57 | 58 | val batchInterval = 10 59 | 60 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 61 | 62 | val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum) 63 | 64 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 65 | interval = batchInterval) 66 | .flatMap(rec => { 67 | implicit val formats = DefaultFormats 68 | val query = parse(rec) \ "query" 69 | ((query \ "results" \ "quote").children) 70 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 71 | }) 72 | .foreachRDD(rdd => { 73 | rdd.foreach({ stock => 74 | stateAccum += (stock._1, (stock._2._1, stock._2._2)) 75 | }) 76 | for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats) 77 | }) 78 | 79 | ssc.start() 80 | ssc.awaitTermination() 81 | } 82 | } 83 | 84 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-26Redis.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.collection.JavaConversions.asScalaBuffer 4 | import scala.collection.JavaConversions.mutableMapAsJavaMap 5 | import scala.collection.mutable 6 | 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.jvalue2extractable 13 | import org.json4s.jvalue2monadic 14 | import org.json4s.native.JsonMethods.parse 15 | import org.json4s.string2JsonInput 16 | 17 | import redis.clients.jedis.Jedis 18 | 19 | object StatefulRedisApp { 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 3) { 23 | System.err.println( 24 | "Usage: StatefulRedisApp ") 25 | System.exit(1) 26 | } 27 | 28 | val Seq(appName, checkpointDir, hostname) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val batchInterval = 10 35 | 36 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 37 | 38 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 39 | interval = batchInterval) 40 | .flatMap(rec => { 41 | implicit val formats = DefaultFormats 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children) 44 | .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) 45 | }) 46 | .foreachRDD(rdd => { 47 | rdd.foreachPartition({ part => 48 | val jedis = new Jedis(hostname) 49 | part.foreach(f => { 50 | val prev = jedis.hmget(f._1, "min", "max", "count") 51 | if (prev(0) == null) { 52 | jedis.hmset(f._1, mutable.HashMap("min" -> Long.MaxValue.toString, "max" -> Long.MinValue.toString, "count" -> 0.toString)) 53 | } else { 54 | val prevLong = prev.toList.map(v => v.toLong) 55 | var newCount = prevLong(2) 56 | val newPrice = f._2._1 57 | val newVolume = f._2._2 58 | if (newPrice > 500.0) { 59 | newCount += 1 60 | } 61 | val newMin = if (newVolume < prevLong(0)) newVolume else prevLong(0) 62 | val newMax = if (newVolume > prevLong(1)) newVolume else prevLong(1) 63 | jedis.hmset(f._1, mutable.HashMap("min" -> newMin.toString, "max" -> newMax.toString, "count" -> newCount.toString)) 64 | } 65 | }) 66 | jedis.close() 67 | }) 68 | 69 | val jedis = new Jedis(hostname) 70 | jedis.scan(0).getResult.foreach(sym => println("Symbol: %s, Stats: %s".format(sym, jedis.hmget(sym, "min", "max", "count").toString))) 71 | jedis.close() 72 | }) 73 | 74 | ssc.start() 75 | ssc.awaitTermination() 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-5Exception.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppA { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 51 | client.connect() 52 | rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) 53 | client.disconnect() 54 | client.close() 55 | } 56 | 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-6PerRecord.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppB { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreach { rec => 51 | { 52 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 53 | client.connect() 54 | client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))) 55 | client.disconnect() 56 | client.close() 57 | } 58 | } 59 | } 60 | 61 | ssc.start() 62 | ssc.awaitTermination() 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-7PerPartition.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppC { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreachPartition { par => 51 | val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 52 | client.connect() 53 | par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) 54 | client.disconnect() 55 | client.close() 56 | } 57 | } 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Chap6/src/main/scala/org/apress/prospark/L6-8Static.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.eclipse.paho.client.mqttv3.MqttClient 10 | import org.eclipse.paho.client.mqttv3.MqttMessage 11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence 12 | import org.json4s.DefaultFormats 13 | import org.json4s.JField 14 | import org.json4s.JsonAST.JObject 15 | import org.json4s.jvalue2extractable 16 | import org.json4s.jvalue2monadic 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.string2JsonInput 19 | 20 | object MqttSinkAppD { 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 3) { 24 | System.err.println( 25 | "Usage: MqttSinkApp ") 26 | System.exit(1) 27 | } 28 | 29 | val Seq(appName, outputBrokerUrl, topic) = args.toSeq 30 | 31 | val conf = new SparkConf() 32 | .setAppName(appName) 33 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 34 | 35 | val batchInterval = 10 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval)) 38 | 39 | HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", 40 | interval = batchInterval) 41 | .flatMap(rec => { 42 | val query = parse(rec) \ "query" 43 | ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) 44 | }) 45 | .map(rec => { 46 | implicit val formats = DefaultFormats 47 | rec.children.map(f => f.extract[String]) mkString "," 48 | }) 49 | .foreachRDD { rdd => 50 | rdd.foreachPartition { par => 51 | par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) 52 | } 53 | } 54 | 55 | ssc.start() 56 | ssc.awaitTermination() 57 | } 58 | } 59 | 60 | object MqttSink { 61 | val brokerUrl = "tcp://localhost:1883" 62 | val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) 63 | client.connect() 64 | sys.addShutdownHook { 65 | client.disconnect() 66 | client.close() 67 | } 68 | 69 | def apply(): MqttClient = { 70 | client 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /Chap7/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap7/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap7/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap7" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | -------------------------------------------------------------------------------- /Chap7/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else { 85 | LOG.warn("Unsupported file type: " + f.getName()); 86 | } 87 | } 88 | } else { 89 | LOG.error(String.format("Path %s is not a directory", path)); 90 | } 91 | } finally { 92 | close(); 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /Chap7/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /Chap7/src/main/scala/org/apress/prospark/L7-2-3Tachyon.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 11 | 12 | object ReferrerApp { 13 | def main(args: Array[String]) { 14 | if (args.length != 7) { 15 | System.err.println( 16 | "Usage: ReferrerApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | .set("spark.externalBlockStore.url", tachyonUrl) 25 | 26 | val ssc = new StreamingContext(conf, Seconds(10)) 27 | ssc.checkpoint(checkpointDir) 28 | 29 | val clickstream = ssc.socketTextStream(hostname, port.toInt) 30 | .map(rec => rec.split("\\t")) 31 | .persist(StorageLevel.OFF_HEAP) 32 | 33 | val topRefStream = clickstream 34 | .map(rec => { 35 | var prev_title = rec(3) 36 | if (!prev_title.startsWith("other")) { 37 | prev_title = "wikipedia" 38 | } 39 | (prev_title, 1) 40 | }) 41 | 42 | val topSparkStream = clickstream 43 | .filter(rec => rec(4).equals("Apache_Spark")) 44 | .map(rec => (rec(3), 1)) 45 | 46 | saveTopKeys(topRefStream, outputPathTop) 47 | 48 | saveTopKeys(topSparkStream, outputPathSpark) 49 | 50 | ssc.start() 51 | ssc.awaitTermination() 52 | } 53 | 54 | def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { 55 | clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) 56 | .repartition(1) 57 | .map(rec => (rec._2, rec._1)) 58 | .transform(rec => rec.sortByKey(ascending = false)) 59 | .saveAsTextFiles(outputPath) 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /Chap7/src/main/scala/org/apress/prospark/L7-4UI.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import java.util.concurrent.atomic.AtomicLong 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object SocialSearchApp { 12 | def main(args: Array[String]) { 13 | if (args.length != 3) { 14 | System.err.println( 15 | "Usage: SocialSearchApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | //.set("spark.eventLog.enabled", "true") 24 | //.set("spark.eventLog.dir", "/tmp/historical") 25 | 26 | 27 | val countSearch = new AtomicLong(0) 28 | val countSocial = new AtomicLong(0) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(1)) 31 | 32 | val titleStream = ssc.socketTextStream(hostname, port.toInt) 33 | .map(rec => rec.split("\\t")) 34 | .filter(_(3) match { 35 | case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true 36 | case _ => false 37 | }) 38 | .map(rec => (rec(3), rec(4))) 39 | .cache() 40 | 41 | val searchStream = titleStream.filter(_._1 match { 42 | case "other-google" | "other-bing" | "other-yahoo" => true 43 | case _ => false 44 | }) 45 | .map(rec => rec._2) 46 | 47 | val socialStream = titleStream.filter(_._1 match { 48 | case "other-facebook" | "other-twitter" => true 49 | case _ => false 50 | }) 51 | .map(rec => rec._2) 52 | 53 | val exclusiveSearch = searchStream.transformWith(socialStream, 54 | (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) 55 | .foreachRDD(rdd => { 56 | countSearch.addAndGet(rdd.count()) 57 | println("Exclusive count search engines: " + countSearch) 58 | }) 59 | 60 | val exclusiveSocial = socialStream.transformWith(searchStream, 61 | (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) 62 | .foreachRDD(rdd => { 63 | countSocial.addAndGet(rdd.count()) 64 | println("Exclusive count social media: " + countSocial) 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | 71 | } -------------------------------------------------------------------------------- /Chap8/L8-36CdrSparkRApp.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(trailingOnly = TRUE) 2 | if(length(args) != 2) { 3 | stop("Usage: CdrSparkRApp ") 4 | } 5 | library(SparkR) 6 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.3.0" "sparkr-shell"') 7 | sc <- sparkR.init(master = args[1]) 8 | sqlContext <- sparkRSQL.init(sc) 9 | df <- read.df(sqlContext, args[2], source = "com.databricks.spark.csv", inferSchema = "true", delimiter = "\t") 10 | cnames <- c("squareId", "timeInterval", "countryCode", "smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity") 11 | for (i in 1:NROW(cnames)) { 12 | df <- withColumnRenamed(df, paste0("C", i - 1), cnames[i]) 13 | } 14 | counts <- count(groupBy(df, "countryCode")) 15 | showDF(orderBy(counts, desc(counts$count)), numRows = 5) 16 | sparkR.stop() -------------------------------------------------------------------------------- /Chap8/L8-39CdrStreamingSparkRApp.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | if(length(args) != 1) { 4 | stop("Usage: CdrStreamingSparkRApp ") 5 | } 6 | library(SparkR) 7 | sc <- sparkR.init(master = args[1]) 8 | hiveContext <- sparkRHive.init(sc) 9 | f <- file("stdin") 10 | open(f) 11 | while(length(tableName <- readLines(f, n = 1)) > 0) { 12 | tryCatch({ 13 | tableName <- trimws(tableName) 14 | write(paste0("Processing table: ", tableName), stderr()) 15 | df <- table(hiveContext, tableName) 16 | counts <- count(groupBy(df, "countryCode")) 17 | outputTable <- paste0(tableName, "processed") 18 | write(paste0("Output written to: ", outputTable), stderr()) 19 | saveAsTable(limit(orderBy(counts, desc(counts$count)), 5), outputTable, "parquet", "error") 20 | }, error = function(e) {stop(e)}) 21 | } 22 | close(f) 23 | sparkR.stop() -------------------------------------------------------------------------------- /Chap8/cdrschema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "struct", 3 | "fields": [ 4 | { 5 | "name": "squareId", 6 | "nullable": false, 7 | "type": "integer" 8 | }, 9 | { 10 | "name": "timeInterval", 11 | "nullable": false, 12 | "type": "long" 13 | }, 14 | { 15 | "name": "countryCode", 16 | "nullable": true, 17 | "type": "string" 18 | }, 19 | { 20 | "name": "smsInActivity", 21 | "nullable": true, 22 | "type": "float" 23 | }, 24 | { 25 | "name": "smsOutActivity", 26 | "nullable": true, 27 | "type": "float" 28 | }, 29 | { 30 | "name": "callInActivity", 31 | "nullable": true, 32 | "type": "float" 33 | }, 34 | { 35 | "name": "callOutActivity", 36 | "nullable": true, 37 | "type": "float" 38 | }, 39 | { 40 | "name": "internetTrafficActivity", 41 | "nullable": true, 42 | "type": "float" 43 | } 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /Chap8/cdrschema2.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "struct", 3 | "fields": [ 4 | { 5 | "name": "squareId", 6 | "nullable": false, 7 | "type": "integer" 8 | }, 9 | { 10 | "name": "timeInterval", 11 | "nullable": false, 12 | "type": "long" 13 | }, 14 | { 15 | "name": "countryCode", 16 | "nullable": true, 17 | "type": "integer" 18 | }, 19 | { 20 | "name": "smsInActivity", 21 | "nullable": true, 22 | "type": "float" 23 | }, 24 | { 25 | "name": "smsOutActivity", 26 | "nullable": true, 27 | "type": "float" 28 | }, 29 | { 30 | "name": "callInActivity", 31 | "nullable": true, 32 | "type": "float" 33 | }, 34 | { 35 | "name": "callOutActivity", 36 | "nullable": true, 37 | "type": "float" 38 | }, 39 | { 40 | "name": "internetTrafficActivity", 41 | "nullable": true, 42 | "type": "float" 43 | } 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /Chap8/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap8/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap8/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap8" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | //libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0" 24 | 25 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "1.4.0" 26 | 27 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 28 | -------------------------------------------------------------------------------- /Chap8/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else { 85 | LOG.warn("Unsupported file type: " + f.getName()); 86 | } 87 | } 88 | } else { 89 | LOG.error(String.format("Path %s is not a directory", path)); 90 | } 91 | } finally { 92 | close(); 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /Chap8/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-10-11UDF.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.io.Source 4 | import scala.reflect.runtime.universe 5 | 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.json4s.jackson.JsonMethods.parse 13 | import org.json4s.jvalue2extractable 14 | import org.json4s.string2JsonInput 15 | 16 | object CdrUDFApp { 17 | 18 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 19 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 20 | callOutActivity: Float, internetTrafficActivity: Float) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: CdrUDFApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val sqlC = new SQLContext(ssc.sparkContext) 37 | import sqlC.implicits._ 38 | 39 | def getCountryCodeMapping() = { 40 | implicit val formats = org.json4s.DefaultFormats 41 | parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) 42 | } 43 | 44 | def getCountryNameMapping() = { 45 | implicit val formats = org.json4s.DefaultFormats 46 | parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] 47 | } 48 | 49 | def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { 50 | mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") 51 | } 52 | 53 | val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) 54 | 55 | sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) 56 | 57 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 58 | .map(_.split("\\t", -1)) 59 | .foreachRDD(rdd => { 60 | val cdrs = seqToCdr(rdd).toDF() 61 | cdrs.registerTempTable("cdrs") 62 | 63 | sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() 64 | 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | 71 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 72 | rdd.map(c => c.map(f => f match { 73 | case x if x.isEmpty() => "0" 74 | case x => x 75 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 76 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-13HiveQL.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.hive.HiveContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CdrHiveqlApp { 13 | 14 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 15 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 16 | callOutActivity: Float, internetTrafficActivity: Float) 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: CdrHiveqlApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val cl = Thread.currentThread().getContextClassLoader() 33 | val hiveC = new HiveContext(ssc.sparkContext) 34 | Thread.currentThread().setContextClassLoader(cl) 35 | 36 | import hiveC.implicits._ 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | seqToCdr(rdd).toDF().registerTempTable("cdrs") 42 | 43 | hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") 44 | hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() 45 | }) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 52 | rdd.map(c => c.map(f => f match { 53 | case x if x.isEmpty() => "0" 54 | case x => x 55 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 56 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 57 | } 58 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-14-27DataFrameExamples.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeExamplesApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeExamplesApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 37 | .map(_.split("\\t", -1)) 38 | .foreachRDD(rdd => { 39 | val cdrs = seqToCdr(rdd).toDF() 40 | 41 | cdrs.select("squareId", "timeInterval", "countryCode").show() 42 | cdrs.select($"squareId", $"timeInterval", $"countryCode").show() 43 | cdrs.filter("squareId = 5").show() 44 | cdrs.drop("countryCode").show() 45 | cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() 46 | cdrs.limit(5).show() 47 | cdrs.groupBy("squareId").count().show() 48 | cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() 49 | cdrs.groupBy("countryCode").max("callOutActivity").show() 50 | cdrs.groupBy("countryCode").min("callOutActivity").show() 51 | cdrs.groupBy("squareId").sum("internetTrafficActivity").show() 52 | cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() 53 | cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() 54 | cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() 55 | cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) 56 | cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) 57 | cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() 58 | cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() 59 | cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() 60 | cdrs.sample(true, 0.01).show() 61 | }) 62 | 63 | ssc.start() 64 | ssc.awaitTermination() 65 | } 66 | 67 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 68 | rdd.map(c => c.map(f => f match { 69 | case x if x.isEmpty() => "0" 70 | case x => x 71 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 72 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 73 | } 74 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-1DataFrameAPI.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 37 | .map(_.split("\\t", -1)) 38 | .foreachRDD(rdd => { 39 | val cdrs = seqToCdr(rdd).toDF() 40 | 41 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 42 | }) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 49 | rdd.map(c => c.map(f => f match { 50 | case x if x.isEmpty() => "0" 51 | case x => x 52 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 53 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 54 | } 55 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-28DataFrameExamplesOps.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.DataFrame 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object CdrDataframeExamples2App { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 4) { 21 | System.err.println( 22 | "Usage: CdrDataframeExamples2App ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val sqlC = new SQLContext(ssc.sparkContext) 34 | import sqlC.implicits._ 35 | 36 | var previousCdrs: Option[DataFrame] = None 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() 42 | previousCdrs match { 43 | case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() 44 | //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() 45 | //case Some(prevCdrs) => cdrs.except(prevCdrs).show() 46 | case None => Unit 47 | } 48 | previousCdrs = Some(cdrs) 49 | }) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 56 | rdd.map(c => c.map(f => f match { 57 | case x if x.isEmpty() => "0" 58 | case x => x 59 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 60 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 61 | } 62 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-29DataFrameExamplesJoin.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JDouble 13 | import org.json4s.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.compact 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.native.JsonMethods.render 19 | import org.json4s.string2JsonInput 20 | 21 | object CdrDataframeExamples3App { 22 | 23 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 24 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 25 | callOutActivity: Float, internetTrafficActivity: Float) 26 | 27 | def main(args: Array[String]) { 28 | if (args.length != 5) { 29 | System.err.println( 30 | "Usage: CdrDataframeExamples3App ") 31 | System.exit(1) 32 | } 33 | val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 40 | 41 | val sqlC = new SQLContext(ssc.sparkContext) 42 | import sqlC.implicits._ 43 | implicit val formats = DefaultFormats 44 | 45 | val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString 46 | val gridGeo = (parse(gridFile) \ "features") 47 | val gridStr = gridGeo.children.map(r => { 48 | val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) 49 | val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), 50 | ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) 51 | compact(render(JObject(l))) 52 | }) 53 | 54 | val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) 55 | 56 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 57 | .map(_.split("\\t", -1)) 58 | .foreachRDD(rdd => { 59 | val cdrs = seqToCdr(rdd).toDF() 60 | cdrs.join(gridDF, $"squareId" === $"id").show() 61 | }) 62 | 63 | ssc.start() 64 | ssc.awaitTermination() 65 | } 66 | 67 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 68 | rdd.map(c => c.map(f => f match { 69 | case x if x.isEmpty() => "0" 70 | case x => x 71 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 72 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 73 | } 74 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-3-6-7DataFrameCreation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.json4s.native.Serialization.write 13 | import org.json4s.DefaultFormats 14 | 15 | object DataframeCreationApp { 16 | 17 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 18 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 19 | callOutActivity: Float, internetTrafficActivity: Float) 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 4) { 23 | System.err.println( 24 | "Usage: CdrDataframeApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 34 | 35 | val sqlC = new SQLContext(ssc.sparkContext) 36 | import sqlC.implicits._ 37 | 38 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 39 | .map(_.split("\\t", -1)) 40 | .foreachRDD(rdd => { 41 | //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) 42 | //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) 43 | //val cdrs = seqToCdr(rdd).toDF() 44 | val cdrsJson = seqToCdr(rdd).map(r => { 45 | implicit val formats = DefaultFormats 46 | write(r) 47 | }) 48 | val cdrs = sqlC.read.json(cdrsJson) 49 | 50 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 51 | }) 52 | 53 | ssc.start() 54 | ssc.awaitTermination() 55 | 56 | } 57 | 58 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 59 | rdd.map(c => c.map(f => f match { 60 | case x if x.isEmpty() => "0" 61 | case x => x 62 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 63 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 64 | } 65 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-35DataFrameExamplesRDD.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.sql.types.DataType 10 | import org.apache.spark.sql.types.StructType 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.json4s.DefaultFormats 14 | 15 | object CdrDataframeExamplesRDDApp { 16 | 17 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 18 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 19 | callOutActivity: Float, internetTrafficActivity: Float) 20 | 21 | def main(args: Array[String]) { 22 | if (args.length != 5) { 23 | System.err.println( 24 | "Usage: CdrDataframeExamplesRDDApp ") 25 | System.exit(1) 26 | } 27 | val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq 28 | 29 | val conf = new SparkConf() 30 | .setAppName(appName) 31 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 32 | 33 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 34 | 35 | val sqlC = new SQLContext(ssc.sparkContext) 36 | import sqlC.implicits._ 37 | implicit val formats = DefaultFormats 38 | 39 | val schemaJson = scala.io.Source.fromFile(schemaFile).mkString 40 | val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] 41 | 42 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 43 | .map(_.split("\\t", -1)) 44 | .foreachRDD(rdd => { 45 | val cdrs = seqToCdr(rdd).toDF() 46 | val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) 47 | val highOther = cdrs.except(highInternet) 48 | val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() 49 | val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() 50 | highOtherGrid.except(highInternetGrid).show() 51 | highInternetGrid.except(highOtherGrid).show() 52 | }) 53 | 54 | ssc.start() 55 | ssc.awaitTermination() 56 | } 57 | 58 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 59 | rdd.map(c => c.map(f => f match { 60 | case x if x.isEmpty() => "0" 61 | case x => x 62 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 63 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 64 | } 65 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-38SparkR.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.hive.HiveContext 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | import java.nio.file.Paths 11 | import org.apache.spark.SparkFiles 12 | 13 | object CdrStreamingSparkRApp { 14 | 15 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 16 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 17 | callOutActivity: Float, internetTrafficActivity: Float) 18 | 19 | def main(args: Array[String]) { 20 | if (args.length != 7) { 21 | System.err.println( 22 | "Usage: CdrStreamingSparkRApp ") 23 | System.exit(1) 24 | } 25 | val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq 26 | 27 | val conf = new SparkConf() 28 | .setAppName(appName) 29 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 30 | 31 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 32 | 33 | val cl = Thread.currentThread().getContextClassLoader() 34 | val hiveC = new HiveContext(ssc.sparkContext) 35 | Thread.currentThread().setContextClassLoader(cl) 36 | 37 | import hiveC.implicits._ 38 | 39 | ssc.sparkContext.addFile(rScriptPath) 40 | val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) 41 | val master = hiveC.sparkContext.getConf.get("spark.master") 42 | 43 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 44 | .map(_.split("\\t", -1)) 45 | .foreachRDD((rdd, time) => { 46 | val iTableName = tableName + time.milliseconds 47 | seqToCdr(rdd).toDF().write.saveAsTable(iTableName) 48 | hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) 49 | }) 50 | 51 | ssc.start() 52 | ssc.awaitTermination() 53 | } 54 | 55 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 56 | rdd.map(c => c.map(f => f match { 57 | case x if x.isEmpty() => "0" 58 | case x => x 59 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 60 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 61 | } 62 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-4DataFrameCreationSchema.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql.functions.desc 8 | import org.apache.spark.sql.types.DataType 9 | import org.apache.spark.sql.types.StructType 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object DataframeCreationApp2 { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 5) { 17 | System.err.println( 18 | "Usage: CdrDataframeApp2 ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val sqlC = new SQLContext(ssc.sparkContext) 30 | 31 | val schemaJson = scala.io.Source.fromFile(schemaFile).mkString 32 | val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] 33 | 34 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 35 | .map(_.split("\\t", -1)) 36 | .foreachRDD(rdd => { 37 | val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) 38 | 39 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) 40 | }) 41 | 42 | ssc.start() 43 | ssc.awaitTermination() 44 | 45 | } 46 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/L8-8Sql.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CdrSqlApp { 13 | 14 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 15 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 16 | callOutActivity: Float, internetTrafficActivity: Float) 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: CdrSqlApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val sqlC = new SQLContext(ssc.sparkContext) 33 | import sqlC.implicits._ 34 | 35 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 36 | .map(_.split("\\t", -1)) 37 | .foreachRDD(rdd => { 38 | val cdrs = seqToCdr(rdd).toDF() 39 | cdrs.registerTempTable("cdrs") 40 | 41 | sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() 42 | sqlC.dropTempTable("cdrs") 43 | }) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | } 48 | 49 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 50 | rdd.map(c => c.map(f => f match { 51 | case x if x.isEmpty() => "0" 52 | case x => x 53 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 54 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 55 | } 56 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/T8-3DataFrameExamplesNA.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SQLContext 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JDouble 13 | import org.json4s.JObject 14 | import org.json4s.jvalue2extractable 15 | import org.json4s.jvalue2monadic 16 | import org.json4s.native.JsonMethods.compact 17 | import org.json4s.native.JsonMethods.parse 18 | import org.json4s.native.JsonMethods.render 19 | import org.json4s.string2JsonInput 20 | 21 | object CdrDataframeExamplesNAApp { 22 | 23 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 24 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 25 | callOutActivity: Float, internetTrafficActivity: Float) 26 | 27 | def main(args: Array[String]) { 28 | if (args.length != 4) { 29 | System.err.println( 30 | "Usage: CdrDataframeExamplesNAApp ") 31 | System.exit(1) 32 | } 33 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 34 | 35 | val conf = new SparkConf() 36 | .setAppName(appName) 37 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 38 | 39 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 40 | 41 | val sqlC = new SQLContext(ssc.sparkContext) 42 | import sqlC.implicits._ 43 | implicit val formats = DefaultFormats 44 | 45 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 46 | .map(_.split("\\t", -1)) 47 | .foreachRDD(rdd => { 48 | val cdrs = seqToCdr(rdd).toDF() 49 | cdrs.na.drop("any").show() 50 | cdrs.na.fill(0, Array("squareId")).show() 51 | cdrs.na.replace("squareId", Map(0 -> 1)).show() 52 | println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) 53 | println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) 54 | cdrs.stat.crosstab("squareId", "countryCode").show() 55 | cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() 56 | cdrs.stat.crosstab("callOutActivity", "callInActivity").show() 57 | }) 58 | 59 | ssc.start() 60 | ssc.awaitTermination() 61 | } 62 | 63 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 64 | rdd.map(c => c.map(f => f match { 65 | case x if x.isEmpty() => "0" 66 | case x => x 67 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 68 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 69 | } 70 | } -------------------------------------------------------------------------------- /Chap8/src/main/scala/org/apress/prospark/T8-5-L8-30-34DataFrameExamplesActions.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SaveMode 9 | import org.apache.spark.sql.functions.desc 10 | import org.apache.spark.sql.hive.HiveContext 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr 14 | import org.json4s.DefaultFormats 15 | 16 | object CdrDataframeExamplesActionsApp { 17 | 18 | case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, 19 | smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, 20 | callOutActivity: Float, internetTrafficActivity: Float) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: CdrDataframeExamplesActionsApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val cl = Thread.currentThread().getContextClassLoader() 37 | val hiveC = new HiveContext(ssc.sparkContext) 38 | Thread.currentThread().setContextClassLoader(cl) 39 | import hiveC.implicits._ 40 | implicit val formats = DefaultFormats 41 | 42 | val cdrStream = ssc.socketTextStream(hostname, port.toInt) 43 | .map(_.split("\\t", -1)) 44 | .foreachRDD(rdd => { 45 | val cdrs = seqToCdr(rdd).toDF() 46 | 47 | val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) 48 | counts.show(5) 49 | counts.show() 50 | println("head(5): " + counts.head(5)) 51 | println("take(5): " + counts.take(5)) 52 | println("head(): " + counts.head()) 53 | println("first(5): " + counts.first()) 54 | println("count(): " + counts.count()) 55 | println("collect(): " + counts.collect()) 56 | println("collectAsList(): " + counts.collectAsList()) 57 | println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) 58 | counts.write.format("parquet").save("/tmp/parquent" + rdd.id) 59 | counts.write.format("json").save("/tmp/json" + rdd.id) 60 | counts.write.parquet("/tmp/parquent2" + rdd.id) 61 | counts.write.json("/tmp/json2" + rdd.id) 62 | counts.write.saveAsTable("count_table") 63 | cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") 64 | val prop: java.util.Properties = new java.util.Properties() 65 | counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) 66 | }) 67 | 68 | ssc.start() 69 | ssc.awaitTermination() 70 | } 71 | 72 | def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { 73 | rdd.map(c => c.map(f => f match { 74 | case x if x.isEmpty() => "0" 75 | case x => x 76 | })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, 77 | c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) 78 | } 79 | } -------------------------------------------------------------------------------- /Chap9/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /Chap9/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.typesafeResolver 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0") 4 | -------------------------------------------------------------------------------- /Chap9/spark.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 6 | case entry => { 7 | val strategy = mergeStrategy(entry) 8 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 9 | else strategy 10 | } 11 | }} 12 | 13 | name := "Chap9" 14 | 15 | version := "1.0" 16 | 17 | scalaVersion := "2.10.5" 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 20 | 21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0" 22 | 23 | libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.4.0" 24 | 25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10" 26 | -------------------------------------------------------------------------------- /Chap9/src/main/java/org/apress/prospark/AbstractDriver.java: -------------------------------------------------------------------------------- 1 | package org.apress.prospark; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Enumeration; 9 | import java.util.zip.GZIPInputStream; 10 | import java.util.zip.ZipEntry; 11 | import java.util.zip.ZipFile; 12 | 13 | import org.apache.commons.io.FilenameUtils; 14 | import org.apache.log4j.LogManager; 15 | import org.apache.log4j.Logger; 16 | 17 | public abstract class AbstractDriver { 18 | 19 | private static final Logger LOG = LogManager.getLogger(AbstractDriver.class); 20 | 21 | private String path; 22 | 23 | public AbstractDriver(String path) { 24 | this.path = path; 25 | } 26 | 27 | public abstract void init() throws Exception; 28 | 29 | public abstract void close() throws Exception; 30 | 31 | public abstract void sendRecord(String record) throws Exception; 32 | 33 | public void execute() throws Exception { 34 | 35 | try { 36 | init(); 37 | File dirPath = new File(path); 38 | if (dirPath.isDirectory()) { 39 | File[] files = new File(path).listFiles(); 40 | for (File f : files) { 41 | String ext = FilenameUtils.getExtension(f.getPath()); 42 | if (ext.equals("zip")) { 43 | LOG.info(String.format("Feeding zipped file %s", f.getName())); 44 | ZipFile zFile = null; 45 | try { 46 | zFile = new ZipFile(f); 47 | Enumeration zEntries = zFile.entries(); 48 | 49 | while (zEntries.hasMoreElements()) { 50 | ZipEntry zEntry = zEntries.nextElement(); 51 | LOG.info(String.format("Feeding file %s", zEntry.getName())); 52 | try (BufferedReader br = new BufferedReader( 53 | new InputStreamReader(zFile.getInputStream(zEntry)))) { 54 | // skip header 55 | br.readLine(); 56 | String line; 57 | while ((line = br.readLine()) != null) { 58 | sendRecord(line); 59 | } 60 | } 61 | } 62 | } catch (IOException e) { 63 | LOG.error(e.getMessage()); 64 | } finally { 65 | if (zFile != null) { 66 | try { 67 | zFile.close(); 68 | } catch (IOException e) { 69 | LOG.error(e.getMessage()); 70 | } 71 | } 72 | } 73 | } else if (ext.equals("gz")) { 74 | LOG.info(String.format("Feeding file %s", f.getName())); 75 | try (BufferedReader br = new BufferedReader( 76 | new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) { 77 | // skip header 78 | br.readLine(); 79 | String line; 80 | while ((line = br.readLine()) != null) { 81 | sendRecord(line); 82 | } 83 | } 84 | } else if (ext.equals("dat")) { 85 | LOG.info(String.format("Feeding dat file %s", f.getName())); 86 | try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) { 87 | String line; 88 | while ((line = br.readLine()) != null) { 89 | sendRecord(line); 90 | } 91 | } 92 | } else { 93 | LOG.warn("Unsupported file type: " + f.getName()); 94 | } 95 | } 96 | } else { 97 | LOG.error(String.format("Path %s is not a directory", path)); 98 | } 99 | } finally { 100 | close(); 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /Chap9/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | log4j.rootCategory=INFO, stdout 3 | 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-10KMeans.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.clustering.StreamingKMeans 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object KMeansClusteringApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 4) { 17 | System.err.println( 18 | "Usage: KMeansClusteringApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val substream = ssc.socketTextStream(hostname, port.toInt) 30 | .filter(!_.contains("NaN")) 31 | .map(_.split(" ")) 32 | .filter(f => f(1) != "0") 33 | 34 | val orientationStream = substream 35 | .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) 36 | .map(arr => arr.map(_.toDouble)) 37 | .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) 38 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) 39 | val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 40 | val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 41 | val model = new StreamingKMeans() 42 | .setK(3) 43 | .setDecayFactor(0) 44 | .setRandomCenters(18, 0.0) 45 | 46 | model.trainOn(train.map(v => v.features)) 47 | val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-11CollabFilteringPreprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapred.FileSplit 6 | import org.apache.hadoop.mapred.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.HadoopRDD 10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 11 | 12 | import com.google.common.io.Files 13 | 14 | object CollabFilteringPreprocessingApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 3) { 18 | System.err.println( 19 | "Usage: CollabFilteringPreprocessingApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, iPath, oPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val delim = " " 29 | 30 | val sc = new SparkContext(conf) 31 | sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) 32 | .asInstanceOf[HadoopRDD[LongWritable, Text]] 33 | .mapPartitionsWithInputSplit((iSplit, iter) => 34 | iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) 35 | .filter(r => r._2 != "0") 36 | .map(r => ((r._1, r._2), 1)) 37 | .reduceByKey(_ + _) 38 | .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) 39 | .sample(false, 0.7) 40 | .coalesce(1) 41 | .saveAsTextFile(oPath) 42 | } 43 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-12CollabFiltering.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.recommendation.ALS 6 | import org.apache.spark.mllib.recommendation.Rating 7 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 8 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 9 | import org.apache.spark.streaming.Seconds 10 | import org.apache.spark.streaming.StreamingContext 11 | 12 | object CollabFilteringApp { 13 | 14 | def main(args: Array[String]) { 15 | if (args.length != 3) { 16 | System.err.println( 17 | "Usage: CollabFilteringApp ") 18 | System.exit(1) 19 | } 20 | val Seq(appName, batchInterval, iPath) = args.toSeq 21 | 22 | val conf = new SparkConf() 23 | .setAppName(appName) 24 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 25 | 26 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 27 | 28 | val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match { 29 | case Array(subject, activity, freq) => 30 | Rating(subject.toInt, activity.toInt, freq.toDouble) 31 | }) 32 | 33 | val rank = 10 34 | val numIterations = 10 35 | val lambda = 0.01 36 | ratingStream.foreachRDD(ratingRDD => { 37 | val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7)) 38 | val model = ALS.train(testTrain(1), rank, numIterations, lambda) 39 | val test = testTrain(0).map { 40 | case Rating(subject, activity, freq) => 41 | (subject, activity) 42 | } 43 | val prediction = model.predict(test) 44 | prediction.take(5).map(println) 45 | }) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-13FPMiningPreprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.apache.hadoop.io.Text 5 | import org.apache.hadoop.mapred.FileSplit 6 | import org.apache.hadoop.mapred.TextInputFormat 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.HadoopRDD 10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 11 | 12 | import com.google.common.io.Files 13 | 14 | object FPMiningPreprocessingApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 3) { 18 | System.err.println( 19 | "Usage: FPMiningPreprocessingApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, iPath, oPath) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val delim = " " 29 | 30 | val sc = new SparkContext(conf) 31 | sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) 32 | .asInstanceOf[HadoopRDD[LongWritable, Text]] 33 | .mapPartitionsWithInputSplit((iSplit, iter) => 34 | iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) 35 | .filter(r => r._2 != "0") 36 | .map(r => (r._1, r._2)) 37 | .distinct() 38 | .groupByKey() 39 | .map(r => r._2.mkString(" ")) 40 | .sample(false, 0.7) 41 | .coalesce(1) 42 | .saveAsTextFile(oPath) 43 | } 44 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-14FPMining.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.fpm.FPGrowth 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.StreamingContext 8 | 9 | object FPMiningApp { 10 | 11 | def main(args: Array[String]) { 12 | if (args.length != 3) { 13 | System.err.println( 14 | "Usage: FPMiningApp ") 15 | System.exit(1) 16 | } 17 | val Seq(appName, batchInterval, iPath) = args.toSeq 18 | 19 | val conf = new SparkConf() 20 | .setAppName(appName) 21 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 22 | 23 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 24 | 25 | val minSupport = 0.4 26 | 27 | ssc.textFileStream(iPath) 28 | .map(r => r.split(" ")) 29 | .foreachRDD(transactionRDD => { 30 | val fpg = new FPGrowth() 31 | .setMinSupport(minSupport) 32 | val model = fpg.run(transactionRDD) 33 | 34 | model.freqItemsets 35 | .collect() 36 | .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq))) 37 | }) 38 | 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-15MLPipeline.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.ml.Pipeline 7 | import org.apache.spark.ml.feature.Normalizer 8 | import org.apache.spark.ml.feature.VectorAssembler 9 | import org.apache.spark.ml.regression.RandomForestRegressor 10 | import org.apache.spark.sql.SQLContext 11 | import org.apache.spark.streaming.Seconds 12 | import org.apache.spark.streaming.StreamingContext 13 | import org.apache.spark.ml.param.ParamMap 14 | 15 | object MLPipelineApp { 16 | 17 | case class Activity(label: Double, 18 | accelXHand: Double, accelYHand: Double, accelZHand: Double, 19 | accelXChest: Double, accelYChest: Double, accelZChest: Double, 20 | accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) 21 | 22 | def main(args: Array[String]) { 23 | if (args.length != 4) { 24 | System.err.println( 25 | "Usage: MLPipelineApp ") 26 | System.exit(1) 27 | } 28 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 29 | 30 | val conf = new SparkConf() 31 | .setAppName(appName) 32 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 33 | 34 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 35 | 36 | val sqlC = new SQLContext(ssc.sparkContext) 37 | import sqlC.implicits._ 38 | 39 | val substream = ssc.socketTextStream(hostname, port.toInt) 40 | .filter(!_.contains("NaN")) 41 | .map(_.split(" ")) 42 | .filter(f => f(1) == "4" || f(1) == "5") 43 | .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 44 | .map(f => f.map(v => v.toDouble)) 45 | .foreachRDD(rdd => { 46 | if (!rdd.isEmpty) { 47 | val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() 48 | val split = accelerometer.randomSplit(Array(0.3, 0.7)) 49 | val test = split(0) 50 | val train = split(1) 51 | 52 | val assembler = new VectorAssembler() 53 | .setInputCols(Array( 54 | "accelXHand", "accelYHand", "accelZHand", 55 | "accelXChest", "accelYChest", "accelZChest", 56 | "accelXAnkle", "accelYAnkle", "accelZAnkle")) 57 | .setOutputCol("vectors") 58 | val normalizer = new Normalizer() 59 | .setInputCol(assembler.getOutputCol) 60 | .setOutputCol("features") 61 | val regressor = new RandomForestRegressor() 62 | 63 | val pipeline = new Pipeline() 64 | .setStages(Array(assembler, normalizer, regressor)) 65 | val pMap = ParamMap(normalizer.p -> 1.0) 66 | val model = pipeline.fit(train, pMap) 67 | val prediction = model.transform(test) 68 | prediction.show() 69 | } 70 | }) 71 | 72 | ssc.start() 73 | ssc.awaitTermination() 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-17MLCrossValidation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import scala.reflect.runtime.universe 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.ml.Pipeline 8 | import org.apache.spark.ml.evaluation.RegressionEvaluator 9 | import org.apache.spark.ml.feature.Normalizer 10 | import org.apache.spark.ml.feature.VectorAssembler 11 | import org.apache.spark.ml.regression.RandomForestRegressor 12 | import org.apache.spark.ml.tuning.CrossValidator 13 | import org.apache.spark.ml.tuning.ParamGridBuilder 14 | import org.apache.spark.sql.SQLContext 15 | import org.apache.spark.streaming.Seconds 16 | import org.apache.spark.streaming.StreamingContext 17 | 18 | object MLCrossValidationApp { 19 | 20 | case class Activity(label: Double, 21 | accelXHand: Double, accelYHand: Double, accelZHand: Double, 22 | accelXChest: Double, accelYChest: Double, accelZChest: Double, 23 | accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) 24 | 25 | def main(args: Array[String]) { 26 | if (args.length != 4) { 27 | System.err.println( 28 | "Usage: MLCrossValidationApp ") 29 | System.exit(1) 30 | } 31 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 32 | 33 | val conf = new SparkConf() 34 | .setAppName(appName) 35 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 36 | 37 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 38 | 39 | val sqlC = new SQLContext(ssc.sparkContext) 40 | import sqlC.implicits._ 41 | 42 | val substream = ssc.socketTextStream(hostname, port.toInt) 43 | .filter(!_.contains("NaN")) 44 | .map(_.split(" ")) 45 | .filter(f => f(1) == "4" || f(1) == "5") 46 | .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 47 | .map(f => f.map(v => v.toDouble)) 48 | .foreachRDD(rdd => { 49 | if (!rdd.isEmpty) { 50 | val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() 51 | val split = accelerometer.randomSplit(Array(0.3, 0.7)) 52 | val test = split(0) 53 | val train = split(1) 54 | 55 | val assembler = new VectorAssembler() 56 | .setInputCols(Array( 57 | "accelXHand", "accelYHand", "accelZHand", 58 | "accelXChest", "accelYChest", "accelZChest", 59 | "accelXAnkle", "accelYAnkle", "accelZAnkle")) 60 | .setOutputCol("vectors") 61 | val normalizer = new Normalizer() 62 | .setInputCol(assembler.getOutputCol) 63 | .setOutputCol("features") 64 | val regressor = new RandomForestRegressor() 65 | 66 | val pipeline = new Pipeline() 67 | .setStages(Array(assembler, normalizer, regressor)) 68 | 69 | val validator = new CrossValidator() 70 | .setEstimator(pipeline) 71 | .setEvaluator(new RegressionEvaluator) 72 | val pGrid = new ParamGridBuilder() 73 | .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) 74 | .addGrid(regressor.numTrees, Array(10, 50, 100)) 75 | .build() 76 | validator.setEstimatorParamMaps(pGrid) 77 | validator.setNumFolds(5) 78 | 79 | val bestModel = validator.fit(train) 80 | val prediction = bestModel.transform(test) 81 | prediction.show() 82 | } 83 | }) 84 | 85 | ssc.start() 86 | ssc.awaitTermination() 87 | } 88 | 89 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-1LinearRegression.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | object LinearRegressionApp { 14 | 15 | def main(args: Array[String]) { 16 | if (args.length != 4) { 17 | System.err.println( 18 | "Usage: LinearRegressionApp ") 19 | System.exit(1) 20 | } 21 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 22 | 23 | val conf = new SparkConf() 24 | .setAppName(appName) 25 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 26 | 27 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 28 | 29 | val substream = ssc.socketTextStream(hostname, port.toInt) 30 | .filter(!_.contains("NaN")) 31 | .map(_.split(" ")) 32 | .filter(f => f(1) != "0") 33 | 34 | val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 35 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 37 | val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 38 | val model = new StreamingLinearRegressionWithSGD() 39 | .setInitialWeights(Vectors.zeros(4)) 40 | .setStepSize(0.0001) 41 | .setNumIterations(1) 42 | 43 | model.trainOn(train) 44 | model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd 45 | .map(v => math.pow((v._1 - v._2), 2)).mean()))) 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-3Statistics.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.stat.Statistics 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | 10 | object StatisticsApp { 11 | 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: StatisticsApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 25 | 26 | val substream = ssc.socketTextStream(hostname, port.toInt) 27 | .filter(!_.contains("NaN")) 28 | .map(_.split(" ")) 29 | .filter(f => f(1) != "0") 30 | .map(f => f.map(f => f.toDouble)) 31 | 32 | substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => { 33 | val stats = Statistics.colStats(rdd) 34 | println("Count: " + stats.count) 35 | println("Max: " + stats.max.toArray.mkString(" ")) 36 | println("Min: " + stats.min.toArray.mkString(" ")) 37 | println("Mean: " + stats.mean.toArray.mkString(" ")) 38 | println("L1-Norm: " + stats.normL1.toArray.mkString(" ")) 39 | println("L2-Norm: " + stats.normL2.toArray.mkString(" ")) 40 | println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" ")) 41 | println("Varience: " + stats.variance.toArray.mkString(" ")) 42 | }) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-4Correlation.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.stat.Statistics 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object CorrelationApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: CorrelationApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | .map(f => f.map(f => f.toDouble)) 32 | 33 | val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 34 | 35 | val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | walkingOrRunning.map(f => f.features).foreachRDD(rdd => { 37 | val corrSpearman = Statistics.corr(rdd, "spearman") 38 | val corrPearson = Statistics.corr(rdd, "pearson") 39 | println("Correlation Spearman: \n" + corrSpearman) 40 | println("Correlation Pearson: \n" + corrPearson) 41 | }) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-5ChiSq.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.stat.Statistics 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object ChiSqApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: ChiSqApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | .map(f => f.map(f => f.toDouble)) 32 | 33 | substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 34 | .filter(f => f(0) == 4.0 || f(0) == 5.0) 35 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 36 | .foreachRDD(rdd => { 37 | Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) 38 | }) 39 | 40 | ssc.start() 41 | ssc.awaitTermination() 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-6Preprocessing.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.StandardScaler 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.streaming.Seconds 8 | import org.apache.spark.streaming.StreamingContext 9 | 10 | object PreprocessingApp { 11 | 12 | def main(args: Array[String]) { 13 | if (args.length != 4) { 14 | System.err.println( 15 | "Usage: PreprocessingAppApp ") 16 | System.exit(1) 17 | } 18 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 19 | 20 | val conf = new SparkConf() 21 | .setAppName(appName) 22 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 23 | 24 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 25 | 26 | val substream = ssc.socketTextStream(hostname, port.toInt) 27 | .filter(!_.contains("NaN")) 28 | .map(_.split(" ")) 29 | .filter(f => f(1) != "0") 30 | 31 | substream.map(f => Array(f(2), f(4), f(5), f(6))) 32 | .map(f => f.map(v => v.toDouble)) 33 | .map(f => Vectors.dense(f)) 34 | .foreachRDD(rdd => { 35 | val scalerModel = new StandardScaler().fit(rdd) 36 | val scaledRDD = scalerModel.transform(rdd) 37 | }) 38 | 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-7FeatureExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.ChiSqSelector 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object FeatureExtractionApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: FeatureExtractionApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | 32 | val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 33 | .map(f => f.map(v => v.toDouble)) 34 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048)))) 35 | 36 | datastream.foreachRDD(rdd => { 37 | val selector = new ChiSqSelector(5) 38 | val model = selector.fit(rdd) 39 | val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features))) 40 | filtered.take(20).foreach(println) 41 | }) 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-8PCA.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.feature.PCA 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.streaming.Seconds 9 | import org.apache.spark.streaming.StreamingContext 10 | 11 | object PCAApp { 12 | 13 | def main(args: Array[String]) { 14 | if (args.length != 4) { 15 | System.err.println( 16 | "Usage: PCAApp ") 17 | System.exit(1) 18 | } 19 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 20 | 21 | val conf = new SparkConf() 22 | .setAppName(appName) 23 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 24 | 25 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 26 | 27 | val substream = ssc.socketTextStream(hostname, port.toInt) 28 | .filter(!_.contains("NaN")) 29 | .map(_.split(" ")) 30 | .filter(f => f(1) != "0") 31 | 32 | val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) 33 | .map(f => f.map(v => v.toDouble)) 34 | .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) 35 | 36 | datastream.foreachRDD(rdd => { 37 | val pca = new PCA(rdd.first().features.size / 2) 38 | .fit(rdd.map(_.features)) 39 | val testTrain = rdd.randomSplit(Array(0.3, 0.7)) 40 | val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features))) 41 | val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features))) 42 | train.take(20).foreach(println) 43 | }) 44 | 45 | ssc.start() 46 | ssc.awaitTermination() 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/L9-9LogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions 10 | import org.apache.spark.streaming.Seconds 11 | import org.apache.spark.streaming.StreamingContext 12 | import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD 13 | 14 | object LogisticRegressionApp { 15 | 16 | def main(args: Array[String]) { 17 | if (args.length != 4) { 18 | System.err.println( 19 | "Usage: LogisticRegressionApp ") 20 | System.exit(1) 21 | } 22 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 23 | 24 | val conf = new SparkConf() 25 | .setAppName(appName) 26 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 27 | 28 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 29 | 30 | val substream = ssc.socketTextStream(hostname, port.toInt) 31 | .filter(!_.contains("NaN")) 32 | .map(_.split(" ")) 33 | .filter(f => f(1) != "0") 34 | 35 | val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) 36 | 37 | val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 38 | val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) 39 | val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() 40 | val model = new StreamingLogisticRegressionWithSGD() 41 | .setInitialWeights(Vectors.zeros(4)) 42 | .setStepSize(0.0001) 43 | .setNumIterations(1) 44 | 45 | model.trainOn(train) 46 | model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd 47 | .map(v => math.pow((v._1 - v._2), 2)).mean()))) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /Chap9/src/main/scala/org/apress/prospark/T9-4DataTypes.scala: -------------------------------------------------------------------------------- 1 | package org.apress.prospark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.linalg.Matrices 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix 8 | import org.apache.spark.mllib.linalg.distributed.IndexedRow 9 | import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix 10 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry 11 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 12 | import org.apache.spark.mllib.regression.LabeledPoint 13 | import org.apache.spark.streaming.Seconds 14 | import org.apache.spark.streaming.StreamingContext 15 | 16 | object DataTypesApp { 17 | 18 | def main(args: Array[String]) { 19 | if (args.length != 4) { 20 | System.err.println( 21 | "Usage: DataTypesApp ") 22 | System.exit(1) 23 | } 24 | val Seq(appName, batchInterval, hostname, port) = args.toSeq 25 | 26 | val conf = new SparkConf() 27 | .setAppName(appName) 28 | .setJars(SparkContext.jarOfClass(this.getClass).toSeq) 29 | 30 | val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) 31 | 32 | val substream = ssc.socketTextStream(hostname, port.toInt) 33 | .filter(!_.contains("NaN")) 34 | .map(_.split(" ")) 35 | .filter(f => f(1) != "0") 36 | .map(f => f.map(f => f.toDouble)) 37 | 38 | val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) 39 | denseV.print() 40 | val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) 41 | .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) 42 | sparseV.print() 43 | val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) 44 | labeledP.print() 45 | val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) 46 | denseM.print() 47 | denseV.foreachRDD(rdd => { 48 | val rowM = new RowMatrix(rdd) 49 | println(rowM) 50 | }) 51 | denseV.foreachRDD(rdd => { 52 | val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) 53 | val iRowM = new IndexedRowMatrix(iRdd) 54 | println(iRowM) 55 | }) 56 | substream.foreachRDD(rdd => { 57 | val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) 58 | .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) 59 | val cRowM = new CoordinateMatrix(entries) 60 | println(cRowM) 61 | }) 62 | substream.foreachRDD(rdd => { 63 | val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) 64 | .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) 65 | val blockM = new CoordinateMatrix(entries).toBlockMatrix 66 | println(blockM) 67 | }) 68 | 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | 73 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pro Spark Streaming 2 | 3 | Code used in "Pro Spark Streaming: The Zen of Real-time Analytics using Apache Spark" published by Apress Publishing. 4 | 5 | ISBN-13: 978-1484214800 6 | 7 | ISBN-10: 1484214803 8 | 9 | # Layout 10 | 11 | Each folder contains code for a particular chapter. The repetition of code is deliberate. While this goes against most software engineering principles (held very dearly by the author as well), it is necessary to expound a topic and keep its implementation self-contained. 12 | 13 | ## Chapters 14 | 15 | - 2: Introduction to Spark 16 | - 3: DStreams: Real-time RDDs 17 | - 4: High Velocity Streams: Parallelism and Other Stories 18 | - 5: Real-time Route 66: Linking External Data Sources 19 | - 6: The Art of Side Effects 20 | - 7: Getting Ready for Prime Time 21 | - 8: Real-time ETL and Analytics Magic 22 | - 9: Machine Learning at Scale 23 | - 10: Of Clouds, Lambdas, and Pythons 24 | 25 | # Build 26 | 27 | Jump to a particular folder and simply execute `sbt assembly`. This will generate an uber JAR that can directly be submitted to a Spark cluster. --------------------------------------------------------------------------------