├── .gitignore
├── Chap10
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    ├── src
    │   └── main
    │   │   ├── java
    │   │       └── org
    │   │       │   └── apress
    │   │       │       └── prospark
    │   │       │           ├── AbstractDriver.java
    │   │       │           └── SocketDriver.java
    │   │   ├── resources
    │   │       └── log4j.properties
    │   │   └── scala
    │   │       └── org
    │   │           └── apress
    │   │               └── prospark
    │   │                   ├── L10-2DataProc.scala
    │   │                   ├── L10-4LambdaDataproc.scala
    │   │                   ├── L10-4LambdaLocal.scala
    │   │                   └── L10-9Graph.scala
    └── yelp_pyspark.py
├── Chap2
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── L2-1FirstApp.scala
    │                       └── T2-6Accumulator.scala
├── Chap3
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── org
    │   │           └── apress
    │   │               └── prospark
    │   │                   ├── L3-1DStreams.scala
    │   │                   ├── L3-DStreamAggregation.scala
    │   │                   ├── L3-DStreamKeyValue.scala
    │   │                   ├── L3-DStreamMapping.scala
    │   │                   ├── L3-DStreamVariation.scala
    │   │                   └── L3-DStreamWindowAndAction.scala
    └── touch_files_window.sh
├── Chap4
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── L4-1Voyager.scala
    │                       ├── L4-3ProtonFlux.scala
    │                       └── L4-4Kryo.scala
├── Chap5
    ├── flumeConf
    │   ├── flumePull.conf
    │   ├── flumePush.conf
    │   ├── flumeTest.conf
    │   └── log4j.properties
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── apress
    │           │       └── prospark
    │           │           ├── AbstractDriver.java
    │           │           ├── KafkaDriver.java
    │           │           ├── MqttDriver.java
    │           │           └── SocketDriver.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── HttpInputDStream.scala
    │                       ├── HttpInputDStreamAsync.scala
    │                       ├── L5-11FlumePull.scala
    │                       ├── L5-11FlumePush.scala
    │                       ├── L5-13Kafka.scala
    │                       ├── L5-14KafkaCustomConf.scala
    │                       ├── L5-15KafkaDirect.scala
    │                       ├── L5-16Twitter.scala
    │                       ├── L5-18Http.scala
    │                       ├── L5-6SocketStream.scala
    │                       ├── L5-7MultipleSocketStreams.scala
    │                       └── L5-9Mqtt.scala
├── Chap6
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── apress
    │           │       └── prospark
    │           │           ├── AbstractDriver.java
    │           │           ├── MqttDriver.java
    │           │           └── SocketDriver.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── HttpInputDStream.scala
    │                       ├── L6-10LazyStatic.scala
    │                       ├── L6-12StaticPool.scala
    │                       ├── L6-14HBase.scala
    │                       ├── L6-16SparkHBase.scala
    │                       ├── L6-18Cassandra.scala
    │                       ├── L6-20CassandraConnector.scala
    │                       ├── L6-22Counters.scala
    │                       ├── L6-23UpdateState.scala
    │                       ├── L6-24Accumulators.scala
    │                       ├── L6-26Redis.scala
    │                       ├── L6-5Exception.scala
    │                       ├── L6-6PerRecord.scala
    │                       ├── L6-7PerPartition.scala
    │                       └── L6-8Static.scala
├── Chap7
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── apress
    │           │       └── prospark
    │           │           ├── AbstractDriver.java
    │           │           └── SocketDriver.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── L7-2-3Tachyon.scala
    │                       └── L7-4UI.scala
├── Chap8
    ├── L8-36CdrSparkRApp.R
    ├── L8-39CdrStreamingSparkRApp.R
    ├── cdrschema.json
    ├── cdrschema2.json
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── apress
    │           │       └── prospark
    │           │           ├── AbstractDriver.java
    │           │           └── SocketDriver.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── L8-10-11UDF.scala
    │                       ├── L8-13HiveQL.scala
    │                       ├── L8-14-27DataFrameExamples.scala
    │                       ├── L8-1DataFrameAPI.scala
    │                       ├── L8-28DataFrameExamplesOps.scala
    │                       ├── L8-29DataFrameExamplesJoin.scala
    │                       ├── L8-3-6-7DataFrameCreation.scala
    │                       ├── L8-35DataFrameExamplesRDD.scala
    │                       ├── L8-38SparkR.scala
    │                       ├── L8-4DataFrameCreationSchema.scala
    │                       ├── L8-8Sql.scala
    │                       ├── T8-3DataFrameExamplesNA.scala
    │                       └── T8-5-L8-30-34DataFrameExamplesActions.scala
├── Chap9
    ├── project
    │   ├── assembly.sbt
    │   └── plugins.sbt
    ├── spark.sbt
    └── src
    │   └── main
    │       ├── java
    │           └── org
    │           │   └── apress
    │           │       └── prospark
    │           │           ├── AbstractDriver.java
    │           │           └── SocketDriver.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── org
    │               └── apress
    │                   └── prospark
    │                       ├── L9-10KMeans.scala
    │                       ├── L9-11CollabFilteringPreprocessing.scala
    │                       ├── L9-12CollabFiltering.scala
    │                       ├── L9-13FPMiningPreprocessing.scala
    │                       ├── L9-14FPMining.scala
    │                       ├── L9-15MLPipeline.scala
    │                       ├── L9-17MLCrossValidation.scala
    │                       ├── L9-1LinearRegression.scala
    │                       ├── L9-3Statistics.scala
    │                       ├── L9-4Correlation.scala
    │                       ├── L9-5ChiSq.scala
    │                       ├── L9-6Preprocessing.scala
    │                       ├── L9-7FeatureExtraction.scala
    │                       ├── L9-8PCA.scala
    │                       ├── L9-9LogisticRegression.scala
    │                       └── T9-4DataTypes.scala
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | .cache-main
15 | 
16 | # Scala-IDE specific
17 | .scala_dependencies
18 | .worksheet
19 | 
20 | # Eclipse
21 | .classpath
22 | .project
23 | .settings/   
24 | .pydevproject
25 | 


--------------------------------------------------------------------------------
/Chap10/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap10/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 
5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
6 | 


--------------------------------------------------------------------------------
/Chap10/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | net.virtualvoid.sbt.graph.DependencyGraphSettings.graphSettings
 6 | 
 7 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 8 |  case entry => {
 9 |    val strategy = mergeStrategy(entry)
10 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
11 |    else strategy
12 |  }
13 | }}
14 | 
15 | name := "Chap10"
16 | 
17 | version := "1.0"
18 | 
19 | scalaVersion := "2.10.5"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
22 | 
23 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
24 | 
25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
26 | 
27 | libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-1.1" % "0.2.3" exclude("com.google.guava", "guava")
28 | 
29 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2"
30 | 
31 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2"
32 | 
33 | libraryDependencies += "com.google.guava" % "guava" % "16.0"
34 | 
35 | libraryDependencies += "org.mortbay.jetty.alpn" % "alpn-boot" % "8.1.6.v20151105"
36 | 
37 | libraryDependencies += "com.google.cloud.bigdataoss" % "bigquery-connector" % "0.7.4-hadoop2"
38 | 
39 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.4.0"
40 | 
41 | 


--------------------------------------------------------------------------------
/Chap10/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.Enumeration;
  9 | import java.util.zip.GZIPInputStream;
 10 | import java.util.zip.ZipEntry;
 11 | import java.util.zip.ZipFile;
 12 | 
 13 | import org.apache.commons.io.FilenameUtils;
 14 | import org.apache.log4j.LogManager;
 15 | import org.apache.log4j.Logger;
 16 | 
 17 | public abstract class AbstractDriver {
 18 | 
 19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
 20 | 
 21 | 	private String path;
 22 | 
 23 | 	public AbstractDriver(String path) {
 24 | 		this.path = path;
 25 | 	}
 26 | 
 27 | 	public abstract void init() throws Exception;
 28 | 
 29 | 	public abstract void close() throws Exception;
 30 | 
 31 | 	public abstract void sendRecord(String record) throws Exception;
 32 | 
 33 | 	public void execute() throws Exception {
 34 | 
 35 | 		try {
 36 | 			init();
 37 | 			File dirPath = new File(path);
 38 | 			if (dirPath.isDirectory()) {
 39 | 				File[] files = new File(path).listFiles();
 40 | 				for (File f : files) {
 41 | 					String ext = FilenameUtils.getExtension(f.getPath());
 42 | 					if (ext.equals("zip")) {
 43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
 44 | 						ZipFile zFile = null;
 45 | 						try {
 46 | 							zFile = new ZipFile(f);
 47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
 48 | 
 49 | 							while (zEntries.hasMoreElements()) {
 50 | 								ZipEntry zEntry = zEntries.nextElement();
 51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
 52 | 								try (BufferedReader br = new BufferedReader(
 53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
 54 | 									// skip header
 55 | 									br.readLine();
 56 | 									String line;
 57 | 									while ((line = br.readLine()) != null) {
 58 | 										sendRecord(line);
 59 | 									}
 60 | 								}
 61 | 							}
 62 | 						} catch (IOException e) {
 63 | 							LOG.error(e.getMessage());
 64 | 						} finally {
 65 | 							if (zFile != null) {
 66 | 								try {
 67 | 									zFile.close();
 68 | 								} catch (IOException e) {
 69 | 									LOG.error(e.getMessage());
 70 | 								}
 71 | 							}
 72 | 						}
 73 | 					} else if (ext.equals("gz")) {
 74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
 75 | 						try (BufferedReader br = new BufferedReader(
 76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
 77 | 							// skip header
 78 | 							br.readLine();
 79 | 							String line;
 80 | 							while ((line = br.readLine()) != null) {
 81 | 								sendRecord(line);
 82 | 							}
 83 | 						}
 84 | 					} else if (ext.equals("dat") || ext.equals("json")) {
 85 | 						LOG.info(String.format("Feeding dat file %s", f.getName()));
 86 | 						try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) {
 87 | 							String line;
 88 | 							while ((line = br.readLine()) != null) {
 89 | 								sendRecord(line);
 90 | 							}
 91 | 						}
 92 | 					} else {
 93 | 						LOG.warn("Unsupported file type: " + f.getName());
 94 | 					}
 95 | 				}
 96 | 			} else {
 97 | 				LOG.error(String.format("Path %s is not a directory", path));
 98 | 			}
 99 | 		} finally {
100 | 			close();
101 | 		}
102 | 	}
103 | }


--------------------------------------------------------------------------------
/Chap10/src/main/java/org/apress/prospark/SocketDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.IOException;
  4 | import java.net.InetSocketAddress;
  5 | import java.nio.ByteBuffer;
  6 | import java.nio.channels.ServerSocketChannel;
  7 | import java.nio.channels.SocketChannel;
  8 | import java.nio.charset.StandardCharsets;
  9 | import java.util.concurrent.ExecutionException;
 10 | 
 11 | import org.apache.log4j.LogManager;
 12 | import org.apache.log4j.Logger;
 13 | 
 14 | public class SocketDriver extends AbstractDriver {
 15 | 
 16 | 	private static final Logger LOG = LogManager.getLogger(SocketDriver.class);
 17 | 
 18 | 	private String hostname;
 19 | 	private int port;
 20 | 	private SocketStream socketStream;
 21 | 
 22 | 	public SocketDriver(String path, String hostname, int port) {
 23 | 		super(path);
 24 | 		this.hostname = hostname;
 25 | 		this.port = port;
 26 | 	}
 27 | 
 28 | 	@Override
 29 | 	public void init() throws Exception {
 30 | 		socketStream = new SocketStream(hostname, port);
 31 | 		LOG.info(String.format("Waiting for client to connect on port %d", port));
 32 | 		SocketChannel socketChan = socketStream.init();
 33 | 		LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port));
 34 | 		socketStream.kickOff(socketChan);
 35 | 		socketStream.start();
 36 | 	}
 37 | 
 38 | 	@Override
 39 | 	public void close() throws IOException {
 40 | 		socketStream.done();
 41 | 		if (socketStream != null) {
 42 | 			socketStream.close();
 43 | 		}
 44 | 	}
 45 | 
 46 | 	@Override
 47 | 	public void sendRecord(String record) throws Exception {
 48 | 		socketStream.sendMsg(record + "\n");
 49 | 	}
 50 | 
 51 | 	static class SocketStream extends Thread {
 52 | 
 53 | 		private String hostname;
 54 | 		private int port;
 55 | 		private ServerSocketChannel server;
 56 | 		private volatile boolean isDone = false;
 57 | 		private SocketChannel socket = null;
 58 | 		private long totalBytes;
 59 | 		private long totalLines;
 60 | 
 61 | 		public SocketStream(String hostname, int port) {
 62 | 			this.hostname = hostname;
 63 | 			this.port = port;
 64 | 			totalBytes = 0;
 65 | 			totalLines = 0;
 66 | 		}
 67 | 
 68 | 		public SocketChannel init() throws IOException {
 69 | 			server = ServerSocketChannel.open();
 70 | 			server.bind(new InetSocketAddress(hostname, port));
 71 | 			LOG.info(String.format("Listening on %s", server.getLocalAddress()));
 72 | 			return server.accept();
 73 | 		}
 74 | 
 75 | 		public void kickOff(SocketChannel socket) {
 76 | 			LOG.info("Kicking off data transfer");
 77 | 			this.socket = socket;
 78 | 		}
 79 | 
 80 | 		@Override
 81 | 		public void run() {
 82 | 			try {
 83 | 				while (!isDone) {
 84 | 					Thread.sleep(1000);
 85 | 				}
 86 | 			} catch (Exception e) {
 87 | 				LOG.error(e);
 88 | 			}
 89 | 		}
 90 | 
 91 | 		public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException {
 92 | 			if (socket != null) {
 93 | 				ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8));
 94 | 				int bytesWritten = socket.write(buffer);
 95 | 				totalBytes += bytesWritten;
 96 | 			} else {
 97 | 				throw new IOException("Client hasn't connected yet!");
 98 | 			}
 99 | 			totalLines++;
100 | 		}
101 | 
102 | 		public void done() {
103 | 			isDone = true;
104 | 		}
105 | 
106 | 		public void close() throws IOException {
107 | 			if (socket != null) {
108 | 				socket.close();
109 | 				socket = null;
110 | 			}
111 | 			LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes,
112 | 					totalLines));
113 | 		}
114 | 	}
115 | 
116 | 	public static void main(String[] args) throws Exception {
117 | 
118 | 		if (args.length != 3) {
119 | 			System.err.println("Usage: SocketDriver <path_to_input_folder> <hostname> <port>");
120 | 			System.exit(-1);
121 | 		}
122 | 
123 | 		String path = args[0];
124 | 		String hostname = args[1];
125 | 		int port = Integer.parseInt(args[2]);
126 | 
127 | 		SocketDriver driver = new SocketDriver(path, hostname, port);
128 | 		try {
129 | 			driver.execute();
130 | 		} finally {
131 | 			driver.close();
132 | 		}
133 | 	}
134 | }


--------------------------------------------------------------------------------
/Chap10/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/Chap10/src/main/scala/org/apress/prospark/L10-2DataProc.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.HashPartitioner
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.json4s.DefaultFormats
10 | import org.json4s.JsonAST.JNothing
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object DataProcApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     ssc.socketTextStream(hostname, port.toInt)
33 |       .map(r => {
34 |         implicit val formats = DefaultFormats
35 |         parse(r)
36 |       })
37 |       .filter(jvalue => {
38 |         jvalue \ "attributes" \ "Wi-Fi" != JNothing
39 |       })
40 |       .map(jvalue => {
41 |         implicit val formats = DefaultFormats
42 |         ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int])
43 |       })
44 |       .combineByKey(
45 |         (v) => (v, 1),
46 |         (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
47 |         (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
48 |         new HashPartitioner(ssc.sparkContext.defaultParallelism))
49 |       .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
50 |       .print()
51 | 
52 |     ssc.start()
53 |     ssc.awaitTermination()
54 |   }
55 | 
56 | }


--------------------------------------------------------------------------------
/Chap10/src/main/scala/org/apress/prospark/L10-9Graph.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.graphx.Edge
 6 | import org.apache.spark.graphx.Graph
 7 | import org.apache.spark.graphx.Graph.graphToGraphOps
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object UserRankApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     ssc.socketTextStream(hostname, port.toInt)
33 |       .map(r => {
34 |         implicit val formats = DefaultFormats
35 |         parse(r)
36 |       })
37 |       .foreachRDD(rdd => {
38 |         val edges = rdd.map(jvalue => {
39 |           implicit val formats = DefaultFormats
40 |           ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
41 |         })
42 |           .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))
43 | 
44 |         val vertices = rdd.map(jvalue => {
45 |           implicit val formats = DefaultFormats
46 |           ((jvalue \ "user_id").extract[String])
47 |         })
48 |           .map(r => (r.hashCode.toLong, r))
49 | 
50 |         val tolerance = 0.0001
51 |         val graph = Graph(vertices, edges, "defaultUser")
52 |           .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
53 |         val pr = graph.pageRank(tolerance).cache
54 | 
55 |         graph.outerJoinVertices(pr.vertices) {
56 |           (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
57 |         }.vertices.top(10) {
58 |           Ordering.by(_._2._1)
59 |         }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
60 |       })
61 | 
62 |     ssc.start()
63 |     ssc.awaitTermination()
64 | 
65 |   }
66 | 
67 | }


--------------------------------------------------------------------------------
/Chap10/yelp_pyspark.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.streaming import StreamingContext
 3 | from sys import argv, exit
 4 | try: import simplejson as json
 5 | except ImportError: import json
 6 | 
 7 | if len(argv) != 5:
 8 |     print 'Usage: yelp_pyspark.py <appname> <batchInterval> <hostname> <port>'
 9 |     exit(-1)
10 | 
11 | appname = argv[1]
12 | batch_interval = int(argv[2])
13 | hostname = argv[3]
14 | port = int(argv[4])
15 | 
16 | sc = SparkContext(appName=appname)
17 | ssc = StreamingContext(sc, batch_interval)
18 | 
19 | records = ssc.socketTextStream(hostname, port)
20 | json_records = records.map(lambda rec: json.loads(rec))
21 | restaurant_records = json_records.filter(lambda rec: 'attributes' in rec and 'Wi-Fi' in rec['attributes'])
22 | wifi_pairs = restaurant_records.map(lambda rec: (rec['attributes']['Wi-Fi'], rec['stars']))
23 | wifi_counts = wifi_pairs.combineByKey(lambda v: (v, 1),
24 |                              lambda x, value: (x[0] + value, x[1] + 1),
25 |                              lambda x, y: (x[0] + y[0], x[1] + y[1]))
26 | avg_stars = wifi_counts.map(lambda (key, (sum_, count)): (key, sum_ / count))
27 | avg_stars.pprint()
28 | 
29 | ssc.start()
30 | ssc.awaitTermination()
31 | 


--------------------------------------------------------------------------------
/Chap2/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap2/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")


--------------------------------------------------------------------------------
/Chap2/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap2"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 


--------------------------------------------------------------------------------
/Chap2/src/main/scala/org/apress/prospark/L2-1FirstApp.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | 
 8 | object TranslateApp {
 9 |   def main(args: Array[String]) {
10 |     if (args.length != 4) {
11 |       System.err.println(
12 |         "Usage: TranslateApp <appname> <book_path> <output_path> <language>")
13 |       System.exit(1)
14 |     }
15 |     val Seq(appName, bookPath, outputPath, lang) = args.toSeq
16 | 
17 |     val dict = getDictionary(lang)
18 | 
19 |     val conf = new SparkConf()
20 |       .setAppName(appName)
21 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
22 |     val sc = new SparkContext(conf)
23 |     val book = sc.textFile(bookPath)
24 |     val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" "))
25 |     translated.saveAsTextFile(outputPath)
26 |   }
27 | 
28 |   def getDictionary(lang: String): Map[String, String] = {
29 |     if (!Set("German", "French", "Italian", "Spanish").contains(lang)) {
30 |       System.err.println(
31 |         "Unsupported language: %s".format(lang))
32 |       System.exit(1)
33 |     }
34 |     val url = "http://www.june29.com/IDP/files/%s.txt".format(lang)
35 |     println("Grabbing dictionary from: %s".format(url))
36 |     Source.fromURL(url, "ISO-8859-1").mkString
37 |       .split("\\r?\\n")
38 |       .filter(line => !line.startsWith("#"))
39 |       .map(line => line.split("\\t"))
40 |       .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/Chap2/src/main/scala/org/apress/prospark/T2-6Accumulator.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | 
 8 | object AccumulatorApp {
 9 |   def main(args: Array[String]) {
10 |     if (args.length != 1) {
11 |       System.err.println(
12 |         "Usage: AccumulatorApp <appname>")
13 |       System.exit(1)
14 |     }
15 |     val Seq(appName) = args.toSeq
16 | 
17 |     val conf = new SparkConf()
18 |       .setAppName(appName)
19 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
20 |       .set("spark.eventLog.enabled", true.toString)
21 |       .set("spark.eventLog.dir", "/tmp")
22 |     val sc = new SparkContext(conf)
23 |     val setAcc = sc.accumulableCollection(mutable.HashSet[Int]())
24 |     val d = sc.parallelize(1 to 100)
25 |     d.foreach(x => setAcc += x)
26 |     println(setAcc.value.size)
27 |   }
28 | }


--------------------------------------------------------------------------------
/Chap3/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap3/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap3/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap3"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 


--------------------------------------------------------------------------------
/Chap3/src/main/scala/org/apress/prospark/L3-1DStreams.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.hadoop.io.LongWritable
 9 | import org.apache.hadoop.fs.Path
10 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
11 | import org.apache.hadoop.io.Text
12 | 
13 | object StreamingTranslateApp {
14 |   def main(args: Array[String]) {
15 |     if (args.length != 4) {
16 |       System.err.println(
17 |         "Usage: StreamingTranslateApp <appname> <book_path> <output_path> <language>")
18 |       System.exit(1)
19 |     }
20 |     val Seq(appName, bookPath, outputPath, lang) = args.toSeq
21 | 
22 |     val dict = getDictionary(lang)
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 |     val ssc = new StreamingContext(conf, Seconds(1))
28 | 
29 |     val book = ssc.textFileStream(bookPath)
30 |     val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" "))
31 |     translated.saveAsTextFiles(outputPath)
32 | 
33 |     ssc.start()
34 |     ssc.awaitTermination()
35 |   }
36 | 
37 |   def getDictionary(lang: String): Map[String, String] = {
38 |     if (!Set("German", "French", "Italian", "Spanish").contains(lang)) {
39 |       System.err.println(
40 |         "Unsupported language: %s".format(lang))
41 |       System.exit(1)
42 |     }
43 |     val url = "http://www.june29.com/IDP/files/%s.txt".format(lang)
44 |     println("Grabbing dictionary from: %s".format(url))
45 |     Source.fromURL(url, "ISO-8859-1").mkString
46 |       .split("\\r?\\n")
47 |       .filter(line => !line.startsWith("#"))
48 |       .map(line => line.split("\\t"))
49 |       .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap
50 |   }
51 | 
52 | }


--------------------------------------------------------------------------------
/Chap3/src/main/scala/org/apress/prospark/L3-DStreamAggregation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditAggregationApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditAggregationApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val recCount = comments.count()
39 | 
40 |     val recCountValue = comments.countByValue()
41 | 
42 |     val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString))
43 |       .flatMap(body => body.split(" "))
44 |       .map(word => 1)
45 |       .reduce(_ + _)
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 | 
50 |   }
51 | }


--------------------------------------------------------------------------------
/Chap3/src/main/scala/org/apress/prospark/L3-DStreamMapping.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditMappingApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditMappingApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val sdf = new SimpleDateFormat("yyyy-MM-dd")
39 |     val tsKey = "created_utc"
40 |     val secs = 1000L
41 |     val keyedByDay = comments.map(rec => {
42 |       val ts = (parse(rec) \ tsKey).values
43 |       (sdf.format(new Date(ts.toString.toLong * secs)), rec)
44 |     })
45 | 
46 |     val keyedByDayPart = comments.mapPartitions(iter => {
47 |       var ret = List[(String, String)]()
48 |       while (iter.hasNext) {
49 |         val rec = iter.next
50 |         val ts = (parse(rec) \ tsKey).values
51 |         ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec)
52 |       }
53 |       ret.iterator
54 |     })
55 | 
56 |     val wordTokens = comments.map(rec => {
57 |       ((parse(rec) \ "body")).values.toString.split(" ")
58 |     })
59 | 
60 |     val wordTokensFlat = comments.flatMap(rec => {
61 |       ((parse(rec) \ "body")).values.toString.split(" ")
62 |     })
63 | 
64 |     val filterSubreddit = comments.filter(rec =>
65 |       (parse(rec) \ "subreddit").values.toString.equals("AskReddit"))
66 | 
67 |     val sortedByAuthor = comments.transform(rdd =>
68 |       (rdd.sortBy(rec => (parse(rec) \ "author").values.toString)))
69 | 
70 |     ssc.start()
71 |     ssc.awaitTermination()
72 | 
73 |   }
74 | }


--------------------------------------------------------------------------------
/Chap3/src/main/scala/org/apress/prospark/L3-DStreamVariation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | 
19 | object RedditVariationApp {
20 |   def main(args: Array[String]) {
21 |     if (args.length != 2) {
22 |       System.err.println(
23 |         "Usage: RedditVariationApp <appname> <input_path>")
24 |       System.exit(1)
25 |     }
26 |     val Seq(appName, inputPath) = args.toSeq
27 |     val LOG = LogManager.getLogger(this.getClass)
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(1))
34 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
35 | 
36 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
37 | 
38 |     val merged = comments.union(comments)
39 | 
40 |     val repartitionedComments = comments.repartition(4)
41 | 
42 |     val rddMin = comments.glom().map(arr =>
43 |       arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 | 
48 |   }
49 | }


--------------------------------------------------------------------------------
/Chap3/src/main/scala/org/apress/prospark/L3-DStreamWindowAndAction.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
 6 | import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
 7 | import org.apache.hadoop.fs.Path
 8 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.hadoop.mapred.TextOutputFormat
11 | import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
12 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
13 | import org.apache.log4j.LogManager
14 | import org.json4s._
15 | import org.json4s.native.JsonMethods._
16 | import java.text.SimpleDateFormat
17 | import java.util.Date
18 | import org.apache.spark.HashPartitioner
19 | 
20 | object RedditWindowAndActionApp {
21 |   def main(args: Array[String]) {
22 |     if (args.length != 2) {
23 |       System.err.println(
24 |         "Usage: RedditWindowAndActionApp <appname> <input_path>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, inputPath) = args.toSeq
28 |     val LOG = LogManager.getLogger(this.getClass)
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(1))
35 |     LOG.info("Started at %d".format(ssc.sparkContext.startTime))
36 | 
37 |     val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
38 | 
39 |     val checkpointPath = "/tmp"
40 |     ssc.checkpoint(checkpointPath)
41 |     val updateFunc = (values: Seq[Int], state: Option[Int]) => {
42 |       val currentCount = values.sum
43 |       val previousCount = state.getOrElse(0)
44 |       Some(currentCount + previousCount)
45 |     }
46 |     val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
47 |     val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
48 |       .map(r => (r._2, r._1))
49 |       .transform(rdd => rdd.sortByKey(ascending = false))
50 | 
51 |     val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
52 |     val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
53 |     val windowedCounts = windowedRecs.countByValue()
54 | 
55 |     windowedCounts.print(10)
56 |     windowedCounts.saveAsObjectFiles("subreddit", "obj")
57 |     windowedCounts.saveAsTextFiles("subreddit", "txt")
58 | 
59 |     globalCount.saveAsHadoopFiles("subreddit", "hadoop",
60 |       classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
61 |     globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
62 |       classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
63 |     comments.foreachRDD(rdd => {
64 |       LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
65 |     })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 | 
70 |   }
71 | }


--------------------------------------------------------------------------------
/Chap3/touch_files_window.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |         for i in `seq 1 10`;
3 |         do
4 | 		p=/Users/zubairnabi/Downloads/dummy/${i}.gz
5 |                 echo ${p}
6 | 		touch -c ${p}
7 | 		sleep 1
8 |       	done    
9 | 


--------------------------------------------------------------------------------
/Chap4/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap4/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap4/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap4"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 


--------------------------------------------------------------------------------
/Chap4/src/main/scala/org/apress/prospark/L4-1Voyager.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.LongWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | 
14 | object VoyagerApp {
15 |   def main(args: Array[String]) {
16 |     if (args.length != 3) {
17 |       System.err.println(
18 |         "Usage: VoyagerApp <appname> <inputPath> <outputPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, inputPath, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |       .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC")
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(10))
29 | 
30 |     val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
31 |     voyager1.map(rec => {
32 |       val attrs = rec.split("\\s+")
33 |       ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble))
34 |     }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1))
35 |       .reduceByKey(_ + _)
36 |       .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath)
37 | 
38 |     ssc.start()
39 |     ssc.awaitTermination()
40 |   }
41 | }


--------------------------------------------------------------------------------
/Chap4/src/main/scala/org/apress/prospark/L4-3ProtonFlux.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import com.esotericsoftware.kryo.{KryoSerializable,Kryo}
 4 | import com.esotericsoftware.kryo.io.{Output, Input}
 5 | 
 6 | class ProtonFlux(
 7 |     var year: Int,
 8 |     var bin0_57to1_78: Double,
 9 |     var bin3_40to17_6: Double,
10 |     var bin22_0to31_0: Double,
11 |     var bin1_894to2_605: Double,
12 |     var bin4_200to6_240: Double,
13 |     var bin3_256to8_132: Double,
14 |     var bin3_276to8_097: Double,
15 |     var bin6_343to42_03: Double,
16 |     var bin17_88to26_81: Double,
17 |     var bin30_29to69_47: Double,
18 |     var bin132_8to242_0: Double
19 |   ) extends KryoSerializable {
20 |   
21 |   def this(year: String, bin0_57to1_78: String, bin3_40to17_6: String, 
22 |       bin22_0to31_0: String, bin1_894to2_605: String, bin4_200to6_240: String, 
23 |       bin3_256to8_132: String, bin3_276to8_097: String, bin6_343to42_03: String,
24 |       bin17_88to26_81: String, bin30_29to69_47: String, bin132_8to242_0: String) {
25 |     this(year.toInt, bin0_57to1_78.toDouble, bin3_40to17_6.toDouble,
26 |         bin22_0to31_0.toDouble, bin1_894to2_605.toDouble, bin4_200to6_240.toDouble, 
27 |         bin3_256to8_132.toDouble, bin3_276to8_097.toDouble, bin6_343to42_03.toDouble,
28 |         bin17_88to26_81.toDouble, bin30_29to69_47.toDouble, bin132_8to242_0.toDouble)
29 |   }
30 |   
31 |   def isSolarStorm = (bin0_57to1_78 > 1.0 || bin3_40to17_6 > 1.0 
32 |     || bin22_0to31_0 > 1.0 || bin1_894to2_605 > 1.0 || bin4_200to6_240 > 1.0 
33 |     || bin3_256to8_132 > 1.0 || bin3_276to8_097 > 1.0 || bin6_343to42_03 > 1.0
34 |     || bin17_88to26_81 > 1.0 || bin30_29to69_47 > 1.0 || bin132_8to242_0 > 1.0)
35 | 
36 |   override def write(kryo: Kryo, output: Output) {
37 |     output.writeInt(year)
38 |     output.writeDouble(bin0_57to1_78)
39 |     output.writeDouble(bin3_40to17_6)
40 |     output.writeDouble(bin22_0to31_0)
41 |     output.writeDouble(bin1_894to2_605)
42 |     output.writeDouble(bin4_200to6_240)
43 |     output.writeDouble(bin3_256to8_132)
44 |     output.writeDouble(bin3_276to8_097)
45 |     output.writeDouble(bin6_343to42_03)
46 |     output.writeDouble(bin17_88to26_81)
47 |     output.writeDouble(bin30_29to69_47)
48 |     output.writeDouble(bin132_8to242_0)
49 |   }
50 | 
51 |   override def read(kryo: Kryo, input: Input) {
52 |     year = input.readInt()
53 |     bin0_57to1_78 = input.readDouble()
54 |     bin3_40to17_6 = input.readDouble()
55 |     bin22_0to31_0 = input.readDouble()
56 |     bin1_894to2_605 = input.readDouble()
57 |     bin4_200to6_240 = input.readDouble()
58 |     bin3_256to8_132 = input.readDouble()
59 |     bin3_276to8_097 = input.readDouble()
60 |     bin6_343to42_03 = input.readDouble()
61 |     bin17_88to26_81 = input.readDouble()
62 |     bin30_29to69_47 = input.readDouble()
63 |     bin132_8to242_0 = input.readDouble()
64 |   }
65 | 
66 | }


--------------------------------------------------------------------------------
/Chap4/src/main/scala/org/apress/prospark/L4-4Kryo.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.io.LongWritable
 5 | import org.apache.hadoop.io.Text
 6 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | 
14 | object VoyagerAppKryo {
15 |   def main(args: Array[String]) {
16 |     if (args.length != 3) {
17 |       System.err.println(
18 |         "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, inputPath, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
27 |       .registerKryoClasses(Array(classOf[ProtonFlux]))
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 | 
31 |     val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
32 |     val projected = voyager1.map(rec => {
33 |       val attrs = rec.split("\\s+")
34 |       new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21),
35 |         attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27),
36 |         attrs(28))
37 |     })
38 |     val filtered = projected.filter(pflux => pflux.isSolarStorm)
39 |     val yearlyBreakdown = filtered.map(rec => (rec.year, 1))
40 |       .reduceByKey(_ + _)
41 |       .transform(rec => rec.sortByKey(ascending = false))
42 |     yearlyBreakdown.saveAsTextFiles(outputPath)
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | }


--------------------------------------------------------------------------------
/Chap5/flumeConf/flumePull.conf:
--------------------------------------------------------------------------------
 1 | # components on this agent
 2 | a1.sources = src-1
 3 | a1.sinks = snk-1
 4 | a1.channels = ch-1
 5 | 
 6 | # source
 7 | a1.sources.src-1.type = spooldir
 8 | a1.sources.src-1.channels = ch-1
 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes
10 | 
11 | # sink
12 | a1.sinks.snk-1.type = org.apache.spark.streaming.flume.sink.SparkSink
13 | a1.sinks.snk-1.hostname = localhost
14 | a1.sinks.snk-1.port = 44444
15 | 
16 | # channel
17 | a1.channels.ch-1.type = memory
18 | a1.channels.ch-1.capacity = 10000
19 | a1.channels.ch-1.transactionCapacity = 1000
20 | 
21 | # bind source, sink, and channel
22 | a1.sources.src-1.channels = ch-1
23 | a1.sinks.snk-1.channel = ch-1
24 | 


--------------------------------------------------------------------------------
/Chap5/flumeConf/flumePush.conf:
--------------------------------------------------------------------------------
 1 | # components on this agent
 2 | a1.sources = src-1
 3 | a1.sinks = snk-1
 4 | a1.channels = ch-1
 5 | 
 6 | # source
 7 | a1.sources.src-1.type = spooldir
 8 | a1.sources.src-1.channels = ch-1
 9 | a1.sources.src-1.spoolDir = /Users/zubairnabi/Downloads/nyc_bikes
10 | 
11 | # sink
12 | a1.sinks.snk-1.type = avro
13 | a1.sinks.snk-1.hostname = localhost
14 | a1.sinks.snk-1.port = 44444
15 | 
16 | # channel
17 | a1.channels.ch-1.type = memory
18 | a1.channels.ch-1.capacity = 10000
19 | a1.channels.ch-1.transactionCapacity = 1000
20 | 
21 | # bind source, sink, and channel
22 | a1.sources.src-1.channels = ch-1
23 | a1.sinks.snk-1.channel = ch-1
24 | 


--------------------------------------------------------------------------------
/Chap5/flumeConf/flumeTest.conf:
--------------------------------------------------------------------------------
 1 | # Name the components on this agent
 2 | a1.sources = r1
 3 | a1.sinks = k1
 4 | a1.channels = c1
 5 | 
 6 | # Describe/configure the source
 7 | a1.sources.r1.type = netcat
 8 | a1.sources.r1.bind = localhost
 9 | a1.sources.r1.port = 44444
10 | 
11 | # Describe the sink
12 | a1.sinks.k1.type = logger
13 | 
14 | # Use a channel which buffers events in memory
15 | a1.channels.c1.type = memory
16 | a1.channels.c1.capacity = 1000
17 | a1.channels.c1.transactionCapacity = 100
18 | 
19 | # Bind the source and sink to the channel
20 | a1.sources.r1.channels = c1
21 | a1.sinks.k1.channel = c1
22 | 


--------------------------------------------------------------------------------
/Chap5/flumeConf/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | 


--------------------------------------------------------------------------------
/Chap5/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap5/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 
5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8")
6 | 


--------------------------------------------------------------------------------
/Chap5/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap5"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0"
26 | 
27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1"
28 | 
29 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.4.0"
30 | 
31 | libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka" % "1.4.0"
32 | 
33 | libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.4.0"
34 | 
35 | libraryDependencies += "com.ning" % "async-http-client" % "1.9.31"
36 | 
37 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1"
38 | 
39 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/"
40 | 


--------------------------------------------------------------------------------
/Chap5/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStreamReader;
 7 | import java.util.Enumeration;
 8 | import java.util.zip.ZipEntry;
 9 | import java.util.zip.ZipFile;
10 | 
11 | import org.apache.log4j.LogManager;
12 | import org.apache.log4j.Logger;
13 | 
14 | public abstract class AbstractDriver {
15 | 
16 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
17 | 
18 | 	private String path;
19 | 
20 | 	public AbstractDriver(String path) {
21 | 		this.path = path;
22 | 	}
23 | 
24 | 	public abstract void init() throws Exception;
25 | 
26 | 	public abstract void close() throws Exception;
27 | 
28 | 	public abstract void sendRecord(String record) throws Exception;
29 | 
30 | 	public void execute() throws Exception {
31 | 
32 | 		try {
33 | 			init();
34 | 			File dirPath = new File(path);
35 | 			if (dirPath.isDirectory()) {
36 | 				File[] files = new File(path).listFiles();
37 | 				for (File f : files) {
38 | 					LOG.info(String.format("Feeding zipped file %s", f.getName()));
39 | 					ZipFile zFile = null;
40 | 					try {
41 | 						zFile = new ZipFile(f);
42 | 						Enumeration<? extends ZipEntry> zEntries = zFile.entries();
43 | 
44 | 						while (zEntries.hasMoreElements()) {
45 | 							ZipEntry zEntry = zEntries.nextElement();
46 | 							LOG.info(String.format("Feeding file %s", zEntry.getName()));
47 | 							try (BufferedReader br = new BufferedReader(
48 | 									new InputStreamReader(zFile.getInputStream(zEntry)))) {
49 | 								// skip header
50 | 								br.readLine();
51 | 								String line;
52 | 								while ((line = br.readLine()) != null) {
53 | 									sendRecord(line);
54 | 								}
55 | 							}
56 | 						}
57 | 					} catch (IOException e) {
58 | 						LOG.error(e.getMessage());
59 | 					} finally {
60 | 						if (zFile != null) {
61 | 							try {
62 | 								zFile.close();
63 | 							} catch (IOException e) {
64 | 								LOG.error(e.getMessage());
65 | 							}
66 | 						}
67 | 					}
68 | 				}
69 | 			} else {
70 | 				LOG.error(String.format("Path %s is not a directory", path));
71 | 			}
72 | 		} finally {
73 | 			close();
74 | 		}
75 | 	}
76 | }


--------------------------------------------------------------------------------
/Chap5/src/main/java/org/apress/prospark/KafkaDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import kafka.javaapi.producer.Producer;
 6 | import kafka.producer.KeyedMessage;
 7 | import kafka.producer.ProducerConfig;
 8 | 
 9 | public class KafkaDriver extends AbstractDriver {
10 | 
11 | 	private final String topic;
12 | 	private Producer<String, String> producer;
13 | 
14 | 	public KafkaDriver(String path, String topic, Properties props) {
15 | 		super(path);
16 | 		this.topic = topic;
17 | 		ProducerConfig config = new ProducerConfig(props);
18 | 		producer = new Producer<String, String>(config);
19 | 	}
20 | 
21 | 	@Override
22 | 	public void init() throws Exception {
23 | 	}
24 | 
25 | 	@Override
26 | 	public void close() throws Exception {
27 | 		producer.close();
28 | 	}
29 | 
30 | 	@Override
31 | 	public void sendRecord(String record) throws Exception {
32 | 		producer.send(new KeyedMessage<String, String>(topic, record));
33 | 	}
34 | 
35 | 	public static void main(String[] args) throws Exception {
36 | 
37 | 		if (args.length != 3) {
38 | 			System.err.println("Usage: KafkaDriver <path_to_input_folder> <brokerUrl> <topic>");
39 | 			System.exit(-1);
40 | 		}
41 | 
42 | 		String path = args[0];
43 | 		String brokerUrl = args[1];
44 | 		String topic = args[2];
45 | 
46 | 		Properties props = new Properties();
47 | 		props.put("metadata.broker.list", brokerUrl);
48 | 		props.put("serializer.class", "kafka.serializer.StringEncoder");
49 | 		// props.put("request.required.acks", "1");
50 | 
51 | 		KafkaDriver driver = new KafkaDriver(path, topic, props);
52 | 		try {
53 | 			driver.execute();
54 | 		} finally {
55 | 			driver.close();
56 | 		}
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/Chap5/src/main/java/org/apress/prospark/MqttDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.nio.charset.StandardCharsets;
 4 | 
 5 | import org.apache.log4j.LogManager;
 6 | import org.apache.log4j.Logger;
 7 | import org.eclipse.paho.client.mqttv3.MqttClient;
 8 | import org.eclipse.paho.client.mqttv3.MqttException;
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage;
10 | import org.eclipse.paho.client.mqttv3.MqttTopic;
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence;
12 | 
13 | public class MqttDriver extends AbstractDriver {
14 | 
15 | 	private static final Logger LOG = LogManager.getLogger(MqttDriver.class);
16 | 
17 | 	private final String brokerUrl;
18 | 	private final String topic;
19 | 	private MqttClient client;
20 | 	private MqttTopic mqttTopic;
21 | 
22 | 	public MqttDriver(String path, String brokerUrl, String topic) {
23 | 		super(path);
24 | 		this.brokerUrl = brokerUrl;
25 | 		this.topic = topic;
26 | 	}
27 | 
28 | 	@Override
29 | 	public void init() throws Exception {
30 | 		client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence());
31 | 		LOG.info(String.format("Attempting to connect to broker %s", brokerUrl));
32 | 		client.connect();
33 | 		mqttTopic = client.getTopic(topic);
34 | 		LOG.info(String.format("Connected to broker %s", brokerUrl));
35 | 	}
36 | 
37 | 	@Override
38 | 	public void close() throws Exception {
39 | 		if (client != null) {
40 | 			client.disconnect();
41 | 		}
42 | 	}
43 | 
44 | 	@Override
45 | 	public void sendRecord(String record) throws Exception {
46 | 		try {
47 | 			mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8)));
48 | 		} catch (MqttException e) {
49 | 			if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) {
50 | 				Thread.sleep(10);
51 | 			}
52 | 		}
53 | 	}
54 | 
55 | 	public static void main(String[] args) throws Exception {
56 | 
57 | 		if (args.length != 3) {
58 | 			System.err.println("Usage:MqttDriver <path_to_input_folder> <broker_url> <topic>");
59 | 			System.exit(-1);
60 | 		}
61 | 
62 | 		String path = args[0];
63 | 		String brokerUrl = args[1];
64 | 		String topic = args[2];
65 | 
66 | 		MqttDriver driver = new MqttDriver(path, brokerUrl, topic);
67 | 		try {
68 | 			driver.execute();
69 | 		} finally {
70 | 			driver.close();
71 | 		}
72 | 	}
73 | 
74 | }


--------------------------------------------------------------------------------
/Chap5/src/main/java/org/apress/prospark/SocketDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.IOException;
  4 | import java.net.InetSocketAddress;
  5 | import java.nio.ByteBuffer;
  6 | import java.nio.channels.ServerSocketChannel;
  7 | import java.nio.channels.SocketChannel;
  8 | import java.nio.charset.StandardCharsets;
  9 | import java.util.concurrent.ExecutionException;
 10 | 
 11 | import org.apache.log4j.LogManager;
 12 | import org.apache.log4j.Logger;
 13 | 
 14 | public class SocketDriver extends AbstractDriver {
 15 | 
 16 | 	private static final Logger LOG = LogManager.getLogger(SocketDriver.class);
 17 | 
 18 | 	private String hostname;
 19 | 	private int port;
 20 | 	private SocketStream socketStream;
 21 | 
 22 | 	public SocketDriver(String path, String hostname, int port) {
 23 | 		super(path);
 24 | 		this.hostname = hostname;
 25 | 		this.port = port;
 26 | 	}
 27 | 
 28 | 	@Override
 29 | 	public void init() throws Exception {
 30 | 		socketStream = new SocketStream(hostname, port);
 31 | 		LOG.info(String.format("Waiting for client to connect on port %d", port));
 32 | 		SocketChannel socketChan = socketStream.init();
 33 | 		LOG.info(String.format("Client %s connected on port %d", socketChan.getRemoteAddress(), port));
 34 | 		socketStream.kickOff(socketChan);
 35 | 		socketStream.start();
 36 | 	}
 37 | 
 38 | 	@Override
 39 | 	public void close() throws IOException {
 40 | 		socketStream.done();
 41 | 		if (socketStream != null) {
 42 | 			socketStream.close();
 43 | 		}
 44 | 	}
 45 | 
 46 | 	@Override
 47 | 	public void sendRecord(String record) throws Exception {
 48 | 		socketStream.sendMsg(record + "\n");
 49 | 	}
 50 | 
 51 | 	static class SocketStream extends Thread {
 52 | 
 53 | 		private String hostname;
 54 | 		private int port;
 55 | 		private ServerSocketChannel server;
 56 | 		private volatile boolean isDone = false;
 57 | 		private SocketChannel socket = null;
 58 | 		private long totalBytes;
 59 | 		private long totalLines;
 60 | 
 61 | 		public SocketStream(String hostname, int port) {
 62 | 			this.hostname = hostname;
 63 | 			this.port = port;
 64 | 			totalBytes = 0;
 65 | 			totalLines = 0;
 66 | 		}
 67 | 
 68 | 		public SocketChannel init() throws IOException {
 69 | 			server = ServerSocketChannel.open();
 70 | 			server.bind(new InetSocketAddress(hostname, port));
 71 | 			LOG.info(String.format("Listening on %s", server.getLocalAddress()));
 72 | 			return server.accept();
 73 | 		}
 74 | 
 75 | 		public void kickOff(SocketChannel socket) {
 76 | 			LOG.info("Kicking off data transfer");
 77 | 			this.socket = socket;
 78 | 		}
 79 | 
 80 | 		@Override
 81 | 		public void run() {
 82 | 			try {
 83 | 				while (!isDone) {
 84 | 					Thread.sleep(1000);
 85 | 				}
 86 | 			} catch (Exception e) {
 87 | 				LOG.error(e);
 88 | 			}
 89 | 		}
 90 | 
 91 | 		public void sendMsg(String msg) throws IOException, InterruptedException, ExecutionException {
 92 | 			if (socket != null) {
 93 | 				ByteBuffer buffer = ByteBuffer.wrap(msg.getBytes(StandardCharsets.UTF_8));
 94 | 				int bytesWritten = socket.write(buffer);
 95 | 				totalBytes += bytesWritten;
 96 | 			} else {
 97 | 				throw new IOException("Client hasn't connected yet!");
 98 | 			}
 99 | 			totalLines++;
100 | 		}
101 | 
102 | 		public void done() {
103 | 			isDone = true;
104 | 		}
105 | 
106 | 		public void close() throws IOException {
107 | 			if (socket != null) {
108 | 				socket.close();
109 | 				socket = null;
110 | 			}
111 | 			LOG.info(String.format("SocketStream is closing after writing %d bytes and %d lines", totalBytes,
112 | 					totalLines));
113 | 		}
114 | 	}
115 | 
116 | 	public static void main(String[] args) throws Exception {
117 | 
118 | 		if (args.length != 3) {
119 | 			System.err.println("Usage: SocketDriver <path_to_input_folder> <hostname> <port>");
120 | 			System.exit(-1);
121 | 		}
122 | 
123 | 		String path = args[0];
124 | 		String hostname = args[1];
125 | 		int port = Integer.parseInt(args[2]);
126 | 
127 | 		SocketDriver driver = new SocketDriver(path, hostname, port);
128 | 		try {
129 | 			driver.execute();
130 | 		} finally {
131 | 			driver.close();
132 | 		}
133 | 	}
134 | }


--------------------------------------------------------------------------------
/Chap5/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, FILE, stdout
 2 | log4j.rootCategory=INFO, FILE, stdout
 3 | 
 4 | log4j.logger.org.eclipse.jetty=WARN
 5 | 
 6 | log4j.appender.FILE=org.apache.log4j.FileAppender
 7 | 
 8 | log4j.appender.FILE.File=/tmp/spark.log
 9 | 
10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
12 | 
13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
14 | log4j.appender.stdout.Target=System.out
15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/HttpInputDStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.Timer
 4 | import java.util.TimerTask
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | import org.apache.http.client.methods.HttpGet
 9 | import org.apache.http.impl.client.CloseableHttpClient
10 | import org.apache.http.impl.client.HttpClients
11 | import org.apache.http.util.EntityUtils
12 | import org.apache.spark.Logging
13 | import org.apache.spark.storage.StorageLevel
14 | import org.apache.spark.streaming.StreamingContext
15 | import org.apache.spark.streaming.api.java.JavaDStream
16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
17 | import org.apache.spark.streaming.api.java.JavaStreamingContext
18 | import org.apache.spark.streaming.dstream.DStream
19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
20 | import org.apache.spark.streaming.receiver.Receiver
21 | 
22 | class HttpInputDStream(
23 |     @transient ssc_ : StreamingContext,
24 |     storageLevel: StorageLevel,
25 |     url: String,
26 |     interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {
27 | 
28 |   def getReceiver(): Receiver[String] = {
29 |     new HttpReceiver(storageLevel, url, interval)
30 |   }
31 | }
32 | 
33 | class HttpReceiver(
34 |     storageLevel: StorageLevel,
35 |     url: String,
36 |     interval: Long) extends Receiver[String](storageLevel) with Logging {
37 | 
38 |   var httpClient: CloseableHttpClient = _
39 |   var trigger: Timer = _
40 | 
41 |   def onStop() {
42 |     httpClient.close()
43 |     logInfo("Disconnected from Http Server")
44 |   }
45 | 
46 |   def onStart() {
47 |     httpClient = HttpClients.createDefault()
48 |     trigger = new Timer()
49 |     trigger.scheduleAtFixedRate(new TimerTask {
50 |       def run() = doGet()
51 |     }, 0, interval * 1000)
52 | 
53 |     logInfo("Http Receiver initiated")
54 |   }
55 | 
56 |   def doGet() {
57 |     logInfo("Fetching data from Http source")
58 |     val response = httpClient.execute(new HttpGet(url))
59 |     try {
60 |       val content = EntityUtils.toString(response.getEntity())
61 |       store(content)
62 |     } catch {
63 |       case e: Exception => restart("Error! Problems while connecting", e)
64 |     } finally {
65 |       response.close()
66 |     }
67 | 
68 |   }
69 | 
70 | }
71 | 
72 | object HttpUtils {
73 |   def createStream(
74 |     ssc: StreamingContext,
75 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
76 |     url: String,
77 |     interval: Long): DStream[String] = {
78 |     new HttpInputDStream(ssc, storageLevel, url, interval)
79 |   }
80 | 
81 |   def createStream(
82 |     jssc: JavaStreamingContext,
83 |     storageLevel: StorageLevel,
84 |     url: String,
85 |     interval: Long): JavaDStream[String] = {
86 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
87 |     createStream(jssc.ssc, storageLevel, url, interval)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/HttpInputDStreamAsync.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.ClassTag
 4 | 
 5 | import org.apache.spark.Logging
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.api.java.JavaDStream
 9 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
10 | import org.apache.spark.streaming.api.java.JavaStreamingContext
11 | import org.apache.spark.streaming.dstream.DStream
12 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
13 | import org.apache.spark.streaming.receiver.Receiver
14 | 
15 | import com.ning.http.client.AsyncCompletionHandler
16 | import com.ning.http.client.AsyncHttpClient
17 | import com.ning.http.client.Response
18 | 
19 | class HttpInputDStreamAsync(
20 |     @transient ssc_ : StreamingContext,
21 |     storageLevel: StorageLevel,
22 |     url: String) extends ReceiverInputDStream[String](ssc_) with Logging {
23 | 
24 |   def getReceiver(): Receiver[String] = {
25 |     new HttpReceiverAsync(storageLevel, url)
26 |   }
27 | }
28 | 
29 | class HttpReceiverAsync(
30 |     storageLevel: StorageLevel,
31 |     url: String) extends Receiver[String](storageLevel) with Logging {
32 | 
33 |   var asyncHttpClient: AsyncHttpClient = _
34 | 
35 |   def onStop() {
36 |     asyncHttpClient.close()
37 |     logInfo("Disconnected from Http Server")
38 |   }
39 | 
40 |   def onStart() {
41 |     asyncHttpClient = new AsyncHttpClient()
42 |     asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {
43 | 
44 |       override def onCompleted(response: Response): Response = {
45 |         store(response.getResponseBody)
46 |         return response
47 |       }
48 | 
49 |       override def onThrowable(t: Throwable) {
50 |         restart("Error! Problems while connecting", t)
51 |       }
52 |     });
53 |     logInfo("Http Connection initiated")
54 |   }
55 |   
56 | }
57 | 
58 | object HttpUtilsAsync {
59 |   def createStream(
60 |     ssc: StreamingContext,
61 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
62 |     url: String): DStream[String] = {
63 |     new HttpInputDStreamAsync(ssc, storageLevel, url)
64 |   }
65 | 
66 |   def createStream(
67 |     jssc: JavaStreamingContext,
68 |     storageLevel: StorageLevel,
69 |     url: String): JavaDStream[String] = {
70 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
71 |     createStream(jssc.ssc, storageLevel, url)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePull.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.flume.FlumeUtils
11 | 
12 | object DailyUserTypeDistributionApp2 {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 5) {
15 |       System.err.println(
16 |         "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => new String(rec.event.getBody().array()).split(","))
30 |       .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .repartition(1)
33 |       .transform(rdd => rdd.sortByKey(ascending = false))
34 |       .saveAsTextFiles(outputPath)
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-11FlumePush.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.flume.FlumeUtils
11 | 
12 | object DailyUserTypeDistributionApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 5) {
15 |       System.err.println(
16 |         "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => new String(rec.event.getBody().array()).split(","))
30 |       .map(rec => ((rec(1).split(" ")(0), rec(12)), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .repartition(1)
33 |       .transform(rdd => rdd.sortByKey(ascending = false))
34 |       .saveAsTextFiles(outputPath)
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-13Kafka.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | 
12 | object StationJourneyCountApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 7) {
16 |       System.err.println(
17 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
18 |       System.exit(1)
19 |     }
20 | 
21 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 |     //.set("spark.streaming.receiver.writeAheadLog.enable", "true")
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(10))
29 |     ssc.checkpoint(checkpointDir)
30 | 
31 |     val topics = Map[String, Int](
32 |       topic -> 1)
33 |     KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
34 |       .map(rec => rec.split(","))
35 |       .map(rec => ((rec(3), rec(7)), 1))
36 |       .reduceByKey(_ + _)
37 |       .repartition(1)
38 |       .map(rec => (rec._2, rec._1))
39 |       .transform(rdd => rdd.sortByKey(ascending = false))
40 |       .saveAsTextFiles(outputPath)
41 | 
42 |     ssc.start()
43 |     ssc.awaitTermination()
44 |   }
45 | 
46 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-14KafkaCustomConf.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 | import kafka.serializer.StringDecoder
11 | import org.apache.spark.storage.StorageLevel
12 | 
13 | object StationJourneyCountCustomApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 7) {
17 |       System.err.println(
18 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
19 |       System.exit(1)
20 |     }
21 | 
22 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 |       //.set("spark.streaming.receiver.writeAheadLog.enable", "true")
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 |     ssc.checkpoint(checkpointDir)
31 | 
32 |     val topics = Map[String, Int](
33 |       topic -> 1)
34 |     val params = Map[String, String](
35 |       "zookeeper.connect" -> zkQuorum,
36 |       "group.id" -> consumerGroupId,
37 |       "bootstrap.servers" -> brokerUrl)
38 |     KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
39 |       .map(rec => rec.split(","))
40 |       .map(rec => ((rec(3), rec(7)), 1))
41 |       .reduceByKey(_ + _)
42 |       .repartition(1)
43 |       .map(rec => (rec._2, rec._1))
44 |       .transform(rdd => rdd.sortByKey(ascending = false))
45 |       .saveAsTextFiles(outputPath)
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-15KafkaDirect.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import kafka.serializer.StringDecoder
10 | import org.apache.spark.streaming.kafka.KafkaUtils
11 | 
12 | object StationJourneyCountDirectApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 7) {
16 |       System.err.println(
17 |         "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
18 |       System.exit(1)
19 |     }
20 | 
21 |     val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(10))
28 |     ssc.checkpoint(checkpointDir)
29 | 
30 |     val topics = Set(topic)
31 |     val params = Map[String, String](
32 |       "zookeeper.connect" -> zkQuorum,
33 |       "group.id" -> consumerGroupId,
34 |       "bootstrap.servers" -> brokerUrl)
35 |     KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2)
36 |       .map(rec => rec.split(","))
37 |       .map(rec => ((rec(3), rec(7)), 1))
38 |       .reduceByKey(_ + _)
39 |       .repartition(1)
40 |       .map(rec => (rec._2, rec._1))
41 |       .transform(rdd => rdd.sortByKey(ascending = false))
42 |       .saveAsTextFiles(outputPath)
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-16Twitter.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 9 | import org.apache.spark.streaming.twitter.TwitterUtils
10 | import org.apache.spark.storage.StorageLevel
11 | import twitter4j.conf.ConfigurationBuilder
12 | import twitter4j.TwitterFactory
13 | 
14 | object TwitterApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: TwitterApp <appname> <outputPath>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, outputPath) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val ssc = new StreamingContext(conf, Seconds(10))
30 | 
31 |     val cb = new ConfigurationBuilder()
32 |     cb.setOAuthConsumerKey("")
33 |     cb.setOAuthConsumerSecret("")
34 |     cb.setOAuthAccessToken("")
35 |     cb.setOAuthAccessTokenSecret("")
36 | 
37 |     val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization()
38 | 
39 |     val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share"))
40 |     tweetStream.count().print()
41 |     tweetStream.saveAsTextFiles(outputPath)
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-18Http.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.streaming.Seconds
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import org.json4s.DefaultFormats
 8 | import org.json4s.JField
 9 | import org.json4s.jvalue2extractable
10 | import org.json4s.jvalue2monadic
11 | import org.json4s.native.JsonMethods.parse
12 | import org.json4s.string2JsonInput
13 | 
14 | object HttpApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: HttpApp <appname> <outputPath>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, outputPath) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val batchInterval = 10
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
32 | 
33 |     HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval)
34 |       .flatMap(rec => (parse(rec) \ "stationBeanList").children)
35 |       .filter(rec => {
36 |         implicit val formats = DefaultFormats
37 |         (rec \ "statusKey").extract[Integer] != 1
38 |       })
39 |       .map(rec => rec.filterField {
40 |         case JField("id", _) => true
41 |         case JField("stationName", _) => true
42 |         case JField("statusValue", _) => true
43 |         case _ => false
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String])
48 |       })
49 |       .saveAsTextFiles(outputPath)
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-6SocketStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
 8 | 
 9 | import java.util.Calendar
10 | 
11 | object TripByYearApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 3) {
14 |       System.err.println(
15 |         "Usage: TripByYearApp <appname> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(10))
25 | 
26 |     ssc.socketTextStream(hostname, port.toInt)
27 |       .map(rec => rec.split(","))
28 |       .map(rec => (rec(13), rec(0).toInt))
29 |       .reduceByKey(_ + _)
30 |       .map(pair => (pair._2, normalizeYear(pair._1)))
31 |       .transform(rec => rec.sortByKey(ascending = false))
32 |       .saveAsTextFiles("TripByYear")
33 | 
34 |     ssc.start()
35 |     ssc.awaitTermination()
36 |   }
37 | 
38 |   def normalizeYear(s: String): String = {
39 |     try {
40 |       (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
41 |     } catch {
42 |       case e: Exception => s
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-7MultipleSocketStreams.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | 
 6 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 7 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
 8 | 
 9 | import java.util.Calendar
10 | 
11 | object TripByYearMultiApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, basePort, nSockets) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(10))
25 | 
26 |     val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i))
27 |     val uniStream = ssc.union(streams)
28 | 
29 |     uniStream
30 |       .map(rec => rec.split(","))
31 |       .map(rec => (rec(13), rec(0).toInt))
32 |       .reduceByKey(_ + _)
33 |       .map(pair => (pair._2, normalizeYear(pair._1)))
34 |       .transform(rec => rec.sortByKey(ascending = false))
35 |       .saveAsTextFiles("TripByYear")
36 | 
37 |     ssc.start()
38 |     ssc.awaitTermination()
39 |   }
40 | 
41 |   def normalizeYear(s: String): String = {
42 |     try {
43 |       (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString
44 |     } catch {
45 |       case e: Exception => s
46 |     }
47 |   }
48 | }


--------------------------------------------------------------------------------
/Chap5/src/main/scala/org/apress/prospark/L5-9Mqtt.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.mqtt.MQTTUtils
11 | 
12 | object YearlyDistributionApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(10))
26 |     ssc.checkpoint(checkpointDir)
27 | 
28 |     MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2)
29 |       .map(rec => rec.split(","))
30 |       .map(rec => (rec(1).split(" ")(0), 1))
31 |       .updateStateByKey(statefulCount)
32 |       .map(pair => (pair._2, pair._1))
33 |       .transform(rec => rec.sortByKey(ascending = false))
34 |       .saveAsTextFiles("YearlyDistribution")
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | 
40 |   val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0))
41 | 
42 | }


--------------------------------------------------------------------------------
/Chap6/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap6/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap6/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap6"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-streaming-mqtt" % "1.4.0"
26 | 
27 | libraryDependencies += "org.eclipse.paho" % "org.eclipse.paho.client.mqttv3" % "1.0.1"
28 | 
29 | libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.1"
30 | 
31 | libraryDependencies += "org.apache.commons" % "commons-pool2" % "2.4.2"
32 | 
33 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.15-hadoop2"
34 | 
35 | //libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2"
36 | 
37 | //libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2"
38 | 
39 | //libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2"
40 | 
41 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-SNAPSHOT"
42 | 
43 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-SNAPSHOT"
44 | 
45 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-SNAPSHOT"
46 | 
47 | libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-SNAPSHOT"
48 | 
49 | resolvers += "Apache Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
50 | 
51 | libraryDependencies += "org.apache.cassandra" % "cassandra-all" % "2.1.11"
52 | 
53 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.4.0"
54 | 
55 | libraryDependencies += "redis.clients" % "jedis" % "2.7.3"
56 | 
57 | resolvers += "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/"
58 | 


--------------------------------------------------------------------------------
/Chap6/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.io.InputStreamReader;
 7 | import java.util.Enumeration;
 8 | import java.util.zip.ZipEntry;
 9 | import java.util.zip.ZipFile;
10 | 
11 | import org.apache.log4j.LogManager;
12 | import org.apache.log4j.Logger;
13 | 
14 | public abstract class AbstractDriver {
15 | 
16 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
17 | 
18 | 	private String path;
19 | 
20 | 	public AbstractDriver(String path) {
21 | 		this.path = path;
22 | 	}
23 | 
24 | 	public abstract void init() throws Exception;
25 | 
26 | 	public abstract void close() throws Exception;
27 | 
28 | 	public abstract void sendRecord(String record) throws Exception;
29 | 
30 | 	public void execute() throws Exception {
31 | 
32 | 		try {
33 | 			init();
34 | 			File dirPath = new File(path);
35 | 			if (dirPath.isDirectory()) {
36 | 				File[] files = new File(path).listFiles();
37 | 				for (File f : files) {
38 | 					LOG.info(String.format("Feeding zipped file %s", f.getName()));
39 | 					ZipFile zFile = null;
40 | 					try {
41 | 						zFile = new ZipFile(f);
42 | 						Enumeration<? extends ZipEntry> zEntries = zFile.entries();
43 | 
44 | 						while (zEntries.hasMoreElements()) {
45 | 							ZipEntry zEntry = zEntries.nextElement();
46 | 							LOG.info(String.format("Feeding file %s", zEntry.getName()));
47 | 							try (BufferedReader br = new BufferedReader(
48 | 									new InputStreamReader(zFile.getInputStream(zEntry)))) {
49 | 								// skip header
50 | 								br.readLine();
51 | 								String line;
52 | 								while ((line = br.readLine()) != null) {
53 | 									sendRecord(line);
54 | 								}
55 | 							}
56 | 						}
57 | 					} catch (IOException e) {
58 | 						LOG.error(e.getMessage());
59 | 					} finally {
60 | 						if (zFile != null) {
61 | 							try {
62 | 								zFile.close();
63 | 							} catch (IOException e) {
64 | 								LOG.error(e.getMessage());
65 | 							}
66 | 						}
67 | 					}
68 | 				}
69 | 			} else {
70 | 				LOG.error(String.format("Path %s is not a directory", path));
71 | 			}
72 | 		} finally {
73 | 			close();
74 | 		}
75 | 	}
76 | }


--------------------------------------------------------------------------------
/Chap6/src/main/java/org/apress/prospark/MqttDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.nio.charset.StandardCharsets;
 4 | 
 5 | import org.apache.log4j.LogManager;
 6 | import org.apache.log4j.Logger;
 7 | import org.eclipse.paho.client.mqttv3.MqttClient;
 8 | import org.eclipse.paho.client.mqttv3.MqttException;
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage;
10 | import org.eclipse.paho.client.mqttv3.MqttTopic;
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence;
12 | 
13 | public class MqttDriver extends AbstractDriver {
14 | 
15 | 	private static final Logger LOG = LogManager.getLogger(MqttDriver.class);
16 | 
17 | 	private final String brokerUrl;
18 | 	private final String topic;
19 | 	private MqttClient client;
20 | 	private MqttTopic mqttTopic;
21 | 
22 | 	public MqttDriver(String path, String brokerUrl, String topic) {
23 | 		super(path);
24 | 		this.brokerUrl = brokerUrl;
25 | 		this.topic = topic;
26 | 	}
27 | 
28 | 	@Override
29 | 	public void init() throws Exception {
30 | 		client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence());
31 | 		LOG.info(String.format("Attempting to connect to broker %s", brokerUrl));
32 | 		client.connect();
33 | 		mqttTopic = client.getTopic(topic);
34 | 		LOG.info(String.format("Connected to broker %s", brokerUrl));
35 | 	}
36 | 
37 | 	@Override
38 | 	public void close() throws Exception {
39 | 		if (client != null) {
40 | 			client.disconnect();
41 | 		}
42 | 	}
43 | 
44 | 	@Override
45 | 	public void sendRecord(String record) throws Exception {
46 | 		try {
47 | 			mqttTopic.publish(new MqttMessage(record.getBytes(StandardCharsets.UTF_8)));
48 | 		} catch (MqttException e) {
49 | 			if (e.getReasonCode() == MqttException.REASON_CODE_MAX_INFLIGHT) {
50 | 				Thread.sleep(10);
51 | 			}
52 | 		}
53 | 	}
54 | 
55 | 	public static void main(String[] args) throws Exception {
56 | 
57 | 		if (args.length != 3) {
58 | 			System.err.println("Usage:MqttDriver <path_to_input_folder> <broker_url> <topic>");
59 | 			System.exit(-1);
60 | 		}
61 | 
62 | 		String path = args[0];
63 | 		String brokerUrl = args[1];
64 | 		String topic = args[2];
65 | 
66 | 		MqttDriver driver = new MqttDriver(path, brokerUrl, topic);
67 | 		try {
68 | 			driver.execute();
69 | 		} finally {
70 | 			driver.close();
71 | 		}
72 | 	}
73 | 
74 | }


--------------------------------------------------------------------------------
/Chap6/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, FILE, stdout
 2 | log4j.rootCategory=INFO, FILE, stdout
 3 | 
 4 | log4j.logger.org.eclipse.jetty=WARN
 5 | 
 6 | log4j.appender.FILE=org.apache.log4j.FileAppender
 7 | 
 8 | log4j.appender.FILE.File=/tmp/spark.log
 9 | 
10 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.FILE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
12 | 
13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
14 | log4j.appender.stdout.Target=System.out
15 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
17 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/HttpInputDStream.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.Timer
 4 | import java.util.TimerTask
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | import org.apache.http.client.methods.HttpGet
 9 | import org.apache.http.impl.client.CloseableHttpClient
10 | import org.apache.http.impl.client.HttpClients
11 | import org.apache.http.util.EntityUtils
12 | import org.apache.spark.Logging
13 | import org.apache.spark.storage.StorageLevel
14 | import org.apache.spark.streaming.StreamingContext
15 | import org.apache.spark.streaming.api.java.JavaDStream
16 | import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
17 | import org.apache.spark.streaming.api.java.JavaStreamingContext
18 | import org.apache.spark.streaming.dstream.DStream
19 | import org.apache.spark.streaming.dstream.ReceiverInputDStream
20 | import org.apache.spark.streaming.receiver.Receiver
21 | 
22 | class HttpInputDStream(
23 |     @transient ssc_ : StreamingContext,
24 |     storageLevel: StorageLevel,
25 |     url: String,
26 |     interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {
27 | 
28 |   def getReceiver(): Receiver[String] = {
29 |     new HttpReceiver(storageLevel, url, interval)
30 |   }
31 | }
32 | 
33 | class HttpReceiver(
34 |     storageLevel: StorageLevel,
35 |     url: String,
36 |     interval: Long) extends Receiver[String](storageLevel) with Logging {
37 | 
38 |   var httpClient: CloseableHttpClient = _
39 |   var trigger: Timer = _
40 | 
41 |   def onStop() {
42 |     httpClient.close()
43 |     logInfo("Disconnected from Http Server")
44 |   }
45 | 
46 |   def onStart() {
47 |     httpClient = HttpClients.createDefault()
48 |     trigger = new Timer()
49 |     trigger.scheduleAtFixedRate(new TimerTask {
50 |       def run() = doGet()
51 |     }, 0, interval * 1000)
52 | 
53 |     logInfo("Http Receiver initiated")
54 |   }
55 | 
56 |   def doGet() {
57 |     logInfo("Fetching data from Http source")
58 |     val response = httpClient.execute(new HttpGet(url))
59 |     try {
60 |       val content = EntityUtils.toString(response.getEntity())
61 |       store(content)
62 |     } catch {
63 |       case e: Exception => restart("Error! Problems while connecting", e)
64 |     } finally {
65 |       response.close()
66 |     }
67 | 
68 |   }
69 | 
70 | }
71 | 
72 | object HttpUtils {
73 |   def createStream(
74 |     ssc: StreamingContext,
75 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
76 |     url: String,
77 |     interval: Long): DStream[String] = {
78 |     new HttpInputDStream(ssc, storageLevel, url, interval)
79 |   }
80 | 
81 |   def createStream(
82 |     jssc: JavaStreamingContext,
83 |     storageLevel: StorageLevel,
84 |     url: String,
85 |     interval: Long): JavaDStream[String] = {
86 |     implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
87 |     createStream(jssc.ssc, storageLevel, url, interval)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-10LazyStatic.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.eclipse.paho.client.mqttv3.MqttClient
 9 | import org.eclipse.paho.client.mqttv3.MqttMessage
10 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JField
13 | import org.json4s.JsonAST.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.parse
17 | import org.json4s.string2JsonInput
18 | import org.apache.commons.pool2.PooledObject
19 | import org.apache.commons.pool2.BasePooledObjectFactory
20 | import org.apache.commons.pool2.impl.DefaultPooledObject
21 | import org.apache.commons.pool2.impl.GenericObjectPool
22 | import org.apache.commons.pool2.ObjectPool
23 | 
24 | object MqttSinkAppE {
25 | 
26 |   def main(args: Array[String]) {
27 |     if (args.length != 3) {
28 |       System.err.println(
29 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
30 |       System.exit(1)
31 |     }
32 | 
33 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val batchInterval = 10
40 | 
41 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
42 | 
43 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
44 |       interval = batchInterval)
45 |       .flatMap(rec => {
46 |         val query = parse(rec) \ "query"
47 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
48 |       })
49 |       .map(rec => {
50 |         implicit val formats = DefaultFormats
51 |         rec.children.map(f => f.extract[String]) mkString ","
52 |       })
53 |       .foreachRDD { rdd =>
54 |         rdd.foreachPartition { par =>
55 |           val mqttSink = MqttSinkPool().borrowObject()
56 |           par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
57 |           MqttSinkPool().returnObject(mqttSink)
58 |         }
59 |       }
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | }
65 | 
66 | object MqttSinkPool {
67 |   val poolSize = 8
68 |   val brokerUrl = "tcp://localhost:1883"
69 |   val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl))
70 |   mqttPool.setMaxTotal(poolSize)
71 |   sys.addShutdownHook {
72 |     mqttPool.close()
73 |   }
74 |   
75 |   def apply(): GenericObjectPool[MqttClient] = {
76 |     mqttPool
77 |   }
78 | }
79 | 
80 | class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] {
81 |   override def create() = {
82 |     val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
83 |     client.connect()
84 |     client
85 |   }
86 |   override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client)
87 |   override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected()
88 |   override def destroyObject(pObj: PooledObject[MqttClient]) = {
89 |     pObj.getObject.disconnect()
90 |     pObj.getObject.close()
91 |   }
92 |   override def passivateObject(pObj: PooledObject[MqttClient]) = {}
93 | }
94 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-12StaticPool.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppF {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl))
40 | 
41 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
42 |       interval = batchInterval)
43 |       .flatMap(rec => {
44 |         val query = parse(rec) \ "query"
45 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
46 |       })
47 |       .map(rec => {
48 |         implicit val formats = DefaultFormats
49 |         rec.children.map(f => f.extract[String]) mkString ","
50 |       })
51 |       .foreachRDD { rdd =>
52 |         rdd.foreachPartition { par =>
53 |           par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
54 |         }
55 |       }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | 
61 | }
62 | 
63 | class MqttSinkLazy(brokerUrl: String) extends Serializable {
64 |   lazy val client = {
65 |     val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
66 |     client.connect()
67 |     sys.addShutdownHook {
68 |       client.disconnect()
69 |       client.close()
70 |     }
71 |     client
72 |   }
73 | }
74 | 
75 | object MqttSinkLazy {
76 |   val brokerUrl = "tcp://localhost:1883"
77 |   val client = new MqttSinkLazy(brokerUrl)
78 | 
79 |   def apply(brokerUrl: String): MqttSinkLazy = {
80 |     client
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-14HBase.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.conf.Configuration
 4 | import org.apache.hadoop.hbase.HBaseConfiguration
 5 | import org.apache.hadoop.hbase.client.Put
 6 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
 7 | import org.apache.hadoop.hbase.util.Bytes
 8 | import org.apache.hadoop.io.Text
 9 | import org.apache.spark.SparkConf
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
12 | import org.apache.spark.streaming.Seconds
13 | import org.apache.spark.streaming.StreamingContext
14 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
15 | import org.json4s.DefaultFormats
16 | import org.json4s.jvalue2extractable
17 | import org.json4s.jvalue2monadic
18 | import org.json4s.native.JsonMethods.parse
19 | import org.json4s.string2JsonInput
20 | 
21 | object HBaseSinkApp {
22 | 
23 |   def main(args: Array[String]) {
24 |     if (args.length != 5) {
25 |       System.err.println(
26 |         "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
27 |       System.exit(1)
28 |     }
29 | 
30 |     val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq
31 | 
32 |     val conf = new SparkConf()
33 |       .setAppName(appName)
34 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
35 | 
36 |     val batchInterval = 10
37 |     val windowSize = 20
38 |     val slideInterval = 10
39 | 
40 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
41 | 
42 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
43 |       interval = batchInterval)
44 |       .flatMap(rec => {
45 |         implicit val formats = DefaultFormats
46 |         val query = parse(rec) \ "query"
47 |         ((query \ "results" \ "quote").children)
48 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
49 |       })
50 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
51 |       .foreachRDD(rdd => {
52 |         val hbaseConf = HBaseConfiguration.create()
53 |         hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
54 |         hbaseConf.set("hbase.master", hbaseMaster)
55 |         val jobConf = new Configuration(hbaseConf)
56 |         jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
57 |         rdd.map(rec => {
58 |           val put = new Put(rec._1.getBytes)
59 |           put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
60 |           (rec._1, put)
61 |         }).saveAsNewAPIHadoopDataset(jobConf)
62 |       })
63 | 
64 |     ssc.start()
65 |     ssc.awaitTermination()
66 |   }
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-16SparkHBase.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.hbase.HBaseConfiguration
 4 | import org.apache.hadoop.hbase.TableName
 5 | import org.apache.hadoop.hbase.client.Put
 6 | import org.apache.hadoop.hbase.spark.HBaseContext
 7 | import org.apache.hadoop.hbase.util.Bytes
 8 | import org.apache.spark.SparkConf
 9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
13 | import org.json4s.DefaultFormats
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.parse
17 | import org.json4s.string2JsonInput
18 | 
19 | object SparkHBaseBulkPutApp {
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 4) {
23 |       System.err.println(
24 |         "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>")
25 |       System.exit(1)
26 |     }
27 | 
28 |     val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val batchInterval = 10
35 |     val windowSize = 20
36 |     val slideInterval = 10
37 | 
38 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
39 | 
40 |     val hbaseConf = HBaseConfiguration.create()
41 |     val hContext = new HBaseContext(ssc.sparkContext, hbaseConf)
42 | 
43 |     val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
44 |       interval = batchInterval)
45 |       .flatMap(rec => {
46 |         implicit val formats = DefaultFormats
47 |         val query = parse(rec) \ "query"
48 |         ((query \ "results" \ "quote").children)
49 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
50 |       })
51 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
52 | 
53 |     hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => {
54 |       val put = new Put(rec._1.getBytes)
55 |       put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
56 |       put
57 |     })
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-18Cassandra.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import org.json4s.DefaultFormats
 9 | import org.json4s.JField
10 | import org.json4s.JsonAST.JObject
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | import org.apache.hadoop.conf.Configuration
16 | import org.apache.hadoop.io.Text
17 | import java.nio.ByteBuffer
18 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
19 | import org.apache.cassandra.hadoop.ConfigHelper
20 | import org.apache.cassandra.thrift.ColumnOrSuperColumn
21 | import org.apache.cassandra.thrift.Column
22 | import org.apache.cassandra.utils.ByteBufferUtil
23 | import org.apache.cassandra.thrift.Mutation
24 | import java.util.Arrays
25 | 
26 | object CassandraSinkApp {
27 | 
28 |   def main(args: Array[String]) {
29 |     if (args.length != 6) {
30 |       System.err.println(
31 |         "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>")
32 |       System.exit(1)
33 |     }
34 | 
35 |     val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq
36 | 
37 |     val conf = new SparkConf()
38 |       .setAppName(appName)
39 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
40 | 
41 |     val batchInterval = 10
42 |     val windowSize = 20
43 |     val slideInterval = 10
44 | 
45 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
46 | 
47 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
48 |       interval = batchInterval)
49 |       .flatMap(rec => {
50 |         implicit val formats = DefaultFormats
51 |         val query = parse(rec) \ "query"
52 |         ((query \ "results" \ "quote").children)
53 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
54 |       })
55 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
56 |       .foreachRDD(rdd => {
57 |         val jobConf = new Configuration()
58 |         ConfigHelper.setOutputRpcPort(jobConf, cassandraPort)
59 |         ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost)
60 |         ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName)
61 |         ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner")
62 |         rdd.map(rec => {
63 |           val c = new Column()
64 |           c.setName(ByteBufferUtil.bytes(columnName))
65 |           c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval)))
66 |           c.setTimestamp(System.currentTimeMillis)
67 |           val m = new Mutation()
68 |           m.setColumn_or_supercolumn(new ColumnOrSuperColumn())
69 |           m.column_or_supercolumn.setColumn(c)
70 |           (ByteBufferUtil.bytes(rec._1), Arrays.asList(m))
71 |         }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf)
72 |       })
73 | 
74 |     ssc.start()
75 |     ssc.awaitTermination()
76 |   }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-20CassandraConnector.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | import com.datastax.spark.connector.SomeColumns
17 | import com.datastax.spark.connector.cql.CassandraConnector
18 | import com.datastax.spark.connector.streaming.toDStreamFunctions
19 | import com.datastax.spark.connector.toNamedColumnRef
20 | 
21 | object CassandraConnectorSinkApp {
22 | 
23 |   def main(args: Array[String]) {
24 |     if (args.length != 6) {
25 |       System.err.println(
26 |         "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>")
27 |       System.exit(1)
28 |     }
29 | 
30 |     val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq
31 | 
32 |     val conf = new SparkConf()
33 |       .setAppName(appName)
34 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
35 |       .set("spark.cassandra.connection.host", cassandraHost)
36 |       .set("spark.cassandra.connection.port", cassandraPort)
37 | 
38 |     val batchInterval = 10
39 |     val windowSize = 20
40 |     val slideInterval = 10
41 | 
42 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
43 | 
44 |     CassandraConnector(conf).withSessionDo { session =>
45 |       session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace))
46 |       session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName))
47 |     }
48 | 
49 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
50 |       interval = batchInterval)
51 |       .flatMap(rec => {
52 |         implicit val formats = DefaultFormats
53 |         val query = parse(rec) \ "query"
54 |         ((query \ "results" \ "quote").children)
55 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
56 |       })
57 |       .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
58 |       .map(stock => (stock._1, stock._2 / (windowSize / batchInterval)))
59 |       .saveToCassandra(keyspace, tableName)
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-22Counters.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.concurrent.atomic.AtomicLong
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.json4s.DefaultFormats
10 | import org.json4s.jvalue2extractable
11 | import org.json4s.jvalue2monadic
12 | import org.json4s.native.JsonMethods.parse
13 | import org.json4s.string2JsonInput
14 | 
15 | object StatefulCountersApp {
16 | 
17 |   def main(args: Array[String]) {
18 |     if (args.length != 1) {
19 |       System.err.println(
20 |         "Usage: StatefulCountersApp <appname>")
21 |       System.exit(1)
22 |     }
23 | 
24 |     val Seq(appName) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val batchInterval = 10
31 | 
32 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
33 |     
34 |     var globalMax: AtomicLong = new AtomicLong(Long.MinValue)
35 |     var globalMin: AtomicLong = new AtomicLong(Long.MaxValue)
36 |     var globalCounter500: AtomicLong = new AtomicLong(0)
37 | 
38 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
39 |       interval = batchInterval)
40 |       .flatMap(rec => {
41 |         implicit val formats = DefaultFormats
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children)
44 |           .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))
45 |       })
46 |       .foreachRDD(rdd => {
47 |         val stocks = rdd.take(10)
48 |         stocks.foreach(stock => {
49 |           val price = stock._2
50 |           val volume = stock._3
51 |           if (volume > globalMax.get()) {
52 |             globalMax.set(volume)
53 |           }
54 |           if (volume < globalMin.get()) {
55 |             globalMin.set(volume)
56 |           }
57 |           if (price > 500) {
58 |             globalCounter500.incrementAndGet()
59 |           }
60 |         })
61 |         if (globalCounter500.get() > 1000L) {
62 |           println("Global counter has reached 1000")
63 |           println("Max ----> " + globalMax.get)
64 |           println("Min ----> " + globalMin.get)
65 |           globalCounter500.set(0)
66 |         }
67 |       })
68 | 
69 |     ssc.start()
70 |     ssc.awaitTermination()
71 |   }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-23UpdateState.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.streaming.Seconds
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
 8 | import org.json4s.DefaultFormats
 9 | import org.json4s.jvalue2extractable
10 | import org.json4s.jvalue2monadic
11 | import org.json4s.native.JsonMethods.parse
12 | import org.json4s.string2JsonInput
13 | 
14 | object StatefulUpdateStateApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 2) {
18 |       System.err.println(
19 |         "Usage: StatefulUpdateStateApp <appname> <checkpointDir>")
20 |       System.exit(1)
21 |     }
22 | 
23 |     val Seq(appName, checkpointDir) = args.toSeq
24 | 
25 |     val conf = new SparkConf()
26 |       .setAppName(appName)
27 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
28 | 
29 |     val batchInterval = 10
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
32 |     ssc.checkpoint(checkpointDir)
33 | 
34 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
35 |       interval = batchInterval)
36 |       .flatMap(rec => {
37 |         implicit val formats = DefaultFormats
38 |         val query = parse(rec) \ "query"
39 |         ((query \ "results" \ "quote").children)
40 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
41 |       })
42 |       .updateStateByKey(updateState)
43 |       .print()
44 | 
45 |     def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = {
46 |       val volumes = values.map(s => s._2)
47 |       val localMin = volumes.min
48 |       val localMax = volumes.max
49 |       val localCount500 = values.map(s => s._1).count(price => price > 500)
50 |       val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)]
51 |       val newMin = if (localMin < globalValues._1) localMin else globalValues._1
52 |       val newMax = if (localMax > globalValues._2) localMax else globalValues._2
53 |       val newCount500 = globalValues._3 + localCount500
54 |       return Some(newMin, newMax, newCount500)
55 |     }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-24Accumulators.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | import org.apache.spark.AccumulableParam
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import org.json4s.DefaultFormats
11 | import org.json4s.jvalue2extractable
12 | import org.json4s.jvalue2monadic
13 | import org.json4s.native.JsonMethods.parse
14 | import org.json4s.string2JsonInput
15 | 
16 | object StatefulAccumulatorsApp {
17 | 
18 |   object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] {
19 |     def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = {
20 |       new mutable.HashMap[String, (Long, Long, Long)]()
21 |     }
22 |     def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = {
23 |       t1 ++ t2.map {
24 |         case (k, v2) => (k -> {
25 |           val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L))
26 |           val newMin = if (v2._1 < v1._1) v2._1 else v1._1
27 |           val newMax = if (v2._2 > v1._2) v2._2 else v1._2
28 |           (newMin, newMax, v1._3 + v2._3)
29 |         })
30 |       }
31 |     }
32 |     def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = {
33 |       val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L))
34 |       val newVals = t2._2
35 |       var newCount = prevStats._3
36 |       if (newVals._1 > 500.0) {
37 |         newCount += 1
38 |       }
39 |       val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1
40 |       val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2
41 |       t1 += t2._1 -> (newMin, newMax, newCount)
42 |     }
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     if (args.length != 2) {
47 |       System.err.println(
48 |         "Usage: StatefulAccumulatorsApp <appname> <checkpointDir>")
49 |       System.exit(1)
50 |     }
51 | 
52 |     val Seq(appName, checkpointDir) = args.toSeq
53 | 
54 |     val conf = new SparkConf()
55 |       .setAppName(appName)
56 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
57 | 
58 |     val batchInterval = 10
59 | 
60 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
61 | 
62 |     val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum)
63 | 
64 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
65 |       interval = batchInterval)
66 |       .flatMap(rec => {
67 |         implicit val formats = DefaultFormats
68 |         val query = parse(rec) \ "query"
69 |         ((query \ "results" \ "quote").children)
70 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
71 |       })
72 |       .foreachRDD(rdd => {
73 |         rdd.foreach({ stock =>
74 |           stateAccum += (stock._1, (stock._2._1, stock._2._2))
75 |         })
76 |         for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats)
77 |       })
78 | 
79 |     ssc.start()
80 |     ssc.awaitTermination()
81 |   }
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-26Redis.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.collection.JavaConversions.asScalaBuffer
 4 | import scala.collection.JavaConversions.mutableMapAsJavaMap
 5 | import scala.collection.mutable
 6 | 
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.jvalue2extractable
13 | import org.json4s.jvalue2monadic
14 | import org.json4s.native.JsonMethods.parse
15 | import org.json4s.string2JsonInput
16 | 
17 | import redis.clients.jedis.Jedis
18 | 
19 | object StatefulRedisApp {
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 3) {
23 |       System.err.println(
24 |         "Usage: StatefulRedisApp <appname> <checkpointDir> <hostname>")
25 |       System.exit(1)
26 |     }
27 | 
28 |     val Seq(appName, checkpointDir, hostname) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val batchInterval = 10
35 | 
36 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
37 | 
38 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
39 |       interval = batchInterval)
40 |       .flatMap(rec => {
41 |         implicit val formats = DefaultFormats
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children)
44 |           .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)))
45 |       })
46 |       .foreachRDD(rdd => {
47 |         rdd.foreachPartition({ part =>
48 |           val jedis = new Jedis(hostname)
49 |           part.foreach(f => {
50 |             val prev = jedis.hmget(f._1, "min", "max", "count")
51 |             if (prev(0) == null) {
52 |               jedis.hmset(f._1, mutable.HashMap("min" -> Long.MaxValue.toString, "max" -> Long.MinValue.toString, "count" -> 0.toString))
53 |             } else {
54 |               val prevLong = prev.toList.map(v => v.toLong)
55 |               var newCount = prevLong(2)
56 |               val newPrice = f._2._1
57 |               val newVolume = f._2._2
58 |               if (newPrice > 500.0) {
59 |                 newCount += 1
60 |               }
61 |               val newMin = if (newVolume < prevLong(0)) newVolume else prevLong(0)
62 |               val newMax = if (newVolume > prevLong(1)) newVolume else prevLong(1)
63 |               jedis.hmset(f._1, mutable.HashMap("min" -> newMin.toString, "max" -> newMax.toString, "count" -> newCount.toString))
64 |             }
65 |           })
66 |           jedis.close()
67 |         })
68 | 
69 |         val jedis = new Jedis(hostname)
70 |         jedis.scan(0).getResult.foreach(sym => println("Symbol: %s, Stats: %s".format(sym, jedis.hmget(sym, "min", "max", "count").toString)))
71 |         jedis.close()
72 |       })
73 | 
74 |     ssc.start()
75 |     ssc.awaitTermination()
76 |   }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-5Exception.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppA {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
51 |         client.connect()
52 |         rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))))
53 |         client.disconnect()
54 |         client.close()
55 |       }
56 | 
57 |     ssc.start()
58 |     ssc.awaitTermination()
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-6PerRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppB {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreach { rec =>
51 |           {
52 |             val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
53 |             client.connect()
54 |             client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))
55 |             client.disconnect()
56 |             client.close()
57 |           }
58 |         }
59 |       }
60 | 
61 |     ssc.start()
62 |     ssc.awaitTermination()
63 |   }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-7PerPartition.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppC {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreachPartition { par =>
51 |           val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
52 |           client.connect()
53 |           par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))))
54 |           client.disconnect()
55 |           client.close()
56 |         }
57 |       }
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/Chap6/src/main/scala/org/apress/prospark/L6-8Static.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.nio.charset.StandardCharsets
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.eclipse.paho.client.mqttv3.MqttClient
10 | import org.eclipse.paho.client.mqttv3.MqttMessage
11 | import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
12 | import org.json4s.DefaultFormats
13 | import org.json4s.JField
14 | import org.json4s.JsonAST.JObject
15 | import org.json4s.jvalue2extractable
16 | import org.json4s.jvalue2monadic
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.string2JsonInput
19 | 
20 | object MqttSinkAppD {
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 3) {
24 |       System.err.println(
25 |         "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>")
26 |       System.exit(1)
27 |     }
28 | 
29 |     val Seq(appName, outputBrokerUrl, topic) = args.toSeq
30 | 
31 |     val conf = new SparkConf()
32 |       .setAppName(appName)
33 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
34 | 
35 |     val batchInterval = 10
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval))
38 | 
39 |     HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
40 |       interval = batchInterval)
41 |       .flatMap(rec => {
42 |         val query = parse(rec) \ "query"
43 |         ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec))
44 |       })
45 |       .map(rec => {
46 |         implicit val formats = DefaultFormats
47 |         rec.children.map(f => f.extract[String]) mkString ","
48 |       })
49 |       .foreachRDD { rdd =>
50 |         rdd.foreachPartition { par =>
51 |           par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8))))
52 |         }
53 |       }
54 | 
55 |     ssc.start()
56 |     ssc.awaitTermination()
57 |   }
58 | }
59 | 
60 | object MqttSink {
61 |   val brokerUrl = "tcp://localhost:1883"
62 |   val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence())
63 |   client.connect()
64 |   sys.addShutdownHook {
65 |     client.disconnect()
66 |     client.close()
67 |   }
68 | 
69 |   def apply(): MqttClient = {
70 |     client
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/Chap7/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap7/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap7/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap7"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 


--------------------------------------------------------------------------------
/Chap7/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.Enumeration;
 9 | import java.util.zip.GZIPInputStream;
10 | import java.util.zip.ZipEntry;
11 | import java.util.zip.ZipFile;
12 | 
13 | import org.apache.commons.io.FilenameUtils;
14 | import org.apache.log4j.LogManager;
15 | import org.apache.log4j.Logger;
16 | 
17 | public abstract class AbstractDriver {
18 | 
19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
20 | 
21 | 	private String path;
22 | 
23 | 	public AbstractDriver(String path) {
24 | 		this.path = path;
25 | 	}
26 | 
27 | 	public abstract void init() throws Exception;
28 | 
29 | 	public abstract void close() throws Exception;
30 | 
31 | 	public abstract void sendRecord(String record) throws Exception;
32 | 
33 | 	public void execute() throws Exception {
34 | 
35 | 		try {
36 | 			init();
37 | 			File dirPath = new File(path);
38 | 			if (dirPath.isDirectory()) {
39 | 				File[] files = new File(path).listFiles();
40 | 				for (File f : files) {
41 | 					String ext = FilenameUtils.getExtension(f.getPath());
42 | 					if (ext.equals("zip")) {
43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
44 | 						ZipFile zFile = null;
45 | 						try {
46 | 							zFile = new ZipFile(f);
47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
48 | 
49 | 							while (zEntries.hasMoreElements()) {
50 | 								ZipEntry zEntry = zEntries.nextElement();
51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
52 | 								try (BufferedReader br = new BufferedReader(
53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
54 | 									// skip header
55 | 									br.readLine();
56 | 									String line;
57 | 									while ((line = br.readLine()) != null) {
58 | 										sendRecord(line);
59 | 									}
60 | 								}
61 | 							}
62 | 						} catch (IOException e) {
63 | 							LOG.error(e.getMessage());
64 | 						} finally {
65 | 							if (zFile != null) {
66 | 								try {
67 | 									zFile.close();
68 | 								} catch (IOException e) {
69 | 									LOG.error(e.getMessage());
70 | 								}
71 | 							}
72 | 						}
73 | 					} else if (ext.equals("gz")) {
74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
75 | 						try (BufferedReader br = new BufferedReader(
76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
77 | 							// skip header
78 | 							br.readLine();
79 | 							String line;
80 | 							while ((line = br.readLine()) != null) {
81 | 								sendRecord(line);
82 | 							}
83 | 						}
84 | 					} else {
85 | 						LOG.warn("Unsupported file type: " + f.getName());
86 | 					}
87 | 				}
88 | 			} else {
89 | 				LOG.error(String.format("Path %s is not a directory", path));
90 | 			}
91 | 		} finally {
92 | 			close();
93 | 		}
94 | 	}
95 | }


--------------------------------------------------------------------------------
/Chap7/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/Chap7/src/main/scala/org/apress/prospark/L7-2-3Tachyon.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
 6 | import org.apache.spark.storage.StorageLevel
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
11 | 
12 | object ReferrerApp {
13 |   def main(args: Array[String]) {
14 |     if (args.length != 7) {
15 |       System.err.println(
16 |         "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 |       .set("spark.externalBlockStore.url", tachyonUrl)
25 | 
26 |     val ssc = new StreamingContext(conf, Seconds(10))
27 |     ssc.checkpoint(checkpointDir)
28 | 
29 |     val clickstream = ssc.socketTextStream(hostname, port.toInt)
30 |       .map(rec => rec.split("\\t"))
31 |       .persist(StorageLevel.OFF_HEAP)
32 | 
33 |     val topRefStream = clickstream
34 |       .map(rec => {
35 |         var prev_title = rec(3)
36 |         if (!prev_title.startsWith("other")) {
37 |           prev_title = "wikipedia"
38 |         }
39 |         (prev_title, 1)
40 |       })
41 | 
42 |     val topSparkStream = clickstream
43 |       .filter(rec => rec(4).equals("Apache_Spark"))
44 |       .map(rec => (rec(3), 1))
45 | 
46 |     saveTopKeys(topRefStream, outputPathTop)
47 | 
48 |     saveTopKeys(topSparkStream, outputPathSpark)
49 | 
50 |     ssc.start()
51 |     ssc.awaitTermination()
52 |   }
53 | 
54 |   def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) {
55 |     clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0)))
56 |       .repartition(1)
57 |       .map(rec => (rec._2, rec._1))
58 |       .transform(rec => rec.sortByKey(ascending = false))
59 |       .saveAsTextFiles(outputPath)
60 |   }
61 | 
62 | }


--------------------------------------------------------------------------------
/Chap7/src/main/scala/org/apress/prospark/L7-4UI.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import java.util.concurrent.atomic.AtomicLong
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object SocialSearchApp {
12 |   def main(args: Array[String]) {
13 |     if (args.length != 3) {
14 |       System.err.println(
15 |         "Usage: SocialSearchApp <appname> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 |       //.set("spark.eventLog.enabled", "true")
24 |       //.set("spark.eventLog.dir", "/tmp/historical")
25 |       
26 | 
27 |     val countSearch = new AtomicLong(0)
28 |     val countSocial = new AtomicLong(0)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(1))
31 |     
32 |     val titleStream = ssc.socketTextStream(hostname, port.toInt)
33 |       .map(rec => rec.split("\\t"))
34 |       .filter(_(3) match {
35 |         case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true
36 |         case _ => false
37 |       })
38 |       .map(rec => (rec(3), rec(4)))
39 |       .cache()
40 | 
41 |     val searchStream = titleStream.filter(_._1 match {
42 |       case "other-google" | "other-bing" | "other-yahoo" => true
43 |       case _ => false
44 |     })
45 |       .map(rec => rec._2)
46 | 
47 |     val socialStream = titleStream.filter(_._1 match {
48 |       case "other-facebook" | "other-twitter" => true
49 |       case _ => false
50 |     })
51 |       .map(rec => rec._2)
52 | 
53 |     val exclusiveSearch = searchStream.transformWith(socialStream,
54 |       (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD))
55 |       .foreachRDD(rdd => {
56 |         countSearch.addAndGet(rdd.count())
57 |         println("Exclusive count search engines: " + countSearch)
58 |       })
59 | 
60 |     val exclusiveSocial = socialStream.transformWith(searchStream,
61 |       (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD))
62 |       .foreachRDD(rdd => {
63 |         countSocial.addAndGet(rdd.count())
64 |         println("Exclusive count social media: " + countSocial)
65 |       })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 |   }
70 | 
71 | }


--------------------------------------------------------------------------------
/Chap8/L8-36CdrSparkRApp.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(trailingOnly = TRUE)
 2 | if(length(args) != 2) {
 3 |     stop("Usage: CdrSparkRApp <master> <filepath>")
 4 | }
 5 | library(SparkR)
 6 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.3.0" "sparkr-shell"')
 7 | sc <- sparkR.init(master = args[1])
 8 | sqlContext <- sparkRSQL.init(sc)
 9 | df <- read.df(sqlContext, args[2], source = "com.databricks.spark.csv", inferSchema = "true", delimiter = "\t")
10 | cnames <- c("squareId", "timeInterval", "countryCode", "smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity") 
11 | for (i in 1:NROW(cnames)) {
12 |     df <- withColumnRenamed(df, paste0("C", i - 1), cnames[i])
13 | }
14 | counts <- count(groupBy(df, "countryCode"))
15 | showDF(orderBy(counts, desc(counts$count)), numRows = 5)
16 | sparkR.stop()


--------------------------------------------------------------------------------
/Chap8/L8-39CdrStreamingSparkRApp.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | if(length(args) != 1) {
 4 |     stop("Usage: CdrStreamingSparkRApp <master>")
 5 | }
 6 | library(SparkR)
 7 | sc <- sparkR.init(master = args[1])
 8 | hiveContext <- sparkRHive.init(sc)
 9 | f <- file("stdin")
10 | open(f)
11 | while(length(tableName <- readLines(f, n = 1)) > 0) {
12 |     tryCatch({
13 |         tableName <- trimws(tableName)
14 |         write(paste0("Processing table: ", tableName), stderr())
15 |         df <- table(hiveContext, tableName)
16 |         counts <- count(groupBy(df, "countryCode"))
17 |         outputTable <- paste0(tableName, "processed")
18 |         write(paste0("Output written to: ", outputTable), stderr())
19 |         saveAsTable(limit(orderBy(counts, desc(counts$count)), 5), outputTable, "parquet", "error")
20 |     }, error = function(e) {stop(e)})
21 | }
22 | close(f)
23 | sparkR.stop()


--------------------------------------------------------------------------------
/Chap8/cdrschema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "struct",
 3 |   "fields": [
 4 |     {
 5 |       "name": "squareId",
 6 |       "nullable": false,
 7 |       "type": "integer"
 8 |     },
 9 |     {
10 |       "name": "timeInterval",
11 |       "nullable": false,
12 |       "type": "long"
13 |     },
14 |     {
15 |       "name": "countryCode",
16 |       "nullable": true,
17 |       "type": "string"
18 |     },
19 |     {
20 |       "name": "smsInActivity",
21 |       "nullable": true,
22 |       "type": "float"
23 |     },
24 |     {
25 |       "name": "smsOutActivity",
26 |       "nullable": true,
27 |       "type": "float"
28 |     },
29 |     {
30 |       "name": "callInActivity",
31 |       "nullable": true,
32 |       "type": "float"
33 |     },
34 |     {
35 |       "name": "callOutActivity",
36 |       "nullable": true,
37 |       "type": "float"
38 |     },
39 |     {
40 |       "name": "internetTrafficActivity",
41 |       "nullable": true,
42 |       "type": "float"
43 |     }
44 |   ]
45 | }
46 | 


--------------------------------------------------------------------------------
/Chap8/cdrschema2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "struct",
 3 |   "fields": [
 4 |     {
 5 |       "name": "squareId",
 6 |       "nullable": false,
 7 |       "type": "integer"
 8 |     },
 9 |     {
10 |       "name": "timeInterval",
11 |       "nullable": false,
12 |       "type": "long"
13 |     },
14 |     {
15 |       "name": "countryCode",
16 |       "nullable": true,
17 |       "type": "integer"
18 |     },
19 |     {
20 |       "name": "smsInActivity",
21 |       "nullable": true,
22 |       "type": "float"
23 |     },
24 |     {
25 |       "name": "smsOutActivity",
26 |       "nullable": true,
27 |       "type": "float"
28 |     },
29 |     {
30 |       "name": "callInActivity",
31 |       "nullable": true,
32 |       "type": "float"
33 |     },
34 |     {
35 |       "name": "callOutActivity",
36 |       "nullable": true,
37 |       "type": "float"
38 |     },
39 |     {
40 |       "name": "internetTrafficActivity",
41 |       "nullable": true,
42 |       "type": "float"
43 |     }
44 |   ]
45 | }
46 | 


--------------------------------------------------------------------------------
/Chap8/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap8/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap8/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap8"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | //libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0"
24 | 
25 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "1.4.0"
26 | 
27 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
28 | 


--------------------------------------------------------------------------------
/Chap8/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.Enumeration;
 9 | import java.util.zip.GZIPInputStream;
10 | import java.util.zip.ZipEntry;
11 | import java.util.zip.ZipFile;
12 | 
13 | import org.apache.commons.io.FilenameUtils;
14 | import org.apache.log4j.LogManager;
15 | import org.apache.log4j.Logger;
16 | 
17 | public abstract class AbstractDriver {
18 | 
19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
20 | 
21 | 	private String path;
22 | 
23 | 	public AbstractDriver(String path) {
24 | 		this.path = path;
25 | 	}
26 | 
27 | 	public abstract void init() throws Exception;
28 | 
29 | 	public abstract void close() throws Exception;
30 | 
31 | 	public abstract void sendRecord(String record) throws Exception;
32 | 
33 | 	public void execute() throws Exception {
34 | 
35 | 		try {
36 | 			init();
37 | 			File dirPath = new File(path);
38 | 			if (dirPath.isDirectory()) {
39 | 				File[] files = new File(path).listFiles();
40 | 				for (File f : files) {
41 | 					String ext = FilenameUtils.getExtension(f.getPath());
42 | 					if (ext.equals("zip")) {
43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
44 | 						ZipFile zFile = null;
45 | 						try {
46 | 							zFile = new ZipFile(f);
47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
48 | 
49 | 							while (zEntries.hasMoreElements()) {
50 | 								ZipEntry zEntry = zEntries.nextElement();
51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
52 | 								try (BufferedReader br = new BufferedReader(
53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
54 | 									// skip header
55 | 									br.readLine();
56 | 									String line;
57 | 									while ((line = br.readLine()) != null) {
58 | 										sendRecord(line);
59 | 									}
60 | 								}
61 | 							}
62 | 						} catch (IOException e) {
63 | 							LOG.error(e.getMessage());
64 | 						} finally {
65 | 							if (zFile != null) {
66 | 								try {
67 | 									zFile.close();
68 | 								} catch (IOException e) {
69 | 									LOG.error(e.getMessage());
70 | 								}
71 | 							}
72 | 						}
73 | 					} else if (ext.equals("gz")) {
74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
75 | 						try (BufferedReader br = new BufferedReader(
76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
77 | 							// skip header
78 | 							br.readLine();
79 | 							String line;
80 | 							while ((line = br.readLine()) != null) {
81 | 								sendRecord(line);
82 | 							}
83 | 						}
84 | 					} else {
85 | 						LOG.warn("Unsupported file type: " + f.getName());
86 | 					}
87 | 				}
88 | 			} else {
89 | 				LOG.error(String.format("Path %s is not a directory", path));
90 | 			}
91 | 		} finally {
92 | 			close();
93 | 		}
94 | 	}
95 | }


--------------------------------------------------------------------------------
/Chap8/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-10-11UDF.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.io.Source
 4 | import scala.reflect.runtime.universe
 5 | 
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.sql.SQLContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.json4s.jackson.JsonMethods.parse
13 | import org.json4s.jvalue2extractable
14 | import org.json4s.string2JsonInput
15 | 
16 | object CdrUDFApp {
17 | 
18 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
19 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
20 |     callOutActivity: Float, internetTrafficActivity: Float)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val sqlC = new SQLContext(ssc.sparkContext)
37 |     import sqlC.implicits._
38 | 
39 |     def getCountryCodeMapping() = {
40 |       implicit val formats = org.json4s.DefaultFormats
41 |       parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap)
42 |     }
43 | 
44 |     def getCountryNameMapping() = {
45 |       implicit val formats = org.json4s.DefaultFormats
46 |       parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]]
47 |     }
48 | 
49 |     def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = {
50 |       mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound")
51 |     }
52 | 
53 |     val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int)
54 | 
55 |     sqlC.udf.register("getCountryNamePartial", getCountryNamePartial)
56 | 
57 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
58 |       .map(_.split("\\t", -1))
59 |       .foreachRDD(rdd => {
60 |         val cdrs = seqToCdr(rdd).toDF()
61 |         cdrs.registerTempTable("cdrs")
62 | 
63 |         sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show()
64 | 
65 |       })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 |   }
70 | 
71 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
72 |     rdd.map(c => c.map(f => f match {
73 |       case x if x.isEmpty() => "0"
74 |       case x => x
75 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
76 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
77 |   }
78 | 
79 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-13HiveQL.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.hive.HiveContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CdrHiveqlApp {
13 | 
14 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
15 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
16 |     callOutActivity: Float, internetTrafficActivity: Float)
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val cl = Thread.currentThread().getContextClassLoader()
33 |     val hiveC = new HiveContext(ssc.sparkContext)
34 |     Thread.currentThread().setContextClassLoader(cl)
35 | 
36 |     import hiveC.implicits._
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         seqToCdr(rdd).toDF().registerTempTable("cdrs")
42 | 
43 |         hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'")
44 |         hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show()
45 |       })
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
52 |     rdd.map(c => c.map(f => f match {
53 |       case x if x.isEmpty() => "0"
54 |       case x => x
55 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
56 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
57 |   }
58 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-14-27DataFrameExamples.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeExamplesApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
37 |       .map(_.split("\\t", -1))
38 |       .foreachRDD(rdd => {
39 |         val cdrs = seqToCdr(rdd).toDF()
40 | 
41 |         cdrs.select("squareId", "timeInterval", "countryCode").show()
42 |         cdrs.select($"squareId", $"timeInterval", $"countryCode").show()
43 |         cdrs.filter("squareId = 5").show()
44 |         cdrs.drop("countryCode").show()
45 |         cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show()
46 |         cdrs.limit(5).show()
47 |         cdrs.groupBy("squareId").count().show()
48 |         cdrs.groupBy("countryCode").avg("internetTrafficActivity").show()
49 |         cdrs.groupBy("countryCode").max("callOutActivity").show()
50 |         cdrs.groupBy("countryCode").min("callOutActivity").show()
51 |         cdrs.groupBy("squareId").sum("internetTrafficActivity").show()
52 |         cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
53 |         cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show()
54 |         cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
55 |         cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode())
56 |         cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode())
57 |         cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show()
58 |         cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show()
59 |         cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show()
60 |         cdrs.sample(true, 0.01).show()
61 |       })
62 | 
63 |     ssc.start()
64 |     ssc.awaitTermination()
65 |   }
66 | 
67 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
68 |     rdd.map(c => c.map(f => f match {
69 |       case x if x.isEmpty() => "0"
70 |       case x => x
71 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
72 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
73 |   }
74 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-1DataFrameAPI.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
37 |       .map(_.split("\\t", -1))
38 |       .foreachRDD(rdd => {
39 |         val cdrs = seqToCdr(rdd).toDF()
40 | 
41 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
42 |       })
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
49 |     rdd.map(c => c.map(f => f match {
50 |       case x if x.isEmpty() => "0"
51 |       case x => x
52 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
53 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
54 |   }
55 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-28DataFrameExamplesOps.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.DataFrame
 9 | import org.apache.spark.sql.SQLContext
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object CdrDataframeExamples2App {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 4) {
21 |       System.err.println(
22 |         "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val sqlC = new SQLContext(ssc.sparkContext)
34 |     import sqlC.implicits._
35 | 
36 |     var previousCdrs: Option[DataFrame] = None
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates()
42 |         previousCdrs match {
43 |           case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show()
44 |           //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show()
45 |           //case Some(prevCdrs) => cdrs.except(prevCdrs).show()
46 |           case None => Unit
47 |         }
48 |         previousCdrs = Some(cdrs)
49 |       })
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
56 |     rdd.map(c => c.map(f => f match {
57 |       case x if x.isEmpty() => "0"
58 |       case x => x
59 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
60 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
61 |   }
62 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-29DataFrameExamplesJoin.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JDouble
13 | import org.json4s.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.compact
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.native.JsonMethods.render
19 | import org.json4s.string2JsonInput
20 | 
21 | object CdrDataframeExamples3App {
22 | 
23 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
24 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
25 |     callOutActivity: Float, internetTrafficActivity: Float)
26 | 
27 |   def main(args: Array[String]) {
28 |     if (args.length != 5) {
29 |       System.err.println(
30 |         "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>")
31 |       System.exit(1)
32 |     }
33 |     val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
40 | 
41 |     val sqlC = new SQLContext(ssc.sparkContext)
42 |     import sqlC.implicits._
43 |     implicit val formats = DefaultFormats
44 | 
45 |     val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString
46 |     val gridGeo = (parse(gridFile) \ "features")
47 |     val gridStr = gridGeo.children.map(r => {
48 |       val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r))
49 |       val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)),
50 |         ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7)))
51 |       compact(render(JObject(l)))
52 |     })
53 | 
54 |     val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr))
55 | 
56 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
57 |       .map(_.split("\\t", -1))
58 |       .foreachRDD(rdd => {
59 |         val cdrs = seqToCdr(rdd).toDF()
60 |         cdrs.join(gridDF, $"squareId" === $"id").show()
61 |       })
62 | 
63 |     ssc.start()
64 |     ssc.awaitTermination()
65 |   }
66 | 
67 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
68 |     rdd.map(c => c.map(f => f match {
69 |       case x if x.isEmpty() => "0"
70 |       case x => x
71 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
72 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
73 |   }
74 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-3-6-7DataFrameCreation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.json4s.native.Serialization.write
13 | import org.json4s.DefaultFormats
14 | 
15 | object DataframeCreationApp {
16 | 
17 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
18 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
19 |     callOutActivity: Float, internetTrafficActivity: Float)
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 4) {
23 |       System.err.println(
24 |         "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
34 | 
35 |     val sqlC = new SQLContext(ssc.sparkContext)
36 |     import sqlC.implicits._
37 | 
38 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
39 |       .map(_.split("\\t", -1))
40 |       .foreachRDD(rdd => {
41 |         //val cdrs = sqlC.createDataFrame(seqToCdr(rdd))
42 |         //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect())
43 |         //val cdrs = seqToCdr(rdd).toDF()
44 |         val cdrsJson = seqToCdr(rdd).map(r => {
45 |           implicit val formats = DefaultFormats
46 |           write(r)
47 |         })
48 |         val cdrs = sqlC.read.json(cdrsJson)
49 | 
50 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
51 |       })
52 | 
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 | 
56 |   }
57 | 
58 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
59 |     rdd.map(c => c.map(f => f match {
60 |       case x if x.isEmpty() => "0"
61 |       case x => x
62 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
63 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
64 |   }
65 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-35DataFrameExamplesRDD.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.sql.types.DataType
10 | import org.apache.spark.sql.types.StructType
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.json4s.DefaultFormats
14 | 
15 | object CdrDataframeExamplesRDDApp {
16 | 
17 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
18 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
19 |     callOutActivity: Float, internetTrafficActivity: Float)
20 | 
21 |   def main(args: Array[String]) {
22 |     if (args.length != 5) {
23 |       System.err.println(
24 |         "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>")
25 |       System.exit(1)
26 |     }
27 |     val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq
28 | 
29 |     val conf = new SparkConf()
30 |       .setAppName(appName)
31 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
32 | 
33 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
34 | 
35 |     val sqlC = new SQLContext(ssc.sparkContext)
36 |     import sqlC.implicits._
37 |     implicit val formats = DefaultFormats
38 | 
39 |     val schemaJson = scala.io.Source.fromFile(schemaFile).mkString
40 |     val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType]
41 | 
42 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
43 |       .map(_.split("\\t", -1))
44 |       .foreachRDD(rdd => {
45 |         val cdrs = seqToCdr(rdd).toDF()
46 |         val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema)
47 |         val highOther = cdrs.except(highInternet)
48 |         val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates()
49 |         val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates()
50 |         highOtherGrid.except(highInternetGrid).show()
51 |         highInternetGrid.except(highOtherGrid).show()
52 |       })
53 | 
54 |     ssc.start()
55 |     ssc.awaitTermination()
56 |   }
57 | 
58 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
59 |     rdd.map(c => c.map(f => f match {
60 |       case x if x.isEmpty() => "0"
61 |       case x => x
62 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
63 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
64 |   }
65 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-38SparkR.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.sql.hive.HiveContext
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | import java.nio.file.Paths
11 | import org.apache.spark.SparkFiles
12 | 
13 | object CdrStreamingSparkRApp {
14 | 
15 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
16 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
17 |     callOutActivity: Float, internetTrafficActivity: Float)
18 | 
19 |   def main(args: Array[String]) {
20 |     if (args.length != 7) {
21 |       System.err.println(
22 |         "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>")
23 |       System.exit(1)
24 |     }
25 |     val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq
26 | 
27 |     val conf = new SparkConf()
28 |       .setAppName(appName)
29 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
30 | 
31 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
32 | 
33 |     val cl = Thread.currentThread().getContextClassLoader()
34 |     val hiveC = new HiveContext(ssc.sparkContext)
35 |     Thread.currentThread().setContextClassLoader(cl)
36 | 
37 |     import hiveC.implicits._
38 | 
39 |     ssc.sparkContext.addFile(rScriptPath)
40 |     val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString)
41 |     val master = hiveC.sparkContext.getConf.get("spark.master")
42 | 
43 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
44 |       .map(_.split("\\t", -1))
45 |       .foreachRDD((rdd, time) => {
46 |         val iTableName = tableName + time.milliseconds
47 |         seqToCdr(rdd).toDF().write.saveAsTable(iTableName)
48 |         hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString)
49 |       })
50 | 
51 |     ssc.start()
52 |     ssc.awaitTermination()
53 |   }
54 | 
55 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
56 |     rdd.map(c => c.map(f => f match {
57 |       case x if x.isEmpty() => "0"
58 |       case x => x
59 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
60 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
61 |   }
62 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-4DataFrameCreationSchema.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql.functions.desc
 8 | import org.apache.spark.sql.types.DataType
 9 | import org.apache.spark.sql.types.StructType
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object DataframeCreationApp2 {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 5) {
17 |       System.err.println(
18 |         "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val sqlC = new SQLContext(ssc.sparkContext)
30 | 
31 |     val schemaJson = scala.io.Source.fromFile(schemaFile).mkString
32 |     val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType]
33 | 
34 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
35 |       .map(_.split("\\t", -1))
36 |       .foreachRDD(rdd => {
37 |         val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema)
38 |         
39 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
40 |       })
41 | 
42 |     ssc.start()
43 |     ssc.awaitTermination()
44 | 
45 |   }
46 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/L8-8Sql.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CdrSqlApp {
13 | 
14 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
15 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
16 |     callOutActivity: Float, internetTrafficActivity: Float)
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val sqlC = new SQLContext(ssc.sparkContext)
33 |     import sqlC.implicits._
34 | 
35 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
36 |       .map(_.split("\\t", -1))
37 |       .foreachRDD(rdd => {
38 |         val cdrs = seqToCdr(rdd).toDF()
39 |         cdrs.registerTempTable("cdrs")
40 | 
41 |         sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show()
42 |         sqlC.dropTempTable("cdrs")
43 |       })
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 |   }
48 | 
49 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
50 |     rdd.map(c => c.map(f => f match {
51 |       case x if x.isEmpty() => "0"
52 |       case x => x
53 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
54 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
55 |   }
56 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/T8-3DataFrameExamplesNA.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SQLContext
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | import org.json4s.DefaultFormats
12 | import org.json4s.JDouble
13 | import org.json4s.JObject
14 | import org.json4s.jvalue2extractable
15 | import org.json4s.jvalue2monadic
16 | import org.json4s.native.JsonMethods.compact
17 | import org.json4s.native.JsonMethods.parse
18 | import org.json4s.native.JsonMethods.render
19 | import org.json4s.string2JsonInput
20 | 
21 | object CdrDataframeExamplesNAApp {
22 | 
23 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
24 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
25 |     callOutActivity: Float, internetTrafficActivity: Float)
26 | 
27 |   def main(args: Array[String]) {
28 |     if (args.length != 4) {
29 |       System.err.println(
30 |         "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>")
31 |       System.exit(1)
32 |     }
33 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
34 | 
35 |     val conf = new SparkConf()
36 |       .setAppName(appName)
37 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
38 | 
39 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
40 | 
41 |     val sqlC = new SQLContext(ssc.sparkContext)
42 |     import sqlC.implicits._
43 |     implicit val formats = DefaultFormats
44 | 
45 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
46 |       .map(_.split("\\t", -1))
47 |       .foreachRDD(rdd => {
48 |         val cdrs = seqToCdr(rdd).toDF()
49 |         cdrs.na.drop("any").show()
50 |         cdrs.na.fill(0, Array("squareId")).show()
51 |         cdrs.na.replace("squareId", Map(0 -> 1)).show()
52 |         println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity"))
53 |         println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity"))
54 |         cdrs.stat.crosstab("squareId", "countryCode").show()
55 |         cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show()
56 |         cdrs.stat.crosstab("callOutActivity", "callInActivity").show()
57 |       })
58 | 
59 |     ssc.start()
60 |     ssc.awaitTermination()
61 |   }
62 | 
63 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
64 |     rdd.map(c => c.map(f => f match {
65 |       case x if x.isEmpty() => "0"
66 |       case x => x
67 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
68 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
69 |   }
70 | }


--------------------------------------------------------------------------------
/Chap8/src/main/scala/org/apress/prospark/T8-5-L8-30-34DataFrameExamplesActions.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.SaveMode
 9 | import org.apache.spark.sql.functions.desc
10 | import org.apache.spark.sql.hive.HiveContext
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr
14 | import org.json4s.DefaultFormats
15 | 
16 | object CdrDataframeExamplesActionsApp {
17 | 
18 |   case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
19 |     smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
20 |     callOutActivity: Float, internetTrafficActivity: Float)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val cl = Thread.currentThread().getContextClassLoader()
37 |     val hiveC = new HiveContext(ssc.sparkContext)
38 |     Thread.currentThread().setContextClassLoader(cl)
39 |     import hiveC.implicits._
40 |     implicit val formats = DefaultFormats
41 | 
42 |     val cdrStream = ssc.socketTextStream(hostname, port.toInt)
43 |       .map(_.split("\\t", -1))
44 |       .foreachRDD(rdd => {
45 |         val cdrs = seqToCdr(rdd).toDF()
46 | 
47 |         val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count"))
48 |         counts.show(5)
49 |         counts.show()
50 |         println("head(5): " + counts.head(5))
51 |         println("take(5): " + counts.take(5))
52 |         println("head(): " + counts.head())
53 |         println("first(5): " + counts.first())
54 |         println("count(): " + counts.count())
55 |         println("collect(): " + counts.collect())
56 |         println("collectAsList(): " + counts.collectAsList())
57 |         println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show())
58 |         counts.write.format("parquet").save("/tmp/parquent" + rdd.id)
59 |         counts.write.format("json").save("/tmp/json" + rdd.id)
60 |         counts.write.parquet("/tmp/parquent2" + rdd.id)
61 |         counts.write.json("/tmp/json2" + rdd.id)
62 |         counts.write.saveAsTable("count_table")
63 |         cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts")
64 |         val prop: java.util.Properties = new java.util.Properties()
65 |         counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop)
66 |       })
67 | 
68 |     ssc.start()
69 |     ssc.awaitTermination()
70 |   }
71 | 
72 |   def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
73 |     rdd.map(c => c.map(f => f match {
74 |       case x if x.isEmpty() => "0"
75 |       case x => x
76 |     })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
77 |       c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
78 |   }
79 | }


--------------------------------------------------------------------------------
/Chap9/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/Chap9/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.4.0")
4 | 


--------------------------------------------------------------------------------
/Chap9/spark.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
 6 |  case entry => {
 7 |    val strategy = mergeStrategy(entry)
 8 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
 9 |    else strategy
10 |  }
11 | }}
12 | 
13 | name := "Chap9"
14 | 
15 | version := "1.0"
16 | 
17 | scalaVersion := "2.10.5"
18 | 
19 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
20 | 
21 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.4.0"
22 | 
23 | libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.4.0"
24 | 
25 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.10"
26 | 


--------------------------------------------------------------------------------
/Chap9/src/main/java/org/apress/prospark/AbstractDriver.java:
--------------------------------------------------------------------------------
  1 | package org.apress.prospark;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.Enumeration;
  9 | import java.util.zip.GZIPInputStream;
 10 | import java.util.zip.ZipEntry;
 11 | import java.util.zip.ZipFile;
 12 | 
 13 | import org.apache.commons.io.FilenameUtils;
 14 | import org.apache.log4j.LogManager;
 15 | import org.apache.log4j.Logger;
 16 | 
 17 | public abstract class AbstractDriver {
 18 | 
 19 | 	private static final Logger LOG = LogManager.getLogger(AbstractDriver.class);
 20 | 
 21 | 	private String path;
 22 | 
 23 | 	public AbstractDriver(String path) {
 24 | 		this.path = path;
 25 | 	}
 26 | 
 27 | 	public abstract void init() throws Exception;
 28 | 
 29 | 	public abstract void close() throws Exception;
 30 | 
 31 | 	public abstract void sendRecord(String record) throws Exception;
 32 | 
 33 | 	public void execute() throws Exception {
 34 | 
 35 | 		try {
 36 | 			init();
 37 | 			File dirPath = new File(path);
 38 | 			if (dirPath.isDirectory()) {
 39 | 				File[] files = new File(path).listFiles();
 40 | 				for (File f : files) {
 41 | 					String ext = FilenameUtils.getExtension(f.getPath());
 42 | 					if (ext.equals("zip")) {
 43 | 						LOG.info(String.format("Feeding zipped file %s", f.getName()));
 44 | 						ZipFile zFile = null;
 45 | 						try {
 46 | 							zFile = new ZipFile(f);
 47 | 							Enumeration<? extends ZipEntry> zEntries = zFile.entries();
 48 | 
 49 | 							while (zEntries.hasMoreElements()) {
 50 | 								ZipEntry zEntry = zEntries.nextElement();
 51 | 								LOG.info(String.format("Feeding file %s", zEntry.getName()));
 52 | 								try (BufferedReader br = new BufferedReader(
 53 | 										new InputStreamReader(zFile.getInputStream(zEntry)))) {
 54 | 									// skip header
 55 | 									br.readLine();
 56 | 									String line;
 57 | 									while ((line = br.readLine()) != null) {
 58 | 										sendRecord(line);
 59 | 									}
 60 | 								}
 61 | 							}
 62 | 						} catch (IOException e) {
 63 | 							LOG.error(e.getMessage());
 64 | 						} finally {
 65 | 							if (zFile != null) {
 66 | 								try {
 67 | 									zFile.close();
 68 | 								} catch (IOException e) {
 69 | 									LOG.error(e.getMessage());
 70 | 								}
 71 | 							}
 72 | 						}
 73 | 					} else if (ext.equals("gz")) {
 74 | 						LOG.info(String.format("Feeding file %s", f.getName()));
 75 | 						try (BufferedReader br = new BufferedReader(
 76 | 								new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))) {
 77 | 							// skip header
 78 | 							br.readLine();
 79 | 							String line;
 80 | 							while ((line = br.readLine()) != null) {
 81 | 								sendRecord(line);
 82 | 							}
 83 | 						}
 84 | 					} else if (ext.equals("dat")) {
 85 | 						LOG.info(String.format("Feeding dat file %s", f.getName()));
 86 | 						try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)))) {
 87 | 							String line;
 88 | 							while ((line = br.readLine()) != null) {
 89 | 								sendRecord(line);
 90 | 							}
 91 | 						}
 92 | 					} else {
 93 | 						LOG.warn("Unsupported file type: " + f.getName());
 94 | 					}
 95 | 				}
 96 | 			} else {
 97 | 				LOG.error(String.format("Path %s is not a directory", path));
 98 | 			}
 99 | 		} finally {
100 | 			close();
101 | 		}
102 | 	}
103 | }


--------------------------------------------------------------------------------
/Chap9/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 | log4j.rootCategory=INFO, stdout
3 | 
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
8 | 


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-10KMeans.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.clustering.StreamingKMeans
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object KMeansClusteringApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 4) {
17 |       System.err.println(
18 |         "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val substream = ssc.socketTextStream(hostname, port.toInt)
30 |       .filter(!_.contains("NaN"))
31 |       .map(_.split(" "))
32 |       .filter(f => f(1) != "0")
33 | 
34 |     val orientationStream = substream
35 |       .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
36 |       .map(arr => arr.map(_.toDouble))
37 |       .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
38 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
39 |     val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
40 |     val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
41 |     val model = new StreamingKMeans()
42 |       .setK(3)
43 |       .setDecayFactor(0)
44 |       .setRandomCenters(18, 0.0)
45 | 
46 |     model.trainOn(train.map(v => v.features))
47 |     val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 |   }
52 | 
53 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-11CollabFilteringPreprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapred.FileSplit
 6 | import org.apache.hadoop.mapred.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.HadoopRDD
10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
11 | 
12 | import com.google.common.io.Files
13 | 
14 | object CollabFilteringPreprocessingApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 3) {
18 |       System.err.println(
19 |         "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, iPath, oPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val delim = " "
29 | 
30 |     val sc = new SparkContext(conf)
31 |     sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
32 |       .asInstanceOf[HadoopRDD[LongWritable, Text]]
33 |       .mapPartitionsWithInputSplit((iSplit, iter) =>
34 |         iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
35 |       .filter(r => r._2 != "0")
36 |       .map(r => ((r._1, r._2), 1))
37 |       .reduceByKey(_ + _)
38 |       .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2)
39 |       .sample(false, 0.7)
40 |       .coalesce(1)
41 |       .saveAsTextFile(oPath)
42 |   }
43 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-12CollabFiltering.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.recommendation.ALS
 6 | import org.apache.spark.mllib.recommendation.Rating
 7 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
 8 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
 9 | import org.apache.spark.streaming.Seconds
10 | import org.apache.spark.streaming.StreamingContext
11 | 
12 | object CollabFilteringApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     if (args.length != 3) {
16 |       System.err.println(
17 |         "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>")
18 |       System.exit(1)
19 |     }
20 |     val Seq(appName, batchInterval, iPath) = args.toSeq
21 | 
22 |     val conf = new SparkConf()
23 |       .setAppName(appName)
24 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
25 | 
26 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
27 | 
28 |     val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match {
29 |       case Array(subject, activity, freq) =>
30 |         Rating(subject.toInt, activity.toInt, freq.toDouble)
31 |     })
32 | 
33 |     val rank = 10
34 |     val numIterations = 10
35 |     val lambda = 0.01
36 |     ratingStream.foreachRDD(ratingRDD => {
37 |       val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7))
38 |       val model = ALS.train(testTrain(1), rank, numIterations, lambda)
39 |       val test = testTrain(0).map {
40 |         case Rating(subject, activity, freq) =>
41 |           (subject, activity)
42 |       }
43 |       val prediction = model.predict(test)
44 |       prediction.take(5).map(println)
45 |     })
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-13FPMiningPreprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.apache.hadoop.io.Text
 5 | import org.apache.hadoop.mapred.FileSplit
 6 | import org.apache.hadoop.mapred.TextInputFormat
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.rdd.HadoopRDD
10 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
11 | 
12 | import com.google.common.io.Files
13 | 
14 | object FPMiningPreprocessingApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 3) {
18 |       System.err.println(
19 |         "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, iPath, oPath) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val delim = " "
29 | 
30 |     val sc = new SparkContext(conf)
31 |     sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
32 |       .asInstanceOf[HadoopRDD[LongWritable, Text]]
33 |       .mapPartitionsWithInputSplit((iSplit, iter) =>
34 |         iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
35 |       .filter(r => r._2 != "0")
36 |       .map(r => (r._1, r._2))
37 |       .distinct()
38 |       .groupByKey()
39 |       .map(r => r._2.mkString(" "))
40 |       .sample(false, 0.7)
41 |       .coalesce(1)
42 |       .saveAsTextFile(oPath)
43 |   }
44 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-14FPMining.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.fpm.FPGrowth
 6 | import org.apache.spark.streaming.Seconds
 7 | import org.apache.spark.streaming.StreamingContext
 8 | 
 9 | object FPMiningApp {
10 | 
11 |   def main(args: Array[String]) {
12 |     if (args.length != 3) {
13 |       System.err.println(
14 |         "Usage: FPMiningApp <appname> <batchInterval> <iPath>")
15 |       System.exit(1)
16 |     }
17 |     val Seq(appName, batchInterval, iPath) = args.toSeq
18 | 
19 |     val conf = new SparkConf()
20 |       .setAppName(appName)
21 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
22 | 
23 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
24 | 
25 |     val minSupport = 0.4
26 | 
27 |     ssc.textFileStream(iPath)
28 |       .map(r => r.split(" "))
29 |       .foreachRDD(transactionRDD => {
30 |         val fpg = new FPGrowth()
31 |           .setMinSupport(minSupport)
32 |         val model = fpg.run(transactionRDD)
33 | 
34 |         model.freqItemsets
35 |           .collect()
36 |           .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq)))
37 |       })
38 | 
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-15MLPipeline.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.ml.Pipeline
 7 | import org.apache.spark.ml.feature.Normalizer
 8 | import org.apache.spark.ml.feature.VectorAssembler
 9 | import org.apache.spark.ml.regression.RandomForestRegressor
10 | import org.apache.spark.sql.SQLContext
11 | import org.apache.spark.streaming.Seconds
12 | import org.apache.spark.streaming.StreamingContext
13 | import org.apache.spark.ml.param.ParamMap
14 | 
15 | object MLPipelineApp {
16 | 
17 |   case class Activity(label: Double,
18 |     accelXHand: Double, accelYHand: Double, accelZHand: Double,
19 |     accelXChest: Double, accelYChest: Double, accelZChest: Double,
20 |     accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double)
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length != 4) {
24 |       System.err.println(
25 |         "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>")
26 |       System.exit(1)
27 |     }
28 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
29 | 
30 |     val conf = new SparkConf()
31 |       .setAppName(appName)
32 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
33 | 
34 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
35 | 
36 |     val sqlC = new SQLContext(ssc.sparkContext)
37 |     import sqlC.implicits._
38 | 
39 |     val substream = ssc.socketTextStream(hostname, port.toInt)
40 |       .filter(!_.contains("NaN"))
41 |       .map(_.split(" "))
42 |       .filter(f => f(1) == "4" || f(1) == "5")
43 |       .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
44 |       .map(f => f.map(v => v.toDouble))
45 |       .foreachRDD(rdd => {
46 |         if (!rdd.isEmpty) {
47 |           val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF()
48 |           val split = accelerometer.randomSplit(Array(0.3, 0.7))
49 |           val test = split(0)
50 |           val train = split(1)
51 | 
52 |           val assembler = new VectorAssembler()
53 |             .setInputCols(Array(
54 |               "accelXHand", "accelYHand", "accelZHand",
55 |               "accelXChest", "accelYChest", "accelZChest",
56 |               "accelXAnkle", "accelYAnkle", "accelZAnkle"))
57 |             .setOutputCol("vectors")
58 |           val normalizer = new Normalizer()
59 |             .setInputCol(assembler.getOutputCol)
60 |             .setOutputCol("features")
61 |           val regressor = new RandomForestRegressor()
62 | 
63 |           val pipeline = new Pipeline()
64 |             .setStages(Array(assembler, normalizer, regressor))
65 |           val pMap =  ParamMap(normalizer.p -> 1.0)
66 |           val model = pipeline.fit(train, pMap)
67 |           val prediction = model.transform(test)
68 |           prediction.show()
69 |         }
70 |       })
71 | 
72 |     ssc.start()
73 |     ssc.awaitTermination()
74 |   }
75 | 
76 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-17MLCrossValidation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import scala.reflect.runtime.universe
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.ml.Pipeline
 8 | import org.apache.spark.ml.evaluation.RegressionEvaluator
 9 | import org.apache.spark.ml.feature.Normalizer
10 | import org.apache.spark.ml.feature.VectorAssembler
11 | import org.apache.spark.ml.regression.RandomForestRegressor
12 | import org.apache.spark.ml.tuning.CrossValidator
13 | import org.apache.spark.ml.tuning.ParamGridBuilder
14 | import org.apache.spark.sql.SQLContext
15 | import org.apache.spark.streaming.Seconds
16 | import org.apache.spark.streaming.StreamingContext
17 | 
18 | object MLCrossValidationApp {
19 | 
20 |   case class Activity(label: Double,
21 |     accelXHand: Double, accelYHand: Double, accelZHand: Double,
22 |     accelXChest: Double, accelYChest: Double, accelZChest: Double,
23 |     accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double)
24 | 
25 |   def main(args: Array[String]) {
26 |     if (args.length != 4) {
27 |       System.err.println(
28 |         "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>")
29 |       System.exit(1)
30 |     }
31 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
32 | 
33 |     val conf = new SparkConf()
34 |       .setAppName(appName)
35 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
36 | 
37 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
38 | 
39 |     val sqlC = new SQLContext(ssc.sparkContext)
40 |     import sqlC.implicits._
41 | 
42 |     val substream = ssc.socketTextStream(hostname, port.toInt)
43 |       .filter(!_.contains("NaN"))
44 |       .map(_.split(" "))
45 |       .filter(f => f(1) == "4" || f(1) == "5")
46 |       .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
47 |       .map(f => f.map(v => v.toDouble))
48 |       .foreachRDD(rdd => {
49 |         if (!rdd.isEmpty) {
50 |           val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF()
51 |           val split = accelerometer.randomSplit(Array(0.3, 0.7))
52 |           val test = split(0)
53 |           val train = split(1)
54 | 
55 |           val assembler = new VectorAssembler()
56 |             .setInputCols(Array(
57 |               "accelXHand", "accelYHand", "accelZHand",
58 |               "accelXChest", "accelYChest", "accelZChest",
59 |               "accelXAnkle", "accelYAnkle", "accelZAnkle"))
60 |             .setOutputCol("vectors")
61 |           val normalizer = new Normalizer()
62 |             .setInputCol(assembler.getOutputCol)
63 |             .setOutputCol("features")
64 |           val regressor = new RandomForestRegressor()
65 | 
66 |           val pipeline = new Pipeline()
67 |             .setStages(Array(assembler, normalizer, regressor))
68 | 
69 |           val validator = new CrossValidator()
70 |             .setEstimator(pipeline)
71 |             .setEvaluator(new RegressionEvaluator)
72 |           val pGrid = new ParamGridBuilder()
73 |             .addGrid(normalizer.p, Array(1.0, 5.0, 10.0))
74 |             .addGrid(regressor.numTrees, Array(10, 50, 100))
75 |             .build()
76 |           validator.setEstimatorParamMaps(pGrid)
77 |           validator.setNumFolds(5)
78 | 
79 |           val bestModel = validator.fit(train)
80 |           val prediction = bestModel.transform(test)
81 |           prediction.show()
82 |         }
83 |       })
84 | 
85 |     ssc.start()
86 |     ssc.awaitTermination()
87 |   }
88 | 
89 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-1LinearRegression.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | object LinearRegressionApp {
14 | 
15 |   def main(args: Array[String]) {
16 |     if (args.length != 4) {
17 |       System.err.println(
18 |         "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>")
19 |       System.exit(1)
20 |     }
21 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
22 | 
23 |     val conf = new SparkConf()
24 |       .setAppName(appName)
25 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
26 | 
27 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
28 | 
29 |     val substream = ssc.socketTextStream(hostname, port.toInt)
30 |       .filter(!_.contains("NaN"))
31 |       .map(_.split(" "))
32 |       .filter(f => f(1) != "0")
33 | 
34 |     val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
35 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |     val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
37 |     val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
38 |     val model = new StreamingLinearRegressionWithSGD()
39 |       .setInitialWeights(Vectors.zeros(4))
40 |       .setStepSize(0.0001)
41 |       .setNumIterations(1)
42 | 
43 |     model.trainOn(train)
44 |     model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
45 |       .map(v => math.pow((v._1 - v._2), 2)).mean())))
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-3Statistics.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.stat.Statistics
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | 
10 | object StatisticsApp {
11 | 
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
25 | 
26 |     val substream = ssc.socketTextStream(hostname, port.toInt)
27 |       .filter(!_.contains("NaN"))
28 |       .map(_.split(" "))
29 |       .filter(f => f(1) != "0")
30 |       .map(f => f.map(f => f.toDouble))
31 | 
32 |     substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => {
33 |       val stats = Statistics.colStats(rdd)
34 |       println("Count: " + stats.count)
35 |       println("Max: " + stats.max.toArray.mkString(" "))
36 |       println("Min: " + stats.min.toArray.mkString(" "))
37 |       println("Mean: " + stats.mean.toArray.mkString(" "))
38 |       println("L1-Norm: " + stats.normL1.toArray.mkString(" "))
39 |       println("L2-Norm: " + stats.normL2.toArray.mkString(" "))
40 |       println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" "))
41 |       println("Varience: " + stats.variance.toArray.mkString(" "))
42 |     })
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 | 
48 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-4Correlation.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.stat.Statistics
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object CorrelationApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 |       .map(f => f.map(f => f.toDouble))
32 | 
33 |     val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
34 | 
35 |     val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |     walkingOrRunning.map(f => f.features).foreachRDD(rdd => {
37 |       val corrSpearman = Statistics.corr(rdd, "spearman")
38 |       val corrPearson = Statistics.corr(rdd, "pearson")
39 |       println("Correlation Spearman: \n" + corrSpearman)
40 |       println("Correlation Pearson: \n" + corrPearson)
41 |     })
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-5ChiSq.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.stat.Statistics
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object ChiSqApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 |       .map(f => f.map(f => f.toDouble))
32 | 
33 |     substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
34 |       .filter(f => f(0) == 4.0 || f(0) == 5.0)
35 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
36 |       .foreachRDD(rdd => {
37 |         Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))
38 |       })
39 | 
40 |     ssc.start()
41 |     ssc.awaitTermination()
42 |   }
43 | 
44 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-6Preprocessing.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.StandardScaler
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.streaming.Seconds
 8 | import org.apache.spark.streaming.StreamingContext
 9 | 
10 | object PreprocessingApp {
11 | 
12 |   def main(args: Array[String]) {
13 |     if (args.length != 4) {
14 |       System.err.println(
15 |         "Usage: PreprocessingAppApp <appname> <batchInterval> <hostname> <port>")
16 |       System.exit(1)
17 |     }
18 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
19 | 
20 |     val conf = new SparkConf()
21 |       .setAppName(appName)
22 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
23 | 
24 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
25 | 
26 |     val substream = ssc.socketTextStream(hostname, port.toInt)
27 |       .filter(!_.contains("NaN"))
28 |       .map(_.split(" "))
29 |       .filter(f => f(1) != "0")
30 | 
31 |     substream.map(f => Array(f(2), f(4), f(5), f(6)))
32 |       .map(f => f.map(v => v.toDouble))
33 |       .map(f => Vectors.dense(f))
34 |       .foreachRDD(rdd => {
35 |         val scalerModel = new StandardScaler().fit(rdd)
36 |         val scaledRDD = scalerModel.transform(rdd)
37 |       })
38 | 
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-7FeatureExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.ChiSqSelector
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object FeatureExtractionApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 | 
32 |     val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
33 |       .map(f => f.map(v => v.toDouble))
34 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048))))
35 | 
36 |     datastream.foreachRDD(rdd => {
37 |       val selector = new ChiSqSelector(5)
38 |       val model = selector.fit(rdd)
39 |       val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features)))
40 |       filtered.take(20).foreach(println)
41 |     })
42 | 
43 |     ssc.start()
44 |     ssc.awaitTermination()
45 |   }
46 | 
47 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-8PCA.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.feature.PCA
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.regression.LabeledPoint
 8 | import org.apache.spark.streaming.Seconds
 9 | import org.apache.spark.streaming.StreamingContext
10 | 
11 | object PCAApp {
12 | 
13 |   def main(args: Array[String]) {
14 |     if (args.length != 4) {
15 |       System.err.println(
16 |         "Usage: PCAApp <appname> <batchInterval> <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
20 | 
21 |     val conf = new SparkConf()
22 |       .setAppName(appName)
23 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
24 | 
25 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
26 | 
27 |     val substream = ssc.socketTextStream(hostname, port.toInt)
28 |       .filter(!_.contains("NaN"))
29 |       .map(_.split(" "))
30 |       .filter(f => f(1) != "0")
31 | 
32 |     val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
33 |       .map(f => f.map(v => v.toDouble))
34 |       .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
35 | 
36 |     datastream.foreachRDD(rdd => {
37 |       val pca = new PCA(rdd.first().features.size / 2)
38 |         .fit(rdd.map(_.features))
39 |       val testTrain = rdd.randomSplit(Array(0.3, 0.7))
40 |       val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features)))
41 |       val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features)))
42 |       train.take(20).foreach(println)
43 |     })
44 | 
45 |     ssc.start()
46 |     ssc.awaitTermination()
47 |   }
48 | 
49 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/L9-9LogisticRegression.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.regression.LabeledPoint
 7 | import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
10 | import org.apache.spark.streaming.Seconds
11 | import org.apache.spark.streaming.StreamingContext
12 | import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
13 | 
14 | object LogisticRegressionApp {
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length != 4) {
18 |       System.err.println(
19 |         "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>")
20 |       System.exit(1)
21 |     }
22 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
23 | 
24 |     val conf = new SparkConf()
25 |       .setAppName(appName)
26 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
27 | 
28 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
29 | 
30 |     val substream = ssc.socketTextStream(hostname, port.toInt)
31 |       .filter(!_.contains("NaN"))
32 |       .map(_.split(" "))
33 |       .filter(f => f(1) != "0")
34 | 
35 |     val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
36 | 
37 |     val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
38 |     val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
39 |     val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
40 |     val model = new StreamingLogisticRegressionWithSGD()
41 |       .setInitialWeights(Vectors.zeros(4))
42 |       .setStepSize(0.0001)
43 |       .setNumIterations(1)
44 | 
45 |     model.trainOn(train)
46 |     model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
47 |       .map(v => math.pow((v._1 - v._2), 2)).mean())))
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 |   }
52 | 
53 | }


--------------------------------------------------------------------------------
/Chap9/src/main/scala/org/apress/prospark/T9-4DataTypes.scala:
--------------------------------------------------------------------------------
 1 | package org.apress.prospark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.mllib.linalg.Matrices
 6 | import org.apache.spark.mllib.linalg.Vectors
 7 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
 8 | import org.apache.spark.mllib.linalg.distributed.IndexedRow
 9 | import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
10 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry
11 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
12 | import org.apache.spark.mllib.regression.LabeledPoint
13 | import org.apache.spark.streaming.Seconds
14 | import org.apache.spark.streaming.StreamingContext
15 | 
16 | object DataTypesApp {
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length != 4) {
20 |       System.err.println(
21 |         "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
22 |       System.exit(1)
23 |     }
24 |     val Seq(appName, batchInterval, hostname, port) = args.toSeq
25 | 
26 |     val conf = new SparkConf()
27 |       .setAppName(appName)
28 |       .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
29 | 
30 |     val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
31 | 
32 |     val substream = ssc.socketTextStream(hostname, port.toInt)
33 |       .filter(!_.contains("NaN"))
34 |       .map(_.split(" "))
35 |       .filter(f => f(1) != "0")
36 |       .map(f => f.map(f => f.toDouble))
37 | 
38 |     val denseV = substream.map(f => Vectors.dense(f.slice(1, 5)))
39 |     denseV.print()
40 |     val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) })
41 |       .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
42 |     sparseV.print()
43 |     val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
44 |     labeledP.print()
45 |     val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
46 |     denseM.print()
47 |     denseV.foreachRDD(rdd => {
48 |       val rowM = new RowMatrix(rdd)
49 |       println(rowM)
50 |     })
51 |     denseV.foreachRDD(rdd => {
52 |       val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1))
53 |       val iRowM = new IndexedRowMatrix(iRdd)
54 |       println(iRowM)
55 |     })
56 |     substream.foreachRDD(rdd => {
57 |       val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
58 |         .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
59 |       val cRowM = new CoordinateMatrix(entries)
60 |       println(cRowM)
61 |     })
62 |     substream.foreachRDD(rdd => {
63 |       val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
64 |         .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
65 |       val blockM = new CoordinateMatrix(entries).toBlockMatrix
66 |       println(blockM)
67 |     })
68 | 
69 |     ssc.start()
70 |     ssc.awaitTermination()
71 |   }
72 | 
73 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pro Spark Streaming
 2 | 
 3 | Code used in "Pro Spark Streaming: The Zen of Real-time Analytics using Apache Spark" published by Apress Publishing.
 4 | 
 5 | ISBN-13: 978-1484214800
 6 | 
 7 | ISBN-10: 1484214803
 8 | 
 9 | # Layout
10 | 
11 | Each folder contains code for a particular chapter. The repetition of code is deliberate. While this goes against most software engineering principles (held very dearly by the author as well), it is necessary to expound a topic and keep its implementation self-contained.
12 | 
13 | ## Chapters
14 | 
15 | - 2:  Introduction to Spark
16 | - 3:  DStreams: Real-time RDDs
17 | - 4:  High Velocity Streams: Parallelism and Other Stories
18 | - 5:  Real-time Route 66: Linking External Data Sources
19 | - 6:  The Art of Side Effects
20 | - 7:  Getting Ready for Prime Time
21 | - 8:  Real-time ETL and Analytics Magic
22 | - 9:  Machine Learning at Scale
23 | - 10: Of Clouds, Lambdas, and Pythons
24 | 
25 | # Build
26 | 
27 | Jump to a particular folder and simply execute `sbt assembly`. This will generate an uber JAR that can directly be submitted to a Spark cluster.


--------------------------------------------------------------------------------