├── example_data └── testData.txt ├── README.md ├── src └── main │ └── scala │ └── cloudera │ └── fun │ └── sparklambda │ ├── common │ └── ErrorCount.scala │ ├── etl │ └── BatchErrorCount.scala │ └── streaming │ └── StreamingErrorCount.scala └── pom.xml /example_data/testData.txt: -------------------------------------------------------------------------------- 1 | 14/08/20 21:40:52 INFO BlockFetcherIterator$BasicBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks 2 | 14/08/20 21:40:52 INFO BlockFetcherIterator$BasicBlockFetcherIterator: Started 0 remote fetches in 3 ms 3 | 14/08/20 21:40:52 INFO FileOutputCommitter: Saved output of task 'attempt_201408202140_0000_m_000000_1' to file:/Users/gshapira/resData2/_temporary/0/task_201408202140_0000_m_000000 4 | 14/08/20 21:40:52 INFO SparkHadoopWriter: attempt_201408202140_0000_m_000000_1: Committed 5 | 14/08/20 21:40:52 INFO Executor: Serialized size of result for 1 is 825 6 | 14/08/20 21:40:52 ERROR Executor: Sending result for 1 directly to driver 7 | 14/08/20 21:40:52 INFO Executor: Finished task ID 1 8 | 14/08/20 21:40:52 INFO DAGScheduler: Completed ResultTask(0, 0) 9 | 14/08/20 21:40:52 INFO TaskSetManager: Finished TID 1 in 98 ms on localhost (progress: 1/1) 10 | 14/08/20 21:40:52 ERROR TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 11 | 14/08/20 21:40:52 INFO DAGScheduler: Stage 0 (saveAsTextFile at BatchErrorCount.scala:41) finished in 0.099 s 12 | 14/08/20 21:40:52 ERROR SparkContext: Job finished: saveAsTextFile at BatchErrorCount.scala:41, took 0.345168 s 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Lambda Architecture 2 | ## Overview 3 | This project is intended to show an example of how Spark can be used to efficiently implement a Lambda Architecture 4 | 5 | Lambda Architectures typically share some of the business logic between the batch layer and speed layer. 6 | When each layer is implemented in a different language or framework, this leads to code duplication, painful 7 | maintenance and errors. 8 | 9 | On the other hand, if Spark is used to implement the batch layer and SparkStreaming for the speed layer, they can share 10 | common functions, reducing code duplication and the associated maintenance overhead. 11 | 12 | This project is intended as an example of how this can be done. 13 | It contains two packages for counting errors in logs. One for batch use and the other for streaming. 14 | 15 | ##Build 16 | mvn clean package 17 | 18 | ##Usage 19 | 20 | ###ETL example: 21 | 22 | java -cp SparkStreamingLambda-1.0-SNAPSHOT.jar:/lib/spark-assembly-1.0.2-hadoop2.2.0.jar cloudera.fun.sparklambda.etl.BatchErrorCount 23 | 24 | ###Streaming example: 25 | 26 | java -cp SparkStreamingLambda-1.0-SNAPSHOT.jar:/lib/spark-assembly-1.0.2-hadoop2.2.0.jar cloudera.fun.sparklambda.streaming.StreamingErrorCount localhost 27 | 28 | to send data to the streaming example, use: 29 | nc -lk 30 | 31 | -------------------------------------------------------------------------------- /src/main/scala/cloudera/fun/sparklambda/common/ErrorCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package cloudera.fun.sparklambda.common 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.apache.spark.SparkContext._ 22 | 23 | 24 | object ErrorCount { 25 | def countErrors(rdd: RDD[String]): RDD[(String, Int)] = { 26 | rdd 27 | .filter(_.contains("ERROR")) // Keep "ERROR" lines 28 | .map( s => (s.split(" ")(0), 1) ) // Return tuple with date & count 29 | .reduceByKey(_+_) // Sum counts for each date 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/cloudera/fun/sparklambda/etl/BatchErrorCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package cloudera.fun.sparklambda.etl 19 | 20 | import cloudera.fun.sparklambda.common.ErrorCount 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | 24 | object BatchErrorCount { 25 | def main(args: Array[String]): Unit = { 26 | if (args.length<3) { 27 | System.err.println("Usage: BatchErrorCount ") 28 | System.exit(1) 29 | } 30 | val conf = new SparkConf() 31 | .setMaster(args(0)) 32 | .setAppName(this.getClass.getCanonicalName) 33 | .setJars(Seq(SparkContext.jarOfClass(this.getClass).get)) 34 | 35 | val sc = new SparkContext(conf) 36 | 37 | val lines = sc.textFile(args(1)) 38 | 39 | val errCount = ErrorCount.countErrors(lines) 40 | 41 | errCount.saveAsTextFile(args(2)) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/cloudera/fun/sparklambda/streaming/StreamingErrorCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package cloudera.fun.sparklambda.streaming 19 | 20 | import cloudera.fun.sparklambda.common.ErrorCount 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.{SparkConf, Logging} 23 | import org.apache.spark.storage.StorageLevel 24 | import org.apache.spark.streaming.{Seconds, StreamingContext} 25 | import org.apache.spark.streaming.StreamingContext._ 26 | import org.apache.log4j.{Level, Logger} 27 | 28 | 29 | /** 30 | * Created by gshapira on 8/20/14. 31 | */ 32 | object StreamingErrorCount extends Logging{ 33 | 34 | def main(args: Array[String]): Unit = { 35 | 36 | 37 | if (args.length < 3) { 38 | System.err.println("Usage: StreamingErrorCount ") 39 | System.exit(1) 40 | } 41 | 42 | //Configure the Streaming Context 43 | 44 | val sparkConf = new SparkConf() 45 | .setMaster(args(0)) 46 | .setAppName(this.getClass.getCanonicalName) 47 | 48 | setStreamingLogLevels() 49 | 50 | 51 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 52 | ssc.checkpoint(".") 53 | 54 | // Create the DStream from data sent over the network 55 | val dStream = ssc.socketTextStream(args(1), args(2).toInt, StorageLevel.MEMORY_AND_DISK_SER) 56 | 57 | // Counting the errors in each RDD in the stream 58 | val errCountStream = dStream.transform(rdd => ErrorCount.countErrors(rdd)) 59 | 60 | 61 | // printing out the current error count 62 | errCountStream.foreachRDD(rdd => { 63 | System.out.println("Errors this minute:%d".format(rdd.first()._2)) 64 | }) 65 | 66 | // creating a stream with running error count 67 | val stateStream = errCountStream.updateStateByKey[Int](updateFunc) 68 | 69 | // printing the running error count 70 | stateStream.foreachRDD(rdd => { 71 | System.out.println("Errors today:%d".format(rdd.first()._2)) 72 | }) 73 | 74 | // starting the action 75 | ssc.start() 76 | ssc.awaitTermination() 77 | } 78 | 79 | 80 | def setStreamingLogLevels() { 81 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 82 | if (!log4jInitialized) { 83 | // We first log something to initialize Spark's default logging, then we override the 84 | // logging level. 85 | logInfo("Setting log level to [WARN] for streaming example." + 86 | " To override add a custom log4j.properties to the classpath.") 87 | Logger.getRootLogger.setLevel(Level.WARN) 88 | } 89 | } 90 | 91 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 92 | val currentCount = values.foldLeft(0)(_ + _) 93 | 94 | val previousCount = state.getOrElse(0) 95 | 96 | Some(currentCount + previousCount) 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 22 | 4.0.0 23 | 24 | demo 25 | SparkStreamingLambda 26 | 1.0-SNAPSHOT 27 | 28 | 29 | 30 | 31 | scala-tools.org 32 | Scala-tools Maven2 Repository 33 | http://scala-tools.org/repo-releases 34 | 35 | 36 | 37 | cloudera-repos 38 | Cloudera Repos 39 | https://repository.cloudera.com/artifactory/cloudera-repos/ 40 | 41 | 42 | 43 | 44 | 45 | 46 | org.apache.spark 47 | spark-core_2.10 48 | 1.0.0-cdh5.1.0 49 | provided 50 | 51 | 52 | 53 | org.apache.spark 54 | spark-streaming_2.10 55 | 1.0.0-cdh5.1.0 56 | provided 57 | 58 | 59 | 60 | 61 | 62 | 63 | scala-tools.org 64 | Scala-tools Maven2 Repository 65 | http://scala-tools.org/repo-releases 66 | 67 | 68 | 69 | 70 | src/main/scala 71 | src/test/scala 72 | 73 | 74 | org.scala-tools 75 | maven-scala-plugin 76 | 2.15.2 77 | 78 | 79 | 80 | compile 81 | testCompile 82 | 83 | 84 | 85 | 86 | 2.10 87 | 88 | 89 | 90 | maven-assembly-plugin 91 | 2.3 92 | 93 | 94 | jar-with-dependencies 95 | 96 | 97 | 98 | 99 | make-assembly 100 | package 101 | 102 | single 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | --------------------------------------------------------------------------------