├── .gitignore ├── LICENSE ├── README.md ├── accessloganalyzer ├── .gitignore ├── README.md ├── build.sbt ├── project │ ├── build.properties │ └── plugins.sbt └── src │ └── main │ ├── resources │ ├── access.log │ └── application.conf │ └── scala │ └── com │ └── supergloo │ ├── Boot.scala │ ├── models │ └── HttpStatus.scala │ └── utils │ ├── AccessLogParser.scala │ └── Utils.scala ├── got-battles ├── README.md ├── battles.csv ├── build.sbt ├── project │ └── assembly.sbt └── src │ └── main │ └── scala │ └── com │ └── supergloo │ └── SparkCassandra.scala └── kafka-streaming ├── README.md ├── build.sbt ├── cql └── create-timeseries.cql ├── data └── load │ └── ny-2008.csv ├── project ├── assembly.sbt └── build.properties └── src └── main ├── resources └── log4j.properties └── scala └── com ├── killrweather └── data │ └── Weather.scala └── supergloo ├── WeatherDataStream.scala └── package.scala /.gitignore: -------------------------------------------------------------------------------- 1 | **/target/ 2 | **/project/target/ 3 | **/project/project/ 4 | **/.DS_Store 5 | **/.idea/** 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Scala 2 | 3 | Spark with Scala example projects 4 | 5 | [Scala Spark Tutorials and Examples](https://supergloo.com/spark-tutorial/spark-tutorials-scala/) 6 | -------------------------------------------------------------------------------- /accessloganalyzer/.gitignore: -------------------------------------------------------------------------------- 1 | target/** 2 | project/target/** 3 | project/project/** 4 | .idea/** 5 | .idea_modules/** 6 | 7 | -------------------------------------------------------------------------------- /accessloganalyzer/README.md: -------------------------------------------------------------------------------- 1 | Spark Broadcast and Accumulator Examples 2 | ---------------------------------------- 3 | 4 | Example of using broadcast and accumulators in Scala for analyzing web server log files to determine HTTP status code counts. 5 | 6 | [Spark Broadcast and Accumulator Examples](https://supergloo.com/spark-scala/spark-broadcast-accumulator-examples-scala/ "Spark Broadcast and Accumulator Examples"). 7 | -------------------------------------------------------------------------------- /accessloganalyzer/build.sbt: -------------------------------------------------------------------------------- 1 | name := "sparkAnalyzer" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | // https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 8 | libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "1.6.1" 9 | -------------------------------------------------------------------------------- /accessloganalyzer/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /accessloganalyzer/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 4 | -------------------------------------------------------------------------------- /accessloganalyzer/src/main/resources/application.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcgrath/spark-scala/42efdf17a2297b7e110b05debab62536512c4679/accessloganalyzer/src/main/resources/application.conf -------------------------------------------------------------------------------- /accessloganalyzer/src/main/scala/com/supergloo/Boot.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo 2 | 3 | import com.supergloo.utils.AccessLogParser 4 | import com.supergloo.models._ 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * https://www.supergloo.com 9 | */ 10 | object Boot { 11 | 12 | import utils.Utils._ 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val sparkConf = new SparkConf(true) 17 | .setMaster("local[2]") 18 | .setAppName("SparkAnalyzer") 19 | 20 | val sparkContext = new SparkContext(sparkConf) 21 | 22 | /** 23 | * Defining list of all HTTP status codes divided into status groups 24 | * This list is read only, and it is used for parsing access log file in order to count status code groups 25 | * This example of broadcast variable shows how broadcast value can be used in computations on workers nodes 26 | */ 27 | val httpStatusList = sparkContext broadcast populateHttpStatusList 28 | 29 | /** 30 | * Definition of accumulators for counting specific HTTP status codes 31 | * Accumulator variable is used because of all the updates to this variable in every executor is relayed back to the driver. 32 | * Otherwise they are local variable on executor and it is not relayed back to driver 33 | * so driver value is not changed 34 | */ 35 | val httpInfo = sparkContext accumulator(0, "HTTP 1xx") 36 | val httpSuccess = sparkContext accumulator(0, "HTTP 2xx") 37 | val httpRedirect = sparkContext accumulator(0, "HTTP 3xx") 38 | val httpClientError = sparkContext accumulator(0, "HTTP 4xx") 39 | val httpServerError = sparkContext accumulator(0, "HTTP 5xx") 40 | 41 | /** 42 | * Iterate over access.log file and parse every line 43 | * for every line extract HTTP status code from it and update appropriate accumulator variable 44 | */ 45 | sparkContext.textFile(getClass.getResource("/access.log").getPath, 2).foreach { line => 46 | httpStatusList.value foreach { 47 | case httpInfoStatus: HttpInfoStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpInfoStatus))) => httpInfo += 1 48 | case httpSuccessStatus: HttpSuccessStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpSuccessStatus))) => httpSuccess += 1 49 | case httpRedirectStatus: HttpRedirectStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpRedirectStatus))) => httpRedirect += 1 50 | case httpClientErrorStatus: HttpClientErrorStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpClientErrorStatus))) => httpClientError += 1 51 | case httpServerErrorStatus: HttpServerErrorStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpServerErrorStatus))) => httpServerError += 1 52 | case _ => 53 | } 54 | } 55 | 56 | println("########## START ##########") 57 | println("Printing HttpStatusCodes result from parsing access log") 58 | println(s"HttpStatusInfo : ${httpInfo.value}") 59 | println(s"HttpStatusSuccess : ${httpSuccess.value}") 60 | println(s"HttpStatusRedirect : ${httpRedirect.value}") 61 | println(s"HttpStatusClientError : ${httpClientError.value}") 62 | println(s"HttpStatusServerError : ${httpServerError.value}") 63 | println("########## END ##########") 64 | 65 | sparkContext.stop() 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /accessloganalyzer/src/main/scala/com/supergloo/models/HttpStatus.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo.models 2 | 3 | /** 4 | * Created by sromic on 19/06/16. 5 | */ 6 | sealed abstract class HttpStatus(val status: String) 7 | 8 | case class HttpInfoStatus(override val status: String) extends HttpStatus(status) 9 | case class HttpSuccessStatus(override val status: String) extends HttpStatus(status) 10 | case class HttpRedirectStatus(override val status: String) extends HttpStatus(status) 11 | case class HttpClientErrorStatus(override val status: String) extends HttpStatus(status) 12 | case class HttpServerErrorStatus(override val status: String) extends HttpStatus(status) -------------------------------------------------------------------------------- /accessloganalyzer/src/main/scala/com/supergloo/utils/AccessLogParser.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo.utils 2 | 3 | import java.util.regex.Pattern 4 | 5 | import com.supergloo.models.HttpStatus 6 | 7 | /** 8 | * https://www.supergloo.com 9 | */ 10 | object AccessLogParser extends Serializable { 11 | import Utils._ 12 | 13 | private val ddd = "\\d{1,3}" 14 | private val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?" 15 | private val client = "(\\S+)" 16 | private val user = "(\\S+)" 17 | private val dateTime = "(\\[.+?\\])" 18 | private val request = "\"(.*?)\"" 19 | private val status = "(\\d{3})" 20 | private val bytes = "(\\S+)" 21 | private val referer = "\"(.*?)\"" 22 | private val agent = "\"(.*?)\"" 23 | private val accessLogRegex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent" 24 | private val p = Pattern.compile(accessLogRegex) 25 | 26 | /** 27 | * Extract HTTP status code and create HttpStatus instance for given status code 28 | */ 29 | def parseHttpStatusCode(logLine: String): Option[HttpStatus] = { 30 | val matcher = p.matcher(logLine) 31 | if(matcher.find) { 32 | Some(createHttpStatus(matcher.group(6))) 33 | } 34 | else { 35 | None 36 | } 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /accessloganalyzer/src/main/scala/com/supergloo/utils/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo.utils 2 | 3 | import com.supergloo.models._ 4 | 5 | /** 6 | * https://www.supergloo.com 7 | */ 8 | object Utils { 9 | 10 | private val httpStatuses = List( 11 | "100", "101", "103", 12 | "200", "201", "202", "203", "204", "205", "206", 13 | "300", "301", "302", "303", "304", "305", "306", "307", "308", 14 | "400", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "413", "414", "415", "416", "417", 15 | "500", "501", "502", "503", "504", "505", "511" 16 | ) 17 | 18 | def populateHttpStatusList(): List[HttpStatus] = { 19 | httpStatuses map createHttpStatus 20 | } 21 | 22 | def createHttpStatus(status: String): HttpStatus = status match { 23 | case status if (status.startsWith("1")) => HttpInfoStatus(status) 24 | case status if (status.startsWith("2")) => HttpSuccessStatus(status) 25 | case status if (status.startsWith("3")) => HttpRedirectStatus(status) 26 | case status if (status.startsWith("4")) => HttpClientErrorStatus(status) 27 | case status if (status.startsWith("5")) => HttpServerErrorStatus(status) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /got-battles/README.md: -------------------------------------------------------------------------------- 1 | # Apache Spark, Cassandra and Game of Thrones Example 2 | 3 | Original post [Apache Spark Casandra Example](https://supergloo.com/spark-scala/apache-spark-cassandra/) 4 | -------------------------------------------------------------------------------- /got-battles/battles.csv: -------------------------------------------------------------------------------- 1 | name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,defender_4,attacker_outcome,battle_type,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note 2 | Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,1,0,15000,4000,Jaime Lannister,"Clement Piper, Vance",1,Golden Tooth,The Westerlands, 3 | Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,,,,win,ambush,1,0,,120,Gregor Clegane,Beric Dondarrion,1,Mummer's Ford,The Riverlands, 4 | Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,0,1,15000,10000,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1,Riverrun,The Riverlands, 5 | Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,1,18000,20000,"Roose Bolton, Wylis Manderly, Medger Cerwyn, Harrion Karstark, Halys Hornwood","Tywin Lannister, Gregor Clegane, Kevan Lannister, Addam Marbrand",1,Green Fork,The Riverlands, 6 | Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,1875,6000,"Robb Stark, Brynden Tully",Jaime Lannister,1,Whispering Wood,The Riverlands, 7 | Battle of the Camps,298,6,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,0,0,6000,12625,"Robb Stark, Tytos Blackwood, Brynden Tully","Lord Andros Brax, Forley Prester",1,Riverrun,The Riverlands, 8 | Sack of Darry,298,7,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Darry,,,,win,pitched battle,0,0,,,Gregor Clegane,Lyman Darry,1,Darry,The Riverlands, 9 | Battle of Moat Cailin,299,8,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,pitched battle,0,0,,,Victarion Greyjoy,,1,Moat Cailin,The North, 10 | Battle of Deepwood Motte,299,9,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,siege,0,0,1000,,Asha Greyjoy,,1,Deepwood Motte,The North, 11 | Battle of the Stony Shore,299,10,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,0,264,,Theon Greyjoy,,1,Stony Shore,The North,"Greyjoy's troop number based on the Battle of Deepwood Motte, in which Asha had 1000 soldier on 30 longships. That comes out to ~33 per longship. In the Battle of the Stony Shore, Theon has 8 longships, and just we can estimate that he has 8*33 =265 troops." 12 | Battle of Torrhen's Square,299,11,Robb Stark,Balon/Euron Greyjoy,Stark,,,,Greyjoy,,,,win,pitched battle,0,0,244,900,"Rodrik Cassel, Cley Cerwyn",Dagmer Cleftjaw,1,Torrhen's Square,The North,Greyjoy's troop number comes from the 264 estimate to have arrived on the stony shore minus the 20 Theon takes to attack Winterfell. Thus 264-20=244 13 | Battle of Winterfell,299,12,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,1,20,,Theon Greyjoy,Bran Stark,1,Winterfell,The North,"It isn't mentioned how many Stark men are left in Winterfell, other than ""very few""." 14 | Sack of Torrhen's Square,299,13,Balon/Euron Greyjoy,Balon/Euron Greyjoy,Greyjoy,,,,Stark,,,,win,siege,0,1,,,Dagmer Cleftjaw,,1,Torrhen's Square,The North, 15 | Sack of Winterfell,299,14,Joffrey/Tommen Baratheon,Robb Stark,Bolton,Greyjoy,,,Stark,,,,win,ambush,1,0,618,2000,"Ramsay Snow, Theon Greyjoy ","Rodrik Cassel, Cley Cerwyn, Leobald Tallhart",1,Winterfell,The North,"Since House Bolton betrays the Starks for House Lannister, we code this battle as between these two houses. Greyjoy men, numbering only 20, don't play a major part in the fighting and end up dying anyway." 16 | Battle of Oxcross,299,15,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,6000,10000,"Robb Stark, Brynden Tully","Stafford Lannister, Roland Crakehall, Antario Jast",1,Oxcross,The Westerlands, 17 | Siege of Storm's End,299,16,Stannis Baratheon,Renly Baratheon,Baratheon,,,,Baratheon,,,,win,siege,1,0,5000,20000,"Stannis Baratheon, Davos Seaworth","Renly Baratheon, Cortnay Penrose, Loras Tyrell, Randyll Tarly, Mathis Rowan",1,Storm's End,The Stormlands, 18 | Battle of the Fords,299,17,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,loss,pitched battle,0,0,20000,10000,"Tywin Lannister, Flement Brax, Gregor Clegane, Addam Marbrand, Lyle Crakehall, Leo Lefford","Edmure Tully, Jason Mallister, Karyl Vance",1,Red Fork,The Riverlands, 19 | Sack of Harrenhal,299,18,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,1,0,100,100,"Roose Bolton, Vargo Hoat, Robett Glover",Amory Lorch,1,Harrenhal,The Riverlands, 20 | Battle of the Crag,299,19,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,0,0,6000,,"Robb Stark, Smalljon Umber, Black Walder Frey",Rolph Spicer,1,Crag,The Westerlands, 21 | Battle of the Blackwater,299,20,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,,,,Lannister,,,,loss,pitched battle,1,1,21000,7250,"Stannis Baratheon, Imry Florent, Guyard Morrigen, Rolland Storm, Salladhor Saan, Davos Seaworth","Tyrion Lannister, Jacelyn Bywater, Sandor Clegane, Tywin Lannister, Garlan Tyrell, Mace Tyrell, Randyll Tarly",1,King's Landing,The Crownlands, 22 | Siege of Darry,299,21,Robb Stark,Joffrey/Tommen Baratheon,Darry,,,,Lannister,,,,win,siege,0,0,,,Helman Tallhart,,1,Darry,The Riverlands, 23 | Battle of Duskendale,299,22,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,0,3000,,"Robertt Glover, Helman Tallhart","Randyll Tarly, Gregor Clegane",1,Duskendale,The Crownlands, 24 | Battle of the Burning Septry,299,23,,,Brotherhood without Banners,,,,Brave Companions,,,,win,pitched battle,0,0,,,,,1,,The Riverlands, 25 | Battle of the Ruby Ford,299,24,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Stark,,,,win,pitched battle,0,0,,6000,Gregor Clegane,"Roose Bolton, Wylis Manderly",,Ruby Ford,The Riverlands, 26 | Retaking of Harrenhal,299,25,Joffrey/Tommen Baratheon,,Lannister,,,,Brave Companions,,,,win,pitched battle,1,0,,,Gregor Clegane,Vargo Hoat,1,Harrenhal,The Riverlands, 27 | The Red Wedding,299,26,Joffrey/Tommen Baratheon,Robb Stark,Frey,Bolton,,,Stark,,,,win,ambush,1,1,3500,3500,"Walder Frey, Roose Bolton, Walder Rivers",Robb Stark,1,The Twins,The Riverlands,"This observation refers to the battle against the Stark men, not the attack on the wedding" 28 | Siege of Seagard,299,27,Robb Stark,Joffrey/Tommen Baratheon,Frey,,,,Mallister,,,,win,siege,0,1,,,Walder Frey,Jason Mallister,1,Seagard,The Riverlands, 29 | Battle of Castle Black,300,28,Stannis Baratheon,Mance Rayder,Free folk,Thenns,Giants,,Night's Watch,Baratheon,,,loss,siege,1,1,100000,1240,"Mance Rayder, Tormund Giantsbane, Harma Dogshead, Magnar Styr, Varamyr","Stannis Baratheon, Jon Snow, Donal Noye, Cotter Pyke",0,Castle Black,Beyond the Wall, 30 | Fall of Moat Cailin,300,29,Joffrey/Tommen Baratheon,Balon/Euron Greyjoy,Bolton,,,,Greyjoy,,,,win,siege,0,0,,,Ramsey Bolton,,0,Moat Cailin,The North, 31 | Sack of Saltpans,300,30,,,Brave Companions,,,,,,,,win,razing,0,0,,,Rorge,,0,Saltpans,The Riverlands, 32 | Retaking of Deepwood Motte,300,31,Stannis Baratheon,Balon/Euron Greyjoy,Baratheon,Karstark,Mormont,Glover,Greyjoy,,,,win,pitched battle,0,0,4500,200,"Stannis Baratheon, Alysane Mormot",Asha Greyjoy,0,Deepwood Motte,The North, 33 | Battle of the Shield Islands,300,32,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,pitched battle,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,Shield Islands,The Reach, 34 | "Invasion of Ryamsport, Vinetown, and Starfish Harbor",300,33,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,razing,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,"Ryamsport, Vinetown, Starfish Harbor",The Reach, 35 | Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,,200,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0,Storm's End,The Stormlands, 36 | Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,2000,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0,Dragonstone,The Stormlands, 37 | Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Frey,,,Tully,,,,win,siege,0,0,3000,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0,Riverrun,The Riverlands, 38 | Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Lannister,,,Blackwood,,,,win,siege,0,1,1500,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0,Raventree,The Riverlands, 39 | Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Karstark,Mormont,Glover,Bolton,Frey,,,,,,,5000,8000,Stannis Baratheon,Roose Bolton,0,Winterfell,The North, -------------------------------------------------------------------------------- /got-battles/build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-cassandra-example" 2 | 3 | version := "1.0" 4 | 5 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 6 | 7 | // https://groups.google.com/a/lists.datastax.com/forum/#!topic/spark-connector-user/5muNwRaCJnU 8 | assemblyMergeStrategy in assembly <<= (assemblyMergeStrategy in assembly) { 9 | (old) => { 10 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last 11 | case x => old(x) 12 | } 13 | } 14 | 15 | scalaVersion := "2.10.6" 16 | 17 | resolvers += "jitpack" at "https://jitpack.io" 18 | 19 | libraryDependencies ++= Seq( 20 | // use provided line when building assembly jar 21 | // "org.apache.spark" %% "spark-sql" % "1.6.1" % "provided", 22 | // comment above line and uncomment the following to run in sbt 23 | "org.apache.spark" %% "spark-sql" % "1.6.1", 24 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.5.0", 25 | "com.github.scopt" %% "scopt" % "3.5.0" 26 | ) 27 | -------------------------------------------------------------------------------- /got-battles/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /got-battles/src/main/scala/com/supergloo/SparkCassandra.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo 2 | 3 | import com.datastax.spark.connector._ 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.sql.SQLContext 6 | import org.apache.spark.sql.functions._ 7 | 8 | import scopt.OptionParser 9 | 10 | 11 | /** 12 | * Simple Spark Cassandra 13 | * One example with Scala case class marshalling 14 | * Another example using Spark SQL 15 | */ 16 | object SparkCassandra { 17 | 18 | case class CommandLineArgs ( 19 | cassandra: String = "", // required 20 | keyspace: String = "gameofthrones", // default is gameofthrones 21 | limit: Int = 10 22 | ) 23 | 24 | case class Battle( 25 | battle_number: Integer, 26 | year: Integer, 27 | attacker_king: String, 28 | defender_king: String 29 | ) 30 | 31 | def main(args: Array[String]) { 32 | 33 | val parser = new scopt.OptionParser[CommandLineArgs]("spark-cassandra-example") { 34 | head("spark-cassandra-example", "1.0") 35 | opt[String]('c', "cassandra").required().valueName(""). 36 | action((x, c) => c.copy(cassandra = x)). 37 | text("Setting cassandra is required") 38 | opt[String]('k', "keyspace").action( (x, c) => 39 | c.copy(keyspace = x) ).text("keyspace is a string with a default of `gameofthrones`") 40 | opt[Int]('l', "limit").action( (x, c) => 41 | c.copy(limit = x) ).text("limit is an integer with default of 10") 42 | } 43 | 44 | parser.parse(args, CommandLineArgs()) match { 45 | 46 | case Some(config) => 47 | // do stuff 48 | val conf = new SparkConf().setAppName("SparkCassandraExampleApp") 49 | conf.setIfMissing("spark.master", "local[5]") 50 | 51 | conf.set("spark.cassandra.connection.host", config.cassandra) 52 | 53 | val sc = new SparkContext(conf) 54 | 55 | // Spark Cassandra Example one which marshalls to Scala case classes 56 | val battles:Array[Battle] = sc.cassandraTable[Battle](config.keyspace, "battles"). 57 | select("battle_number","year","attacker_king","defender_king").toArray 58 | 59 | battles.foreach { b: Battle => 60 | println("Battle Number %s was defended by %s.".format(b.battle_number, b.defender_king)) 61 | } 62 | 63 | 64 | // Spark Cassandra Example Two - Create DataFrame from Spark SQL read 65 | val sqlContext = new SQLContext(sc) 66 | 67 | val df = sqlContext.read 68 | .format("org.apache.spark.sql.cassandra") 69 | .options(Map( "table" -> "battles", "keyspace" -> "gameofthrones" )) 70 | .load() 71 | 72 | df.show 73 | 74 | 75 | // Game of Thrones Battle analysis 76 | 77 | // Who were the most aggressive kings? (most attacker_kings) 78 | val countsByAttack = df.groupBy("attacker_king").count().limit(config.limit).sort(desc("count")) 79 | countsByAttack.show() 80 | 81 | // Which kings were attacked the most? (most defender_kings) 82 | val countsByDefend = df.groupBy("defender_king").count().limit(config.limit).sort(desc("count")) 83 | countsByDefend.show() 84 | 85 | sc.stop() 86 | 87 | case None => 88 | // arguments are bad, error message will have been displayed 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /kafka-streaming/README.md: -------------------------------------------------------------------------------- 1 | # Spark Streaming with Kafka 2 | 3 | This is a Spark Streaming job which streams weather data from Kafka 4 | and stores into two Cassandra tables. 5 | 6 | For complete instructions, see 7 | [Spark Streaming with Kafka Example](https://supergloo.com/spark-streaming/spark-streaming-kafka-example/) 8 | 9 | Credits: 10 | Concepts and some code reused from KillrWeather application found at https://github.com/killrweather/killrweather 11 | 12 | #### To run on local machine 13 | 14 | Download Kafka and then we're going to follow similar steps as found here 15 | https://kafka.apache.org/quickstart 16 | 17 | You'll need to update your path appropriately for the following commands 18 | depending on where Kafka; i.e. where is Kafka `bin` dir 19 | 20 | * Start Zookeeper ```bin/zookeeper-server-start.sh config/zookeeper.properties``` 21 | * Start Kafka ```bin/kafka-server-start.sh config/server.properties``` 22 | * Create Kafka topic ``` 23 | bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic raw_weather``` 24 | * Start Streaming job in SBT similar to described above. Choose the `WeatherDataStream` option 25 | * Send Weather Data to Kafka ```kafka-console-producer.sh --broker-list localhost:9092 --topic raw_weather 26 | --new-producer < ny-2008.csv``` 27 | 28 | #### Monitor with SparkLint 29 | 30 | The SparkLint monitoring tool is included and described in TBD post 31 | 32 | To activate, pass in the extraListener arg when submitting; i.e. 33 | 34 | --conf spark.extraListeners=com.groupon.sparklint.SparklintListener 35 | -------------------------------------------------------------------------------- /kafka-streaming/build.sbt: -------------------------------------------------------------------------------- 1 | name := "kafka-streaming" 2 | 3 | version := "1.0" 4 | 5 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 6 | 7 | assemblyMergeStrategy in assembly := { 8 | case PathList("org", "apache", "spark", "unused", "UnusedStubClass.class") => MergeStrategy.first 9 | case PathList(pl @ _*) if pl.contains("log4j.properties") => MergeStrategy.concat 10 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last 11 | case x => 12 | val oldStrategy = (assemblyMergeStrategy in assembly).value 13 | oldStrategy(x) 14 | } 15 | 16 | scalaVersion := "2.10.6" 17 | 18 | resolvers += "jitpack" at "https://jitpack.io" 19 | 20 | // still want to be able to run in sbt 21 | // https://github.com/sbt/sbt-assembly#-provided-configuration 22 | run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) 23 | 24 | fork in run := true 25 | javaOptions in run ++= Seq( 26 | "-Dlog4j.debug=true", 27 | "-Dlog4j.configuration=log4j.properties") 28 | 29 | libraryDependencies ++= Seq( 30 | "com.groupon.sparklint" %% "sparklint-spark162" % "1.0.4" excludeAll ( 31 | ExclusionRule(organization = "org.apache.spark") 32 | ), 33 | "org.apache.spark" %% "spark-core" % "1.6.2" % "provided", 34 | "org.apache.spark" %% "spark-sql" % "1.6.2" % "provided", 35 | "org.apache.spark" %% "spark-streaming" % "1.6.2" % "provided", 36 | "org.apache.spark" %% "spark-streaming-kafka" % "1.6.2", 37 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.6.0" 38 | ) 39 | -------------------------------------------------------------------------------- /kafka-streaming/cql/create-timeseries.cql: -------------------------------------------------------------------------------- 1 | /* 2 | Schema for storing raw ISD-lite hourly weather data. 3 | More can be read about these weather sets here: http://www.ncdc.noaa.gov/oa/climate/isd/ 4 | */ 5 | 6 | DROP KEYSPACE IF EXISTS isd_weather_data; 7 | CREATE KEYSPACE isd_weather_data WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; 8 | 9 | use isd_weather_data; 10 | 11 | /* 12 | Raw weather readings from a single station, hourly. 13 | sky_condition_text text, // Non-coded sky conditions 14 | */ 15 | 16 | CREATE TABLE raw_weather_data ( 17 | wsid text, // Composite of Air Force Datsav3 station number and NCDC WBAN number 18 | year int, // Year collected 19 | month int, // Month collected 20 | day int, // Day collected 21 | hour int, // Hour collected 22 | temperature double, // Air temperature (degrees Celsius) 23 | dewpoint double, // Dew point temperature (degrees Celsius) 24 | pressure double, // Sea level pressure (hectopascals) 25 | wind_direction int, // Wind direction in degrees. 0-359 26 | wind_speed double, // Wind speed (meters per second) 27 | sky_condition int, // Total cloud cover (coded, see format documentation) 28 | sky_condition_text text, // Non-coded sky conditions 29 | one_hour_precip double, // One-hour accumulated liquid precipitation (millimeters) 30 | six_hour_precip double, // Six-hour accumulated liquid precipitation (millimeters) 31 | PRIMARY KEY ((wsid), year, month, day, hour) 32 | ) WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC, hour DESC); 33 | 34 | /* 35 | Sum of all one_hour_precip for one day and one weather station 36 | */ 37 | CREATE TABLE daily_aggregate_precip ( 38 | wsid text, 39 | year int, 40 | month int, 41 | day int, 42 | precipitation counter, 43 | PRIMARY KEY ((wsid), year, month, day) 44 | ) WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC); 45 | -------------------------------------------------------------------------------- /kafka-streaming/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /kafka-streaming/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.11 -------------------------------------------------------------------------------- /kafka-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | log4j.logger.org.apache.spark.SparkEnv=DEBUG 18 | 19 | # Set everything to be logged to the console 20 | log4j.rootCategory=ERROR, console 21 | log4j.appender.console=org.apache.log4j.ConsoleAppender 22 | log4j.appender.console.target=System.err 23 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 25 | 26 | # Settings to quiet third party logs that are too verbose 27 | log4j.logger.org.spark-project.jetty=WARN 28 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 29 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 30 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 31 | log4j.logger.org.apache.parquet=ERROR 32 | log4j.logger.parquet=ERROR 33 | 34 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 35 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 36 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 37 | 38 | #TM 39 | #log4j.logger.org.apache.spark.sql.cassandra=DEBUG -------------------------------------------------------------------------------- /kafka-streaming/src/main/scala/com/killrweather/data/Weather.scala: -------------------------------------------------------------------------------- 1 | package com.killrweather.data 2 | 3 | object Weather { 4 | 5 | /** Base marker trait. */ 6 | @SerialVersionUID(1L) 7 | sealed trait WeatherModel extends Serializable 8 | 9 | /** 10 | * @param id Composite of Air Force Datsav3 station number and NCDC WBAN number 11 | * @param name Name of reporting station 12 | * @param countryCode 2 letter ISO Country ID // TODO restrict 13 | * @param callSign International station call sign 14 | * @param lat Latitude in decimal degrees 15 | * @param long Longitude in decimal degrees 16 | * @param elevation Elevation in meters 17 | */ 18 | case class WeatherStation( 19 | id: String, 20 | name: String, 21 | countryCode: String, 22 | callSign: String, 23 | lat: Double, 24 | long: Double, 25 | elevation: Double) extends WeatherModel 26 | 27 | /** 28 | * @param wsid Composite of Air Force Datsav3 station number and NCDC WBAN number 29 | * @param year Year collected 30 | * @param month Month collected 31 | * @param day Day collected 32 | * @param hour Hour collected 33 | * @param temperature Air temperature (degrees Celsius) 34 | * @param dewpoint Dew point temperature (degrees Celsius) 35 | * @param pressure Sea level pressure (hectopascals) 36 | * @param windDirection Wind direction in degrees. 0-359 37 | * @param windSpeed Wind speed (meters per second) 38 | * @param skyCondition Total cloud cover (coded, see format documentation) 39 | * @param skyConditionText Non-coded sky conditions 40 | * @param oneHourPrecip One-hour accumulated liquid precipitation (millimeters) 41 | * @param sixHourPrecip Six-hour accumulated liquid precipitation (millimeters) 42 | */ 43 | case class RawWeatherData( 44 | wsid: String, 45 | year: Int, 46 | month: Int, 47 | day: Int, 48 | hour: Int, 49 | temperature: Double, 50 | dewpoint: Double, 51 | pressure: Double, 52 | windDirection: Int, 53 | windSpeed: Double, 54 | skyCondition: Int, 55 | skyConditionText: String, 56 | oneHourPrecip: Double, 57 | sixHourPrecip: Double) extends WeatherModel 58 | 59 | object RawWeatherData { 60 | def apply(array: Array[String]): RawWeatherData = { 61 | RawWeatherData( 62 | wsid = array(0), 63 | year = array(1).toInt, 64 | month = array(2).toInt, 65 | day = array(3).toInt, 66 | hour = array(4).toInt, 67 | temperature = array(5).toDouble, 68 | dewpoint = array(6).toDouble, 69 | pressure = array(7).toDouble, 70 | windDirection = array(8).toInt, 71 | windSpeed = array(9).toDouble, 72 | skyCondition = array(10).toInt, 73 | skyConditionText = array(11), 74 | oneHourPrecip = array(11).toDouble, 75 | sixHourPrecip = Option(array(12).toDouble).getOrElse(0)) 76 | } 77 | } 78 | 79 | trait WeatherAggregate extends WeatherModel with Serializable { 80 | def wsid: String 81 | } 82 | 83 | /* Precipitation */ 84 | trait Precipitation extends WeatherAggregate 85 | 86 | case class DailyPrecipitation(wsid: String, 87 | year: Int, 88 | month: Int, 89 | day: Int, 90 | precipitation: Double) extends Precipitation 91 | 92 | } 93 | -------------------------------------------------------------------------------- /kafka-streaming/src/main/scala/com/supergloo/WeatherDataStream.scala: -------------------------------------------------------------------------------- 1 | package com.supergloo 2 | 3 | import com.killrweather.data.Weather.RawWeatherData 4 | import kafka.serializer.StringDecoder 5 | import org.apache.log4j.Logger 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import org.apache.spark.streaming.dstream.{DStream, InputDStream} 9 | import org.apache.spark.streaming.kafka.KafkaUtils 10 | 11 | /** 12 | * Stream from Kafka 13 | */ 14 | object WeatherDataStream { 15 | 16 | val localLogger = Logger.getLogger("WeatherDataStream") 17 | 18 | def main(args: Array[String]) { 19 | 20 | // update 21 | // val checkpointDir = "./tmp" 22 | 23 | val sparkConf = new SparkConf().setAppName("Raw Weather") 24 | sparkConf.setIfMissing("spark.master", "local[5]") 25 | // sparkConf.setIfMissing("spark.checkpoint.dir", checkpointDir) 26 | sparkConf.setIfMissing("spark.cassandra.connection.host", "127.0.0.1") 27 | 28 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 29 | 30 | val kafkaTopicRaw = "raw_weather" 31 | val kafkaBroker = "127.0.01:9092" 32 | 33 | val cassandraKeyspace = "isd_weather_data" 34 | val cassandraTableRaw = "raw_weather_data" 35 | val cassandraTableDailyPrecip = "daily_aggregate_precip" 36 | 37 | println(s"using cassandraTableDailyPrecip $cassandraTableDailyPrecip") 38 | 39 | val topics: Set[String] = kafkaTopicRaw.split(",").map(_.trim).toSet 40 | val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBroker) 41 | 42 | localLogger.info(s"connecting to brokers: $kafkaBroker") 43 | localLogger.info(s"kafkaParams: $kafkaParams") 44 | localLogger.info(s"topics: $topics") 45 | 46 | val rawWeatherStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) 47 | val parsedWeatherStream: DStream[RawWeatherData] = ingestStream(rawWeatherStream) 48 | 49 | persist(cassandraKeyspace, cassandraTableRaw, cassandraTableDailyPrecip, parsedWeatherStream) 50 | 51 | parsedWeatherStream.print // for demo purposes only 52 | 53 | //Kick off 54 | ssc.start() 55 | 56 | ssc.awaitTermination() 57 | 58 | ssc.stop() 59 | } 60 | 61 | def persist(CassandraKeyspace: String, CassandraTableRaw: String, 62 | CassandraTableDailyPrecip: String, 63 | parsedWeatherStream: DStream[RawWeatherData]): Unit = { 64 | 65 | import com.datastax.spark.connector.streaming._ 66 | 67 | /** Saves the raw data to Cassandra - raw table. */ 68 | parsedWeatherStream.saveToCassandra(CassandraKeyspace, CassandraTableRaw) 69 | 70 | /** For a given weather station, year, month, day, aggregates hourly precipitation values by day. 71 | * Weather station first gets you the partition key - data locality - which spark gets via the 72 | * connector, so the data transfer between spark and cassandra is very fast per node. 73 | * 74 | * Persists daily aggregate data to Cassandra daily precip table by weather station, 75 | * automatically sorted by most recent (due to how we set up the Cassandra schema: 76 | * 77 | * @see https://github.com/killrweather/killrweather/blob/master/data/create-timeseries.cql. 78 | * 79 | * Because the 'oneHourPrecip' column is a Cassandra Counter we do not have to do a spark 80 | * reduceByKey, which is expensive. We simply let Cassandra do it - not expensive and fast. 81 | */ 82 | parsedWeatherStream.map { weather => 83 | (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) 84 | }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) 85 | } 86 | 87 | def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { 88 | val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) 89 | .map(RawWeatherData(_)) 90 | parsedWeatherStream 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /kafka-streaming/src/main/scala/com/supergloo/package.scala: -------------------------------------------------------------------------------- 1 | package com 2 | 3 | package object supergloo { 4 | 5 | val BASE_PATH = "/Users/toddmcgrath/Development/pioneer/" 6 | val SANDBOX_PATH = BASE_PATH + "spark-sql/sandbox/" 7 | } 8 | --------------------------------------------------------------------------------