├── .gitignore
├── LICENSE
├── README.md
├── accessloganalyzer
├── .gitignore
├── README.md
├── build.sbt
├── project
│ ├── build.properties
│ └── plugins.sbt
└── src
│ └── main
│ ├── resources
│ ├── access.log
│ └── application.conf
│ └── scala
│ └── com
│ └── supergloo
│ ├── Boot.scala
│ ├── models
│ └── HttpStatus.scala
│ └── utils
│ ├── AccessLogParser.scala
│ └── Utils.scala
├── got-battles
├── README.md
├── battles.csv
├── build.sbt
├── project
│ └── assembly.sbt
└── src
│ └── main
│ └── scala
│ └── com
│ └── supergloo
│ └── SparkCassandra.scala
└── kafka-streaming
├── README.md
├── build.sbt
├── cql
└── create-timeseries.cql
├── data
└── load
│ └── ny-2008.csv
├── project
├── assembly.sbt
└── build.properties
└── src
└── main
├── resources
└── log4j.properties
└── scala
└── com
├── killrweather
└── data
│ └── Weather.scala
└── supergloo
├── WeatherDataStream.scala
└── package.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | **/target/
2 | **/project/target/
3 | **/project/project/
4 | **/.DS_Store
5 | **/.idea/**
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | CC0 1.0 Universal
2 |
3 | Statement of Purpose
4 |
5 | The laws of most jurisdictions throughout the world automatically confer
6 | exclusive Copyright and Related Rights (defined below) upon the creator and
7 | subsequent owner(s) (each and all, an "owner") of an original work of
8 | authorship and/or a database (each, a "Work").
9 |
10 | Certain owners wish to permanently relinquish those rights to a Work for the
11 | purpose of contributing to a commons of creative, cultural and scientific
12 | works ("Commons") that the public can reliably and without fear of later
13 | claims of infringement build upon, modify, incorporate in other works, reuse
14 | and redistribute as freely as possible in any form whatsoever and for any
15 | purposes, including without limitation commercial purposes. These owners may
16 | contribute to the Commons to promote the ideal of a free culture and the
17 | further production of creative, cultural and scientific works, or to gain
18 | reputation or greater distribution for their Work in part through the use and
19 | efforts of others.
20 |
21 | For these and/or other purposes and motivations, and without any expectation
22 | of additional consideration or compensation, the person associating CC0 with a
23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
25 | and publicly distribute the Work under its terms, with knowledge of his or her
26 | Copyright and Related Rights in the Work and the meaning and intended legal
27 | effect of CC0 on those rights.
28 |
29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
30 | protected by copyright and related or neighboring rights ("Copyright and
31 | Related Rights"). Copyright and Related Rights include, but are not limited
32 | to, the following:
33 |
34 | i. the right to reproduce, adapt, distribute, perform, display, communicate,
35 | and translate a Work;
36 |
37 | ii. moral rights retained by the original author(s) and/or performer(s);
38 |
39 | iii. publicity and privacy rights pertaining to a person's image or likeness
40 | depicted in a Work;
41 |
42 | iv. rights protecting against unfair competition in regards to a Work,
43 | subject to the limitations in paragraph 4(a), below;
44 |
45 | v. rights protecting the extraction, dissemination, use and reuse of data in
46 | a Work;
47 |
48 | vi. database rights (such as those arising under Directive 96/9/EC of the
49 | European Parliament and of the Council of 11 March 1996 on the legal
50 | protection of databases, and under any national implementation thereof,
51 | including any amended or successor version of such directive); and
52 |
53 | vii. other similar, equivalent or corresponding rights throughout the world
54 | based on applicable law or treaty, and any national implementations thereof.
55 |
56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
59 | and Related Rights and associated claims and causes of action, whether now
60 | known or unknown (including existing as well as future claims and causes of
61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
62 | duration provided by applicable law or treaty (including future time
63 | extensions), (iii) in any current or future medium and for any number of
64 | copies, and (iv) for any purpose whatsoever, including without limitation
65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
66 | the Waiver for the benefit of each member of the public at large and to the
67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
68 | shall not be subject to revocation, rescission, cancellation, termination, or
69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
70 | by the public as contemplated by Affirmer's express Statement of Purpose.
71 |
72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
73 | judged legally invalid or ineffective under applicable law, then the Waiver
74 | shall be preserved to the maximum extent permitted taking into account
75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
77 | non transferable, non sublicensable, non exclusive, irrevocable and
78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
80 | provided by applicable law or treaty (including future time extensions), (iii)
81 | in any current or future medium and for any number of copies, and (iv) for any
82 | purpose whatsoever, including without limitation commercial, advertising or
83 | promotional purposes (the "License"). The License shall be deemed effective as
84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
85 | License for any reason be judged legally invalid or ineffective under
86 | applicable law, such partial invalidity or ineffectiveness shall not
87 | invalidate the remainder of the License, and in such case Affirmer hereby
88 | affirms that he or she will not (i) exercise any of his or her remaining
89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
90 | and causes of action with respect to the Work, in either case contrary to
91 | Affirmer's express Statement of Purpose.
92 |
93 | 4. Limitations and Disclaimers.
94 |
95 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
96 | surrendered, licensed or otherwise affected by this document.
97 |
98 | b. Affirmer offers the Work as-is and makes no representations or warranties
99 | of any kind concerning the Work, express, implied, statutory or otherwise,
100 | including without limitation warranties of title, merchantability, fitness
101 | for a particular purpose, non infringement, or the absence of latent or
102 | other defects, accuracy, or the present or absence of errors, whether or not
103 | discoverable, all to the greatest extent permissible under applicable law.
104 |
105 | c. Affirmer disclaims responsibility for clearing rights of other persons
106 | that may apply to the Work or any use thereof, including without limitation
107 | any person's Copyright and Related Rights in the Work. Further, Affirmer
108 | disclaims responsibility for obtaining any necessary consents, permissions
109 | or other rights required for any use of the Work.
110 |
111 | d. Affirmer understands and acknowledges that Creative Commons is not a
112 | party to this document and has no duty or obligation with respect to this
113 | CC0 or use of the Work.
114 |
115 | For more information, please see
116 |
117 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark Scala
2 |
3 | Spark with Scala example projects
4 |
5 | [Scala Spark Tutorials and Examples](https://supergloo.com/spark-tutorial/spark-tutorials-scala/)
6 |
--------------------------------------------------------------------------------
/accessloganalyzer/.gitignore:
--------------------------------------------------------------------------------
1 | target/**
2 | project/target/**
3 | project/project/**
4 | .idea/**
5 | .idea_modules/**
6 |
7 |
--------------------------------------------------------------------------------
/accessloganalyzer/README.md:
--------------------------------------------------------------------------------
1 | Spark Broadcast and Accumulator Examples
2 | ----------------------------------------
3 |
4 | Example of using broadcast and accumulators in Scala for analyzing web server log files to determine HTTP status code counts.
5 |
6 | [Spark Broadcast and Accumulator Examples](https://supergloo.com/spark-scala/spark-broadcast-accumulator-examples-scala/ "Spark Broadcast and Accumulator Examples").
7 |
--------------------------------------------------------------------------------
/accessloganalyzer/build.sbt:
--------------------------------------------------------------------------------
1 | name := "sparkAnalyzer"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.11.8"
6 |
7 | // https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11
8 | libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "1.6.1"
9 |
--------------------------------------------------------------------------------
/accessloganalyzer/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8
--------------------------------------------------------------------------------
/accessloganalyzer/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 |
3 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
4 |
--------------------------------------------------------------------------------
/accessloganalyzer/src/main/resources/application.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcgrath/spark-scala/42efdf17a2297b7e110b05debab62536512c4679/accessloganalyzer/src/main/resources/application.conf
--------------------------------------------------------------------------------
/accessloganalyzer/src/main/scala/com/supergloo/Boot.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo
2 |
3 | import com.supergloo.utils.AccessLogParser
4 | import com.supergloo.models._
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | /**
8 | * https://www.supergloo.com
9 | */
10 | object Boot {
11 |
12 | import utils.Utils._
13 |
14 | def main(args: Array[String]): Unit = {
15 |
16 | val sparkConf = new SparkConf(true)
17 | .setMaster("local[2]")
18 | .setAppName("SparkAnalyzer")
19 |
20 | val sparkContext = new SparkContext(sparkConf)
21 |
22 | /**
23 | * Defining list of all HTTP status codes divided into status groups
24 | * This list is read only, and it is used for parsing access log file in order to count status code groups
25 | * This example of broadcast variable shows how broadcast value can be used in computations on workers nodes
26 | */
27 | val httpStatusList = sparkContext broadcast populateHttpStatusList
28 |
29 | /**
30 | * Definition of accumulators for counting specific HTTP status codes
31 | * Accumulator variable is used because of all the updates to this variable in every executor is relayed back to the driver.
32 | * Otherwise they are local variable on executor and it is not relayed back to driver
33 | * so driver value is not changed
34 | */
35 | val httpInfo = sparkContext accumulator(0, "HTTP 1xx")
36 | val httpSuccess = sparkContext accumulator(0, "HTTP 2xx")
37 | val httpRedirect = sparkContext accumulator(0, "HTTP 3xx")
38 | val httpClientError = sparkContext accumulator(0, "HTTP 4xx")
39 | val httpServerError = sparkContext accumulator(0, "HTTP 5xx")
40 |
41 | /**
42 | * Iterate over access.log file and parse every line
43 | * for every line extract HTTP status code from it and update appropriate accumulator variable
44 | */
45 | sparkContext.textFile(getClass.getResource("/access.log").getPath, 2).foreach { line =>
46 | httpStatusList.value foreach {
47 | case httpInfoStatus: HttpInfoStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpInfoStatus))) => httpInfo += 1
48 | case httpSuccessStatus: HttpSuccessStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpSuccessStatus))) => httpSuccess += 1
49 | case httpRedirectStatus: HttpRedirectStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpRedirectStatus))) => httpRedirect += 1
50 | case httpClientErrorStatus: HttpClientErrorStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpClientErrorStatus))) => httpClientError += 1
51 | case httpServerErrorStatus: HttpServerErrorStatus if (AccessLogParser.parseHttpStatusCode(line).equals(Some(httpServerErrorStatus))) => httpServerError += 1
52 | case _ =>
53 | }
54 | }
55 |
56 | println("########## START ##########")
57 | println("Printing HttpStatusCodes result from parsing access log")
58 | println(s"HttpStatusInfo : ${httpInfo.value}")
59 | println(s"HttpStatusSuccess : ${httpSuccess.value}")
60 | println(s"HttpStatusRedirect : ${httpRedirect.value}")
61 | println(s"HttpStatusClientError : ${httpClientError.value}")
62 | println(s"HttpStatusServerError : ${httpServerError.value}")
63 | println("########## END ##########")
64 |
65 | sparkContext.stop()
66 | }
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/accessloganalyzer/src/main/scala/com/supergloo/models/HttpStatus.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo.models
2 |
3 | /**
4 | * Created by sromic on 19/06/16.
5 | */
6 | sealed abstract class HttpStatus(val status: String)
7 |
8 | case class HttpInfoStatus(override val status: String) extends HttpStatus(status)
9 | case class HttpSuccessStatus(override val status: String) extends HttpStatus(status)
10 | case class HttpRedirectStatus(override val status: String) extends HttpStatus(status)
11 | case class HttpClientErrorStatus(override val status: String) extends HttpStatus(status)
12 | case class HttpServerErrorStatus(override val status: String) extends HttpStatus(status)
--------------------------------------------------------------------------------
/accessloganalyzer/src/main/scala/com/supergloo/utils/AccessLogParser.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo.utils
2 |
3 | import java.util.regex.Pattern
4 |
5 | import com.supergloo.models.HttpStatus
6 |
7 | /**
8 | * https://www.supergloo.com
9 | */
10 | object AccessLogParser extends Serializable {
11 | import Utils._
12 |
13 | private val ddd = "\\d{1,3}"
14 | private val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?"
15 | private val client = "(\\S+)"
16 | private val user = "(\\S+)"
17 | private val dateTime = "(\\[.+?\\])"
18 | private val request = "\"(.*?)\""
19 | private val status = "(\\d{3})"
20 | private val bytes = "(\\S+)"
21 | private val referer = "\"(.*?)\""
22 | private val agent = "\"(.*?)\""
23 | private val accessLogRegex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent"
24 | private val p = Pattern.compile(accessLogRegex)
25 |
26 | /**
27 | * Extract HTTP status code and create HttpStatus instance for given status code
28 | */
29 | def parseHttpStatusCode(logLine: String): Option[HttpStatus] = {
30 | val matcher = p.matcher(logLine)
31 | if(matcher.find) {
32 | Some(createHttpStatus(matcher.group(6)))
33 | }
34 | else {
35 | None
36 | }
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/accessloganalyzer/src/main/scala/com/supergloo/utils/Utils.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo.utils
2 |
3 | import com.supergloo.models._
4 |
5 | /**
6 | * https://www.supergloo.com
7 | */
8 | object Utils {
9 |
10 | private val httpStatuses = List(
11 | "100", "101", "103",
12 | "200", "201", "202", "203", "204", "205", "206",
13 | "300", "301", "302", "303", "304", "305", "306", "307", "308",
14 | "400", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "413", "414", "415", "416", "417",
15 | "500", "501", "502", "503", "504", "505", "511"
16 | )
17 |
18 | def populateHttpStatusList(): List[HttpStatus] = {
19 | httpStatuses map createHttpStatus
20 | }
21 |
22 | def createHttpStatus(status: String): HttpStatus = status match {
23 | case status if (status.startsWith("1")) => HttpInfoStatus(status)
24 | case status if (status.startsWith("2")) => HttpSuccessStatus(status)
25 | case status if (status.startsWith("3")) => HttpRedirectStatus(status)
26 | case status if (status.startsWith("4")) => HttpClientErrorStatus(status)
27 | case status if (status.startsWith("5")) => HttpServerErrorStatus(status)
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/got-battles/README.md:
--------------------------------------------------------------------------------
1 | # Apache Spark, Cassandra and Game of Thrones Example
2 |
3 | Original post [Apache Spark Casandra Example](https://supergloo.com/spark-scala/apache-spark-cassandra/)
4 |
--------------------------------------------------------------------------------
/got-battles/battles.csv:
--------------------------------------------------------------------------------
1 | name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,defender_4,attacker_outcome,battle_type,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note
2 | Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,1,0,15000,4000,Jaime Lannister,"Clement Piper, Vance",1,Golden Tooth,The Westerlands,
3 | Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,,,,win,ambush,1,0,,120,Gregor Clegane,Beric Dondarrion,1,Mummer's Ford,The Riverlands,
4 | Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,0,1,15000,10000,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1,Riverrun,The Riverlands,
5 | Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,1,18000,20000,"Roose Bolton, Wylis Manderly, Medger Cerwyn, Harrion Karstark, Halys Hornwood","Tywin Lannister, Gregor Clegane, Kevan Lannister, Addam Marbrand",1,Green Fork,The Riverlands,
6 | Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,1875,6000,"Robb Stark, Brynden Tully",Jaime Lannister,1,Whispering Wood,The Riverlands,
7 | Battle of the Camps,298,6,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,0,0,6000,12625,"Robb Stark, Tytos Blackwood, Brynden Tully","Lord Andros Brax, Forley Prester",1,Riverrun,The Riverlands,
8 | Sack of Darry,298,7,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Darry,,,,win,pitched battle,0,0,,,Gregor Clegane,Lyman Darry,1,Darry,The Riverlands,
9 | Battle of Moat Cailin,299,8,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,pitched battle,0,0,,,Victarion Greyjoy,,1,Moat Cailin,The North,
10 | Battle of Deepwood Motte,299,9,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,siege,0,0,1000,,Asha Greyjoy,,1,Deepwood Motte,The North,
11 | Battle of the Stony Shore,299,10,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,0,264,,Theon Greyjoy,,1,Stony Shore,The North,"Greyjoy's troop number based on the Battle of Deepwood Motte, in which Asha had 1000 soldier on 30 longships. That comes out to ~33 per longship. In the Battle of the Stony Shore, Theon has 8 longships, and just we can estimate that he has 8*33 =265 troops."
12 | Battle of Torrhen's Square,299,11,Robb Stark,Balon/Euron Greyjoy,Stark,,,,Greyjoy,,,,win,pitched battle,0,0,244,900,"Rodrik Cassel, Cley Cerwyn",Dagmer Cleftjaw,1,Torrhen's Square,The North,Greyjoy's troop number comes from the 264 estimate to have arrived on the stony shore minus the 20 Theon takes to attack Winterfell. Thus 264-20=244
13 | Battle of Winterfell,299,12,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,1,20,,Theon Greyjoy,Bran Stark,1,Winterfell,The North,"It isn't mentioned how many Stark men are left in Winterfell, other than ""very few""."
14 | Sack of Torrhen's Square,299,13,Balon/Euron Greyjoy,Balon/Euron Greyjoy,Greyjoy,,,,Stark,,,,win,siege,0,1,,,Dagmer Cleftjaw,,1,Torrhen's Square,The North,
15 | Sack of Winterfell,299,14,Joffrey/Tommen Baratheon,Robb Stark,Bolton,Greyjoy,,,Stark,,,,win,ambush,1,0,618,2000,"Ramsay Snow, Theon Greyjoy ","Rodrik Cassel, Cley Cerwyn, Leobald Tallhart",1,Winterfell,The North,"Since House Bolton betrays the Starks for House Lannister, we code this battle as between these two houses. Greyjoy men, numbering only 20, don't play a major part in the fighting and end up dying anyway."
16 | Battle of Oxcross,299,15,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,6000,10000,"Robb Stark, Brynden Tully","Stafford Lannister, Roland Crakehall, Antario Jast",1,Oxcross,The Westerlands,
17 | Siege of Storm's End,299,16,Stannis Baratheon,Renly Baratheon,Baratheon,,,,Baratheon,,,,win,siege,1,0,5000,20000,"Stannis Baratheon, Davos Seaworth","Renly Baratheon, Cortnay Penrose, Loras Tyrell, Randyll Tarly, Mathis Rowan",1,Storm's End,The Stormlands,
18 | Battle of the Fords,299,17,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,loss,pitched battle,0,0,20000,10000,"Tywin Lannister, Flement Brax, Gregor Clegane, Addam Marbrand, Lyle Crakehall, Leo Lefford","Edmure Tully, Jason Mallister, Karyl Vance",1,Red Fork,The Riverlands,
19 | Sack of Harrenhal,299,18,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,1,0,100,100,"Roose Bolton, Vargo Hoat, Robett Glover",Amory Lorch,1,Harrenhal,The Riverlands,
20 | Battle of the Crag,299,19,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,0,0,6000,,"Robb Stark, Smalljon Umber, Black Walder Frey",Rolph Spicer,1,Crag,The Westerlands,
21 | Battle of the Blackwater,299,20,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,,,,Lannister,,,,loss,pitched battle,1,1,21000,7250,"Stannis Baratheon, Imry Florent, Guyard Morrigen, Rolland Storm, Salladhor Saan, Davos Seaworth","Tyrion Lannister, Jacelyn Bywater, Sandor Clegane, Tywin Lannister, Garlan Tyrell, Mace Tyrell, Randyll Tarly",1,King's Landing,The Crownlands,
22 | Siege of Darry,299,21,Robb Stark,Joffrey/Tommen Baratheon,Darry,,,,Lannister,,,,win,siege,0,0,,,Helman Tallhart,,1,Darry,The Riverlands,
23 | Battle of Duskendale,299,22,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,0,3000,,"Robertt Glover, Helman Tallhart","Randyll Tarly, Gregor Clegane",1,Duskendale,The Crownlands,
24 | Battle of the Burning Septry,299,23,,,Brotherhood without Banners,,,,Brave Companions,,,,win,pitched battle,0,0,,,,,1,,The Riverlands,
25 | Battle of the Ruby Ford,299,24,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Stark,,,,win,pitched battle,0,0,,6000,Gregor Clegane,"Roose Bolton, Wylis Manderly",,Ruby Ford,The Riverlands,
26 | Retaking of Harrenhal,299,25,Joffrey/Tommen Baratheon,,Lannister,,,,Brave Companions,,,,win,pitched battle,1,0,,,Gregor Clegane,Vargo Hoat,1,Harrenhal,The Riverlands,
27 | The Red Wedding,299,26,Joffrey/Tommen Baratheon,Robb Stark,Frey,Bolton,,,Stark,,,,win,ambush,1,1,3500,3500,"Walder Frey, Roose Bolton, Walder Rivers",Robb Stark,1,The Twins,The Riverlands,"This observation refers to the battle against the Stark men, not the attack on the wedding"
28 | Siege of Seagard,299,27,Robb Stark,Joffrey/Tommen Baratheon,Frey,,,,Mallister,,,,win,siege,0,1,,,Walder Frey,Jason Mallister,1,Seagard,The Riverlands,
29 | Battle of Castle Black,300,28,Stannis Baratheon,Mance Rayder,Free folk,Thenns,Giants,,Night's Watch,Baratheon,,,loss,siege,1,1,100000,1240,"Mance Rayder, Tormund Giantsbane, Harma Dogshead, Magnar Styr, Varamyr","Stannis Baratheon, Jon Snow, Donal Noye, Cotter Pyke",0,Castle Black,Beyond the Wall,
30 | Fall of Moat Cailin,300,29,Joffrey/Tommen Baratheon,Balon/Euron Greyjoy,Bolton,,,,Greyjoy,,,,win,siege,0,0,,,Ramsey Bolton,,0,Moat Cailin,The North,
31 | Sack of Saltpans,300,30,,,Brave Companions,,,,,,,,win,razing,0,0,,,Rorge,,0,Saltpans,The Riverlands,
32 | Retaking of Deepwood Motte,300,31,Stannis Baratheon,Balon/Euron Greyjoy,Baratheon,Karstark,Mormont,Glover,Greyjoy,,,,win,pitched battle,0,0,4500,200,"Stannis Baratheon, Alysane Mormot",Asha Greyjoy,0,Deepwood Motte,The North,
33 | Battle of the Shield Islands,300,32,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,pitched battle,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,Shield Islands,The Reach,
34 | "Invasion of Ryamsport, Vinetown, and Starfish Harbor",300,33,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,razing,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,"Ryamsport, Vinetown, Starfish Harbor",The Reach,
35 | Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,,200,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0,Storm's End,The Stormlands,
36 | Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,2000,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0,Dragonstone,The Stormlands,
37 | Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Frey,,,Tully,,,,win,siege,0,0,3000,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0,Riverrun,The Riverlands,
38 | Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Lannister,,,Blackwood,,,,win,siege,0,1,1500,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0,Raventree,The Riverlands,
39 | Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Karstark,Mormont,Glover,Bolton,Frey,,,,,,,5000,8000,Stannis Baratheon,Roose Bolton,0,Winterfell,The North,
--------------------------------------------------------------------------------
/got-battles/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-cassandra-example"
2 |
3 | version := "1.0"
4 |
5 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
6 |
7 | // https://groups.google.com/a/lists.datastax.com/forum/#!topic/spark-connector-user/5muNwRaCJnU
8 | assemblyMergeStrategy in assembly <<= (assemblyMergeStrategy in assembly) {
9 | (old) => {
10 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last
11 | case x => old(x)
12 | }
13 | }
14 |
15 | scalaVersion := "2.10.6"
16 |
17 | resolvers += "jitpack" at "https://jitpack.io"
18 |
19 | libraryDependencies ++= Seq(
20 | // use provided line when building assembly jar
21 | // "org.apache.spark" %% "spark-sql" % "1.6.1" % "provided",
22 | // comment above line and uncomment the following to run in sbt
23 | "org.apache.spark" %% "spark-sql" % "1.6.1",
24 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.5.0",
25 | "com.github.scopt" %% "scopt" % "3.5.0"
26 | )
27 |
--------------------------------------------------------------------------------
/got-battles/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/got-battles/src/main/scala/com/supergloo/SparkCassandra.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo
2 |
3 | import com.datastax.spark.connector._
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.sql.SQLContext
6 | import org.apache.spark.sql.functions._
7 |
8 | import scopt.OptionParser
9 |
10 |
11 | /**
12 | * Simple Spark Cassandra
13 | * One example with Scala case class marshalling
14 | * Another example using Spark SQL
15 | */
16 | object SparkCassandra {
17 |
18 | case class CommandLineArgs (
19 | cassandra: String = "", // required
20 | keyspace: String = "gameofthrones", // default is gameofthrones
21 | limit: Int = 10
22 | )
23 |
24 | case class Battle(
25 | battle_number: Integer,
26 | year: Integer,
27 | attacker_king: String,
28 | defender_king: String
29 | )
30 |
31 | def main(args: Array[String]) {
32 |
33 | val parser = new scopt.OptionParser[CommandLineArgs]("spark-cassandra-example") {
34 | head("spark-cassandra-example", "1.0")
35 | opt[String]('c', "cassandra").required().valueName("").
36 | action((x, c) => c.copy(cassandra = x)).
37 | text("Setting cassandra is required")
38 | opt[String]('k', "keyspace").action( (x, c) =>
39 | c.copy(keyspace = x) ).text("keyspace is a string with a default of `gameofthrones`")
40 | opt[Int]('l', "limit").action( (x, c) =>
41 | c.copy(limit = x) ).text("limit is an integer with default of 10")
42 | }
43 |
44 | parser.parse(args, CommandLineArgs()) match {
45 |
46 | case Some(config) =>
47 | // do stuff
48 | val conf = new SparkConf().setAppName("SparkCassandraExampleApp")
49 | conf.setIfMissing("spark.master", "local[5]")
50 |
51 | conf.set("spark.cassandra.connection.host", config.cassandra)
52 |
53 | val sc = new SparkContext(conf)
54 |
55 | // Spark Cassandra Example one which marshalls to Scala case classes
56 | val battles:Array[Battle] = sc.cassandraTable[Battle](config.keyspace, "battles").
57 | select("battle_number","year","attacker_king","defender_king").toArray
58 |
59 | battles.foreach { b: Battle =>
60 | println("Battle Number %s was defended by %s.".format(b.battle_number, b.defender_king))
61 | }
62 |
63 |
64 | // Spark Cassandra Example Two - Create DataFrame from Spark SQL read
65 | val sqlContext = new SQLContext(sc)
66 |
67 | val df = sqlContext.read
68 | .format("org.apache.spark.sql.cassandra")
69 | .options(Map( "table" -> "battles", "keyspace" -> "gameofthrones" ))
70 | .load()
71 |
72 | df.show
73 |
74 |
75 | // Game of Thrones Battle analysis
76 |
77 | // Who were the most aggressive kings? (most attacker_kings)
78 | val countsByAttack = df.groupBy("attacker_king").count().limit(config.limit).sort(desc("count"))
79 | countsByAttack.show()
80 |
81 | // Which kings were attacked the most? (most defender_kings)
82 | val countsByDefend = df.groupBy("defender_king").count().limit(config.limit).sort(desc("count"))
83 | countsByDefend.show()
84 |
85 | sc.stop()
86 |
87 | case None =>
88 | // arguments are bad, error message will have been displayed
89 | }
90 | }
91 | }
--------------------------------------------------------------------------------
/kafka-streaming/README.md:
--------------------------------------------------------------------------------
1 | # Spark Streaming with Kafka
2 |
3 | This is a Spark Streaming job which streams weather data from Kafka
4 | and stores into two Cassandra tables.
5 |
6 | For complete instructions, see
7 | [Spark Streaming with Kafka Example](https://supergloo.com/spark-streaming/spark-streaming-kafka-example/)
8 |
9 | Credits:
10 | Concepts and some code reused from KillrWeather application found at https://github.com/killrweather/killrweather
11 |
12 | #### To run on local machine
13 |
14 | Download Kafka and then we're going to follow similar steps as found here
15 | https://kafka.apache.org/quickstart
16 |
17 | You'll need to update your path appropriately for the following commands
18 | depending on where Kafka; i.e. where is Kafka `bin` dir
19 |
20 | * Start Zookeeper ```bin/zookeeper-server-start.sh config/zookeeper.properties```
21 | * Start Kafka ```bin/kafka-server-start.sh config/server.properties```
22 | * Create Kafka topic ```
23 | bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic raw_weather```
24 | * Start Streaming job in SBT similar to described above. Choose the `WeatherDataStream` option
25 | * Send Weather Data to Kafka ```kafka-console-producer.sh --broker-list localhost:9092 --topic raw_weather
26 | --new-producer < ny-2008.csv```
27 |
28 | #### Monitor with SparkLint
29 |
30 | The SparkLint monitoring tool is included and described in TBD post
31 |
32 | To activate, pass in the extraListener arg when submitting; i.e.
33 |
34 | --conf spark.extraListeners=com.groupon.sparklint.SparklintListener
35 |
--------------------------------------------------------------------------------
/kafka-streaming/build.sbt:
--------------------------------------------------------------------------------
1 | name := "kafka-streaming"
2 |
3 | version := "1.0"
4 |
5 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
6 |
7 | assemblyMergeStrategy in assembly := {
8 | case PathList("org", "apache", "spark", "unused", "UnusedStubClass.class") => MergeStrategy.first
9 | case PathList(pl @ _*) if pl.contains("log4j.properties") => MergeStrategy.concat
10 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last
11 | case x =>
12 | val oldStrategy = (assemblyMergeStrategy in assembly).value
13 | oldStrategy(x)
14 | }
15 |
16 | scalaVersion := "2.10.6"
17 |
18 | resolvers += "jitpack" at "https://jitpack.io"
19 |
20 | // still want to be able to run in sbt
21 | // https://github.com/sbt/sbt-assembly#-provided-configuration
22 | run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run))
23 |
24 | fork in run := true
25 | javaOptions in run ++= Seq(
26 | "-Dlog4j.debug=true",
27 | "-Dlog4j.configuration=log4j.properties")
28 |
29 | libraryDependencies ++= Seq(
30 | "com.groupon.sparklint" %% "sparklint-spark162" % "1.0.4" excludeAll (
31 | ExclusionRule(organization = "org.apache.spark")
32 | ),
33 | "org.apache.spark" %% "spark-core" % "1.6.2" % "provided",
34 | "org.apache.spark" %% "spark-sql" % "1.6.2" % "provided",
35 | "org.apache.spark" %% "spark-streaming" % "1.6.2" % "provided",
36 | "org.apache.spark" %% "spark-streaming-kafka" % "1.6.2",
37 | "com.datastax.spark" %% "spark-cassandra-connector" % "1.6.0"
38 | )
39 |
--------------------------------------------------------------------------------
/kafka-streaming/cql/create-timeseries.cql:
--------------------------------------------------------------------------------
1 | /*
2 | Schema for storing raw ISD-lite hourly weather data.
3 | More can be read about these weather sets here: http://www.ncdc.noaa.gov/oa/climate/isd/
4 | */
5 |
6 | DROP KEYSPACE IF EXISTS isd_weather_data;
7 | CREATE KEYSPACE isd_weather_data WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
8 |
9 | use isd_weather_data;
10 |
11 | /*
12 | Raw weather readings from a single station, hourly.
13 | sky_condition_text text, // Non-coded sky conditions
14 | */
15 |
16 | CREATE TABLE raw_weather_data (
17 | wsid text, // Composite of Air Force Datsav3 station number and NCDC WBAN number
18 | year int, // Year collected
19 | month int, // Month collected
20 | day int, // Day collected
21 | hour int, // Hour collected
22 | temperature double, // Air temperature (degrees Celsius)
23 | dewpoint double, // Dew point temperature (degrees Celsius)
24 | pressure double, // Sea level pressure (hectopascals)
25 | wind_direction int, // Wind direction in degrees. 0-359
26 | wind_speed double, // Wind speed (meters per second)
27 | sky_condition int, // Total cloud cover (coded, see format documentation)
28 | sky_condition_text text, // Non-coded sky conditions
29 | one_hour_precip double, // One-hour accumulated liquid precipitation (millimeters)
30 | six_hour_precip double, // Six-hour accumulated liquid precipitation (millimeters)
31 | PRIMARY KEY ((wsid), year, month, day, hour)
32 | ) WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC, hour DESC);
33 |
34 | /*
35 | Sum of all one_hour_precip for one day and one weather station
36 | */
37 | CREATE TABLE daily_aggregate_precip (
38 | wsid text,
39 | year int,
40 | month int,
41 | day int,
42 | precipitation counter,
43 | PRIMARY KEY ((wsid), year, month, day)
44 | ) WITH CLUSTERING ORDER BY (year DESC, month DESC, day DESC);
45 |
--------------------------------------------------------------------------------
/kafka-streaming/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 |
--------------------------------------------------------------------------------
/kafka-streaming/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.11
--------------------------------------------------------------------------------
/kafka-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | log4j.logger.org.apache.spark.SparkEnv=DEBUG
18 |
19 | # Set everything to be logged to the console
20 | log4j.rootCategory=ERROR, console
21 | log4j.appender.console=org.apache.log4j.ConsoleAppender
22 | log4j.appender.console.target=System.err
23 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
25 |
26 | # Settings to quiet third party logs that are too verbose
27 | log4j.logger.org.spark-project.jetty=WARN
28 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
29 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
30 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
31 | log4j.logger.org.apache.parquet=ERROR
32 | log4j.logger.parquet=ERROR
33 |
34 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
35 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
36 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
37 |
38 | #TM
39 | #log4j.logger.org.apache.spark.sql.cassandra=DEBUG
--------------------------------------------------------------------------------
/kafka-streaming/src/main/scala/com/killrweather/data/Weather.scala:
--------------------------------------------------------------------------------
1 | package com.killrweather.data
2 |
3 | object Weather {
4 |
5 | /** Base marker trait. */
6 | @SerialVersionUID(1L)
7 | sealed trait WeatherModel extends Serializable
8 |
9 | /**
10 | * @param id Composite of Air Force Datsav3 station number and NCDC WBAN number
11 | * @param name Name of reporting station
12 | * @param countryCode 2 letter ISO Country ID // TODO restrict
13 | * @param callSign International station call sign
14 | * @param lat Latitude in decimal degrees
15 | * @param long Longitude in decimal degrees
16 | * @param elevation Elevation in meters
17 | */
18 | case class WeatherStation(
19 | id: String,
20 | name: String,
21 | countryCode: String,
22 | callSign: String,
23 | lat: Double,
24 | long: Double,
25 | elevation: Double) extends WeatherModel
26 |
27 | /**
28 | * @param wsid Composite of Air Force Datsav3 station number and NCDC WBAN number
29 | * @param year Year collected
30 | * @param month Month collected
31 | * @param day Day collected
32 | * @param hour Hour collected
33 | * @param temperature Air temperature (degrees Celsius)
34 | * @param dewpoint Dew point temperature (degrees Celsius)
35 | * @param pressure Sea level pressure (hectopascals)
36 | * @param windDirection Wind direction in degrees. 0-359
37 | * @param windSpeed Wind speed (meters per second)
38 | * @param skyCondition Total cloud cover (coded, see format documentation)
39 | * @param skyConditionText Non-coded sky conditions
40 | * @param oneHourPrecip One-hour accumulated liquid precipitation (millimeters)
41 | * @param sixHourPrecip Six-hour accumulated liquid precipitation (millimeters)
42 | */
43 | case class RawWeatherData(
44 | wsid: String,
45 | year: Int,
46 | month: Int,
47 | day: Int,
48 | hour: Int,
49 | temperature: Double,
50 | dewpoint: Double,
51 | pressure: Double,
52 | windDirection: Int,
53 | windSpeed: Double,
54 | skyCondition: Int,
55 | skyConditionText: String,
56 | oneHourPrecip: Double,
57 | sixHourPrecip: Double) extends WeatherModel
58 |
59 | object RawWeatherData {
60 | def apply(array: Array[String]): RawWeatherData = {
61 | RawWeatherData(
62 | wsid = array(0),
63 | year = array(1).toInt,
64 | month = array(2).toInt,
65 | day = array(3).toInt,
66 | hour = array(4).toInt,
67 | temperature = array(5).toDouble,
68 | dewpoint = array(6).toDouble,
69 | pressure = array(7).toDouble,
70 | windDirection = array(8).toInt,
71 | windSpeed = array(9).toDouble,
72 | skyCondition = array(10).toInt,
73 | skyConditionText = array(11),
74 | oneHourPrecip = array(11).toDouble,
75 | sixHourPrecip = Option(array(12).toDouble).getOrElse(0))
76 | }
77 | }
78 |
79 | trait WeatherAggregate extends WeatherModel with Serializable {
80 | def wsid: String
81 | }
82 |
83 | /* Precipitation */
84 | trait Precipitation extends WeatherAggregate
85 |
86 | case class DailyPrecipitation(wsid: String,
87 | year: Int,
88 | month: Int,
89 | day: Int,
90 | precipitation: Double) extends Precipitation
91 |
92 | }
93 |
--------------------------------------------------------------------------------
/kafka-streaming/src/main/scala/com/supergloo/WeatherDataStream.scala:
--------------------------------------------------------------------------------
1 | package com.supergloo
2 |
3 | import com.killrweather.data.Weather.RawWeatherData
4 | import kafka.serializer.StringDecoder
5 | import org.apache.log4j.Logger
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 | import org.apache.spark.streaming.dstream.{DStream, InputDStream}
9 | import org.apache.spark.streaming.kafka.KafkaUtils
10 |
11 | /**
12 | * Stream from Kafka
13 | */
14 | object WeatherDataStream {
15 |
16 | val localLogger = Logger.getLogger("WeatherDataStream")
17 |
18 | def main(args: Array[String]) {
19 |
20 | // update
21 | // val checkpointDir = "./tmp"
22 |
23 | val sparkConf = new SparkConf().setAppName("Raw Weather")
24 | sparkConf.setIfMissing("spark.master", "local[5]")
25 | // sparkConf.setIfMissing("spark.checkpoint.dir", checkpointDir)
26 | sparkConf.setIfMissing("spark.cassandra.connection.host", "127.0.0.1")
27 |
28 | val ssc = new StreamingContext(sparkConf, Seconds(2))
29 |
30 | val kafkaTopicRaw = "raw_weather"
31 | val kafkaBroker = "127.0.01:9092"
32 |
33 | val cassandraKeyspace = "isd_weather_data"
34 | val cassandraTableRaw = "raw_weather_data"
35 | val cassandraTableDailyPrecip = "daily_aggregate_precip"
36 |
37 | println(s"using cassandraTableDailyPrecip $cassandraTableDailyPrecip")
38 |
39 | val topics: Set[String] = kafkaTopicRaw.split(",").map(_.trim).toSet
40 | val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBroker)
41 |
42 | localLogger.info(s"connecting to brokers: $kafkaBroker")
43 | localLogger.info(s"kafkaParams: $kafkaParams")
44 | localLogger.info(s"topics: $topics")
45 |
46 | val rawWeatherStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
47 | val parsedWeatherStream: DStream[RawWeatherData] = ingestStream(rawWeatherStream)
48 |
49 | persist(cassandraKeyspace, cassandraTableRaw, cassandraTableDailyPrecip, parsedWeatherStream)
50 |
51 | parsedWeatherStream.print // for demo purposes only
52 |
53 | //Kick off
54 | ssc.start()
55 |
56 | ssc.awaitTermination()
57 |
58 | ssc.stop()
59 | }
60 |
61 | def persist(CassandraKeyspace: String, CassandraTableRaw: String,
62 | CassandraTableDailyPrecip: String,
63 | parsedWeatherStream: DStream[RawWeatherData]): Unit = {
64 |
65 | import com.datastax.spark.connector.streaming._
66 |
67 | /** Saves the raw data to Cassandra - raw table. */
68 | parsedWeatherStream.saveToCassandra(CassandraKeyspace, CassandraTableRaw)
69 |
70 | /** For a given weather station, year, month, day, aggregates hourly precipitation values by day.
71 | * Weather station first gets you the partition key - data locality - which spark gets via the
72 | * connector, so the data transfer between spark and cassandra is very fast per node.
73 | *
74 | * Persists daily aggregate data to Cassandra daily precip table by weather station,
75 | * automatically sorted by most recent (due to how we set up the Cassandra schema:
76 | *
77 | * @see https://github.com/killrweather/killrweather/blob/master/data/create-timeseries.cql.
78 | *
79 | * Because the 'oneHourPrecip' column is a Cassandra Counter we do not have to do a spark
80 | * reduceByKey, which is expensive. We simply let Cassandra do it - not expensive and fast.
81 | */
82 | parsedWeatherStream.map { weather =>
83 | (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip)
84 | }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip)
85 | }
86 |
87 | def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = {
88 | val parsedWeatherStream = rawWeatherStream.map(_._2.split(","))
89 | .map(RawWeatherData(_))
90 | parsedWeatherStream
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/kafka-streaming/src/main/scala/com/supergloo/package.scala:
--------------------------------------------------------------------------------
1 | package com
2 |
3 | package object supergloo {
4 |
5 | val BASE_PATH = "/Users/toddmcgrath/Development/pioneer/"
6 | val SANDBOX_PATH = BASE_PATH + "spark-sql/sandbox/"
7 | }
8 |
--------------------------------------------------------------------------------