├── LICENSE
├── images
├── attach-driver-to-cluster.png
├── attach-driver.png
├── autocomplete.png
├── create-library-menu.png
├── event-hubs-settings.png
├── keyboard-shortcuts.png
├── maven-create.png
├── play-options.png
├── run-all.png
├── search-packages.png
└── source-maven.png
├── notebooks
└── Shared
│ └── tutorials
│ ├── EventHubsDemo.scala
│ ├── IntroToDataParsing.scala
│ ├── IntroToDataSources.scala
│ ├── IntroToEventHubs.scala
│ ├── IntroToNotebooks.scala
│ ├── IntroToSparkComponents.scala
│ ├── StartHere.scala
│ └── WordcountExample.scala
└── testdata
├── weatherdata-12345.json
└── weatherdata-22334.json
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 David Makogon
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/images/attach-driver-to-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/attach-driver-to-cluster.png
--------------------------------------------------------------------------------
/images/attach-driver.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/attach-driver.png
--------------------------------------------------------------------------------
/images/autocomplete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/autocomplete.png
--------------------------------------------------------------------------------
/images/create-library-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/create-library-menu.png
--------------------------------------------------------------------------------
/images/event-hubs-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/event-hubs-settings.png
--------------------------------------------------------------------------------
/images/keyboard-shortcuts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/keyboard-shortcuts.png
--------------------------------------------------------------------------------
/images/maven-create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/maven-create.png
--------------------------------------------------------------------------------
/images/play-options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/play-options.png
--------------------------------------------------------------------------------
/images/run-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/run-all.png
--------------------------------------------------------------------------------
/images/search-packages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/search-packages.png
--------------------------------------------------------------------------------
/images/source-maven.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmakogon/iot-data-openhack-helpers/25a5854ed9902fb80751ec469c7d5b67dd890f61/images/source-maven.png
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/EventHubsDemo.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md # Getting started with Event Hubs + Spark
3 | // MAGIC
4 | // MAGIC This notebook helps get you set up with various "plumbing" for Event Hubs, Blob storage, and
5 | // MAGIC Dataframes. And then you'll be able to create your own queries against data streaming through the Event Hubs endpoint.
6 | // MAGIC
7 | // MAGIC Note: This notebook is written in Scala, but you can also use Python or R for your own projects.
8 | // MAGIC
9 | // MAGIC Also note: The basics of Spark are all documented online, [here](https://spark.apache.org/docs/latest/), and includes full programming guides and API docs.
10 |
11 | // COMMAND ----------
12 |
13 | // MAGIC %md
14 | // MAGIC
15 | // MAGIC First, we'll set up Event Hubs, which is the same setup for IoT Hubs. To do this, we'll need to first:
16 | // MAGIC
17 | // MAGIC - ensure that the Event Hubs SDK has been added as a library, and attached to a running cluster
18 | // MAGIC - add required import statements (which are equivalent to c#'s "using" statement)
19 | // MAGIC
20 | // MAGIC ## Spark Connector SDK
21 | // MAGIC The Spark Connector SDK may be found [here](https://github.com/Azure/azure-event-hubs-spark). But there's a much easier way to install the correct driver, if you know its Maven coordinates.
22 | // MAGIC Note: Maven is a dependency/build manager tool for Java. Similar to Nuget for .net and npm for Node.js. Here are the instructions for installing the correct SDK, based on the Maven coordinates.
23 | // MAGIC
24 | // MAGIC
25 | // MAGIC ### Selecting and initializing the correct driver
26 | // MAGIC
27 | // MAGIC It's important to choose the correct Event Hubs SDK, depending on which version of Spark you're working with.
28 | // MAGIC
29 | // MAGIC For Databricks, these are the Maven coordinates for the Event Hubs SDK for Databricks:
30 | // MAGIC
31 | // MAGIC - Cluster v3.5 and above (Scala 2.11+): `com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.0`
32 | // MAGIC
33 | // MAGIC To install the SDK in Databricks, traverse to the `Shared` folder (or your own personal folder) and select `Create Library`:
34 | // MAGIC
35 | // MAGIC 
36 | // MAGIC
37 | // MAGIC
38 | // MAGIC Then, choose to enter Maven coordinates, enter the correct SDK's coordinates, and choose to Create the library:
39 | // MAGIC
40 | // MAGIC 
41 | // MAGIC
42 | // MAGIC
43 | // MAGIC 
44 | // MAGIC
45 | // MAGIC At this point, you must attach the SDK to a cluster. You will be shown a list of your clusters. Choose which ever cluster(s) you are using, and select the checkbox.
46 | // MAGIC
47 | // MAGIC 
48 | // MAGIC
49 | // MAGIC ## Imports
50 | // MAGIC Next, we'll define some important import statements, required for the Spark Connector.
51 | // MAGIC
52 | // MAGIC ### A word about cells
53 | // MAGIC Notice that these imports are defined in their own *cell*. Cells are similar to functions or methods, in that they are an execution block: if you run a cell, all the instructions in the cell are run.
54 | // MAGIC
55 | // MAGIC A notebook may have many cells. They all share the same variable scope. That is, if you define variable `foo` in one cell, and run that cell, `foo` is now a valid variable that may be accessed in other cells.
56 | // MAGIC
57 | // MAGIC Likewise, once you define imports and run the cell with the definition of those imports, you may now run code in any cell that has a dependency on those imports.
58 | // MAGIC
59 | // MAGIC Bonus: Once you run a cell defining your imports, you don't have to run that cell again, until your cluster is restarted.
60 | // MAGIC
61 | // MAGIC For a bit more info on cells, take a look at this notebook.
62 |
63 | // COMMAND ----------
64 |
65 | import org.apache.spark.sql._
66 | import org.apache.spark.sql.types._
67 | import org.apache.spark.sql.functions._
68 | import org.apache.spark.sql.eventhubs._
69 |
70 | // COMMAND ----------
71 |
72 | // MAGIC %md
73 | // MAGIC # Setting Up Event Hubs / IoT Hub connection
74 | // MAGIC Ok, we have our imports. How, let's set up the Event Hubs or IoT Hub connection. You'll need all of your Event Hub (or IoT Hub) settings for this, from the Azure portal.
75 | // MAGIC
76 | // MAGIC One setting you might not have configured is the `consumerGroup`. Each Event Hubs endpoint may have multiple consumer groups, with a default consumer group defined when the endpoint is created. You'll want to create your own consumer group, as this gives you your own independent view into the incoming data stream, which does not conflict with others who might also be reading from the same stream. If you haven't done so, please create a new consumer group for yourself.
77 | // MAGIC
78 | // MAGIC Here is an example of where you'd find the Event Hubs compatible connection string for an IoT Hub, along with Event Hubs name and Consumer Group:
79 | // MAGIC
80 | // MAGIC 
81 | // MAGIC
82 | // MAGIC Now, using these properties, set up your connection below, replacing `` placeholders with your real setting name (without the `<>` brackets, of course).
83 |
84 | // COMMAND ----------
85 |
86 | // Modify to include your event hubs parameters here
87 | // Note: This code works only with the latest Event Hubs driver,
88 | // which is supported by both Databricks v3.5 & v4.0 and HDInsight v3.5
89 |
90 | import org.apache.spark.eventhubs._
91 |
92 | val iotConnString = ""
93 |
94 | val ehName = ""
95 |
96 | val consumerGroup = ""
97 |
98 | // Build connection string with the above information
99 | val connectionString = ConnectionStringBuilder(iotConnString)
100 | .setEventHubName(ehName)
101 | .build
102 |
103 | // this sets up our event hubs configuration, including consumer group
104 | val ehConf = EventHubsConf(connectionString)
105 | .setConsumerGroup(consumerGroup)
106 |
107 | // COMMAND ----------
108 |
109 | // MAGIC %md
110 | // MAGIC # Connecting to Event Hubs
111 | // MAGIC Ok, now we need to wire up a dataframe to Event Hubs. If you haven't worked with Dataframes before: for the purposes of this exercise, just imagine a very large database table, that allows for operations to be partitioned and performed in parallel, with data that could either be static or streaming in from a live source.
112 | // MAGIC
113 | // MAGIC For this simple example, we are using Event Hubs as the streaming source of our Dataframe, and taking advantage of the `readStream` function to read directly from Event Hubs. `readStream` is similar to a file object's `read` method that you might have seen in other languages.
114 | // MAGIC
115 | // MAGIC It is important to understand the difference between `read` function and `readStream`. Simply stated, it is as follows:
116 | // MAGIC `read` => For reading static data or data in batches.
117 | // MAGIC `readStream` => For reading streaming data.
118 | // MAGIC
119 | // MAGIC **See also:** [reading data from event hubs](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/structured-streaming-eventhubs-integration.md#reading-data-from-event-hubs)
120 |
121 | // COMMAND ----------
122 |
123 | // First, create the data frame
124 | val df = spark
125 | .readStream
126 | .format("eventhubs")
127 | .options(ehConf.toMap)
128 | .load()
129 |
130 | // COMMAND ----------
131 |
132 | // MAGIC %md
133 | // MAGIC # Extracting data from Event Hubs
134 | // MAGIC
135 | // MAGIC Each "row" of data coming from Event Hubs has the following schema:
136 | // MAGIC
137 | // MAGIC | Column | Type |
138 | // MAGIC |----------|----------|
139 | // MAGIC | body | binary |
140 | // MAGIC | offset | string |
141 | // MAGIC | sequenceNumber | long |
142 | // MAGIC | enqueuedTime | timestame |
143 | // MAGIC | publisher | string |
144 | // MAGIC | partitionKey | string |
145 | // MAGIC
146 | // MAGIC For our purposes, we only need `body`. The issue is, `body` is transmitted as binary data by Event Hubs by default. So, we will do a simple cast to convert this data to a string.
147 |
148 | // COMMAND ----------
149 |
150 | // create a new dataframe with decoded body
151 | val eventhubsDF = df
152 | .selectExpr("CAST(body as STRING)")
153 |
154 | // COMMAND ----------
155 |
156 | // MAGIC %md
157 | // MAGIC # Writing: To memory
158 | // MAGIC First thing we'll want to do is write our streaming data *somewhere*, so that we can query a bit of it and see what it looks like. From a dev/test standpoint, the easiest way to get started is to write to an in-memory table.
159 | // MAGIC **see also:** [Structured streaming guide: output sinks](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#output-sinks)
160 |
161 | // COMMAND ----------
162 |
163 | // now write to an in-memory table. We'll save this in a variable so we can stop it later
164 | val memoryQuery = eventhubsDF.writeStream
165 | .format("memory")
166 | .queryName("sampledata") // this is the table name to be used for our in-memory table
167 | .start()
168 |
169 | // COMMAND ----------
170 |
171 | // MAGIC %md
172 | // MAGIC # Reading: From memory
173 | // MAGIC We should now have data in our in-memory table, which we can now query, to get an idea of what our data looks like.
174 | // MAGIC
175 | // MAGIC At this point, you can experiment with this query in any way you see fit. Here are two ways to display data coming from `spark.sql()`:
176 |
177 | // COMMAND ----------
178 |
179 | // if you omit the 'truncate' parameter, it defaults to 'true',
180 | // which shortens output strings for display purposes
181 | spark.sql("SELECT * from sampledata").show(truncate=false)
182 |
183 | // COMMAND ----------
184 |
185 | display(spark.sql("SELECT * from sampledata"))
186 |
187 | // COMMAND ----------
188 |
189 | // MAGIC %md
190 | // MAGIC # Shutting down in-memory table stream
191 | // MAGIC Snce we saved off the stream variable earlier, we can easily shut it down after we're done querying.
192 |
193 | // COMMAND ----------
194 |
195 | memoryQuery.stop()
196 |
197 | // COMMAND ----------
198 |
199 | // MAGIC %md
200 | // MAGIC #Data Sources
201 | // MAGIC
202 | // MAGIC With Spark, you have many options for working with data sources. See this Notebook for more information about data sources.
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/IntroToDataParsing.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md # Parsing JSON data
3 | // MAGIC
4 | // MAGIC
5 | // MAGIC In this notebook, we'll work with JSON content within the `Body`, and see how to extract
6 | // MAGIC individual JSON properties, so that we can execute queries on these properties.
7 | // MAGIC
8 | // MAGIC Note: To simplify this exercise, sample data has been created for you, in `weatherdata-xxxxx.json` (where `xxxxx` represents a zip code), so that you don't need to
9 | // MAGIC create your own weather data simulator. To use this data, upload the json files to an Azure Storage container, and then
10 | // MAGIC provide your storage account credentials below, along with the container you chose for storing these json files.
11 | // MAGIC
12 | // MAGIC Note: The test data, along with this notebook, is located in GitHub, at [github.com/dmakogon/iot-data-openhack-helpers](https://github.com/dmakogon/iot-data-openhack-helpers).
13 |
14 | // COMMAND ----------
15 |
16 | // MAGIC %md
17 | // MAGIC Before anything, let's import required namespaces:
18 |
19 | // COMMAND ----------
20 |
21 | // First, imports
22 | import org.apache.spark.sql._
23 | import org.apache.spark.sql.types._
24 | import org.apache.spark.sql.functions._
25 |
26 | // COMMAND ----------
27 |
28 | // MAGIC %md
29 | // MAGIC #Setting up Azure Blob storage
30 | // MAGIC
31 | // MAGIC Here, we are configuring Spark to work with your Azure Storage account, and then setting up your sample data as a streaming source.
32 | // MAGIC
33 | // MAGIC Note that this will stream all content contained in the named container. In this example, our data is partitioned by zipcode, with each zipcode's data stored in a single file. In a real-world weather data scenario, data would likely be partitioned differently, but this should suffice for demo purposes.
34 |
35 | // COMMAND ----------
36 |
37 | // Fill in your Azure Storage settings here
38 | spark.conf.set(
39 | "fs.azure.account.key..blob.core.windows.net",
40 | "")
41 |
42 | // Connect to blob storage and read all content within the input container into a dataframe:
43 | val inputBlobDF = spark.read
44 | .json("wasbs://@.blob.core.windows.net/")
45 |
46 | // COMMAND ----------
47 |
48 | // MAGIC %md
49 | // MAGIC Let's take a peek at a bit of the input data. Note that while it looks like JSON, it's currently just one long string:
50 |
51 | // COMMAND ----------
52 |
53 | display(inputBlobDF)
54 |
55 | // COMMAND ----------
56 |
57 | // MAGIC %md
58 | // MAGIC # File system support
59 | // MAGIC
60 | // MAGIC In case you want to peruse the contents of the blobs we're working with, you can do this directly from spark. For example, here is a file listing of our input container:
61 |
62 | // COMMAND ----------
63 |
64 | // MAGIC %fs ls "wasbs://@.blob.core.windows.net/"
65 |
66 | // COMMAND ----------
67 |
68 | // MAGIC %md
69 | // MAGIC And we can display the first part of a specific file:
70 |
71 | // COMMAND ----------
72 |
73 | // MAGIC %fs head "wasbs://@.blob.core.windows.net/weatherdata-12345.json"
74 |
75 | // COMMAND ----------
76 |
77 | // MAGIC %md
78 | // MAGIC # Setting up a JSON schema
79 | // MAGIC
80 | // MAGIC Let's work with sample weather data, and assume each message body contains the following JSON:
81 | // MAGIC
82 | // MAGIC `{ "timestamp": "2018-10-01", "zipcode": "12345", "temperature": 75 }`
83 | // MAGIC
84 | // MAGIC We can now define a schema which defines each of these properties. The order of the properties in the schema doesn't matter, but the spelling and case *do* matter.
85 |
86 | // COMMAND ----------
87 |
88 | // Define the schema to apply to our weather data:
89 | val schema = StructType(
90 | StructField("timestamp", TimestampType) ::
91 | StructField("zipcode", StringType) ::
92 | StructField("temperature", IntegerType) :: Nil)
93 |
94 | // Apply the schema to our data frame, creating a new data frame.
95 | // Applying the schema lets us decode the `body` field (from the original Event Hubs message) into individual properties,
96 | // as defined by the schema. No need to cast the entire `body` payload to `string`, as we originally did in the
97 | // Intro to EventHubs notebook.
98 | //
99 | // Each JSON object will be rendered into an object in our dataframe. We need to give that object
100 | // a name, for querying purposes. In this example, we're calling it "reading" (a temperature reading).
101 | val schemaDF = inputBlobDF.select(from_json(col("body"), schema).alias("reading"))
102 |
103 | // COMMAND ----------
104 |
105 | // MAGIC %md
106 | // MAGIC # Peeking at our dataframe
107 | // MAGIC
108 | // MAGIC Note that Spark has *transforms* and *actions*. Transforms are lazy: nothing happens until an action is executed.
109 | // MAGIC
110 | // MAGIC Applying a schema? Transform. Displaying content: Action!
111 |
112 | // COMMAND ----------
113 |
114 | display(schemaDF)
115 |
116 | // COMMAND ----------
117 |
118 | // MAGIC %md
119 | // MAGIC Note: All of the available data types are documented [here](https://spark.apache.org/docs/2.3.1/api/java/org/apache/spark/sql/types/DataType.html). In the above example, the temperature value is set to `IntegerType`, which is a subclass of `NumericType`.
120 |
121 | // COMMAND ----------
122 |
123 | // MAGIC %md
124 | // MAGIC # Set up temporary table, for querying
125 |
126 | // COMMAND ----------
127 |
128 | schemaDF.createOrReplaceTempView("weatherdata")
129 |
130 | // COMMAND ----------
131 |
132 | // MAGIC %md
133 | // MAGIC # Reading: From table
134 | // MAGIC We should now have our temporary table filling with our sample weather data from the JSON file.
135 |
136 | // COMMAND ----------
137 |
138 | // Observe the data, as parsed into separate columns:
139 | spark.sql("SELECT reading.* from weatherdata").show(truncate=false)
140 |
141 |
142 | // COMMAND ----------
143 |
144 | // MAGIC %md
145 | // MAGIC
146 | // MAGIC Note that you can also use a `%sql` cell, as shown in the next example:
147 |
148 | // COMMAND ----------
149 |
150 | // MAGIC %sql
151 | // MAGIC SELECT reading.timestamp, reading.temperature
152 | // MAGIC from weatherdata
153 | // MAGIC where reading.zipcode=22334
154 | // MAGIC and reading.temperature > 65
155 | // MAGIC order by reading.temperature
156 |
157 | // COMMAND ----------
158 |
159 | // MAGIC %md
160 | // MAGIC You can also use traditional SQL aggregations such as `AVG` and `COUNT`:
161 |
162 | // COMMAND ----------
163 |
164 | // MAGIC %sql
165 | // MAGIC SELECT reading.zipcode,avg(reading.temperature) as AverageTemperature,count(reading.temperature) as SampleCount
166 | // MAGIC from weatherdata
167 | // MAGIC group by reading.zipcode
168 |
169 | // COMMAND ----------
170 |
171 | // MAGIC %md
172 | // MAGIC # Alternative: Use SQL instead of Scala
173 | // MAGIC
174 | // MAGIC This example creates a temporary table by reading directly from blob storage into a table.
175 |
176 | // COMMAND ----------
177 |
178 | // MAGIC %sql
179 | // MAGIC DROP TABLE IF EXISTS sqlrawweatherdata;
180 | // MAGIC CREATE TEMPORARY TABLE sqlrawweatherdata
181 | // MAGIC USING json
182 | // MAGIC OPTIONS (path "wasbs://@.blob.core.windows.net/", mode "FAILFAST");
183 |
184 | // COMMAND ----------
185 |
186 | // MAGIC %md
187 | // MAGIC Now that the table has been created, we can query it. Note that we will only have a `body` since we don't do any parsing of the incoming content. And since that content was a JSON-formatted document, that's exactly what we see here.
188 |
189 | // COMMAND ----------
190 |
191 | // MAGIC %sql
192 | // MAGIC SELECT * FROM sqlrawweatherdata;
193 |
194 | // COMMAND ----------
195 |
196 | // MAGIC %md
197 | // MAGIC Here is another table being created, but this time we will apply a schema.
198 |
199 | // COMMAND ----------
200 |
201 | // MAGIC %sql
202 | // MAGIC DROP VIEW IF EXISTS sqlweatherdata;
203 | // MAGIC CREATE TEMPORARY VIEW sqlweatherdata AS
204 | // MAGIC SELECT get_json_object(body,'$.temperature') AS temperature,
205 | // MAGIC get_json_object(body,'$.zipcode') AS zipcode,
206 | // MAGIC get_json_object(body,'$.timestamp') AS timestamp
207 | // MAGIC FROM sqlrawweatherdata;
208 |
209 | // COMMAND ----------
210 |
211 | // MAGIC %md
212 | // MAGIC Now if we query, we will see each individual property as a column.
213 |
214 | // COMMAND ----------
215 |
216 | // MAGIC %sql
217 | // MAGIC SELECT * FROM sqlweatherdata
218 |
219 | // COMMAND ----------
220 |
221 | // MAGIC %md
222 | // MAGIC # Writing to storage
223 | // MAGIC
224 | // MAGIC Let's say we want to write our incoming data to storage instead of a temporary table. In this example, we are reading sample data from blob storage, but in a real-world application, thousands (millions?) of weather data points would arrive via Event Hubs or Iot Hub, and we'd want to store it for later processing.
225 | // MAGIC
226 | // MAGIC When storing, we can optionally partition data by a given set of properties. In this example, we will add an additional column, `hour`, that we can include in a partitioning scheme (zipcode + day + hour). In a real-world scenario, you would likely partition by something like year, month, day, and optionally hour, and store more than just the temperature reading (maybe barometric pressure, precipitation, humidity, etc).
227 |
228 | // COMMAND ----------
229 |
230 | // Grab needed columns for partitioning. We'll parse down to hour of day within zipcode, as a simple example.
231 | // This effectively grabs 3 columns, creates an additional parsed column called "hour", and then selects
232 | // all columns (including the data we want, along with extra parsed properties for partitioning purposes)
233 | val partitionDF = schemaDF
234 | .select("reading.temperature", "reading.timestamp", "reading.zipcode")
235 | .withColumn("hour", hour(col("timestamp").cast("timestamp"))) // extracting hour from timestap column, into new "hour" column
236 | .select("zipcode", "hour", "temperature") // our final set of columns to work with
237 |
238 | // COMMAND ----------
239 |
240 | // MAGIC %md
241 | // MAGIC Our new dataframe (`partitionDF`) has been enhanced to contain an `hour` column:
242 |
243 | // COMMAND ----------
244 |
245 | display(partitionDF)
246 |
247 | // COMMAND ----------
248 |
249 | // MAGIC %md
250 | // MAGIC At this point, we can write our data to blob storage. First, a simple write, without partitioning:
251 |
252 | // COMMAND ----------
253 |
254 | partitionDF.write
255 | .option("header","true")
256 | .mode("overwrite")
257 | .option("delimiter",",")
258 | .csv("wasbs://@.blob.core.windows.net/alldata")
259 |
260 | // COMMAND ----------
261 |
262 | // MAGIC %md
263 | // MAGIC Now let's write in a partitioned way:
264 |
265 | // COMMAND ----------
266 |
267 | partitionDF.write
268 | .option("header","true")
269 | .mode("overwrite")
270 | .option("delimiter",",")
271 | .partitionBy("zipcode","hour")
272 | .csv("wasbs://@.blob.core.windows.net/partitiondata")
273 |
274 | // COMMAND ----------
275 |
276 | // MAGIC %md
277 | // MAGIC If you now browse your storage account, you'd find several folders under the data output folder, each representing a specific zip code. Under these, you'll find additional folders for each hour. You can download and view any of these files.
278 |
279 | // COMMAND ----------
280 |
281 | // MAGIC %fs ls "wasbs://@.blob.core.windows.net/partitiondata"
282 |
283 | // COMMAND ----------
284 |
285 | // MAGIC %fs ls "wasbs://@.blob.core.windows.net/partitiondata/zipcode=12345"
286 |
287 | // COMMAND ----------
288 |
289 | // MAGIC %fs ls "wasbs://@.blob.core.windows.net/partitiondata/zipcode=12345/hour=1"
290 |
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/IntroToDataSources.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Working with Storage
4 | // MAGIC
5 | // MAGIC Spark has the ability to read content not only from streaming sources (such as Azure's Event Hubs and IoT Hub), but also from files. Out of the box, Spark supports several file formats, such as `csv`, `json`, `avro`, and `parquet`. Spark also provides the ability for you to work with custom formats.
6 | // MAGIC
7 | // MAGIC ## Methods for reading and writing
8 | // MAGIC
9 | // MAGIC Spark provides two general sets of reading & writing methods:
10 | // MAGIC - `read()`, `write()`, and `save()` - you will use these with static content
11 | // MAGIC - `readStream()`, `writeStream()`, and `start()` - you will use these with streaming content
12 | // MAGIC
13 | // MAGIC ## Connecting to Azure
14 | // MAGIC
15 | // MAGIC When working with content in Azure, you'll first need to configure your Spark session to have a properly-authenticated connection to Azure blob storage. Databricks has fully documented the process, [here](https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html), including details about mounting a blob container as a file system mount.
16 | // MAGIC
17 | // MAGIC Note that, in the Databricks example, they show how to read a parquet file (`spark.read.parquet()`). Just remember that this is one of many built-in formats, and there is no dependency between Azure and a specific file format.
18 | // MAGIC
19 | // MAGIC If you are working with a Storage account that is in your subscription, then you'll have access to both the account name and account key. With these two parameters, you may configure Azure storage with those two configuration elements:
20 | // MAGIC
21 | // MAGIC ```
22 | // MAGIC spark.conf.set(
23 | // MAGIC "fs.azure.account.key.{YOUR STORAGE ACCOUNT NAME}.blob.core.windows.net",
24 | // MAGIC "{YOUR STORAGE ACCOUNT ACCESS KEY}")
25 | // MAGIC ```
26 | // MAGIC
27 | // MAGIC However: If someone else is granting you access to a given container (or if you don't want to embed an entire storage account's key within your app), you'll need to use a Shared Access Signature, which is a key generated for granting access to a given blob or container. To use a SAS, the call is slightly different:
28 | // MAGIC
29 | // MAGIC ```
30 | // MAGIC spark.conf.set(
31 | // MAGIC "fs.azure.sas.{YOUR CONTAINER NAME}.{YOUR STORAGE ACCOUNT NAME}.blob.core.windows.net",
32 | // MAGIC "{COMPLETE QUERY STRING OF YOUR SAS FOR THE CONTAINER}")
33 | // MAGIC ```
34 | // MAGIC
35 | // MAGIC At this point, you're all set, and can read and write files to Azure blob storage. For example:
36 | // MAGIC
37 | // MAGIC ```
38 | // MAGIC val df = spark.read.json("wasbs://{YOUR CONTAINER NAME}@{YOUR STORAGE ACCOUNT NAME}.blob.core.windows.net/{YOUR DIRECTORY NAME}/yourfile.json")
39 | // MAGIC ```
40 | // MAGIC
41 | // MAGIC ## Additional resources
42 | // MAGIC
43 | // MAGIC - [Spark data sources](https://docs.databricks.com/spark/latest/data-sources/index.html)
44 |
45 | // COMMAND ----------
46 |
47 | // MAGIC %md
48 | // MAGIC ## Moving data from CSV to SQL Azure
49 | // MAGIC In the next paragraph you're going to extracting data from a CSV file stored in an Azure Blob Store, do some basic queries using Spark SQL and then save the loaded data into an Azure SQL database.
50 |
51 | // COMMAND ----------
52 |
53 | // MAGIC %md
54 | // MAGIC Configure spark to access an existing Azure Blob store where the CSV file that we want to import is stored
55 |
56 | // COMMAND ----------
57 |
58 | spark.conf.set("fs.azure.account.key.openhackspark.blob.core.windows.net", "xlkvzaPoN5MQvYgT/Yg70s6sEw2KBkrLpiqhbrR9IhHC8gbvP41MeMGjuljPpsAjvCzUn3MIjSaQ/w8oXDoroQ==")
59 |
60 | // COMMAND ----------
61 |
62 | // MAGIC %md
63 | // MAGIC In order to manipulate data using Spark SQL, specialized function and types needs to be imported
64 |
65 | // COMMAND ----------
66 |
67 | import org.apache.spark.sql.functions._
68 | import org.apache.spark.sql.types._
69 |
70 | // COMMAND ----------
71 |
72 | // MAGIC %md
73 | // MAGIC Create a schema for the CSV file so that data can be manipulated more easily and also checked for inconsistencies. The approach of creating the schema *after* having loaded data somewhere, it is called **schema-on-read** as opposed to the **schema-on-write** approach. **Schema-on-write** more suitable where youknow in advance the shape of your data, and you want to make sure that only the data compliant with the schema is loaded.
74 | // MAGIC In an IoT scenario, **the schema-on-read** is usually preferred since it gives more flexbility and favor the idea of storing the data even if we don't know how to deal with it just yet. Think, for example, at the case of adding a shiny new sensor to a set of existing one. The new sensor may return additional data that you don't want to miss, even if your application is not yet ready to deal with it, but it will be in future.
75 |
76 | // COMMAND ----------
77 |
78 | val DecimalType = DataTypes.createDecimalType(15, 10)
79 |
80 | val schema = StructType(
81 | StructField("SepalLength", DecimalType, nullable = false) ::
82 | StructField("SepalWidth", DecimalType, nullable = false) ::
83 | StructField("PetalLength", DecimalType, nullable = false) ::
84 | StructField("PetalWidth", DecimalType, nullable = false) ::
85 | StructField("Class", StringType, nullable = false) ::
86 | Nil
87 | )
88 |
89 | // COMMAND ----------
90 |
91 | // MAGIC %md
92 | // MAGIC Read the file. The file is the famous Iris Dataset taken from the https://archive.ics.uci.edu/ml/datasets/Iris. If you want to start exploring Machine Learning, this is a great dataset to get get started.
93 |
94 | // COMMAND ----------
95 |
96 | // file originally from
97 | // https://archive.ics.uci.edu/ml/datasets/iris
98 | val irisDF = sqlContext.read.schema(schema).format("csv").load("wasb://sample-data@openhackspark.blob.core.windows.net/iris.data")
99 |
100 | // COMMAND ----------
101 |
102 | // MAGIC %md
103 | // MAGIC Show first 10 lines using the *take* method on the created DataFrame
104 |
105 | // COMMAND ----------
106 |
107 | // Make sure we actualy read something
108 | irisDF.take(10)
109 |
110 | // COMMAND ----------
111 |
112 | // MAGIC %md
113 | // MAGIC Create a temporary view so that we can manipulate data using standard SQL commands, that will make data manipulation much easier
114 |
115 | // COMMAND ----------
116 |
117 | irisDF.createOrReplaceTempView("iris")
118 |
119 | // COMMAND ----------
120 |
121 | // MAGIC %md
122 | // MAGIC Execute a very simple SQL query on created view. Spark SQL supports ANSI SQL:2003 that allows really complexy data manipulation, perfect for data science needs (https://en.wikipedia.org/wiki/SQL:2003)
123 |
124 | // COMMAND ----------
125 |
126 | // MAGIC %sql
127 | // MAGIC SELECT * FROM iris LIMIT 10
128 |
129 | // COMMAND ----------
130 |
131 | // MAGIC %md
132 | // MAGIC Note that you can also run your SQL command through `spark`, as the next example shows.
133 |
134 | // COMMAND ----------
135 |
136 | spark.sql("select * from iris limit 10").show(truncate=false)
137 |
138 | // COMMAND ----------
139 |
140 | // MAGIC %md
141 | // MAGIC Connect to an Azure SQL database using JDBC driver.
142 | // MAGIC Make sure you create your own Azure SQL database (https://docs.microsoft.com/en-us/azure/sql-database/sql-database-get-started-portal), and the get the host name, database name, login and password and use in the following code
143 |
144 | // COMMAND ----------
145 |
146 | val jdbcHostname = ".database.windows.net"
147 | val jdbcPort = 1433
148 | val jdbcDatabase =""
149 |
150 | // Create the JDBC URL without passing in the user and password parameters.
151 | val jdbcUrl = s"jdbc:sqlserver://${jdbcHostname}:${jdbcPort};database=${jdbcDatabase}"
152 |
153 | // Create a Properties() object to hold the parameters.
154 | import java.util.Properties
155 | val connectionProperties = new java.util.Properties()
156 | val jdbcUsername = ""
157 | val jdbcPassword = ""
158 | connectionProperties.put("user", s"${jdbcUsername}")
159 | connectionProperties.put("password", s"${jdbcPassword}")
160 |
161 | // Set JDBC Driver
162 | val driverClass = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
163 | connectionProperties.setProperty("Driver", driverClass)
164 |
165 | // COMMAND ----------
166 |
167 | // MAGIC %md
168 | // MAGIC Execute a command to check that connection is working properly
169 |
170 | // COMMAND ----------
171 |
172 | // Let's check if connection with Azure SQL is up and running
173 | // (https://docs.azuredatabricks.net/spark/latest/data-sources/sql-databases.html#push-down-a-query-to-the-database-engine)
174 | val serverName = spark.read.jdbc(jdbcUrl, "(select @@servername as ServerName) t", connectionProperties)
175 |
176 | // COMMAND ----------
177 |
178 | // MAGIC %md
179 | // MAGIC View the result: it should be the name of the Azure SQL server you are connected to
180 |
181 | // COMMAND ----------
182 |
183 | display(serverName)
184 |
185 | // COMMAND ----------
186 |
187 | // MAGIC %md
188 | // MAGIC Copy the Iris Spark table content to Azure SQL. Depending on the specificed *mode* option, target table can be created automatically or not.
189 | // MAGIC - https://docs.azuredatabricks.net/spark/latest/data-sources/sql-databases.html#write-data-to-jdbc
190 | // MAGIC - https://spark.apache.org/docs/2.3.0/api/scala/index.html#org.apache.spark.sql.DataFrameWriter
191 |
192 | // COMMAND ----------
193 |
194 | import org.apache.spark.sql.SaveMode
195 |
196 | // Drop existing table if needed, create a new table and fill it
197 | spark.sql("select * from iris")
198 | .write
199 | .mode(SaveMode.Overwrite)
200 | .jdbc(jdbcUrl, "iris", connectionProperties)
201 |
202 | // COMMAND ----------
203 |
204 | // MAGIC %md
205 | // MAGIC Done! Let's check table content, by reading back the table on Azure SQL
206 |
207 | // COMMAND ----------
208 |
209 | spark.read.jdbc(jdbcUrl, "dbo.Iris", connectionProperties).show()
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/IntroToEventHubs.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md # Getting started with Event Hubs + Spark
3 | // MAGIC
4 | // MAGIC This notebook helps get you set up with various "plumbing" for Event Hubs, Blob storage, and
5 | // MAGIC Dataframes. And then you'll be able to create your own queries against data streaming through the Event Hubs endpoint.
6 | // MAGIC
7 | // MAGIC Note: This notebook is written in Scala, but you can also use Python or R for your own projects.
8 | // MAGIC
9 | // MAGIC Also note: The basics of Spark are all documented online, [here](https://spark.apache.org/docs/latest/), and includes full programming guides and API docs.
10 |
11 | // COMMAND ----------
12 |
13 | // MAGIC %md
14 | // MAGIC
15 | // MAGIC First, we'll set up Event Hubs, which is the same setup for IoT Hubs. To do this, we'll need to first:
16 | // MAGIC
17 | // MAGIC - ensure that the Event Hubs SDK has been added as a library, and attached to a running cluster
18 | // MAGIC - add required import statements (which are equivalent to c#'s "using" statement)
19 | // MAGIC
20 | // MAGIC ## Spark Connector SDK
21 | // MAGIC The Spark Connector SDK may be found [here](https://github.com/Azure/azure-event-hubs-spark). But there's a much easier way to install the correct driver, if you know its Maven coordinates.
22 | // MAGIC Note: Maven is a dependency/build manager tool for Java. Similar to Nuget for .net and npm for Node.js. Here are the instructions for installing the correct SDK, based on the Maven coordinates.
23 | // MAGIC
24 | // MAGIC
25 | // MAGIC ### Selecting and initializing the correct driver
26 | // MAGIC
27 | // MAGIC It's important to choose the correct Event Hubs SDK, depending on which version of Spark you're working with.
28 | // MAGIC
29 | // MAGIC For Databricks, these are the Maven coordinates for the Event Hubs SDK for Databricks:
30 | // MAGIC
31 | // MAGIC - Cluster v4.2 and above (Scala 2.11+): `com.microsoft.azure:azure-eventhubs-spark_2.11:2.3.4`
32 | // MAGIC
33 | // MAGIC Note: The versioning is periodically updated. To use the latest version, you may choose to search Maven for the latest version.
34 | // MAGIC
35 | // MAGIC To install the SDK in Databricks, traverse to the `Shared` folder (or your own personal folder) and select `Create Library`:
36 | // MAGIC
37 | // MAGIC 
38 | // MAGIC
39 | // MAGIC Change the source to "Maven Coordinate". Then, to search for the latest driver version, choose "Search Spark Packages and Maven Central":
40 | // MAGIC
41 | // MAGIC 
42 | // MAGIC
43 | // MAGIC In the upper-right, the default set of packages is "Spark" - change this to "Maven Central" and type "`eventhubs`" in the Search Packages box. This should present you with a list of packages. Choose the one with Group Id `com.microsoft.azure` and Artifact Id `azure-eventhubs-spark_2.11`. Expand the Releases dropdown and choose the latest (version 2.3.4 currently).
44 | // MAGIC
45 | // MAGIC
46 | // MAGIC 
47 | // MAGIC
48 | // MAGIC Click the Select button on the far-right, which will return you to the Import form, with all details filled in. Click Create Library.
49 | // MAGIC
50 | // MAGIC 
51 | // MAGIC
52 | // MAGIC You must attach the SDK to a cluster. You will be shown a list of your clusters. Choose whichever cluster(s) you are using, and select the checkbox. You may also choose "Attach automatically to all clusters."
53 | // MAGIC
54 | // MAGIC 
55 | // MAGIC
56 | // MAGIC At this point, your Event Hubs Spark library is ready to use.
57 | // MAGIC
58 | // MAGIC ## Imports
59 | // MAGIC Next, we'll define some important import statements, required for the Spark Connector.
60 | // MAGIC
61 | // MAGIC ### A word about cells
62 | // MAGIC Notice that these imports are defined in their own *cell*. Cells are similar to functions or methods, in that they are an execution block: if you run a cell, all the instructions in the cell are run.
63 | // MAGIC
64 | // MAGIC A notebook may have many cells. They all share the same variable scope. That is, if you define variable `foo` in one cell, and run that cell, `foo` is now a valid variable that may be accessed in other cells.
65 | // MAGIC
66 | // MAGIC Likewise, once you define imports and run the cell with the definition of those imports, you may now run code in any cell that has a dependency on those imports.
67 | // MAGIC
68 | // MAGIC Bonus: Once you run a cell defining your imports, you don't have to run that cell again, until your cluster is restarted.
69 | // MAGIC
70 | // MAGIC For a bit more info on cells, take a look at this notebook.
71 |
72 | // COMMAND ----------
73 |
74 | import org.apache.spark.sql._
75 | import org.apache.spark.sql.types._
76 | import org.apache.spark.sql.functions._
77 | import org.apache.spark.sql.eventhubs._
78 |
79 | // COMMAND ----------
80 |
81 | // MAGIC %md
82 | // MAGIC # Setting Up Event Hubs / IoT Hub connection
83 | // MAGIC Ok, we have our imports. How, let's set up the Event Hubs or IoT Hub connection. You'll need all of your Event Hub (or IoT Hub) settings for this, from the Azure portal.
84 | // MAGIC
85 | // MAGIC One setting you might not have configured is the `consumerGroup`. Each Event Hubs endpoint may have multiple consumer groups, with a default consumer group defined when the endpoint is created. You'll want to create your own consumer group, as this gives you your own independent view into the incoming data stream, which does not conflict with others who might also be reading from the same stream. If you haven't done so, please create a new consumer group for yourself.
86 | // MAGIC
87 | // MAGIC Here is an example of where you'd find the Event Hubs compatible connection string for an IoT Hub, along with Event Hubs name and Consumer Group:
88 | // MAGIC
89 | // MAGIC 
90 | // MAGIC
91 | // MAGIC Now, using these properties, set up your connection below, replacing `` placeholders with your real setting name (without the `<>` brackets, of course).
92 |
93 | // COMMAND ----------
94 |
95 | // Modify to include your event hubs parameters here
96 | // Note: This code works only with the latest Event Hubs driver,
97 | // which is supported by both Databricks v3.5 & v4.0 and HDInsight v3.5
98 |
99 | import org.apache.spark.eventhubs._
100 |
101 | val iotConnString = ""
102 |
103 | val ehName = ""
104 |
105 | val consumerGroup = ""
106 |
107 | // Build connection string with the above information
108 | val connectionString = ConnectionStringBuilder(iotConnString)
109 | .setEventHubName(ehName)
110 | .build
111 |
112 | // this sets up our event hubs configuration, including consumer group
113 | val ehConf = EventHubsConf(connectionString)
114 | .setConsumerGroup(consumerGroup)
115 |
116 | // COMMAND ----------
117 |
118 | // MAGIC %md
119 | // MAGIC # Connecting to Event Hubs
120 | // MAGIC Ok, now we need to wire up a dataframe to Event Hubs. If you haven't worked with Dataframes before: for the purposes of this exercise, just imagine a very large database table, that allows for operations to be partitioned and performed in parallel, with data that could either be static or streaming in from a live source.
121 | // MAGIC
122 | // MAGIC For this simple example, we are using Event Hubs as the streaming source of our Dataframe, and taking advantage of the `readStream` function to read directly from Event Hubs. `readStream` is similar to a file object's `read` method that you might have seen in other languages.
123 | // MAGIC
124 | // MAGIC It is important to understand the difference between `read` function and `readStream`. Simply stated:
125 | // MAGIC `read` => For reading static data or data in batches
126 | // MAGIC `readStream` => For reading streaming data
127 | // MAGIC
128 | // MAGIC **See also:** [reading data from event hubs](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/structured-streaming-eventhubs-integration.md#reading-data-from-event-hubs)
129 |
130 | // COMMAND ----------
131 |
132 | // Create a data frame representing the Event Hubs incoming stream
133 | val eventhubsDF = spark
134 | .readStream
135 | .format("eventhubs")
136 | .options(ehConf.toMap)
137 | .load()
138 |
139 | // COMMAND ----------
140 |
141 | // MAGIC %md
142 | // MAGIC # Extracting data from Event Hubs
143 | // MAGIC
144 | // MAGIC Each message coming from Event Hubs has the following schema:
145 | // MAGIC
146 | // MAGIC | Column | Type |
147 | // MAGIC |----------|----------|
148 | // MAGIC | body | binary |
149 | // MAGIC | offset | string |
150 | // MAGIC | sequenceNumber | long |
151 | // MAGIC | enqueuedTime | timestame |
152 | // MAGIC | publisher | string |
153 | // MAGIC | partitionKey | string |
154 | // MAGIC
155 | // MAGIC For our purposes, we only need `body`. The issue is, `body` is transmitted as binary data by Event Hubs, by default. So, we will do a cast to convert this data to a string.
156 |
157 | // COMMAND ----------
158 |
159 | // create a new dataframe with decoded body as string
160 | val stringbodyDF = eventhubsDF
161 | .selectExpr("CAST(body as STRING)")
162 |
163 | // COMMAND ----------
164 |
165 | // MAGIC %md
166 | // MAGIC # Writing: To memory
167 | // MAGIC First thing we'll want to do is write our streaming data *somewhere*, so that we can query a bit of it and see what it looks like. From a dev/test standpoint, the easiest way to get started is to write to an in-memory table.
168 | // MAGIC **see also:** [Structured streaming guide: output sinks](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#output-sinks)
169 |
170 | // COMMAND ----------
171 |
172 | // Set up an in-memory table.
173 | // Note: the moment `start()` is called, everything is set into motion, and data will
174 | // begin streaming into our new in-memory table.
175 |
176 | val memoryQuery = stringbodyDF.writeStream
177 | .format("memory")
178 | .queryName("sampledata") // this is the table name to be used for our in-memory table
179 | .start()
180 |
181 | // COMMAND ----------
182 |
183 | // MAGIC %md
184 | // MAGIC # Reading: From memory
185 | // MAGIC We should now have our in-memory table filling with data from our Event Hubs source, which we can now query, to get an idea of what our data looks like.
186 | // MAGIC
187 | // MAGIC At this point, you can experiment with this query in any way you see fit. Here are two ways to display data coming from `spark.sql()`:
188 |
189 | // COMMAND ----------
190 |
191 | // if you omit the 'truncate' parameter, it defaults to 'true',
192 | // which shortens output strings for display purposes
193 | spark.sql("SELECT * from sampledata").show(truncate=false)
194 |
195 | // COMMAND ----------
196 |
197 | display(spark.sql("SELECT * from sampledata"))
198 |
199 | // COMMAND ----------
200 |
201 | // MAGIC %md
202 | // MAGIC # Shutting down in-memory table stream
203 | // MAGIC Snce we saved off the stream variable earlier, we can easily shut it down after we're done querying.
204 |
205 | // COMMAND ----------
206 |
207 | memoryQuery.stop()
208 |
209 | // COMMAND ----------
210 |
211 | // MAGIC %md
212 | // MAGIC #Data Sources
213 | // MAGIC
214 | // MAGIC With Spark, you have many options for working with data sources. See this Notebook for more information about data sources.
215 |
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/IntroToNotebooks.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Intro to Notebooks
4 | // MAGIC
5 | // MAGIC This is a simple walkthrough of the basics of Databricks Notebooks. For more details, please see the official Databricks documentation, [here](https://docs.databricks.com/user-guide/notebooks/index.html).
6 | // MAGIC
7 | // MAGIC ## What is a Notebook?
8 | // MAGIC
9 | // MAGIC Notebooks are a simple way to interact with Databricks. They are designed to help you write and run code against your cluster.
10 | // MAGIC
11 | // MAGIC A notebook is comprised of *cells.* Each individual cell is run in its entirety, and has its own output. You can think of a cell as a function or method in common programming languages, in that all of the code within the cell are run as an atomic unit.
12 | // MAGIC
13 | // MAGIC A key thing to note: All cells of a notebook share a common memory space. That is, if you defined a variable in one cell, then it's available in other cells.
14 | // MAGIC
15 | // MAGIC Let's look at a small example:
16 |
17 | // COMMAND ----------
18 |
19 | // Here, we define a simple variable
20 | val greeting = "Hello world!"
21 |
22 | // COMMAND ----------
23 |
24 | // MAGIC %md
25 | // MAGIC In the cell above, a string, called `greeting`, is defined. To run this cell, either choose "Play" arrow button in the top-right of the cell, or use the `-` shortcut while your cursor is anywhere within the cell. You should then see something similar to:
26 | // MAGIC
27 | // MAGIC `greeting: String = Hello world!`
28 | // MAGIC
29 | // MAGIC Now, in the next cell, we simply print out the greeting. Because all cells share a session, the `greeting` variable should already be defined. Go ahead and run the next cell.
30 |
31 | // COMMAND ----------
32 |
33 | print(greeting)
34 |
35 | // COMMAND ----------
36 |
37 | // MAGIC %md
38 | // MAGIC ## Default programming language
39 | // MAGIC Now: when you first created your notebook, you were required to choose a *language* for the notebook: Python, Scala, SQL or R. This cannot be changed, once the notebook is created. However, for any given cell, you may override the language used within that cell.
40 | // MAGIC
41 | // MAGIC For example: This Notebook's default language is Scala. Let's say you wanted to re-create the demo above with the `greeting` string, but with Python. You can do this, by specifying the desired language at the top of the cell, with a special `%` directive, such as `%python`. The next cell demonstrates this (feel free to run it)
42 |
43 | // COMMAND ----------
44 |
45 | // MAGIC %python
46 | // MAGIC python_greeting = 'Hello world!'
47 | // MAGIC print(python_greeting)
48 |
49 | // COMMAND ----------
50 |
51 | // MAGIC %md
52 | // MAGIC Note: Even though you may mix languages, each language has its own environment and runtime. So, for example, a variable declared in Scala cannot be referenced via Python. If you run the following cell, you should see an error:
53 |
54 | // COMMAND ----------
55 |
56 | // MAGIC %python
57 | // MAGIC print(greeting) # this should fail - variable is defined in scala, not python
58 |
59 | // COMMAND ----------
60 |
61 | // MAGIC %md
62 | // MAGIC ## Using Markdown to provide formatted text like this
63 | // MAGIC If you're creating your own Notebook, and you want to provide formatted text to help document it, you may use the `%md` directive, to specify that a cell contains Markdown. This Notebook makes use of Markdown (including this cell.)
64 |
65 | // COMMAND ----------
66 |
67 | // MAGIC %md
68 | // MAGIC ## More about running code
69 | // MAGIC
70 | // MAGIC Instead of just hitting the "Play" button, you may also choose other options, by using the dropdown next to the "Play" button:
71 | // MAGIC
72 | // MAGIC 
73 | // MAGIC
74 | // MAGIC From a given cell, you can run everything above or below a cell, instead of running just the current cell. Additionally, you might want to run the entire Notebook, in order.
75 | // MAGIC
76 | // MAGIC Sometimes, you may want to get into a "clean state", sort of like clearing your cache (this is recommended for when you restart your cluster). You can also clear out all the variables and, then run everything. You'll see all of these options at the top of the notebook:
77 | // MAGIC
78 | // MAGIC 
79 |
80 | // COMMAND ----------
81 |
82 | // MAGIC %md
83 | // MAGIC ## Autocomplete & IntelliSense
84 | // MAGIC
85 | // MAGIC While true intelliSense doesn't exist in a notebook environment, there is autocomplete. To use autocomplete, simply press `` and it will give you all the options to apply to an object.
86 | // MAGIC
87 | // MAGIC For instance, if you press `` after the period `.`, you will get a list of all the functions that can be applied on the val/var.
88 | // MAGIC
89 | // MAGIC 
90 |
91 | // COMMAND ----------
92 |
93 | // MAGIC %md
94 | // MAGIC ## Keyboard Shortcuts
95 | // MAGIC
96 | // MAGIC You don't have to use dropdowns to execute code, or work with Notebooks in general. There is an entire set of keyboard shortcuts (like `-` that you already used) at your disposal. Click the keyboard icon, in the top-right of the notebook, to view all shortcuts. Here's a snippet of what you'll see when clicking the keyboard icon:
97 | // MAGIC
98 | // MAGIC 
99 |
100 | // COMMAND ----------
101 |
102 | // MAGIC %md
103 | // MAGIC ## Notebooks and Collaboration
104 | // MAGIC If you're working alone, it's fine to store your Notebooks within your user-specific folder in the `Workspace` area of Databricks. However, if you're collaborating with teammates, it might be better to place this notebook in the `Shared` folder, so that every user has access.
105 | // MAGIC
106 | // MAGIC Further: Databricks Notebooks are linkable to version control systems such as github, which helps considerably when trying to track changes. In general, this lets you treat Notebooks the same way you'd treat any other source file.
107 | // MAGIC
108 | // MAGIC For more details on Databricks Github integration, take a look at [this article](https://docs.databricks.com/user-guide/notebooks/github-version-control.html).
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/IntroToSparkComponents.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Intro to Spark's data components
4 | // MAGIC
5 | // MAGIC
6 | // MAGIC Spark has an entire infrastructure built out with the express purpose of optimizing data processing on large workloads. If, for example, you were just performing some simple word-counts and other actions on some small text files, there are lots of tools (including command-line tools) that can probably handle this type of job without much effort. But, instead, imagine trying to perform analysis on all of Wikipedia for common words and phrases, or most frequently-referenced sources. Now, you can imagine a command-line tool being a bit unwieldy to work with. Now, instead of a fixed set of data, imagine a live Twitter firehose stream, where you're trying to identify trending hashtags. Suddenly, local tools are not looking like such a good option. And this is where Spark comes in to play.
7 | // MAGIC
8 | // MAGIC When working with Spark, the first thing you'll likely experience, when going through various tutorials, is the notion of a Dataframe. But before jumping into Dataframes, it's worth knowing about the basic layers of functionality you'll end up working with.
9 | // MAGIC
10 | // MAGIC ## Step 0: the Driver
11 | // MAGIC
12 | // MAGIC Spark is a distributed data processing environment. And if you imagine a cluster with a master node and several worker nodes, all available to help process a block or stream of data, the workload tends to get distributed across those nodes. The distribution, and general job management interface that you'll work with, is the **Driver** (master node). And when you execute specific actions (or submit jobs to the cluster), it's the Driver's responsibility to distribute the activity or job across the clusters.
13 | // MAGIC
14 | // MAGIC Within the Spark environment, you'll have pre-defined variables. One such variable is `spark`, which is essentially your spark session (running on the driver).
15 | // MAGIC
16 | // MAGIC ## RDD's
17 | // MAGIC
18 | // MAGIC Spark uses *Resilient Distributed Datasets* (RDDs) to partition data across nodes in a cluster. Simply, these are "datasets" that are fault tolerant. These are designed so that they can perform actions in parallel. Further, they are designed with resiliency in mind: as they progress through their data processing tasks, they are able to store intermediate results and counters, to persistent storage. This way, in case of a disruption (such as a node rebooting), the cluster can recover from a saved checkpoint and continue, without having to restart the entire operation.
19 | // MAGIC
20 | // MAGIC While there are many operations and transforms you can perform on an RDD, there isn't much structure to the data. And that is where DataFrames come into play.
21 | // MAGIC
22 | // MAGIC ## Dataframes
23 | // MAGIC
24 | // MAGIC These are the new way of data processing in Spark. A Spark dataframe can be thought of as a table. Similar to a relational data table, with columns and rows. In reality, there aren't exactly *rows*. More like chunks of data (static or streaming), organized into the dataframe in a similar way to how you'd organize data in a database table.
25 | // MAGIC
26 | // MAGIC Where DataFrames shine is with their ability to let you partition, query, group, order, and aggregate content. There are many additional operations available as well: flattening arrays, substituting default values for missing data, eliminating rows with null values, replacing values within an existing column, on and on.
27 | // MAGIC
28 | // MAGIC
29 | // MAGIC ## SQL
30 | // MAGIC
31 | // MAGIC As it turns out, the Structured Query Language (SQL) has become commonplace. And, to make sure Spark is as capable as the tools that preceded it (such as Hive), Spark now has a very powerful SQL query engine built-in, available for you when performing queries.
32 | // MAGIC
33 | // MAGIC As `spark` is your entry point into the driver, `spark.sql()` is your way to execute a SQL query. As an example: Imagine a `People`file, read into a data frame. Once this is done, you can create an in-memory table to query, and execute SQL against it:
34 | // MAGIC
35 | // MAGIC ```
36 | // MAGIC peopleDF.createOrReplaceTempView("people") // this will overwrite an existing table named 'people'
37 | // MAGIC spark.sql("SELECT name, age FROM people WHERE age >= 18")
38 | // MAGIC ```
39 |
40 | // COMMAND ----------
41 |
42 | // MAGIC %md
43 | // MAGIC ## Learning more
44 | // MAGIC
45 | // MAGIC - For a quick example of using Spark in Databricks, check out this [Word Count Notebook]($./WordcountExample).
46 | // MAGIC - Spark's introductory programming guide is [here](https://spark.apache.org/docs/latest/sql-programming-guide.html#loading-data-programmatically), giving a great overview of working with RDDs, Datasets, and SQL.
47 | // MAGIC - Download the free eBook from Databricks, "A Gentle Introduction to Apache Spark", [here](https://pages.databricks.com/gentle-intro-spark.html)
48 | // MAGIC - Spark has an entire set of programming guides at [spark.apache.org](https://spark.apache.org). A few specific pages that are very helpful:
49 | // MAGIC - [Quick start](https://spark.apache.org/docs/latest/quick-start.html)
50 | // MAGIC - [SQL, Datasets, and DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html)
51 | // MAGIC - [Scala programming guide](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.package)
52 | // MAGIC - [SQL programming guide](https://spark.apache.org/docs/latest/api/sql/index.html)
53 | // MAGIC
54 | // MAGIC
55 | // MAGIC
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/StartHere.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Welcome!
4 | // MAGIC
5 | // MAGIC This set of notebooks provides short introductions to several key concepts when working with Spark+Azure, with Databricks. Where applicable, these notebooks also contain links to additional resources.
6 | // MAGIC
7 | // MAGIC
8 | // MAGIC - Databricks notebooks [[link]($./IntroToNotebooks)]
9 | // MAGIC - Event Hubs / IoT Hub Integration with Spark [[link]($./IntroToEventHubs)]
10 | // MAGIC - Parsing incoming JSON data [[Link]($./IntroToDataParsing)]
11 | // MAGIC - Spark core concepts: RDDs, Dataframes, SQL [[link]($./IntroToDataframes)]
12 | // MAGIC - Data sources: Azure Storage, Azure SQL Database [[link]($./IntroToDataSources)]
13 | // MAGIC
14 | // MAGIC Additionally, [[this notebook]($./WordcountExample)] presents a very simple "word count" example using Spark.
15 |
--------------------------------------------------------------------------------
/notebooks/Shared/tutorials/WordcountExample.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md In this example, we take lines of text and split them up into words. Next, we count the number of occurances of each work in the set using a variety of Spark API.
3 |
4 | // COMMAND ----------
5 |
6 | dbutils.fs.put("/textlines","""
7 | Hello hello world
8 | Hello how are you world
9 | """, true)
10 |
11 | // COMMAND ----------
12 |
13 | import org.apache.spark.sql.functions._
14 |
15 | // Load a text file and interpret each line as a java.lang.String
16 | val ds = sqlContext.read.text("/textlines").as[String]
17 | val result = ds
18 | .flatMap(_.split(" ")) // Split on whitespace
19 | .filter(_ != "") // Filter empty words
20 | .toDF() // Convert to DataFrame to perform aggregation / sorting
21 | .groupBy($"value") // Count number of occurences of each word
22 | .agg(count("*") as "numOccurances")
23 | .orderBy($"numOccurances" desc) // Show most common words first
24 |
25 | display(result)
--------------------------------------------------------------------------------
/testdata/weatherdata-12345.json:
--------------------------------------------------------------------------------
1 | {"body":"{\"temperature\":54,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T00:00:00.000Z\"}"}
2 | {"body":"{\"temperature\":57,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T01:00:00.000Z\"}"}
3 | {"body":"{\"temperature\":57,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T02:00:00.000Z\"}"}
4 | {"body":"{\"temperature\":58,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T03:00:00.000Z\"}"}
5 | {"body":"{\"temperature\":60,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T04:00:00.000Z\"}"}
6 | {"body":"{\"temperature\":63,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T05:00:00.000Z\"}"}
7 | {"body":"{\"temperature\":63,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T06:00:00.000Z\"}"}
8 | {"body":"{\"temperature\":64,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T07:00:00.000Z\"}"}
9 | {"body":"{\"temperature\":66,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T08:00:00.000Z\"}"}
10 | {"body":"{\"temperature\":67,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T09:00:00.000Z\"}"}
11 | {"body":"{\"temperature\":67,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T10:00:00.000Z\"}"}
12 | {"body":"{\"temperature\":71,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T11:00:00.000Z\"}"}
13 | {"body":"{\"temperature\":73,\"zipcode\":\"12345\",\"timestamp\":\"2018-01-01T12:00:00.000Z\"}"}
--------------------------------------------------------------------------------
/testdata/weatherdata-22334.json:
--------------------------------------------------------------------------------
1 | {"body":"{\"temperature\":54,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T00:00:00.000Z\"}"}
2 | {"body":"{\"temperature\":57,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T01:00:00.000Z\"}"}
3 | {"body":"{\"temperature\":57,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T02:00:00.000Z\"}"}
4 | {"body":"{\"temperature\":58,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T03:00:00.000Z\"}"}
5 | {"body":"{\"temperature\":60,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T04:00:00.000Z\"}"}
6 | {"body":"{\"temperature\":63,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T05:00:00.000Z\"}"}
7 | {"body":"{\"temperature\":63,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T06:00:00.000Z\"}"}
8 | {"body":"{\"temperature\":64,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T07:00:00.000Z\"}"}
9 | {"body":"{\"temperature\":66,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T08:00:00.000Z\"}"}
10 | {"body":"{\"temperature\":67,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T09:00:00.000Z\"}"}
11 | {"body":"{\"temperature\":67,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T10:00:00.000Z\"}"}
12 | {"body":"{\"temperature\":71,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T11:00:00.000Z\"}"}
13 | {"body":"{\"temperature\":73,\"zipcode\":\"22334\",\"timestamp\":\"2018-01-01T12:00:00.000Z\"}"}
--------------------------------------------------------------------------------