├── .gitignore
├── LICENSE
├── README.md
├── notebooks
    ├── elt-blob-storage-cosmosdb-python.ipynb
    ├── elt-blob-storage-sqldw-python.ipynb
    ├── file-operations-python.ipynb
    ├── tweet-streaming-cosmosdb-python.ipynb
    └── tweet-streaming-eventhub-python.ipynb
└── scripts
    ├── connect-azure-sqldw.sh
    ├── create-source-azure-blob-storage.sh
    ├── install-sql-cli.sh
    └── setup-cosmosdb-feed.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Yoichi Kawasaki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # databricks-notebook
 2 | 
 3 | Collection of Sample Databricks Spark Notebooks ( mostly for Azure Databricks )
 4 | 
 5 | 
 6 | ## Sample Notebooks
 7 | 
 8 | | Notebook | Description | Lang
 9 | | ------------- | ------------- | ----------- |
10 | | [File Operations Sample](notebooks/file-operations-python.ipynb) | Various file operations sample such as Azure Blob Storage mount & umount, ls/rm/cp/mv, read CSV file, etc| Python |    
11 | | [ELT Sample: Azure Blob Stroage - Databricks - CosmosDB](notebooks/elt-blob-storage-cosmosdb-python.ipynb)| In this notebook, you extract data from Azure Blob Storage into Databricks cluster, run transformations on the data in Databricks cluster, and then load the transformed data into Azure Cosmos DB | Python |
12 | | [ELT Sample: Azure Blob Stroage - Databricks - SQLDW](notebooks/elt-blob-storage-sqldw-python.ipynb) | In this notebook, you extract data from Azure Blob Storage into Databricks cluster, run transformations on the data in Databricks cluster, and then load the transformed data into Azure SQL Data Warehouse | Python |
13 | | [Streaming Sample: Cosmos DB ChangeFeed - Databricks](notebooks/tweet-streaming-cosmosdb-python.ipynb) | In this notebook, you read a live stream of tweets that stored in Cosmos DB by leveraging Apache Spart to read the Cosmos DB's Change Feed, and run transformations on the data in Databricks cluster | Python |
14 | | [Streaming Sample: Azure Event Hub - Databricks](notebooks/tweet-streaming-eventhub-python.ipynb) | In this notebook, you connect Azure Event Hub (hyper-scale data ingestion service) with Azure Databricks to stream data into an Apache Spark cluster in near real-time | Python |
15 | 
16 | ## Contributing
17 | 
18 | Bug reports and pull requests are welcome on GitHub at https://github.com/yokawasa/databricks-notebook
19 | 


--------------------------------------------------------------------------------
/notebooks/elt-blob-storage-cosmosdb-python.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# ELT Sample: Azure Blob Stroage - Databricks - CosmosDB\nIn this notebook, you extract data from Azure Blob Storage into Databricks cluster, run transformations on the data in Databricks cluster, and then load the transformed data into Azure Cosmos DB.\n## prerequisites:\n- Azure Blob Storage Account and Containers\n- Databricks Cluster (Spark)\n- Cosmos DB Spark Connector (azure-cosmosdb-spark)\n  - Create a library using maven coordinates. Simply typed in `azure-cosmosdb-spark_2.2.0` in the search box and search it, or create library by simply uploading jar file that can be donwload from marven central repository\n- Azure Cosmos DB Collection\n## Sample data\n- https://github.com/Azure/usql/blob/master/Examples/Samples/Data/json/radiowebsite/small_radio_json.json\n## LINKS\n- https://docs.azuredatabricks.net/spark/latest/data-sources/azure/azure-storage.html\n- https://github.com/Azure/azure-cosmosdb-spark"],"metadata":{}},{"cell_type":"markdown","source":["# Connecting to Azure Blob Storage and access a sample Json file"],"metadata":{}},{"cell_type":"markdown","source":["## Set up an account access key"],"metadata":{}},{"cell_type":"code","source":["# spark.conf.set(\n#  \"fs.azure.account.key.<storage-account-name>.blob.core.windows.net\",\n#  \"<storage-access-key>\")\n\nspark.conf.set(\n  \"fs.azure.account.key.databrickstore.blob.core.windows.net\",\n  \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\")"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["Once an account access key or a SAS is set up in your notebook, you can use standard Spark and Databricks APIs to read from the storage account"],"metadata":{}},{"cell_type":"code","source":["#dbutils.fs.ls(\"wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>\")\ndbutils.fs.ls(\"wasbs://dbdemo01@databrickstore.blob.core.windows.net\")"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["## Mount a Blob storage container or a folder inside a container"],"metadata":{}},{"cell_type":"code","source":["# Mount a Blob storage container or a folder inside a container\n# dbutils.fs.mount(\n#   source = \"wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>\",\n#   mount_point = \"<mount-point-path>\",\n#   extra_configs = <\"<conf-key>\": \"<conf-value>\">)\n# [note] <mount_point> is a DBFS path and the path must be under /mnt\n\ndbutils.fs.mount(\n  source = \"wasbs://dbdemo01@databrickstore.blob.core.windows.net\",\n  mount_point = \"/mnt/dbdemo01\",\n  extra_configs = {\"fs.azure.account.key.databrickstore.blob.core.windows.net\": \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\"})\n"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"markdown","source":["## Access files in your container as if they were local files"],"metadata":{}},{"cell_type":"code","source":["# Access files in your container as if they were local files\n# (TEXT) df = spark.read.text(\"/mnt/%s/....\" % <mount-point-path>)\n# (JSON) df = spark.read.json(\"/mnt/%s/....\" % <mount-point-path>)\n\ndf = spark.read.json( \"/mnt/%s/small_radio_json.json\" % \"dbdemo01\" )\n\n# display(df)\ndf.show()"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"markdown","source":["## Unmount the blob storage (if needed)"],"metadata":{}},{"cell_type":"code","source":["# unmount (if needed)\n# dbutils.fs.unmount(\"<mount-point-path>\")\n# dbutils.fs.unmount(\"/mnt/dbdemo01\")"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"markdown","source":["# Transform data in Azure Databricks"],"metadata":{}},{"cell_type":"markdown","source":["Start by retrieving only the columns firstName, lastName, gender, location, and level from the dataframe you already created."],"metadata":{}},{"cell_type":"code","source":["specificColumnsDf = df.select(\"firstname\", \"lastname\", \"gender\", \"location\", \"level\")\nspecificColumnsDf.show()"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"markdown","source":["You can further transform this data to rename the column level to subscription_type."],"metadata":{}},{"cell_type":"code","source":["renamedColumnsDF = specificColumnsDf.withColumnRenamed(\"level\", \"subscription_type\")\nrenamedColumnsDF.show()"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"markdown","source":["# Load data into Azure Cosmos DB"],"metadata":{}},{"cell_type":"markdown","source":["Write configuration, then write to Cosmos DB from the renamedColumnsDF DataFrame"],"metadata":{}},{"cell_type":"code","source":["#writeConfig = {\n# \"Endpoint\" : \"https://<cosmosdb-account-name>.documents.azure.com:443/\",\n# \"Masterkey\" : \"<Cosmosdb-master-key-string>\",\n# \"Database\" : \"<database-name>\",\n# \"Collection\" : \"<collection-name>\",\n# \"Upsert\" : \"true\"\n#}\n\n# Write configuration\nwriteConfig = {\n \"Endpoint\" : \"https://dbstreamdemo.documents.azure.com:443/\",\n \"Masterkey\" : \"ekRLXkETPJ93s6XZz4YubZOw1mjSnoO5Bhz1Gk29bVxCbtgtKmiyRz4SogOSxLOGTouXbwlaAHcHOzct4JVwtQ==\",\n \"Database\" : \"etl\",\n \"Collection\" : \"outcol01\",\n \"Upsert\" : \"true\"\n}\n\n# Write to Cosmos DB from the renamedColumnsDF DataFrame\nrenamedColumnsDF.write.format(\"com.microsoft.azure.cosmosdb.spark\").options(**writeConfig).save()"],"metadata":{},"outputs":[],"execution_count":20}],"metadata":{"name":"elt-blob-storage-cosmosdb-python","notebookId":141860019630215},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/notebooks/elt-blob-storage-sqldw-python.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# ELT Sample: Azure Blob Stroage - Databricks - SQLDW\nIn this notebook, you extract data from Azure Blob Storage into Databricks cluster, run transformations on the data in Databricks cluster, and then load the transformed data into Azure SQL Data Warehouse.\n\n## prerequisites:\n- Azure Blob Storage Account and Containers\n- Databricks Cluster (Spark)\n- Azure SQL Data Warehouse\n\n## Sample data\n- https://github.com/Azure/usql/blob/master/Examples/Samples/Data/json/radiowebsite/small_radio_json.json\n\n## LINKS\n- https://docs.azuredatabricks.net/spark/latest/data-sources/azure/azure-storage.html\n- https://docs.azuredatabricks.net/spark/latest/data-sources/azure/sql-data-warehouse.html\n- [Quickstart: Create an Azure SQL Data Warehouse](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal)"],"metadata":{}},{"cell_type":"markdown","source":["# Connecting to Azure Blob Storage and access a sample Json file"],"metadata":{}},{"cell_type":"markdown","source":["## Set up an account access key"],"metadata":{}},{"cell_type":"code","source":["# Set up an account access key\n# spark.conf.set(\n#  \"fs.azure.account.key.<storage-account-name>.blob.core.windows.net\",\n#  \"<storage-access-key>\")\n\nspark.conf.set(\n  \"fs.azure.account.key.databrickstore.blob.core.windows.net\",\n  \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\")"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["Once an account access key or a SAS is set up in your notebook, you can use standard Spark and Databricks APIs to read from the storage account"],"metadata":{}},{"cell_type":"code","source":["# dbutils.fs.ls(\"wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>\")\ndbutils.fs.ls(\"wasbs://dbdemo01@databrickstore.blob.core.windows.net\")"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["## Mount a Blob storage container or a folder inside a container"],"metadata":{}},{"cell_type":"code","source":["# mount a Blob storage container or a folder inside a container\n# dbutils.fs.mount(\n#   source = \"wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>\",\n#   mount_point = \"<mount-point-path>\",\n#   extra_configs = <\"<conf-key>\": \"<conf-value>\">)\n# [note] <mount_point> is a DBFS path and the path must be under /mnt\n\ndbutils.fs.mount(\n  source = \"wasbs://dbdemo01@databrickstore.blob.core.windows.net\",\n  mount_point = \"/mnt/dbdemo01\",\n  extra_configs = {\"fs.azure.account.key.databrickstore.blob.core.windows.net\": \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\"})"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"markdown","source":["## Access files in your container as if they were local files"],"metadata":{}},{"cell_type":"code","source":["# Access files in your container as if they were local files\n# (TEXT) df = spark.read.text(\"/mnt/%s/....\" % <mount-point-path>)\n# (JSON) df = spark.read.json(\"/mnt/%s/....\" % <mount-point-path>)\n\ndf = spark.read.json( \"/mnt/%s/small_radio_json.json\" % \"dbdemo01\" )\n\n# display(df)\ndf.show()"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"markdown","source":["## Unmount the blob storage (if needed)"],"metadata":{}},{"cell_type":"code","source":["# unmount\n# dbutils.fs.unmount(\"<mount-point-path>\")\n# dbutils.fs.unmount(\"/mnt/dbdemo01\")"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"markdown","source":["# Transform data in Azure Databricks"],"metadata":{}},{"cell_type":"markdown","source":["Start by retrieving only the columns firstName, lastName, gender, location, and level from the dataframe you already created."],"metadata":{}},{"cell_type":"code","source":["specificColumnsDf = df.select(\"firstname\", \"lastname\", \"gender\", \"location\", \"level\")\nspecificColumnsDf.show()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"markdown","source":["You can further transform this data to rename the column level to subscription_type."],"metadata":{}},{"cell_type":"code","source":["renamedColumnsDF = specificColumnsDf.withColumnRenamed(\"level\", \"subscription_type\")\nrenamedColumnsDF.show()"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"markdown","source":["# Load data into Azure SQL Data Warehouse"],"metadata":{}},{"cell_type":"code","source":["# Apply some transformations to the data, then use the\n# Data Source API to write the data back to another table in SQL DW.\n\n# [note] the SQL date warehouse connector uses Azure Blob Storage as a temporary storage to upload data between Azure Databricks and Azure SQL Data Warehouse.\n\n## SQL Data Warehouse related settings\ndwTable= \"mytable001\"\ndwDatabase = \"sqldwdemo001\"\ndwServer = \"sqldwdemoserver001\" \ndwUser = \"yoichika\"\ndwPass = \"P@ssw0rd____\"\ndwJdbcPort =  \"1433\"\ndwJdbcExtraOptions = \"encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;\"\nsqlDwUrl = \"jdbc:sqlserver://\" + dwServer + \".database.windows.net:\" + dwJdbcPort + \";database=\" + dwDatabase + \";user=\" + dwUser+\";password=\" + dwPass + \";$dwJdbcExtraOptions\"\nsqlDwUrlSmall = \"jdbc:sqlserver://\" + dwServer + \".database.windows.net:\" + dwJdbcPort + \";database=\" + dwDatabase + \";user=\" + dwUser+\";password=\" + dwPass\n\n\ntempDir = \"wasbs://dbdemo01tmp@databrickstore.blob.core.windows.net/tempDirs\"\n\n#sc._jsc.hadoopConfiguration().set(\n#  \"fs.azure.account.key.<your-storage-account-name>.blob.core.windows.net\",\n#  \"<your-storage-account-access-key>\")\nacntInfo = \"fs.azure.account.key.databrickstore.blob.core.windows.net\"\nsc._jsc.hadoopConfiguration().set(\n  acntInfo, \n  \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\")\n\n## Loading transformed dataframe (renamedColumnsDF) into SQLDW\nspark.conf.set(\"spark.sql.parquet.writeLegacyFormat\",\"true\")\n\n## This snippet creates a table called 'dwTable' in the SQL database.\n#df.write \\\n#  .format(\"com.databricks.spark.sqldw\") \\\n#  .option(\"url\", \"jdbc:sqlserver://<the-rest-of-the-connection-string>\") \\\n#  .option(\"forward_spark_azure_storage_credentials\", \"true\") \\\n#  .option(\"dbtable\", \"my_table_in_dw_copy\") \\\n#  .option(\"tempdir\", \"wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>\") \\\n#  .save()\n\nrenamedColumnsDF.write \\\n  .format(\"com.databricks.spark.sqldw\") \\\n  .option(\"url\", sqlDwUrlSmall) \\\n  .option(\"dbtable\", dwTable) \\\n  .option( \"forward_spark_azure_storage_credentials\",\"true\") \\\n  .option(\"tempdir\", tempDir) \\\n  .mode(\"overwrite\") \\\n  .save()"],"metadata":{},"outputs":[],"execution_count":20}],"metadata":{"name":"elt-blob-storage-sqldw-python","notebookId":3679592423530605},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/notebooks/file-operations-python.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# File Operations Sample\nVarious file operations sample such as Azure Blob Storage mount & umount, ls/rm/cp/mv, read CSV file, etc"],"metadata":{}},{"cell_type":"markdown","source":["## 1. Direct Connection to Azure Blob Storage"],"metadata":{}},{"cell_type":"markdown","source":["### Configure Azure Blob Storage Connection Key"],"metadata":{}},{"cell_type":"code","source":["spark.conf.set(\n  \"fs.azure.account.key.databrickstore.blob.core.windows.net\",\n  \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\")"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["### Reading Blob files directly"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.ls(\"wasbs://mountpoint001@databrickstore.blob.core.windows.net\")"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["## 2. Mounting Azure Blob Storage container"],"metadata":{}},{"cell_type":"markdown","source":["### Configure to mount Azure Blob Storage container onto local dir"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.mount(\n  source = \"wasbs://sharedlib@databrickstore.blob.core.windows.net\",\n  mount_point = \"/mnt/azstorage\",\n  extra_configs = {\"fs.azure.account.key.databrickstore.blob.core.windows.net\": \"S1PtMWvUw5If1Z8FMzXAxC7OMw9G5Go8BGCXJ81qpFVYpZ9dpXOnU4zlg0PbldKkbLIbmbv02WoJsgYLGKIfgg==\"})"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"markdown","source":["### List files"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.ls(\"/mnt/azstorage\")"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"markdown","source":["### Copy files to local dir"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.cp(\"/mnt/azstorage/libmecab.so\", \"file:/usr/lib/libmecab.so\")"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"markdown","source":["### Remove files in local dir"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.rm(\"file:/usr/lib/libmecab.so\")"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"markdown","source":["### List files in local dir"],"metadata":{}},{"cell_type":"code","source":["dbutils.fs.ls(\"file:/usr/lib/\")"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"markdown","source":["### Read json file from mounted dir using Json parser and write them into SQL table"],"metadata":{}},{"cell_type":"code","source":["%sql\nDROP TABLE IF EXISTS radio_sample_data;\nCREATE TABLE radio_sample_data\nUSING json\nOPTIONS (\n path \"/mnt/azstorage/small_radio_json.json\"\n)"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"markdown","source":["### select from SQL table"],"metadata":{}},{"cell_type":"code","source":["%sql\nSELECT * from radio_sample_data"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"markdown","source":["### Unmount the dir"],"metadata":{}},{"cell_type":"code","source":["# dbutils.fs.unmount(\"/mnt/azstorage\")"],"metadata":{},"outputs":[],"execution_count":23},{"cell_type":"markdown","source":["## 3. Read file using CSV parser"],"metadata":{}},{"cell_type":"code","source":["display(dbutils.fs.ls(\"/databricks-datasets\"))"],"metadata":{},"outputs":[],"execution_count":25},{"cell_type":"markdown","source":["### Read CSV file in the Spark CSV datasource with options specifying \n- First line of file is a header\n- Automatically infer the schema of the data"],"metadata":{}},{"cell_type":"code","source":["# Use the Spark CSV datasource with options specifying:\n#  - First line of file is a header\n#  - Automatically infer the schema of the data\ndata = sqlContext.read.format(\"com.databricks.spark.csv\")\\\n  .option(\"header\", \"true\")\\\n  .option(\"inferSchema\", \"true\")\\\n  .load(\"/databricks-datasets/samples/population-vs-price/data_geo.csv\")\ndata.cache()  # Cache data for faster reuse\n# data.count()\ndisplay(data)"],"metadata":{},"outputs":[],"execution_count":27}],"metadata":{"name":"file-operations-python","notebookId":4162529425452146},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/notebooks/tweet-streaming-cosmosdb-python.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# Streaming Sample: Cosmos DB ChangeFeed - Databricks\nIn this notebook, you read a live stream of tweets that stored in Cosmos DB by leveraging Apache Spart to read the Cosmos DB's Change Feed, and run transformations on the data in Databricks cluster.\n\n## prerequisites:\n- Databricks Cluster (Spark)\n- Cosmos DB Spark Connector (azure-cosmosdb-spark)\n  - Create a library using maven coordinates. Simply typed in `azure-cosmosdb-spark_2.2.0` in the search box and search it, or create library by simply uploading jar file that can be donwload from marven central repository\n- Azure Cosmos DB Collection\n\n## Test Feed Generator\n- https://github.com/tknandu/TwitterCosmosDBFeed\n\n## LINKS\n- [Working with the change feed support in Azure Cosmos DB](https://docs.microsoft.com/en-us/azure/cosmos-db/change-feed)\n- [Twitter with Spark and Azure Cosmos DB Change Feed Sample](https://github.com/Azure/azure-cosmosdb-spark/blob/master/samples/notebooks/Twitter%20with%20Spark%20and%20Azure%20Cosmos%20DB%20Change%20Feed.ipynb)\n- [Stream Processing Changes using Azure Cosmos DB Change Feed and Apache Spark](https://github.com/Azure/azure-cosmosdb-spark/wiki/Stream-Processing-Changes-using-Azure-Cosmos-DB-Change-Feed-and-Apache-Spark)\n- https://github.com/tknandu/TwitterCosmosDBFeed"],"metadata":{}},{"cell_type":"markdown","source":["## Configure Connection to Cosmos DB Change Feed using azure-cosmosdb-spark\nThe parameters below connect to the Cosmos DB Change Feed; for more information, please refer to Change Feed Test Runs."],"metadata":{}},{"cell_type":"code","source":["# Adding variables \nrollingChangeFeed = False\nstartFromTheBeginning = False\nuseNextToken = True \n\ndatabase = \"changefeedsource\"\ncollection = \"tweet_new\"\n\ntweetsConfig = {\n\"Endpoint\" : \"https://dbstreamdemo.documents.azure.com:443/\",\n\"Masterkey\" : \"ekRLXkETPJ93s6XZz4YubZOw1mjSnoO5Bhz1Gk29bVxCbtgtKmiyRz4SogOSxLOGTouXbwlaAHcHOzct4JVwtQ==\",\n#\"Database\" : database,\n#\"Collection\" : collection, \n\"Database\" : \"changefeedsource\",\n\"Collection\" : \"tweet_new\", \n\"ReadChangeFeed\" : \"true\",\n\"ChangeFeedQueryName\" : database + collection + \" \",\n\"ChangeFeedStartFromTheBeginning\" : str(startFromTheBeginning),\n\"ChangeFeedUseNextToken\" : str(useNextToken),\n\"RollingChangeFeed\" : str(rollingChangeFeed),\n#\"ChangeFeedCheckpointLocation\" : \"./changefeedcheckpointlocation\",\n\"SamplingRatio\" : \"1.0\"\n}# Adding"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":["## Read a DataFrame"],"metadata":{}},{"cell_type":"code","source":["# Read a DataFrame\n# SparkSession available as 'spark'.\ntweets = spark.read.format(\"com.microsoft.azure.cosmosdb.spark\").options(**tweetsConfig).load()\n"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"markdown","source":["##Get the number of tweets\nThis provides the count of tweets; it will start off 0 and then continue growing as you re-run the cell below."],"metadata":{}},{"cell_type":"code","source":["# Get the number of tweets\ntweets.count()\n# display(tweets)\n# tweets.printSchema()"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"markdown","source":["## Create tweets TempView\nThis way we can run SQL statements within the notebook"],"metadata":{}},{"cell_type":"code","source":["# Create tweets TempView\n# This way we can run SQL statements within the notebook\ntweets.createOrReplaceTempView(\"tweets\")"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["%sql\nselect count(1) from tweets"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"markdown","source":["## Show various attributes of the first 20 tweets"],"metadata":{}},{"cell_type":"code","source":["%sql\nselect \n  id,\n  created_at,\n  user.screen_name,\n  user.location,\n  text,\n  retweet_count,\n  entities.hashtags,\n  entities.user_mentions,\n  favorited,\n  source\nfrom tweets\nlimit 20"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"markdown","source":["## Determine Top 10 hashtags for the tweets"],"metadata":{}},{"cell_type":"code","source":["%sql\nselect concat(concat((dense_rank() OVER (PARTITION BY 1 ORDER BY tweets DESC)-1), '. '), text) as hashtags, tweets\nfrom (\nselect hashtags.text, count(distinct id) as tweets\nfrom (\nselect \n  explode(entities.hashtags) as hashtags,\n  id\nfrom tweets\n) a\ngroup by hashtags.text\norder by tweets desc\nlimit 10\n) b"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"markdown","source":["# [APPENDIX] Connnecting to Cosmos DB using pydocumentdb"],"metadata":{}},{"cell_type":"code","source":["# Import Necessary Libraries\nimport pydocumentdb\nfrom pydocumentdb import document_client\nfrom pydocumentdb import documents\nimport datetime\n\n# Configuring the connection policy (allowing for endpoint discovery)\nconnectionPolicy = documents.ConnectionPolicy()\nconnectionPolicy.EnableEndpointDiscovery \nconnectionPolicy.PreferredLocations = [\"Japan East\", \"Japan West\"]\n\n\n# Set keys to connect to Cosmos DB \nmasterKey = 'b3KPBHQvWTD8prYsQDiHlaM8kDzBholipD1sgshjT60ayDK9WkvRAT0Qywsi5FkcyKsYcvF4iIrUEBBzaZwJKw==' \nhost = 'https://videoanalytics.documents.azure.com:443/'\nclient = document_client.DocumentClient(host, {'masterKey': masterKey}, connectionPolicy)\n\n\n# Configure Database and Collections\ndatabaseId = 'asset'\ncollectionId = 'meta'\n\n# Configurations the Cosmos DB client will use to connect to the database and collection\ndbLink = 'dbs/' + databaseId\ncollLink = dbLink + '/colls/' + collectionId\n\n\n# Set query parameter\n#querystr = \"SELECT c.City FROM c WHERE c.State='WA'\"\nquerystr= \"SELECT * FROM c\"\n# Query documents\nquery = client.QueryDocuments(collLink, querystr, options=None, partition_key=None)\n\n# Query for partitioned collections\n# query = client.QueryDocuments(collLink, query, options= { 'enableCrossPartitionQuery': True }, partition_key=None)\n\n# Push into list `elements`\nelements = list(query)\nprint(elements)"],"metadata":{},"outputs":[],"execution_count":16}],"metadata":{"name":"tweet-streaming-cosmosdb-python","notebookId":141860019630249},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/notebooks/tweet-streaming-eventhub-python.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# Streaming Sample: Azure Event Hub - Databricks\n you connect Azure Event Hub (hyper-scale data ingestion service) with Azure Databricks to stream data into an Apache Spark cluster in near real-time\n \n## Pre-requisites\nStructured streaming integration for Azure Event Hubs is ultimately run on the JVM, so you'll need to import the libraries `com.microsoft.azure:azure‐eventhubs‐spark_2.11:2.3.1` from the Maven coordinate. A procedure is (1) create a library using maven coordinates, then (2) simply typed in `\"azure‐eventhubs‐spark_2.11\"` in the search box and search it. \n\n## Test Feed Generator\n- https://github.com/yokawasa/TwitterEventHubFeed\n\n## LINKS\n- [Structured Streaming + Event Hubs Integration Guide for PySpark](https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md)\n- [Integrating Apache Spark with Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-spark-connector)"],"metadata":{}},{"cell_type":"markdown","source":["# Event Hubs Configuration"],"metadata":{}},{"cell_type":"code","source":["import json\n\n# Connection String\nev_namespace    =\"yoichika-eventhub01\"\nev_name         =\"tweetshub\"\nev_sas_key_name =\"managepolicy\"\nev_sas_key_val  = \"Lg4mzv2qIvnZ+xlam6Gbq/7Whf8ZPMqQN6fRjbqzIhc=\"\n\n\nconn_string=\"Endpoint=sb://{0}.servicebus.windows.net/;EntityPath={1};SharedAccessKeyName={2};SharedAccessKey={3}\".format(ev_namespace, ev_name, ev_sas_key_name, ev_sas_key_val)\n\nehConf = {}\nehConf['eventhubs.connectionString'] = conn_string\n#ehConf['eventhubs.maxEventsPerTrigger'] = 5\n#ehConf['eventhubs.consumerGroup'] = \"$Default\"\n\n# Start from beginning of stream\n#startOffset = \"-1\"\n\n# Create the positions\n#startingEventPosition = {\n#  \"offset\": startOffset,  \n#  \"seqNo\": -1,            #not in use\n#  \"enqueuedTime\": None,   #not in use\n#  \"isInclusive\": True\n#}\n\n# Put the positions into the Event Hub config dictionary\n#ehConf[\"eventhubs.startingPosition\"] = json.dumps(startingEventPosition)"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":["# Reading Data from Event Hubs"],"metadata":{}},{"cell_type":"code","source":["# Creating an Event Hubs Source for Streaming Queries\ndf = spark.readStream.format(\"eventhubs\").options(**ehConf).load()\n\nreadInStreamBody = df.withColumn(\"body\", df[\"body\"].cast(\"string\"))\ndisplay(readInStreamBody)"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["# Print Schema\ndf.printSchema()"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":7}],"metadata":{"name":"tweet-streaming-eventhub-python","notebookId":141860019630266},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/scripts/connect-azure-sqldw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SERVER_NAME="<SQLDW Server Name>"
 4 | USER_NAME="<SQLDW User Name>"
 5 | USER_PASSWORD="<SQLDW Password>"
 6 | SQLDW_NAME="<SQLDW Name>"
 7 | 
 8 | # mssql -s <serverName>.database.windows.net -u <userName>@<serverName> -p <password> -d <dwName> -e
 9 | mssql -s $SERVER_NAME.database.windows.net -u $USER_NAME@$SERVER_NAME -p $USER_PASSWORD -d $SQLDW_NAME -e
10 | 


--------------------------------------------------------------------------------
/scripts/create-source-azure-blob-storage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -x -e
 3 | 
 4 | RESOURCE_GROUP="<your resource group>"
 5 | NAME="<your storage account name>"
 6 | 
 7 | # Create Azure Storage Account for Video Processing Pipeline and Blob Container in the account
 8 | az storage account create \
 9 |     --name $NAME \
10 |     --resource-group $RESOURCE_GROUP \
11 |     --sku Standard_LRS \
12 |     --kind Storage
13 | 
14 | # Get Key
15 | ACCESS_KEY=$(az storage account keys list --account-name $NAME --resource-group $RESOURCE_GROUP --output tsv |head -1 | awk '{print $3}')
16 |  
17 | # Create Container
18 | az storage container create  \
19 |     --name "uploads" \
20 |     --account-name $NAME \
21 |     --account-key $ACCESS_KEY
22 | 


--------------------------------------------------------------------------------
/scripts/install-sql-cli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # https://www.npmjs.com/package/sql-cli
3 | # install node (if not yet)
4 | # brew install node
5 | 
6 | npm install -g sql-cli
7 | 


--------------------------------------------------------------------------------
/scripts/setup-cosmosdb-feed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -x -e
3 | 
4 | # https://github.com/tknandu/TwitterCosmosDBFeed
5 | 
6 | git clone https://github.com/tknandu/TwitterCosmosDBFeed.git
7 | pip install tweepy==3.3.0
8 | pip install pyDocumentDB
9 | 


--------------------------------------------------------------------------------