├── .gitignore
├── LICENSE
├── README.md
├── SECURITY.md
└── tutorials
    ├── data-access
        ├── 01-weather-to-spark-dataframe.ipynb
        └── 02-weather-to-pandas-dataframe.ipynb
    ├── data-join
        ├── 01-weather-join-in-spark.ipynb
        ├── 02-weather-join-in-pandas.ipynb
        ├── 03-nyc-taxi-join-weather-in-spark.ipynb
        └── 04-nyc-taxi-join-weather-in-pandas.ipynb
    ├── energy-join
        ├── 01-energy-join-weather-in-pandas.ipynb
        └── nyc_energy.csv
    └── taxi-automl
        └── 01-tutorial-opendatasets-automl.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Open Datasets Example Notebooks
 2 | 
 3 | This repository contains example notebooks demonstrating the [Open Datasets](https://azure.microsoft.com/en-us/services/opendatasets/) Python SDK which allows you to enrich, and get open datasets using Azure.  The OpenDataSets SDK allows you the choice of using local or cloud compute resources, while managing and maintaining the complete data from the cloud.
 4 | 
 5 | ## Quick installation
 6 | ```sh
 7 | pip install azureml-opendatasets
 8 | ```
 9 | 
10 | ## How to navigate and use the example notebooks?
11 | 
12 | > * To learn more about Azure Open Datasets: https://docs.microsoft.com/azure/open-datasets/
13 | > * How to load open datasets into your familiar Pandas/SPARK DataFrame: check out notebooks under [tutorials/data-access](./tutorials/data-access/).
14 | > * How to join your own data with open datasets: check out notebooks under [tutorials/data-join](./tutorials/data-join/).
15 | 
16 | 
17 | > * For Pandas version, either you already created your own Azure Notebooks library, or you have your own
18 | >   Jupyter server. Then you simply upload the notebook over there to run it.
19 | 
20 | > * For SPARK version, you can create an Azure Databricks Workspace in your Azure subscription, upload the notebook over there, and click 'Run'. 
21 | Alternatively, you can setup your own SPARK cluster and run it there. 
22 | 
23 | ## API reference
24 | 
25 | Detailed API references are available [here](https://docs.microsoft.com/en-us/python/api/azureml-opendatasets/?view=azure-ml-py).
26 | 
27 | # Contributing
28 | 
29 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
30 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
31 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
32 | 
33 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
34 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
35 | provided by the bot. You will only need to do this once across all repos using our CLA.
36 | 
37 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
38 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
39 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
40 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/tutorials/data-access/01-weather-to-spark-dataframe.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "source": [
 6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
 7 |       ],
 8 |       "metadata": {}
 9 |     },
10 |     {
11 |       "cell_type": "markdown",
12 |       "source": [
13 |         "# Tutorial: Load NOAA ISD Weather Data\n\nIn this tutorial, you load the NOAA ISD Weather Data into Spark DataFrame by calling to_spark_dataframe() method.\n\nPrerequesits:\n> You must install the PyPi package on the cluster:\n> * azureml-opendatasets"
14 |       ],
15 |       "metadata": {}
16 |     },
17 |     {
18 |       "cell_type": "markdown",
19 |       "source": [
20 |         "Import NoaaIsdWeather class from azureml-opendatasets"
21 |       ],
22 |       "metadata": {}
23 |     },
24 |     {
25 |       "cell_type": "code",
26 |       "source": [
27 |         "from azureml.opendatasets import NoaaIsdWeather\n\nfrom datetime import datetime\nfrom dateutil import parser\nfrom dateutil.relativedelta import relativedelta"
28 |       ],
29 |       "metadata": {},
30 |       "outputs": [
31 |         {
32 |           "metadata": {},
33 |           "output_type": "display_data",
34 |           "data": {
35 |             "text/html": [
36 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
37 |             ]
38 |           }
39 |         }
40 |       ],
41 |       "execution_count": 4
42 |     },
43 |     {
44 |       "cell_type": "markdown",
45 |       "source": [
46 |         "> 1. Set start_date and end_date.\n> 2. New an instance of NoaaIsdWeather.\n> 3. Call to_spark_dataframe() method to get a Spark DataFrame for the given date range."
47 |       ],
48 |       "metadata": {}
49 |     },
50 |     {
51 |       "cell_type": "code",
52 |       "source": [
53 |         "start_date = parser.parse('2019-1-1')\nend_date = parser.parse('2019-3-31')\nisd = NoaaIsdWeather(start_date, end_date)\ndf = isd.to_spark_dataframe()\ndisplay(df.limit(10))"
54 |       ],
55 |       "metadata": {
56 |         "scrolled": true
57 |       },
58 |       "outputs": [
59 |         {
60 |           "metadata": {},
61 |           "output_type": "display_data",
62 |           "data": {
63 |             "text/html": [
64 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>usaf</th><th>wban</th><th>datetime</th><th>latitude</th><th>longitude</th><th>elevation</th><th>windAngle</th><th>windSpeed</th><th>temperature</th><th>seaLvlPressure</th><th>cloudCoverage</th><th>presentWeatherIndicator</th><th>pastWeatherIndicator</th><th>precipTime</th><th>precipDepth</th><th>snowDepth</th><th>stationName</th><th>countryOrRegion</th><th>p_k</th><th>year</th><th>day</th><th>version</th><th>month</th></tr></thead><tbody><tr><td>999999</td><td>03055</td><td>2019-01-01T00:00:00.000+0000</td><td>36.599</td><td>-101.595</td><td>995.0</td><td>null</td><td>9.3</td><td>-6.4</td><td>null</td><td>null</td><td>null</td><td>null</td><td>1.0</td><td>0.0</td><td>null</td><td>GOODWELL 2 E</td><td>US</td><td>999999-03055</td><td>2019</td><td>1</td><td>1.0</td><td>1</td></tr><tr><td>061830</td><td>99999</td><td>2019-01-26T16:00:00.000+0000</td><td>55.533</td><td>12.717</td><td>6.0</td><td>140</td><td>7.7</td><td>0.3</td><td>999.4</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>DROGDEN</td><td>DA</td><td>061830-99999</td><td>2019</td><td>26</td><td>1.0</td><td>1</td></tr><tr><td>999999</td><td>03055</td><td>2019-01-01T00:05:00.000+0000</td><td>36.599</td><td>-101.595</td><td>995.0</td><td>null</td><td>null</td><td>-6.4</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>GOODWELL 2 E</td><td>US</td><td>999999-03055</td><td>2019</td><td>1</td><td>1.0</td><td>1</td></tr><tr><td>061830</td><td>99999</td><td>2019-01-26T17:00:00.000+0000</td><td>55.533</td><td>12.717</td><td>6.0</td><td>140</td><td>7.7</td><td>0.6</td><td>998.9</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>DROGDEN</td><td>DA</td><td>061830-99999</td><td>2019</td><td>26</td><td>1.0</td><td>1</td></tr><tr><td>999999</td><td>03055</td><td>2019-01-01T00:10:00.000+0000</td><td>36.599</td><td>-101.595</td><td>995.0</td><td>null</td><td>null</td><td>-6.5</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>GOODWELL 2 E</td><td>US</td><td>999999-03055</td><td>2019</td><td>1</td><td>1.0</td><td>1</td></tr><tr><td>061830</td><td>99999</td><td>2019-01-26T18:00:00.000+0000</td><td>55.533</td><td>12.717</td><td>6.0</td><td>140</td><td>8.2</td><td>0.8</td><td>998.6</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>DROGDEN</td><td>DA</td><td>061830-99999</td><td>2019</td><td>26</td><td>1.0</td><td>1</td></tr><tr><td>999999</td><td>03055</td><td>2019-01-01T00:15:00.000+0000</td><td>36.599</td><td>-101.595</td><td>995.0</td><td>null</td><td>null</td><td>-6.5</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>GOODWELL 2 E</td><td>US</td><td>999999-03055</td><td>2019</td><td>1</td><td>1.0</td><td>1</td></tr><tr><td>061830</td><td>99999</td><td>2019-01-26T19:00:00.000+0000</td><td>55.533</td><td>12.717</td><td>6.0</td><td>140</td><td>10.3</td><td>1.0</td><td>997.8</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>DROGDEN</td><td>DA</td><td>061830-99999</td><td>2019</td><td>26</td><td>1.0</td><td>1</td></tr><tr><td>999999</td><td>03055</td><td>2019-01-01T00:20:00.000+0000</td><td>36.599</td><td>-101.595</td><td>995.0</td><td>null</td><td>null</td><td>-6.5</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>GOODWELL 2 E</td><td>US</td><td>999999-03055</td><td>2019</td><td>1</td><td>1.0</td><td>1</td></tr><tr><td>061830</td><td>99999</td><td>2019-01-26T20:00:00.000+0000</td><td>55.533</td><td>12.717</td><td>6.0</td><td>140</td><td>10.3</td><td>1.1</td><td>997.7</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>DROGDEN</td><td>DA</td><td>061830-99999</td><td>2019</td><td>26</td><td>1.0</td><td>1</td></tr></tbody></table></div>"
65 |             ]
66 |           }
67 |         }
68 |       ],
69 |       "execution_count": 6
70 |     }
71 |   ],
72 |   "metadata": {
73 |     "kernelspec": {
74 |       "display_name": "Python 3",
75 |       "language": "python",
76 |       "name": "python3"
77 |     },
78 |     "language_info": {
79 |       "mimetype": "text/x-python",
80 |       "name": "python",
81 |       "pygments_lexer": "ipython3",
82 |       "codemirror_mode": {
83 |         "name": "ipython",
84 |         "version": 3
85 |       },
86 |       "version": "3.6.8",
87 |       "nbconvert_exporter": "python",
88 |       "file_extension": ".py"
89 |     },
90 |     "name": "NoaaIsdWeather.to_spark_dataframe",
91 |     "notebookId": 2741195231538712
92 |   },
93 |   "nbformat": 4,
94 |   "nbformat_minor": 0
95 | }
96 | 


--------------------------------------------------------------------------------
/tutorials/data-access/02-weather-to-pandas-dataframe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "# Tutorial: Load NOAA ISD Weather Data"
 14 |       ],
 15 |       "metadata": {}
 16 |     },
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Install azureml-opendatasets SDK"
 21 |       ],
 22 |       "metadata": {}
 23 |     },
 24 |     {
 25 |       "cell_type": "code",
 26 |       "source": [
 27 |         "!pip uninstall -y azureml-opendatasets\n!pip install azureml-opendatasets"
 28 |       ],
 29 |       "metadata": {
 30 |         "scrolled": true
 31 |       },
 32 |       "outputs": [],
 33 |       "execution_count": 4
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "source": [
 38 |         "Import NoaaIsdWeather class from azureml-opendatasets"
 39 |       ],
 40 |       "metadata": {}
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "from azureml.opendatasets import NoaaIsdWeather\n\nfrom datetime import datetime\nfrom dateutil import parser\nfrom dateutil.relativedelta import relativedelta"
 46 |       ],
 47 |       "metadata": {},
 48 |       "outputs": [
 49 |         {
 50 |           "metadata": {},
 51 |           "output_type": "display_data",
 52 |           "data": {
 53 |             "text/html": [
 54 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
 55 |             ]
 56 |           }
 57 |         }
 58 |       ],
 59 |       "execution_count": 6
 60 |     },
 61 |     {
 62 |       "cell_type": "markdown",
 63 |       "source": [
 64 |         "> 1. Set start_date and end_date.\n> 2. New an instance of NoaaIsdWeather.\n> 3. Call to_pandas_dataframe() method to get a pandas DataFrame."
 65 |       ],
 66 |       "metadata": {}
 67 |     },
 68 |     {
 69 |       "cell_type": "code",
 70 |       "source": [
 71 |         "start_date = parser.parse('2018-1-1')\nend_date = parser.parse('2018-2-28')\nisd = NoaaIsdWeather(start_date, end_date)\nisd.to_pandas_dataframe().info()\nprint('isd done')"
 72 |       ],
 73 |       "metadata": {
 74 |         "scrolled": true
 75 |       },
 76 |       "outputs": [
 77 |         {
 78 |           "metadata": {},
 79 |           "output_type": "display_data",
 80 |           "data": {
 81 |             "text/html": [
 82 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Target paths: [&apos;/year=2018/month=1/&apos;, &apos;/year=2018/month=2/&apos;]\nLooking for parquet files...\nReading them into Pandas dataframe...\nReading ISDWeather/year=2018/month=1/part-00043-tid-9138739344806125380-ef942066-1d58-49f9-8ecb-3329cbe6e57e-383738.c000.snappy.parquet under container isdweatherdatacontainer\nReading ISDWeather/year=2018/month=2/part-00174-tid-9138739344806125380-ef942066-1d58-49f9-8ecb-3329cbe6e57e-383934.c000.snappy.parquet under container isdweatherdatacontainer\nDone.\n&lt;class &apos;pandas.core.frame.DataFrame&apos;&gt;\nInt64Index: 21666673 entries, 0 to 10482057\nData columns (total 22 columns):\nusaf                       object\nwban                       object\ndatetime                   datetime64[ns]\nlatitude                   float64\nlongitude                  float64\nelevation                  float64\nwindAngle                  float64\nwindSpeed                  float64\ntemperature                float64\nseaLvlPressure             float64\ncloudCoverage              object\npresentWeatherIndicator    float64\npastWeatherIndicator       float64\nprecipTime                 float64\nprecipDepth                float64\nsnowDepth                  float64\nstationName                object\ncountryOrRegion            object\np_k                        object\nyear                       int32\nday                        int32\nversion                    float64\ndtypes: datetime64[ns](1), float64(13), int32(2), object(6)\nmemory usage: 3.6+ GB\nisd done\n</div>"
 83 |             ]
 84 |           }
 85 |         }
 86 |       ],
 87 |       "execution_count": 8
 88 |     }
 89 |   ],
 90 |   "metadata": {
 91 |     "kernelspec": {
 92 |       "display_name": "Python 3",
 93 |       "language": "python",
 94 |       "name": "python3"
 95 |     },
 96 |     "language_info": {
 97 |       "mimetype": "text/x-python",
 98 |       "name": "python",
 99 |       "pygments_lexer": "ipython3",
100 |       "codemirror_mode": {
101 |         "name": "ipython",
102 |         "version": 3
103 |       },
104 |       "version": "3.6.8",
105 |       "nbconvert_exporter": "python",
106 |       "file_extension": ".py"
107 |     },
108 |     "name": "02-weather-to-pandas-dataframe",
109 |     "notebookId": 2741195231538698
110 |   },
111 |   "nbformat": 4,
112 |   "nbformat_minor": 0
113 | }
114 | 


--------------------------------------------------------------------------------
/tutorials/data-join/01-weather-join-in-spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "# Tutorial: Load demo data and enrich it with NOAA ISD Weather data.\n\nIn this tutorial, you load the demo data (a parquet file in Azure Blob), check the data schema, enrich it with NOAA ISD Weather data.\n\nPrerequisites:\n> You must install the PyPI package on the cluster:\n> * azureml-opendatasets\n\nLearn how to:\n> * Load the demo data from Azure Blob\n> * Check the demo data schema\n> * Initialize NoaaIsdWeather class to load weather data\n> * Enrich the demo data with weather data\n> * Display the joined result annd stats"
 14 |       ],
 15 |       "metadata": {}
 16 |     },
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Load demo parquet file from Azure Blob"
 21 |       ],
 22 |       "metadata": {}
 23 |     },
 24 |     {
 25 |       "cell_type": "code",
 26 |       "source": [
 27 |         "from azure.storage.blob import BlockBlobService\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.getOrCreate()\n\ncontainer_name = 'tutorials'\naccount_name = 'azureopendatastorage'\nrelative_path = 'noaa_isd_weather/demo.parquet'\ndf = spark.read.parquet('wasbs://%s@%s.blob.core.windows.net/%s' % (\n    container_name,\n    account_name,\n    relative_path))\ndf.count()"
 28 |       ],
 29 |       "metadata": {},
 30 |       "outputs": [
 31 |         {
 32 |           "metadata": {},
 33 |           "output_type": "display_data",
 34 |           "data": {
 35 |             "text/html": [
 36 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">1</span><span class=\"ansired\">]: </span>1850\n</div>"
 37 |             ]
 38 |           }
 39 |         }
 40 |       ],
 41 |       "execution_count": 4
 42 |     },
 43 |     {
 44 |       "cell_type": "markdown",
 45 |       "source": [
 46 |         "# Display the demo data"
 47 |       ],
 48 |       "metadata": {}
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "source": [
 53 |         "display(df.limit(10))"
 54 |       ],
 55 |       "metadata": {},
 56 |       "outputs": [
 57 |         {
 58 |           "metadata": {},
 59 |           "output_type": "display_data",
 60 |           "data": {
 61 |             "text/html": [
 62 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>datetime</th><th>lat</th><th>long</th><th>stations.city</th><th>count</th><th>stations.dock_count</th></tr></thead><tbody><tr><td>2015-05-01T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>28</td><td>15</td></tr><tr><td>2015-05-02T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>5</td><td>15</td></tr><tr><td>2015-05-03T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>11</td><td>15</td></tr><tr><td>2015-05-04T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>24</td><td>15</td></tr><tr><td>2015-05-05T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>24</td><td>15</td></tr><tr><td>2015-05-06T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>28</td><td>15</td></tr><tr><td>2015-05-07T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>20</td><td>15</td></tr><tr><td>2015-05-08T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>21</td><td>15</td></tr><tr><td>2015-05-09T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>9</td><td>15</td></tr><tr><td>2015-05-10T00:00:00.000+0000</td><td>37.787152</td><td>-122.38801299999999</td><td>San Francisco</td><td>10</td><td>15</td></tr></tbody></table></div>"
 63 |             ]
 64 |           }
 65 |         }
 66 |       ],
 67 |       "execution_count": 6
 68 |     },
 69 |     {
 70 |       "cell_type": "markdown",
 71 |       "source": [
 72 |         "# Initialize NoaaIsdWeather class, get the enricher from it and enrich demo data"
 73 |       ],
 74 |       "metadata": {}
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "source": [
 79 |         "from azureml.opendatasets.accessories.location_data import LatLongColumn\nfrom azureml.opendatasets.accessories.location_time_customer_data import LocationTimeCustomerData\nfrom azureml.opendatasets import NoaaIsdWeather\n\n\n_customer_data = LocationTimeCustomerData(df, LatLongColumn('lat', 'long'), 'datetime')\nweather = NoaaIsdWeather(cols=[\"temperature\", \"windSpeed\", \"seaLvlPressure\"])\nweather_enricher = weather.get_enricher()\njoined_data = weather_enricher.enrich_customer_data_with_agg(\n  customer_data_object=_customer_data,\n  location_match_granularity=5,\n  time_round_granularity='day',\n  agg='avg')"
 80 |       ],
 81 |       "metadata": {},
 82 |       "outputs": [
 83 |         {
 84 |           "metadata": {},
 85 |           "output_type": "display_data",
 86 |           "data": {
 87 |             "text/html": [
 88 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, get_enricher\nActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=3.75 [ms]\nActivityStarted, enrich_customer_data_with_agg\nActivityStarted, enrich\nActivityCompleted: Activity=enrich, HowEnded=Success, Duration=109171.59 [ms]\nActivityCompleted: Activity=enrich_customer_data_with_agg, HowEnded=Success, Duration=109174.13 [ms]\n</div>"
 89 |             ]
 90 |           }
 91 |         }
 92 |       ],
 93 |       "execution_count": 8
 94 |     },
 95 |     {
 96 |       "cell_type": "markdown",
 97 |       "source": [
 98 |         "# Display the joined result"
 99 |       ],
100 |       "metadata": {}
101 |     },
102 |     {
103 |       "cell_type": "code",
104 |       "source": [
105 |         "display(joined_data.data.limit(10))"
106 |       ],
107 |       "metadata": {},
108 |       "outputs": [
109 |         {
110 |           "metadata": {},
111 |           "output_type": "display_data",
112 |           "data": {
113 |             "text/html": [
114 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>lat</th><th>long</th><th>datetime</th><th>stations.city</th><th>count</th><th>stations.dock_count</th><th>row_id</th><th>avg(seaLvlPressure)</th><th>avg(temperature)</th><th>avg(windSpeed)</th></tr></thead><tbody><tr><td>37.330165</td><td>-121.88583100000001</td><td>2015-05-27T00:00:00.000+0000</td><td>San Jose</td><td>2</td><td>15</td><td>555</td><td>1016.0708333333332</td><td>17.041747572815535</td><td>4.40576923076923</td></tr><tr><td>37.389218</td><td>-122.081896</td><td>2015-05-24T00:00:00.000+0000</td><td>Mountain View</td><td>1</td><td>15</td><td>607</td><td>1016.9791666666666</td><td>14.564999999999998</td><td>3.6208333333333345</td></tr><tr><td>37.444521</td><td>-122.16309299999999</td><td>2015-05-27T00:00:00.000+0000</td><td>Palo Alto</td><td>1</td><td>11</td><td>1848</td><td>1016.3666666666668</td><td>14.850344827586207</td><td>3.705442176870746</td></tr><tr><td>37.781039</td><td>-122.411748</td><td>2015-05-21T00:00:00.000+0000</td><td>San Francisco</td><td>38</td><td>23</td><td>1569</td><td>1014.4471153846152</td><td>13.967088607594935</td><td>3.2710638297872303</td></tr><tr><td>37.791464000000005</td><td>-122.391034</td><td>2015-05-23T00:00:00.000+0000</td><td>San Francisco</td><td>9</td><td>19</td><td>496</td><td>1018.7134615384616</td><td>13.847904191616768</td><td>4.965697674418601</td></tr><tr><td>37.794139</td><td>-122.394434</td><td>2015-05-14T00:00:00.000+0000</td><td>San Francisco</td><td>50</td><td>23</td><td>430</td><td>1011.5009615384613</td><td>14.23948717948718</td><td>3.145641025641024</td></tr><tr><td>37.795392</td><td>-122.394203</td><td>2015-05-24T00:00:00.000+0000</td><td>San Francisco</td><td>30</td><td>23</td><td>1264</td><td>1017.2605769230772</td><td>13.388111888111888</td><td>4.8027972027971995</td></tr><tr><td>37.795392</td><td>-122.394203</td><td>2015-05-31T00:00:00.000+0000</td><td>San Francisco</td><td>13</td><td>23</td><td>1271</td><td>1013.8009615384617</td><td>13.20612244897959</td><td>4.1484693877551</td></tr><tr><td>37.337391</td><td>-121.886995</td><td>2015-05-24T00:00:00.000+0000</td><td>San Jose</td><td>5</td><td>15</td><td>1800</td><td>1016.9791666666666</td><td>16.338297872340423</td><td>3.8212765957446813</td></tr><tr><td>37.348742</td><td>-121.89471499999999</td><td>2015-05-26T00:00:00.000+0000</td><td>San Jose</td><td>5</td><td>15</td><td>581</td><td>1015.5375</td><td>16.492473118279573</td><td>4.138709677419354</td></tr></tbody></table></div>"
115 |             ]
116 |           }
117 |         }
118 |       ],
119 |       "execution_count": 10
120 |     },
121 |     {
122 |       "cell_type": "markdown",
123 |       "source": [
124 |         "# Convert the joined spark dataframe to pandas dataframe"
125 |       ],
126 |       "metadata": {}
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "source": [
131 |         "joined_data_pandas = joined_data.data.toPandas()"
132 |       ],
133 |       "metadata": {},
134 |       "outputs": [
135 |         {
136 |           "metadata": {},
137 |           "output_type": "display_data",
138 |           "data": {
139 |             "text/html": [
140 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
141 |             ]
142 |           }
143 |         }
144 |       ],
145 |       "execution_count": 12
146 |     },
147 |     {
148 |       "cell_type": "markdown",
149 |       "source": [
150 |         "# Check the stats of joined result"
151 |       ],
152 |       "metadata": {}
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "source": [
157 |         "print(joined_data_pandas.info())"
158 |       ],
159 |       "metadata": {},
160 |       "outputs": [
161 |         {
162 |           "metadata": {},
163 |           "output_type": "display_data",
164 |           "data": {
165 |             "text/html": [
166 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">&lt;class &apos;pandas.core.frame.DataFrame&apos;&gt;\nRangeIndex: 1850 entries, 0 to 1849\nData columns (total 10 columns):\nlat                    1850 non-null float64\nlong                   1850 non-null float64\ndatetime               1850 non-null datetime64[ns]\nstations.city          1850 non-null object\ncount                  1850 non-null int32\nstations.dock_count    1850 non-null int32\nrow_id                 1850 non-null int64\navg(seaLvlPressure)    1850 non-null float64\navg(temperature)       1850 non-null float64\navg(windSpeed)         1850 non-null float64\ndtypes: datetime64[ns](1), float64(5), int32(2), int64(1), object(1)\nmemory usage: 130.2+ KB\nNone\n</div>"
167 |             ]
168 |           }
169 |         }
170 |       ],
171 |       "execution_count": 14
172 |     }
173 |   ],
174 |   "metadata": {
175 |     "kernelspec": {
176 |       "display_name": "Python 3",
177 |       "language": "python",
178 |       "name": "python3"
179 |     },
180 |     "language_info": {
181 |       "mimetype": "text/x-python",
182 |       "name": "python",
183 |       "pygments_lexer": "ipython3",
184 |       "codemirror_mode": {
185 |         "name": "ipython",
186 |         "version": 3
187 |       },
188 |       "version": "3.6.8",
189 |       "nbconvert_exporter": "python",
190 |       "file_extension": ".py"
191 |     },
192 |     "name": "enrich_demo_data_spark",
193 |     "notebookId": 2741195231538736
194 |   },
195 |   "nbformat": 4,
196 |   "nbformat_minor": 0
197 | }
198 | 


--------------------------------------------------------------------------------
/tutorials/data-join/02-weather-join-in-pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "# Tutorial: Load demo data and enrich it with NOAA ISD Weather data.\n\nIn this tutorial, you load the demo data (a parquet file in Azure Blob), check the data schema, enrich it with NOAA ISD Weather data.\n\nPrerequisites:\n> You must install the PyPi package on the cluster:\n> * azureml-contrib-opendatasets\n\nLearn how to:\n> * Load the demo data from Azure Blob\n> * Check the demo data schema\n> * Initialize NoaaIsdWeather class to load weather data\n> * Enrich the demo data with weather data\n> * Display the joined result annd stats"
 14 |       ],
 15 |       "metadata": {}
 16 |     },
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Install azureml-opendatasets package"
 21 |       ],
 22 |       "metadata": {}
 23 |     },
 24 |     {
 25 |       "cell_type": "code",
 26 |       "source": [
 27 |         "!pip uninstall -y azureml-opendatasets\n!pip install azureml-opendatasets"
 28 |       ],
 29 |       "metadata": {
 30 |         "scrolled": true
 31 |       },
 32 |       "outputs": [],
 33 |       "execution_count": 4
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "source": [
 38 |         "## Define a DemoData class to load demo parquet from Azure Blob"
 39 |       ],
 40 |       "metadata": {}
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "from azure.storage.blob import BlockBlobService\nimport pyarrow.parquet as pq\nfrom io import BytesIO\n\nclass DemoData:\n    def __init__(self):\n        self.blob_account_name = \"azureopendatastorage\"\n        self.blob_container_name = \"tutorials\"\n        self.blob_relative_path = 'noaa_isd_weather/demo.parquet'\n\n    def to_pandas_dataframe(self):\n        blob_service = BlockBlobService(account_name=self.blob_account_name)\n        byte_stream = BytesIO()\n        blob = blob_service.get_blob_to_stream(\n            container_name=self.blob_container_name,\n            blob_name=self.blob_relative_path,\n            stream=byte_stream)\n\n        return pq.read_table(source=byte_stream).to_pandas()"
 46 |       ],
 47 |       "metadata": {},
 48 |       "outputs": [
 49 |         {
 50 |           "metadata": {},
 51 |           "output_type": "display_data",
 52 |           "data": {
 53 |             "text/html": [
 54 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
 55 |             ]
 56 |           }
 57 |         }
 58 |       ],
 59 |       "execution_count": 6
 60 |     },
 61 |     {
 62 |       "cell_type": "markdown",
 63 |       "source": [
 64 |         "## Initialize a DemoData instance and load the pandas DataFrame and check the schema"
 65 |       ],
 66 |       "metadata": {}
 67 |     },
 68 |     {
 69 |       "cell_type": "code",
 70 |       "source": [
 71 |         "df = DemoData().to_pandas_dataframe()\ndf.dtypes"
 72 |       ],
 73 |       "metadata": {
 74 |         "scrolled": false
 75 |       },
 76 |       "outputs": [
 77 |         {
 78 |           "metadata": {},
 79 |           "output_type": "display_data",
 80 |           "data": {
 81 |             "text/html": [
 82 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">2</span><span class=\"ansired\">]: </span>\ndatetime               datetime64[ns]\nlat                           float64\nlong                          float64\nstations.city                  object\ncount                           int32\nstations.dock_count             int32\ndtype: object\n</div>"
 83 |             ]
 84 |           }
 85 |         }
 86 |       ],
 87 |       "execution_count": 8
 88 |     },
 89 |     {
 90 |       "cell_type": "markdown",
 91 |       "source": [
 92 |         "## Display the top 5 rows in the demo data dataframe"
 93 |       ],
 94 |       "metadata": {}
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "df.head(5)"
100 |       ],
101 |       "metadata": {
102 |         "collapsed": true
103 |       },
104 |       "outputs": [
105 |         {
106 |           "metadata": {},
107 |           "output_type": "display_data",
108 |           "data": {
109 |             "text/html": [
110 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">3</span><span class=\"ansired\">]: </span>\n    datetime        lat         ...          count stations.dock_count\n0 2015-05-01  37.787152         ...             28                  15\n1 2015-05-02  37.787152         ...              5                  15\n2 2015-05-03  37.787152         ...             11                  15\n3 2015-05-04  37.787152         ...             24                  15\n4 2015-05-05  37.787152         ...             24                  15\n\n[5 rows x 6 columns]\n</div>"
111 |             ]
112 |           }
113 |         }
114 |       ],
115 |       "execution_count": 10
116 |     },
117 |     {
118 |       "cell_type": "markdown",
119 |       "source": [
120 |         "## Initialize NoaaIsdWeather class, get the enricher from it and enrich demo data\nFor weather data, due to size, by default we allow reading from the last month if multiple months are passed.If you want to load more, please refer to `04-nyc-taxi-join-weather-in-pandas.ipynb.ipynb` under this folder for how.\n\nThe logic for join:\n\nThe join logic for Pandas version is using cKDTree to accelerate the speed of the process. We gather the public weather dataset as long/lat point array, pass it to create cKDTree. Then gather the customer dataset as long/lat point array, pass it to cKDTree query function, to find the closest point in cKDTree. After querying cKDTree, we join public weather dataset with customer dataset by the querying result, then grant ranking group id."
121 |       ],
122 |       "metadata": {}
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "source": [
127 |         "from azureml.opendatasets.accessories.location_data import LatLongColumn\nfrom azureml.opendatasets.accessories.location_time_customer_data import LocationTimeCustomerData\nfrom azureml.opendatasets import NoaaIsdWeather\nfrom datetime import datetime\n\n\n_customer_data = LocationTimeCustomerData(df, LatLongColumn('lat', 'long'), 'datetime')\nweather = NoaaIsdWeather(\n    cols=[\"temperature\", \"windSpeed\", \"seaLvlPressure\"],\n    start_date=datetime(2015, 5, 1, 0, 0),\n    end_date=datetime(2015, 5, 31, 23, 59))\nweather_enricher = weather.get_enricher()\njoined_data = weather_enricher.enrich_customer_data_with_agg(\n    customer_data_object=_customer_data,\n    location_match_granularity=5,\n    time_round_granularity='day',\n    agg='avg')"
128 |       ],
129 |       "metadata": {
130 |         "scrolled": true
131 |       },
132 |       "outputs": [
133 |         {
134 |           "metadata": {},
135 |           "output_type": "display_data",
136 |           "data": {
137 |             "text/html": [
138 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, get_enricher\nActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=1.65 [ms]\nActivityStarted, enrich_customer_data_with_agg\nActivityStarted, enrich\nTarget paths: [&apos;/year=2015/month=5/&apos;]\nLooking for parquet files...\nReading them into Pandas dataframe...\nReading ISDWeather/year=2015/month=5/part-00001-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-93.c000.snappy.parquet under container isdweatherdatacontainer\nDone.\nActivityStarted, _get_closest_location_kdTree\nActivityCompleted: Activity=_get_closest_location_kdTree, HowEnded=Success, Duration=26.44 [ms]\nActivityCompleted: Activity=enrich, HowEnded=Success, Duration=36200.08 [ms]\nActivityCompleted: Activity=enrich_customer_data_with_agg, HowEnded=Success, Duration=36201.54 [ms]\n</div>"
139 |             ]
140 |           }
141 |         }
142 |       ],
143 |       "execution_count": 12
144 |     },
145 |     {
146 |       "cell_type": "markdown",
147 |       "source": [
148 |         "## Display the top 10 rows of the joined result"
149 |       ],
150 |       "metadata": {}
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "source": [
155 |         "joined_data.data.head(10)"
156 |       ],
157 |       "metadata": {},
158 |       "outputs": [
159 |         {
160 |           "metadata": {},
161 |           "output_type": "display_data",
162 |           "data": {
163 |             "text/html": [
164 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\n    datetime        lat      ...        windSpeed seaLvlPressure\n0 2015-05-01  37.787152      ...         3.123711     1010.11125\n1 2015-05-02  37.787152      ...         3.690777     1011.76125\n2 2015-05-03  37.787152      ...         3.579512     1011.81125\n3 2015-05-04  37.787152      ...         4.349289     1014.19500\n4 2015-05-05  37.787152      ...         4.650739     1014.70625\n5 2015-05-06  37.787152      ...         5.280488     1012.26000\n6 2015-05-07  37.787152      ...         4.492424     1008.61000\n7 2015-05-08  37.787152      ...         2.853769     1009.97250\n8 2015-05-09  37.787152      ...         3.574510     1017.74359\n9 2015-05-10  37.787152      ...         4.777209     1018.53500\n\n[10 rows x 9 columns]\n</div>"
165 |             ]
166 |           }
167 |         }
168 |       ],
169 |       "execution_count": 14
170 |     },
171 |     {
172 |       "cell_type": "markdown",
173 |       "source": [
174 |         "## Check the stats of joined result"
175 |       ],
176 |       "metadata": {}
177 |     },
178 |     {
179 |       "cell_type": "code",
180 |       "source": [
181 |         "joined_data.data.info()"
182 |       ],
183 |       "metadata": {
184 |         "scrolled": true
185 |       },
186 |       "outputs": [
187 |         {
188 |           "metadata": {},
189 |           "output_type": "display_data",
190 |           "data": {
191 |             "text/html": [
192 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">&lt;class &apos;pandas.core.frame.DataFrame&apos;&gt;\nInt64Index: 1850 entries, 0 to 1849\nData columns (total 9 columns):\ndatetime               1850 non-null datetime64[ns]\nlat                    1850 non-null float64\nlong                   1850 non-null float64\nstations.city          1850 non-null object\ncount                  1850 non-null int32\nstations.dock_count    1850 non-null int32\ntemperature            1850 non-null float64\nwindSpeed              1850 non-null float64\nseaLvlPressure         1850 non-null float64\ndtypes: datetime64[ns](1), float64(5), int32(2), object(1)\nmemory usage: 130.1+ KB\n</div>"
193 |             ]
194 |           }
195 |         }
196 |       ],
197 |       "execution_count": 16
198 |     }
199 |   ],
200 |   "metadata": {
201 |     "language_info": {
202 |       "mimetype": "text/x-python",
203 |       "name": "python",
204 |       "pygments_lexer": "ipython3",
205 |       "codemirror_mode": {
206 |         "name": "ipython",
207 |         "version": 3
208 |       },
209 |       "version": "3.6.8",
210 |       "nbconvert_exporter": "python",
211 |       "file_extension": ".py"
212 |     },
213 |     "name": "02-weather-join-in-pandas",
214 |     "notebookId": 1709144033725327,
215 |     "kernelspec": {
216 |       "display_name": "Python 3",
217 |       "language": "python",
218 |       "name": "python3"
219 |     },
220 |     "celltoolbar": "Raw Cell Format"
221 |   },
222 |   "nbformat": 4,
223 |   "nbformat_minor": 0
224 | }
225 | 


--------------------------------------------------------------------------------
/tutorials/data-join/03-nyc-taxi-join-weather-in-spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "# Tutorial: Load TAXI data and enrich it with Weather data in Pandas DataFrame"
 14 |       ],
 15 |       "metadata": {}
 16 |     },
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "Begin by creating a dataframe to hold the taxi data. To download 2 months of taxi data, iteratively fetch one month at a time, and before appending it to green_taxi_df randomly sample 0.1% records from the specific month to avoid bloating the dataframe."
 21 |       ],
 22 |       "metadata": {}
 23 |     },
 24 |     {
 25 |       "cell_type": "code",
 26 |       "source": [
 27 |         "import pandas as pd\nfrom datetime import datetime\nfrom dateutil.relativedelta import relativedelta\nfrom azureml.opendatasets import NycTlcGreen\nfrom functools import reduce  # For Python 3.x\nfrom pyspark.sql import DataFrame\n\n\nstart = datetime.strptime(\"1/1/2016\", \"%m/%d/%Y\")\nend = datetime.strptime(\"1/31/2016\", \"%m/%d/%Y\")\n\ndfs = []\nfor sample_month in range(2):\n    temp_df_green = NycTlcGreen(\n        start + relativedelta(months=sample_month),\n        end + relativedelta(months=sample_month)).to_spark_dataframe()\n    dfs.append(temp_df_green.sample(False, 0.001, 3))\n\ngreen_taxi_df = reduce(DataFrame.unionAll, dfs)"
 28 |       ],
 29 |       "metadata": {},
 30 |       "outputs": [
 31 |         {
 32 |           "metadata": {},
 33 |           "output_type": "display_data",
 34 |           "data": {
 35 |             "text/html": [
 36 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, to_spark_dataframe\nActivityStarted, to_spark_dataframe_in_worker\nActivityCompleted: Activity=to_spark_dataframe_in_worker, HowEnded=Success, Duration=53785.29 [ms]\nActivityCompleted: Activity=to_spark_dataframe, HowEnded=Success, Duration=53786.87 [ms]\nActivityStarted, to_spark_dataframe\nActivityStarted, to_spark_dataframe_in_worker\nActivityCompleted: Activity=to_spark_dataframe_in_worker, HowEnded=Success, Duration=52593.31 [ms]\nActivityCompleted: Activity=to_spark_dataframe, HowEnded=Success, Duration=52594.72 [ms]\n</div>"
 37 |             ]
 38 |           }
 39 |         }
 40 |       ],
 41 |       "execution_count": 4
 42 |     },
 43 |     {
 44 |       "cell_type": "markdown",
 45 |       "source": [
 46 |         "Save a copy of the raw_columns name list for clean up at the last step."
 47 |       ],
 48 |       "metadata": {}
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "source": [
 53 |         "raw_columns = list(green_taxi_df.columns)"
 54 |       ],
 55 |       "metadata": {},
 56 |       "outputs": [
 57 |         {
 58 |           "metadata": {},
 59 |           "output_type": "display_data",
 60 |           "data": {
 61 |             "text/html": [
 62 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
 63 |             ]
 64 |           }
 65 |         }
 66 |       ],
 67 |       "execution_count": 6
 68 |     },
 69 |     {
 70 |       "cell_type": "markdown",
 71 |       "source": [
 72 |         "NYC Latitude & Longitude: (40.71455, -74.00712) found by Bing search.\n\nAdd to taxi dataframe"
 73 |       ],
 74 |       "metadata": {}
 75 |     },
 76 |     {
 77 |       "cell_type": "markdown",
 78 |       "source": [
 79 |         "Make all Latitude and Longitude be the location of New York City."
 80 |       ],
 81 |       "metadata": {}
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "source": [
 86 |         "from pyspark.sql.functions import lit\n\nnyc_lat, nyc_long = (40.71455, -74.00712)\ngreen_taxi_df = green_taxi_df.withColumn('lat', lit(nyc_lat)).withColumn('long', lit(nyc_long))\ndisplay(green_taxi_df.limit(5))"
 87 |       ],
 88 |       "metadata": {},
 89 |       "outputs": [
 90 |         {
 91 |           "metadata": {},
 92 |           "output_type": "display_data",
 93 |           "data": {
 94 |             "text/html": [
 95 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>vendorID</th><th>lpepPickupDatetime</th><th>lpepDropoffDatetime</th><th>passengerCount</th><th>tripDistance</th><th>puLocationId</th><th>doLocationId</th><th>pickupLongitude</th><th>pickupLatitude</th><th>dropoffLongitude</th><th>dropoffLatitude</th><th>rateCodeID</th><th>storeAndFwdFlag</th><th>paymentType</th><th>fareAmount</th><th>extra</th><th>mtaTax</th><th>improvementSurcharge</th><th>tipAmount</th><th>tollsAmount</th><th>ehailFee</th><th>totalAmount</th><th>tripType</th><th>puYear</th><th>puMonth</th><th>lat</th><th>long</th></tr></thead><tbody><tr><td>1</td><td>2016-01-18T17:14:03.000+0000</td><td>2016-01-18T17:26:34.000+0000</td><td>1</td><td>1.6</td><td>null</td><td>null</td><td>-73.95868682861328</td><td>40.71489334106445</td><td>-73.95039367675781</td><td>40.69751739501953</td><td>1</td><td>N</td><td>2</td><td>9.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>10.3</td><td>1</td><td>2016</td><td>1</td><td>40.71455</td><td>-74.00712</td></tr><tr><td>2</td><td>2016-01-18T18:00:42.000+0000</td><td>2016-01-18T18:08:53.000+0000</td><td>1</td><td>1.37</td><td>null</td><td>null</td><td>-73.95466613769531</td><td>40.789363861083984</td><td>-73.95391082763672</td><td>40.77495574951172</td><td>1</td><td>N</td><td>1</td><td>7.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>1.24</td><td>0.0</td><td>null</td><td>9.54</td><td>1</td><td>2016</td><td>1</td><td>40.71455</td><td>-74.00712</td></tr><tr><td>2</td><td>2016-01-18T18:28:33.000+0000</td><td>2016-01-18T18:51:43.000+0000</td><td>1</td><td>5.52</td><td>null</td><td>null</td><td>-73.99703216552734</td><td>40.68907165527344</td><td>-73.9892349243164</td><td>40.745548248291016</td><td>1</td><td>N</td><td>1</td><td>20.0</td><td>0.0</td><td>0.5</td><td>0.3</td><td>5.0</td><td>0.0</td><td>null</td><td>25.8</td><td>1</td><td>2016</td><td>1</td><td>40.71455</td><td>-74.00712</td></tr><tr><td>2</td><td>2016-01-18T18:50:23.000+0000</td><td>2016-01-18T18:56:13.000+0000</td><td>2</td><td>1.16</td><td>null</td><td>null</td><td>-73.90315246582031</td><td>40.745941162109375</td><td>-73.91759490966797</td><td>40.744773864746094</td><td>1</td><td>N</td><td>2</td><td>6.0</td><td>0.0</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>6.8</td><td>1</td><td>2016</td><td>1</td><td>40.71455</td><td>-74.00712</td></tr><tr><td>1</td><td>2016-01-18T18:51:36.000+0000</td><td>2016-01-18T19:00:36.000+0000</td><td>1</td><td>1.6</td><td>null</td><td>null</td><td>-73.98666381835938</td><td>40.70247268676758</td><td>-73.97904968261719</td><td>40.68352127075195</td><td>1</td><td>N</td><td>1</td><td>8.0</td><td>0.0</td><td>0.5</td><td>0.3</td><td>1.2</td><td>0.0</td><td>null</td><td>10.0</td><td>1</td><td>2016</td><td>1</td><td>40.71455</td><td>-74.00712</td></tr></tbody></table></div>"
 96 |             ]
 97 |           }
 98 |         }
 99 |       ],
100 |       "execution_count": 9
101 |     },
102 |     {
103 |       "cell_type": "markdown",
104 |       "source": [
105 |         "Initialize LocationTimeCustomerData using pandas dataframe green_taxi."
106 |       ],
107 |       "metadata": {}
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "source": [
112 |         "from azureml.opendatasets.accessories.location_data import LatLongColumn\nfrom azureml.opendatasets.accessories.location_time_customer_data \\\n    import LocationTimeCustomerData\nfrom azureml.opendatasets import NoaaIsdWeather\n\n\ngreen_taxi = LocationTimeCustomerData(\n    green_taxi_df,\n    LatLongColumn('lat', 'long'),\n    'lpepPickupDatetime')"
113 |       ],
114 |       "metadata": {},
115 |       "outputs": [
116 |         {
117 |           "metadata": {},
118 |           "output_type": "display_data",
119 |           "data": {
120 |             "text/html": [
121 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
122 |             ]
123 |           }
124 |         }
125 |       ],
126 |       "execution_count": 11
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "source": [
131 |         "spark.conf.set('spark.sql.crossJoin.enabled', 'true')"
132 |       ],
133 |       "metadata": {},
134 |       "outputs": [],
135 |       "execution_count": 12
136 |     },
137 |     {
138 |       "cell_type": "markdown",
139 |       "source": [
140 |         "Initialize NoaaIsdWeather class, get enricher from it, and enrich the taxi data without aggregation"
141 |       ],
142 |       "metadata": {}
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "source": [
147 |         "weather = NoaaIsdWeather(\n    cols=[\"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\"],\n    start_date=datetime(2016, 1, 1, 0, 0),\n    end_date=datetime(2016, 2, 28, 23, 59))\nweather_enricher = weather.get_enricher()\nnew_green_taxi, processed_weather = weather_enricher.enrich_customer_data_no_agg(\n    customer_data_object=green_taxi,\n    location_match_granularity=5,\n    time_round_granularity='day')"
148 |       ],
149 |       "metadata": {},
150 |       "outputs": [
151 |         {
152 |           "metadata": {},
153 |           "output_type": "display_data",
154 |           "data": {
155 |             "text/html": [
156 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, get_enricher\nActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=1.93 [ms]\nActivityStarted, enrich_customer_data_no_agg\nActivityStarted, enrich\nActivityCompleted: Activity=enrich, HowEnded=Success, Duration=217158.82 [ms]\nActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=217161.48 [ms]\n</div>"
157 |             ]
158 |           }
159 |         }
160 |       ],
161 |       "execution_count": 14
162 |     },
163 |     {
164 |       "cell_type": "markdown",
165 |       "source": [
166 |         "Preview the pandas dataframe new_green_taxi.data"
167 |       ],
168 |       "metadata": {}
169 |     },
170 |     {
171 |       "cell_type": "code",
172 |       "source": [
173 |         "display(new_green_taxi.data.limit(3))"
174 |       ],
175 |       "metadata": {},
176 |       "outputs": [
177 |         {
178 |           "metadata": {},
179 |           "output_type": "display_data",
180 |           "data": {
181 |             "text/html": [
182 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>lat</th><th>long</th><th>vendorID</th><th>lpepPickupDatetime</th><th>lpepDropoffDatetime</th><th>passengerCount</th><th>tripDistance</th><th>puLocationId</th><th>doLocationId</th><th>pickupLongitude</th><th>pickupLatitude</th><th>dropoffLongitude</th><th>dropoffLatitude</th><th>rateCodeID</th><th>storeAndFwdFlag</th><th>paymentType</th><th>fareAmount</th><th>extra</th><th>mtaTax</th><th>improvementSurcharge</th><th>tipAmount</th><th>tollsAmount</th><th>ehailFee</th><th>totalAmount</th><th>tripType</th><th>puYear</th><th>puMonth</th><th>row_id</th><th>customer_rankgrouprbzmn</th><th>customer_join_time1v3cp</th></tr></thead><tbody><tr><td>40.71455</td><td>-74.00712</td><td>1</td><td>2016-01-18T17:14:03.000+0000</td><td>2016-01-18T17:26:34.000+0000</td><td>1</td><td>1.6</td><td>null</td><td>null</td><td>-73.95868682861328</td><td>40.71489334106445</td><td>-73.95039367675781</td><td>40.69751739501953</td><td>1</td><td>N</td><td>2</td><td>9.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>10.3</td><td>1</td><td>2016</td><td>1</td><td>77309411328</td><td>1</td><td>2016-01-18T00:00:00.000+0000</td></tr><tr><td>40.71455</td><td>-74.00712</td><td>2</td><td>2016-01-18T18:00:42.000+0000</td><td>2016-01-18T18:08:53.000+0000</td><td>1</td><td>1.37</td><td>null</td><td>null</td><td>-73.95466613769531</td><td>40.789363861083984</td><td>-73.95391082763672</td><td>40.77495574951172</td><td>1</td><td>N</td><td>1</td><td>7.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>1.24</td><td>0.0</td><td>null</td><td>9.54</td><td>1</td><td>2016</td><td>1</td><td>77309411329</td><td>1</td><td>2016-01-18T00:00:00.000+0000</td></tr><tr><td>40.71455</td><td>-74.00712</td><td>2</td><td>2016-01-18T18:28:33.000+0000</td><td>2016-01-18T18:51:43.000+0000</td><td>1</td><td>5.52</td><td>null</td><td>null</td><td>-73.99703216552734</td><td>40.68907165527344</td><td>-73.9892349243164</td><td>40.745548248291016</td><td>1</td><td>N</td><td>1</td><td>20.0</td><td>0.0</td><td>0.5</td><td>0.3</td><td>5.0</td><td>0.0</td><td>null</td><td>25.8</td><td>1</td><td>2016</td><td>1</td><td>77309411330</td><td>1</td><td>2016-01-18T00:00:00.000+0000</td></tr></tbody></table></div>"
183 |             ]
184 |           }
185 |         }
186 |       ],
187 |       "execution_count": 16
188 |     },
189 |     {
190 |       "cell_type": "markdown",
191 |       "source": [
192 |         "Define a dict `aggregations` to define how to aggregate each field at a hour level. For `snowDepth` and `temperature` we'll take the mean and for `precipTime` and `precipDepth` we'll take the hourly maximum. Use the groupby() function along with the aggregations to group data."
193 |       ],
194 |       "metadata": {}
195 |     },
196 |     {
197 |       "cell_type": "code",
198 |       "source": [
199 |         "aggregations = {\n    \"snowDepth\": \"mean\",\n    \"precipTime\": \"max\",\n    \"temperature\": \"mean\",\n    \"precipDepth\": \"max\"}"
200 |       ],
201 |       "metadata": {},
202 |       "outputs": [
203 |         {
204 |           "metadata": {},
205 |           "output_type": "display_data",
206 |           "data": {
207 |             "text/html": [
208 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
209 |             ]
210 |           }
211 |         }
212 |       ],
213 |       "execution_count": 18
214 |     },
215 |     {
216 |       "cell_type": "markdown",
217 |       "source": [
218 |         "The keys (`public_rankgroup`, `public_join_time`, `customer_rankgroup`, `customer_join_time`) used by groupby() and later merge() must be hacked here due to the current design."
219 |       ],
220 |       "metadata": {}
221 |     },
222 |     {
223 |       "cell_type": "code",
224 |       "source": [
225 |         "public_rankgroup = processed_weather.id\n\npublic_join_time = [\n    s for s in list(processed_weather.data.columns)\n    if s.startswith('ds_join_time')][0]\n\ncustomer_rankgroup = weather_enricher.location_selector.customer_rankgroup\n\ncustomer_join_time = [\n    s for s in list(new_green_taxi.data.columns)\n    if type(s) is str and s.startswith('customer_join_time')][0]\n\nweather_df_grouped = processed_weather.data.groupby(public_rankgroup, public_join_time).agg(aggregations)\ndisplay(weather_df_grouped.limit(3))"
226 |       ],
227 |       "metadata": {},
228 |       "outputs": [
229 |         {
230 |           "metadata": {},
231 |           "output_type": "display_data",
232 |           "data": {
233 |             "text/html": [
234 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>public_rankgroup1qf8h</th><th>ds_join_timevotkg</th><th>avg(snowDepth)</th><th>avg(temperature)</th><th>max(precipDepth)</th><th>max(precipTime)</th></tr></thead><tbody><tr><td>1</td><td>2016-01-13T00:00:00.000+0000</td><td>0.0</td><td>-2.2664285714285732</td><td>3.0</td><td>24.0</td></tr><tr><td>1</td><td>2016-02-07T00:00:00.000+0000</td><td>1.8</td><td>3.8400000000000007</td><td>0.0</td><td>24.0</td></tr><tr><td>1</td><td>2016-01-21T00:00:00.000+0000</td><td>0.0</td><td>-0.07226277372262757</td><td>0.0</td><td>24.0</td></tr></tbody></table></div>"
235 |             ]
236 |           }
237 |         }
238 |       ],
239 |       "execution_count": 20
240 |     },
241 |     {
242 |       "cell_type": "markdown",
243 |       "source": [
244 |         "Join the final dataframe, and preview the joined result."
245 |       ],
246 |       "metadata": {}
247 |     },
248 |     {
249 |       "cell_type": "code",
250 |       "source": [
251 |         "taxi_df = new_green_taxi.data\njoined_dataset = taxi_df.join(\n    weather_df_grouped,\n    [taxi_df[customer_rankgroup] == weather_df_grouped[public_rankgroup],\n     taxi_df[customer_join_time] == weather_df_grouped[public_join_time]],\n    how='left')\n\nfinal_df = joined_dataset.select(raw_columns + [\n    \"avg(temperature)\", \"max(precipTime)\", \"max(precipDepth)\", \"avg(snowDepth)\"])\ndisplay(final_df.limit(5))"
252 |       ],
253 |       "metadata": {},
254 |       "outputs": [
255 |         {
256 |           "metadata": {},
257 |           "output_type": "display_data",
258 |           "data": {
259 |             "text/html": [
260 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>vendorID</th><th>lpepPickupDatetime</th><th>lpepDropoffDatetime</th><th>passengerCount</th><th>tripDistance</th><th>puLocationId</th><th>doLocationId</th><th>pickupLongitude</th><th>pickupLatitude</th><th>dropoffLongitude</th><th>dropoffLatitude</th><th>rateCodeID</th><th>storeAndFwdFlag</th><th>paymentType</th><th>fareAmount</th><th>extra</th><th>mtaTax</th><th>improvementSurcharge</th><th>tipAmount</th><th>tollsAmount</th><th>ehailFee</th><th>totalAmount</th><th>tripType</th><th>puYear</th><th>puMonth</th><th>avg(temperature)</th><th>max(precipTime)</th><th>max(precipDepth)</th><th>avg(snowDepth)</th></tr></thead><tbody><tr><td>2</td><td>2016-01-13T00:56:41.000+0000</td><td>2016-01-13T01:05:00.000+0000</td><td>1</td><td>1.85</td><td>null</td><td>null</td><td>-73.95475006103516</td><td>40.687801361083984</td><td>-73.95307922363281</td><td>40.70832824707031</td><td>1</td><td>N</td><td>1</td><td>8.5</td><td>0.5</td><td>0.5</td><td>0.3</td><td>1.2</td><td>0.0</td><td>null</td><td>11.0</td><td>1</td><td>2016</td><td>1</td><td>-2.2664285714285732</td><td>24.0</td><td>3.0</td><td>0.0</td></tr><tr><td>2</td><td>2016-01-13T01:46:31.000+0000</td><td>2016-01-13T01:53:55.000+0000</td><td>1</td><td>1.75</td><td>null</td><td>null</td><td>-73.8910903930664</td><td>40.74677658081055</td><td>-73.88246154785156</td><td>40.7307014465332</td><td>1</td><td>N</td><td>2</td><td>8.0</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>9.3</td><td>1</td><td>2016</td><td>1</td><td>-2.2664285714285732</td><td>24.0</td><td>3.0</td><td>0.0</td></tr><tr><td>1</td><td>2016-01-13T01:49:57.000+0000</td><td>2016-01-13T01:56:53.000+0000</td><td>2</td><td>1.4</td><td>null</td><td>null</td><td>-73.8910903930664</td><td>40.74702453613281</td><td>-73.86837768554688</td><td>40.75227355957031</td><td>1</td><td>N</td><td>2</td><td>7.0</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>8.3</td><td>1</td><td>2016</td><td>1</td><td>-2.2664285714285732</td><td>24.0</td><td>3.0</td><td>0.0</td></tr><tr><td>1</td><td>2016-01-13T01:41:11.000+0000</td><td>2016-01-13T02:02:37.000+0000</td><td>3</td><td>6.1</td><td>null</td><td>null</td><td>-73.95850372314453</td><td>40.719234466552734</td><td>-74.001708984375</td><td>40.73370361328125</td><td>1</td><td>N</td><td>2</td><td>21.0</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>22.3</td><td>1</td><td>2016</td><td>1</td><td>-2.2664285714285732</td><td>24.0</td><td>3.0</td><td>0.0</td></tr><tr><td>2</td><td>2016-01-13T04:17:12.000+0000</td><td>2016-01-13T04:25:11.000+0000</td><td>1</td><td>1.07</td><td>null</td><td>null</td><td>-73.89081573486328</td><td>40.746795654296875</td><td>-73.87198638916016</td><td>40.746891021728516</td><td>1</td><td>N</td><td>2</td><td>6.5</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>7.8</td><td>1</td><td>2016</td><td>1</td><td>-2.2664285714285732</td><td>24.0</td><td>3.0</td><td>0.0</td></tr></tbody></table></div>"
261 |             ]
262 |           }
263 |         }
264 |       ],
265 |       "execution_count": 22
266 |     },
267 |     {
268 |       "cell_type": "markdown",
269 |       "source": [
270 |         "Check the join success rate."
271 |       ],
272 |       "metadata": {}
273 |     },
274 |     {
275 |       "cell_type": "code",
276 |       "source": [
277 |         "final_df.toPandas().info()"
278 |       ],
279 |       "metadata": {},
280 |       "outputs": [
281 |         {
282 |           "metadata": {},
283 |           "output_type": "display_data",
284 |           "data": {
285 |             "text/html": [
286 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">&lt;class &apos;pandas.core.frame.DataFrame&apos;&gt;\nRangeIndex: 2842 entries, 0 to 2841\nData columns (total 29 columns):\nvendorID                2842 non-null int32\nlpepPickupDatetime      2842 non-null datetime64[ns]\nlpepDropoffDatetime     2842 non-null datetime64[ns]\npassengerCount          2842 non-null int32\ntripDistance            2842 non-null float64\npuLocationId            0 non-null object\ndoLocationId            0 non-null object\npickupLongitude         2842 non-null float64\npickupLatitude          2842 non-null float64\ndropoffLongitude        2842 non-null float64\ndropoffLatitude         2842 non-null float64\nrateCodeID              2842 non-null int32\nstoreAndFwdFlag         2842 non-null object\npaymentType             2842 non-null int32\nfareAmount              2842 non-null float64\nextra                   2842 non-null float64\nmtaTax                  2842 non-null float64\nimprovementSurcharge    2842 non-null object\ntipAmount               2842 non-null float64\ntollsAmount             2842 non-null float64\nehailFee                0 non-null object\ntotalAmount             2842 non-null float64\ntripType                2842 non-null int32\npuYear                  2842 non-null int32\npuMonth                 2842 non-null int32\navg(temperature)        2842 non-null float64\nmax(precipTime)         2842 non-null float64\nmax(precipDepth)        2842 non-null float64\navg(snowDepth)          2842 non-null float64\ndtypes: datetime64[ns](2), float64(15), int32(7), object(5)\nmemory usage: 566.3+ KB\n</div>"
287 |             ]
288 |           }
289 |         }
290 |       ],
291 |       "execution_count": 24
292 |     },
293 |     {
294 |       "cell_type": "code",
295 |       "source": [
296 |         "final_df.createOrReplaceTempView('joined_df')"
297 |       ],
298 |       "metadata": {},
299 |       "outputs": [
300 |         {
301 |           "metadata": {},
302 |           "output_type": "display_data",
303 |           "data": {
304 |             "text/html": [
305 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
306 |             ]
307 |           }
308 |         }
309 |       ],
310 |       "execution_count": 25
311 |     },
312 |     {
313 |       "cell_type": "code",
314 |       "source": [
315 |         "%sql\nselect * from joined_df\nwhere lpepPickupDatetime >= '2016-01-26' and lpepPickupDatetime < '2016-01-27'\norder by lpepPickupDatetime limit 5"
316 |       ],
317 |       "metadata": {},
318 |       "outputs": [
319 |         {
320 |           "metadata": {},
321 |           "output_type": "display_data",
322 |           "data": {
323 |             "text/html": [
324 |               "<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>vendorID</th><th>lpepPickupDatetime</th><th>lpepDropoffDatetime</th><th>passengerCount</th><th>tripDistance</th><th>puLocationId</th><th>doLocationId</th><th>pickupLongitude</th><th>pickupLatitude</th><th>dropoffLongitude</th><th>dropoffLatitude</th><th>rateCodeID</th><th>storeAndFwdFlag</th><th>paymentType</th><th>fareAmount</th><th>extra</th><th>mtaTax</th><th>improvementSurcharge</th><th>tipAmount</th><th>tollsAmount</th><th>ehailFee</th><th>totalAmount</th><th>tripType</th><th>puYear</th><th>puMonth</th><th>avg(temperature)</th><th>max(precipTime)</th><th>max(precipDepth)</th><th>avg(snowDepth)</th></tr></thead><tbody><tr><td>2</td><td>2016-01-26T00:02:33.000+0000</td><td>2016-01-26T00:16:54.000+0000</td><td>2</td><td>3.27</td><td>null</td><td>null</td><td>-73.95603942871094</td><td>40.71393966674805</td><td>-73.90204620361328</td><td>40.70497131347656</td><td>1</td><td>N</td><td>2</td><td>12.5</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>13.8</td><td>1</td><td>2016</td><td>1</td><td>4.209285714285715</td><td>24.0</td><td>0.0</td><td>40.06896551724138</td></tr><tr><td>2</td><td>2016-01-26T01:50:11.000+0000</td><td>2016-01-26T01:56:21.000+0000</td><td>1</td><td>2.06</td><td>null</td><td>null</td><td>-73.90959930419922</td><td>40.77006530761719</td><td>-73.88544464111328</td><td>40.75567626953125</td><td>1</td><td>N</td><td>2</td><td>8.0</td><td>0.5</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>9.3</td><td>1</td><td>2016</td><td>1</td><td>4.209285714285715</td><td>24.0</td><td>0.0</td><td>40.06896551724138</td></tr><tr><td>2</td><td>2016-01-26T02:45:59.000+0000</td><td>2016-01-26T03:04:32.000+0000</td><td>1</td><td>5.59</td><td>null</td><td>null</td><td>-73.9578857421875</td><td>40.800941467285156</td><td>-73.93751525878906</td><td>40.84716033935547</td><td>1</td><td>N</td><td>1</td><td>19.5</td><td>0.5</td><td>0.5</td><td>0.3</td><td>2.0</td><td>0.0</td><td>null</td><td>22.8</td><td>1</td><td>2016</td><td>1</td><td>4.209285714285715</td><td>24.0</td><td>0.0</td><td>40.06896551724138</td></tr><tr><td>2</td><td>2016-01-26T07:35:27.000+0000</td><td>2016-01-26T08:04:34.000+0000</td><td>1</td><td>2.54</td><td>null</td><td>null</td><td>-73.95887756347656</td><td>40.6507453918457</td><td>-73.97756958007812</td><td>40.684326171875</td><td>1</td><td>N</td><td>1</td><td>18.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>0.0</td><td>0.0</td><td>null</td><td>19.3</td><td>1</td><td>2016</td><td>1</td><td>4.209285714285715</td><td>24.0</td><td>0.0</td><td>40.06896551724138</td></tr><tr><td>2</td><td>2016-01-26T08:40:46.000+0000</td><td>2016-01-26T09:24:24.000+0000</td><td>2</td><td>6.21</td><td>null</td><td>null</td><td>-73.93299102783203</td><td>40.679508209228516</td><td>-74.00007629394531</td><td>40.73252868652344</td><td>1</td><td>N</td><td>1</td><td>29.5</td><td>0.0</td><td>0.5</td><td>0.3</td><td>6.06</td><td>0.0</td><td>null</td><td>36.36</td><td>1</td><td>2016</td><td>1</td><td>4.209285714285715</td><td>24.0</td><td>0.0</td><td>40.06896551724138</td></tr></tbody></table></div>"
325 |             ]
326 |           }
327 |         }
328 |       ],
329 |       "execution_count": 26
330 |     }
331 |   ],
332 |   "metadata": {
333 |     "kernelspec": {
334 |       "display_name": "Python 3",
335 |       "language": "python",
336 |       "name": "python3"
337 |     },
338 |     "language_info": {
339 |       "mimetype": "text/x-python",
340 |       "name": "python",
341 |       "pygments_lexer": "ipython3",
342 |       "codemirror_mode": {
343 |         "name": "ipython",
344 |         "version": 3
345 |       },
346 |       "version": "3.6.7",
347 |       "nbconvert_exporter": "python",
348 |       "file_extension": ".py"
349 |     },
350 |     "name": "04-nyc-taxi-join-weather-in-spark",
351 |     "notebookId": 2741195231538819
352 |   },
353 |   "nbformat": 4,
354 |   "nbformat_minor": 0
355 | }
356 | 


--------------------------------------------------------------------------------
/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "Copyright (c) Microsoft Corporation. All rights reserved.\n\nLicensed under the MIT License."
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "# Tutorial: Load TAXI data and enrich it with Weather data in Pandas DataFrame"
 14 |       ],
 15 |       "metadata": {}
 16 |     },
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "Install azureml-opendatasets package"
 21 |       ],
 22 |       "metadata": {}
 23 |     },
 24 |     {
 25 |       "cell_type": "code",
 26 |       "source": [
 27 |         "!pip uninstall -y azureml-opendatasets\n!pip install azureml-opendatasets"
 28 |       ],
 29 |       "metadata": {
 30 |         "scrolled": true
 31 |       },
 32 |       "outputs": [],
 33 |       "execution_count": 4
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "source": [
 38 |         "Begin by creating a dataframe to hold the taxi data. When working in a non-Spark environment, Open Datasets only allows downloading one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 2 months of taxi data, iteratively fetch one month at a time, and before appending it to green_taxi_df randomly sample 2000 records from the specific month to avoid bloating the dataframe."
 39 |       ],
 40 |       "metadata": {}
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "import pandas as pd\nfrom datetime import datetime\nfrom dateutil.relativedelta import relativedelta\nfrom azureml.opendatasets import NycTlcGreen\n\n\ngreen_taxi_df = pd.DataFrame([])\nstart = datetime.strptime(\"1/1/2016\", \"%m/%d/%Y\")\nend = datetime.strptime(\"1/31/2016\", \"%m/%d/%Y\")\n\nfor sample_month in range(2):\n    temp_df_green = NycTlcGreen(\n        start + relativedelta(months=sample_month),\n        end + relativedelta(months=sample_month)).to_pandas_dataframe()\n    green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))"
 46 |       ],
 47 |       "metadata": {
 48 |         "scrolled": true
 49 |       },
 50 |       "outputs": [
 51 |         {
 52 |           "metadata": {},
 53 |           "output_type": "display_data",
 54 |           "data": {
 55 |             "text/html": [
 56 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, to_pandas_dataframe\nActivityStarted, to_pandas_dataframe_in_worker\nTarget paths: [&apos;/puYear=2016/puMonth=1/&apos;]\nLooking for parquet files...\nReading them into Pandas dataframe...\nReading green/puYear=2016/puMonth=1/part-00119-tid-6037743401120983271-619c4849-c957-4290-a1b8-66832cb385b6-12538.c000.snappy.parquet under container nyctlc\nDone.\nActivityCompleted: Activity=to_pandas_dataframe_in_worker, HowEnded=Success, Duration=5911.64 [ms]\nActivityCompleted: Activity=to_pandas_dataframe, HowEnded=Success, Duration=5913.04 [ms]\nActivityStarted, to_pandas_dataframe\nActivityStarted, to_pandas_dataframe_in_worker\nTarget paths: [&apos;/puYear=2016/puMonth=2/&apos;]\nLooking for parquet files...\nReading them into Pandas dataframe...\nReading green/puYear=2016/puMonth=2/part-00060-tid-6037743401120983271-619c4849-c957-4290-a1b8-66832cb385b6-12479.c000.snappy.parquet under container nyctlc\nDone.\nActivityCompleted: Activity=to_pandas_dataframe_in_worker, HowEnded=Success, Duration=5719.32 [ms]\nActivityCompleted: Activity=to_pandas_dataframe, HowEnded=Success, Duration=5720.51 [ms]\n</div>"
 57 |             ]
 58 |           }
 59 |         }
 60 |       ],
 61 |       "execution_count": 6
 62 |     },
 63 |     {
 64 |       "cell_type": "markdown",
 65 |       "source": [
 66 |         "Save a copy of the raw_columns name list for clean up at the last step."
 67 |       ],
 68 |       "metadata": {}
 69 |     },
 70 |     {
 71 |       "cell_type": "code",
 72 |       "source": [
 73 |         "raw_columns = list(green_taxi_df.columns)"
 74 |       ],
 75 |       "metadata": {},
 76 |       "outputs": [
 77 |         {
 78 |           "metadata": {},
 79 |           "output_type": "display_data",
 80 |           "data": {
 81 |             "text/html": [
 82 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
 83 |             ]
 84 |           }
 85 |         }
 86 |       ],
 87 |       "execution_count": 8
 88 |     },
 89 |     {
 90 |       "cell_type": "markdown",
 91 |       "source": [
 92 |         "<font color='red'>Get mean values of pickupLatitude and pickupLongitude</font>"
 93 |       ],
 94 |       "metadata": {}
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "info = green_taxi_df.describe()\ninfo['pickupLatitude']['mean'], info['pickupLongitude']['mean']"
100 |       ],
101 |       "metadata": {},
102 |       "outputs": [
103 |         {
104 |           "metadata": {},
105 |           "output_type": "display_data",
106 |           "data": {
107 |             "text/html": [
108 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>(40.707583425521854, -73.861531415939325)\n</div>"
109 |             ]
110 |           }
111 |         }
112 |       ],
113 |       "execution_count": 10
114 |     },
115 |     {
116 |       "cell_type": "markdown",
117 |       "source": [
118 |         "Drop the rows that both lat/long are NaN, especially all columns in the first row are NaN."
119 |       ],
120 |       "metadata": {}
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "source": [
125 |         "green_taxi_df = green_taxi_df.dropna(how='all', subset=['lpepPickupDatetime', 'pickupLatitude', 'pickupLongitude'])"
126 |       ],
127 |       "metadata": {},
128 |       "outputs": [
129 |         {
130 |           "metadata": {},
131 |           "output_type": "display_data",
132 |           "data": {
133 |             "text/html": [
134 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
135 |             ]
136 |           }
137 |         }
138 |       ],
139 |       "execution_count": 12
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "source": [
144 |         "Make all pickupLatitude and pickupLongitude be the center location of the city."
145 |       ],
146 |       "metadata": {}
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "source": [
151 |         "def set_lat(x):\n    return info['pickupLatitude']['mean']\ndef set_long(x):\n    return info['pickupLongitude']['mean']\ngreen_taxi_df['pickupLatitude'] = green_taxi_df[['pickupLatitude']].apply(set_lat, axis=1)\ngreen_taxi_df['pickupLongitude'] = green_taxi_df[['pickupLongitude']].apply(set_long, axis=1)\ngreen_taxi_df.head(5)"
152 |       ],
153 |       "metadata": {},
154 |       "outputs": [
155 |         {
156 |           "metadata": {},
157 |           "output_type": "display_data",
158 |           "data": {
159 |             "text/html": [
160 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\n         vendorID  lpepPickupDatetime    ...    totalAmount  tripType\n456868          1 2016-01-09 20:13:16    ...          10.30       1.0\n802793          2 2016-01-16 23:55:50    ...          11.80       1.0\n1341644         2 2016-01-04 16:30:30    ...          11.30       1.0\n49923           2 2016-01-19 02:37:18    ...           5.30       1.0\n750222          2 2016-01-16 01:57:20    ...           8.16       1.0\n\n[5 rows x 23 columns]\n</div>"
161 |             ]
162 |           }
163 |         }
164 |       ],
165 |       "execution_count": 14
166 |     },
167 |     {
168 |       "cell_type": "markdown",
169 |       "source": [
170 |         "The original index can fail the initialization of class LocationTimeCustomerData at below, so this is a workaround to add a monotonically increasing id column."
171 |       ],
172 |       "metadata": {}
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "source": [
177 |         "green_taxi_df['idx'] = list(range(len(green_taxi_df.index)))\ngreen_taxi_df_idx = green_taxi_df.set_index('idx')\ngreen_taxi_df_idx.head(5)"
178 |       ],
179 |       "metadata": {},
180 |       "outputs": [
181 |         {
182 |           "metadata": {},
183 |           "output_type": "display_data",
184 |           "data": {
185 |             "text/html": [
186 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">7</span><span class=\"ansired\">]: </span>\n     vendorID  lpepPickupDatetime    ...    totalAmount  tripType\nidx                                  ...                         \n0           1 2016-01-09 20:13:16    ...          10.30       1.0\n1           2 2016-01-16 23:55:50    ...          11.80       1.0\n2           2 2016-01-04 16:30:30    ...          11.30       1.0\n3           2 2016-01-19 02:37:18    ...           5.30       1.0\n4           2 2016-01-16 01:57:20    ...           8.16       1.0\n\n[5 rows x 23 columns]\n</div>"
187 |             ]
188 |           }
189 |         }
190 |       ],
191 |       "execution_count": 16
192 |     },
193 |     {
194 |       "cell_type": "markdown",
195 |       "source": [
196 |         "Initialize LocationTimeCustomerData using pandas dataframe green_taxi."
197 |       ],
198 |       "metadata": {}
199 |     },
200 |     {
201 |       "cell_type": "code",
202 |       "source": [
203 |         "from azureml.opendatasets.accessories.location_data import LatLongColumn\nfrom azureml.opendatasets.accessories.location_time_customer_data \\\n    import LocationTimeCustomerData\nfrom azureml.opendatasets import NoaaIsdWeather\n\n\ngreen_taxi = LocationTimeCustomerData(\n    green_taxi_df_idx,\n    LatLongColumn('pickupLatitude', 'pickupLongitude'),\n    'lpepPickupDatetime')"
204 |       ],
205 |       "metadata": {
206 |         "scrolled": true
207 |       },
208 |       "outputs": [
209 |         {
210 |           "metadata": {},
211 |           "output_type": "display_data",
212 |           "data": {
213 |             "text/html": [
214 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
215 |             ]
216 |           }
217 |         }
218 |       ],
219 |       "execution_count": 18
220 |     },
221 |     {
222 |       "cell_type": "markdown",
223 |       "source": [
224 |         "Initialize NoaaIsdWeather class, get enricher from it, and enrich the taxi data without aggregation\n\nThe logic for join:\n\nThe join logic for Pandas version is using cKDTree to accelerate the speed of the process. We gather the public weather dataset as long/lat point array, pass it to create cKDTree. Then gather the customer dataset as long/lat point array, pass it to cKDTree query function, to find the closest point in cKDTree. After querying cKDTree, we join public weather dataset with customer dataset by the querying result, then grant ranking group id."
225 |       ],
226 |       "metadata": {}
227 |     },
228 |     {
229 |       "cell_type": "code",
230 |       "source": [
231 |         "weather = NoaaIsdWeather(\n    cols=[\"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\"],\n    start_date=datetime(2016, 1, 1, 0, 0),\n    end_date=datetime(2016, 2, 28, 23, 59))\nweather_enricher = weather.get_enricher()\nnew_green_taxi, processed_weather = weather_enricher.enrich_customer_data_no_agg(\n    customer_data_object=green_taxi,\n    location_match_granularity=1,\n    time_round_granularity='day')"
232 |       ],
233 |       "metadata": {
234 |         "scrolled": false
235 |       },
236 |       "outputs": [
237 |         {
238 |           "metadata": {},
239 |           "output_type": "display_data",
240 |           "data": {
241 |             "text/html": [
242 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">ActivityStarted, get_enricher\nActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=1.77 [ms]\nActivityStarted, enrich_customer_data_no_agg\nActivityStarted, enrich\nTarget paths: [&apos;/year=2016/month=1/&apos;, &apos;/year=2016/month=2/&apos;]\nLooking for parquet files...\nReading them into Pandas dataframe...\nReading ISDWeather/year=2016/month=1/part-00005-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-110.c000.snappy.parquet under container isdweatherdatacontainer\nReading ISDWeather/year=2016/month=2/part-00011-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-116.c000.snappy.parquet under container isdweatherdatacontainer\nDone.\nActivityStarted, _get_closest_location_kdTree\nActivityCompleted: Activity=_get_closest_location_kdTree, HowEnded=Success, Duration=231.73 [ms]\nActivityCompleted: Activity=enrich, HowEnded=Success, Duration=212029.49 [ms]\nActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=212298.99 [ms]\n</div>"
243 |             ]
244 |           }
245 |         }
246 |       ],
247 |       "execution_count": 20
248 |     },
249 |     {
250 |       "cell_type": "markdown",
251 |       "source": [
252 |         "Preview the pandas dataframe new_green_taxi.data"
253 |       ],
254 |       "metadata": {}
255 |     },
256 |     {
257 |       "cell_type": "code",
258 |       "source": [
259 |         "new_green_taxi.data.head(3)"
260 |       ],
261 |       "metadata": {},
262 |       "outputs": [
263 |         {
264 |           "metadata": {},
265 |           "output_type": "display_data",
266 |           "data": {
267 |             "text/html": [
268 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">10</span><span class=\"ansired\">]: </span>\n   vendorID           ...           customer_join_timeifj7t\n0         1           ...                        2016-01-09\n1         2           ...                        2016-01-16\n2         2           ...                        2016-01-04\n\n[3 rows x 25 columns]\n</div>"
269 |             ]
270 |           }
271 |         }
272 |       ],
273 |       "execution_count": 22
274 |     },
275 |     {
276 |       "cell_type": "markdown",
277 |       "source": [
278 |         "Define a dict `aggregations` to define how to aggregate each field at a hour level. For `snowDepth` and `temperature` we'll take the mean and for `precipTime` and `precipDepth` we'll take the hourly maximum. Use the groupby() function along with the aggregations to group data."
279 |       ],
280 |       "metadata": {}
281 |     },
282 |     {
283 |       "cell_type": "code",
284 |       "source": [
285 |         "aggregations = {\n    \"snowDepth\": \"mean\",\n    \"precipTime\": \"max\",\n    \"temperature\": \"mean\",\n    \"precipDepth\": \"max\"}"
286 |       ],
287 |       "metadata": {},
288 |       "outputs": [
289 |         {
290 |           "metadata": {},
291 |           "output_type": "display_data",
292 |           "data": {
293 |             "text/html": [
294 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
295 |             ]
296 |           }
297 |         }
298 |       ],
299 |       "execution_count": 24
300 |     },
301 |     {
302 |       "cell_type": "markdown",
303 |       "source": [
304 |         "The keys (`public_rankgroup`, `public_join_time`, `customer_rankgroup`, `customer_join_time`) used by groupby() and later merge() must be hacked here due to the current design."
305 |       ],
306 |       "metadata": {}
307 |     },
308 |     {
309 |       "cell_type": "code",
310 |       "source": [
311 |         "public_rankgroup = processed_weather.id\n\npublic_join_time = [\n    s for s in list(processed_weather.data.columns)\n    if s.startswith('ds_join_time')][0]\n\ncustomer_rankgroup = weather_enricher.location_selector.customer_rankgroup\n\ncustomer_join_time = [\n    s for s in list(new_green_taxi.data.columns)\n    if type(s) is str and s.startswith('customer_join_time')][0]\n\nweather_df_grouped = processed_weather.data.groupby(by=[public_rankgroup, public_join_time]).agg(aggregations)\nweather_df_grouped.head(3)"
312 |       ],
313 |       "metadata": {},
314 |       "outputs": [
315 |         {
316 |           "metadata": {},
317 |           "output_type": "display_data",
318 |           "data": {
319 |             "text/html": [
320 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">12</span><span class=\"ansired\">]: </span>\n                                         snowDepth     ...       precipDepth\npublic_rankgroupbshjb ds_join_time73425                ...                  \n1                     2016-01-01               0.0     ...               8.0\n                      2016-01-02               0.0     ...               0.0\n                      2016-01-03               0.0     ...               0.0\n\n[3 rows x 4 columns]\n</div>"
321 |             ]
322 |           }
323 |         }
324 |       ],
325 |       "execution_count": 26
326 |     },
327 |     {
328 |       "cell_type": "markdown",
329 |       "source": [
330 |         "Join the final dataframe, and preview the joined result."
331 |       ],
332 |       "metadata": {}
333 |     },
334 |     {
335 |       "cell_type": "code",
336 |       "source": [
337 |         "joined_dataset = new_green_taxi.data.merge(\n    weather_df_grouped,\n    left_on=[customer_rankgroup, customer_join_time],\n    right_on=[public_rankgroup, public_join_time],\n    how='left')\n\nfinal_df = joined_dataset[raw_columns + [\n    \"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\"]]\nfinal_df.head(5)"
338 |       ],
339 |       "metadata": {},
340 |       "outputs": [
341 |         {
342 |           "metadata": {},
343 |           "output_type": "display_data",
344 |           "data": {
345 |             "text/html": [
346 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">13</span><span class=\"ansired\">]: </span>\n   vendorID  lpepPickupDatetime    ...     precipDepth  snowDepth\n0         1 2016-01-09 20:13:16    ...             0.0        0.0\n1         2 2016-01-16 23:55:50    ...            61.0        0.0\n2         2 2016-01-04 16:30:30    ...             0.0        0.0\n3         2 2016-01-19 02:37:18    ...             0.0        0.0\n4         2 2016-01-16 01:57:20    ...            61.0        0.0\n\n[5 rows x 27 columns]\n</div>"
347 |             ]
348 |           }
349 |         }
350 |       ],
351 |       "execution_count": 28
352 |     },
353 |     {
354 |       "cell_type": "markdown",
355 |       "source": [
356 |         "Check the join success rate."
357 |       ],
358 |       "metadata": {}
359 |     },
360 |     {
361 |       "cell_type": "code",
362 |       "source": [
363 |         "final_df.info()"
364 |       ],
365 |       "metadata": {},
366 |       "outputs": [
367 |         {
368 |           "metadata": {},
369 |           "output_type": "display_data",
370 |           "data": {
371 |             "text/html": [
372 |               "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">&lt;class &apos;pandas.core.frame.DataFrame&apos;&gt;\nInt64Index: 4000 entries, 0 to 3999\nData columns (total 27 columns):\nvendorID                4000 non-null int32\nlpepPickupDatetime      4000 non-null datetime64[ns]\nlpepDropoffDatetime     4000 non-null datetime64[ns]\npassengerCount          4000 non-null int32\ntripDistance            4000 non-null float64\npuLocationId            0 non-null object\ndoLocationId            0 non-null object\npickupLongitude         4000 non-null float64\npickupLatitude          4000 non-null float64\ndropoffLongitude        4000 non-null float64\ndropoffLatitude         4000 non-null float64\nrateCodeID              4000 non-null int32\nstoreAndFwdFlag         4000 non-null object\npaymentType             4000 non-null int32\nfareAmount              4000 non-null float64\nextra                   4000 non-null float64\nmtaTax                  4000 non-null float64\nimprovementSurcharge    4000 non-null object\ntipAmount               4000 non-null float64\ntollsAmount             4000 non-null float64\nehailFee                0 non-null float64\ntotalAmount             4000 non-null float64\ntripType                4000 non-null float64\ntemperature             4000 non-null float64\nprecipTime              4000 non-null float64\nprecipDepth             4000 non-null float64\nsnowDepth               4000 non-null float64\ndtypes: datetime64[ns](2), float64(17), int32(4), object(4)\nmemory usage: 812.5+ KB\n</div>"
373 |             ]
374 |           }
375 |         }
376 |       ],
377 |       "execution_count": 30
378 |     }
379 |   ],
380 |   "metadata": {
381 |     "kernelspec": {
382 |       "display_name": "Python 3",
383 |       "language": "python",
384 |       "name": "python3"
385 |     },
386 |     "language_info": {
387 |       "mimetype": "text/x-python",
388 |       "name": "python",
389 |       "pygments_lexer": "ipython3",
390 |       "codemirror_mode": {
391 |         "name": "ipython",
392 |         "version": 3
393 |       },
394 |       "version": "3.6.7",
395 |       "nbconvert_exporter": "python",
396 |       "file_extension": ".py"
397 |     },
398 |     "name": "04-nyc-taxi-join-weather-in-pandas",
399 |     "notebookId": 1709144033725344
400 |   },
401 |   "nbformat": 4,
402 |   "nbformat_minor": 0
403 | }
404 | 


--------------------------------------------------------------------------------
/tutorials/energy-join/01-energy-join-weather-in-pandas.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Load nyc_energy and enrich it with weather data\n",
   8 |     "\n",
   9 |     "In this notebook, we try to enrich the NYC Energy data in Jupyter Notebook in a scalable way.\n",
  10 |     "We enrich the input data by month, put the monthly enriched data in the temp folder, and save the final result in the current folder every time we have done one month.\n",
  11 |     "\n",
  12 |     "* Load csv file which is downloaded from:  https://notebooks.azure.com/frlazzeri/projects/automatedml-ms-build/html/nyc_energy.csv\n",
  13 |     "* Time range: 1/1/2012  to 8/12/2017\n",
  14 |     "* Location: 'PORT AUTH DOWNTN MANHATTAN WALL ST' station at <font color='red'>lat: 40.701, long: -74.009</font>"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "# install packages if it's not availble.\n",
  24 |     "\n",
  25 |     "# !pip uninstall -y azureml-opendatasets\n",
  26 |     "# !pip install azureml-opendatasets"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "markdown",
  31 |    "metadata": {},
  32 |    "source": [
  33 |     "### Initialize global variables."
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 2,
  39 |    "metadata": {},
  40 |    "outputs": [
  41 |     {
  42 |      "data": {
  43 |       "text/plain": [
  44 |        "(datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2017, 8, 12, 23, 59))"
  45 |       ]
  46 |      },
  47 |      "execution_count": 2,
  48 |      "metadata": {},
  49 |      "output_type": "execute_result"
  50 |     }
  51 |    ],
  52 |    "source": [
  53 |     "from datetime import datetime\n",
  54 |     "\n",
  55 |     "\n",
  56 |     "start_date = datetime(2012, 1, 1, 0, 0)\n",
  57 |     "end_date = datetime(2017, 8, 12, 23, 59)\n",
  58 |     "\n",
  59 |     "start_date, end_date"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": 3,
  65 |    "metadata": {},
  66 |    "outputs": [
  67 |     {
  68 |      "data": {
  69 |       "text/plain": [
  70 |        "68"
  71 |       ]
  72 |      },
  73 |      "execution_count": 3,
  74 |      "metadata": {},
  75 |      "output_type": "execute_result"
  76 |     }
  77 |    ],
  78 |    "source": [
  79 |     "from datetime import timedelta\n",
  80 |     "from dateutil.relativedelta import relativedelta\n",
  81 |     "\n",
  82 |     "import math\n",
  83 |     "\n",
  84 |     "\n",
  85 |     "r = relativedelta(end_date, start_date)\n",
  86 |     "months = r.years * 12 + r.months + math.floor((r.days + 30)/31)\n",
  87 |     "months"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": 4,
  93 |    "metadata": {},
  94 |    "outputs": [
  95 |     {
  96 |      "data": {
  97 |       "text/plain": [
  98 |        "(40.701, -74.009)"
  99 |       ]
 100 |      },
 101 |      "execution_count": 4,
 102 |      "metadata": {},
 103 |      "output_type": "execute_result"
 104 |     }
 105 |    ],
 106 |    "source": [
 107 |     "lat, long = 40.701, -74.009\n",
 108 |     "lat, long"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "markdown",
 113 |    "metadata": {},
 114 |    "source": [
 115 |     "### Load ``\"./nyc_energy.csv\"`` (download and save to local) and preview the data."
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 5,
 121 |    "metadata": {},
 122 |    "outputs": [
 123 |     {
 124 |      "data": {
 125 |       "text/html": [
 126 |        "<div>\n",
 127 |        "<style scoped>\n",
 128 |        "    .dataframe tbody tr th:only-of-type {\n",
 129 |        "        vertical-align: middle;\n",
 130 |        "    }\n",
 131 |        "\n",
 132 |        "    .dataframe tbody tr th {\n",
 133 |        "        vertical-align: top;\n",
 134 |        "    }\n",
 135 |        "\n",
 136 |        "    .dataframe thead th {\n",
 137 |        "        text-align: right;\n",
 138 |        "    }\n",
 139 |        "</style>\n",
 140 |        "<table border=\"1\" class=\"dataframe\">\n",
 141 |        "  <thead>\n",
 142 |        "    <tr style=\"text-align: right;\">\n",
 143 |        "      <th></th>\n",
 144 |        "      <th>timeStamp</th>\n",
 145 |        "      <th>demand</th>\n",
 146 |        "      <th>lat</th>\n",
 147 |        "      <th>long</th>\n",
 148 |        "    </tr>\n",
 149 |        "  </thead>\n",
 150 |        "  <tbody>\n",
 151 |        "    <tr>\n",
 152 |        "      <th>0</th>\n",
 153 |        "      <td>2012-01-01 00:00:00</td>\n",
 154 |        "      <td>4937.5</td>\n",
 155 |        "      <td>40.701</td>\n",
 156 |        "      <td>-74.009</td>\n",
 157 |        "    </tr>\n",
 158 |        "    <tr>\n",
 159 |        "      <th>1</th>\n",
 160 |        "      <td>2012-01-01 01:00:00</td>\n",
 161 |        "      <td>4752.1</td>\n",
 162 |        "      <td>40.701</td>\n",
 163 |        "      <td>-74.009</td>\n",
 164 |        "    </tr>\n",
 165 |        "    <tr>\n",
 166 |        "      <th>2</th>\n",
 167 |        "      <td>2012-01-01 02:00:00</td>\n",
 168 |        "      <td>4542.6</td>\n",
 169 |        "      <td>40.701</td>\n",
 170 |        "      <td>-74.009</td>\n",
 171 |        "    </tr>\n",
 172 |        "    <tr>\n",
 173 |        "      <th>3</th>\n",
 174 |        "      <td>2012-01-01 03:00:00</td>\n",
 175 |        "      <td>4357.7</td>\n",
 176 |        "      <td>40.701</td>\n",
 177 |        "      <td>-74.009</td>\n",
 178 |        "    </tr>\n",
 179 |        "    <tr>\n",
 180 |        "      <th>4</th>\n",
 181 |        "      <td>2012-01-01 04:00:00</td>\n",
 182 |        "      <td>4275.5</td>\n",
 183 |        "      <td>40.701</td>\n",
 184 |        "      <td>-74.009</td>\n",
 185 |        "    </tr>\n",
 186 |        "  </tbody>\n",
 187 |        "</table>\n",
 188 |        "</div>"
 189 |       ],
 190 |       "text/plain": [
 191 |        "             timeStamp  demand     lat    long\n",
 192 |        "0  2012-01-01 00:00:00  4937.5  40.701 -74.009\n",
 193 |        "1  2012-01-01 01:00:00  4752.1  40.701 -74.009\n",
 194 |        "2  2012-01-01 02:00:00  4542.6  40.701 -74.009\n",
 195 |        "3  2012-01-01 03:00:00  4357.7  40.701 -74.009\n",
 196 |        "4  2012-01-01 04:00:00  4275.5  40.701 -74.009"
 197 |       ]
 198 |      },
 199 |      "execution_count": 5,
 200 |      "metadata": {},
 201 |      "output_type": "execute_result"
 202 |     }
 203 |    ],
 204 |    "source": [
 205 |     "from pandas import read_csv\n",
 206 |     "\n",
 207 |     "\n",
 208 |     "df = read_csv('./nyc_energy.csv').drop(columns=['precip', 'temp'], axis=1)\n",
 209 |     "df['lat'] = lat\n",
 210 |     "df['long'] = long\n",
 211 |     "df.head(5)"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "markdown",
 216 |    "metadata": {},
 217 |    "source": [
 218 |     "### Extend the timeStamp column so that we can filter it easily."
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "code",
 223 |    "execution_count": 6,
 224 |    "metadata": {},
 225 |    "outputs": [
 226 |     {
 227 |      "data": {
 228 |       "text/html": [
 229 |        "<div>\n",
 230 |        "<style scoped>\n",
 231 |        "    .dataframe tbody tr th:only-of-type {\n",
 232 |        "        vertical-align: middle;\n",
 233 |        "    }\n",
 234 |        "\n",
 235 |        "    .dataframe tbody tr th {\n",
 236 |        "        vertical-align: top;\n",
 237 |        "    }\n",
 238 |        "\n",
 239 |        "    .dataframe thead th {\n",
 240 |        "        text-align: right;\n",
 241 |        "    }\n",
 242 |        "</style>\n",
 243 |        "<table border=\"1\" class=\"dataframe\">\n",
 244 |        "  <thead>\n",
 245 |        "    <tr style=\"text-align: right;\">\n",
 246 |        "      <th></th>\n",
 247 |        "      <th>timeStamp</th>\n",
 248 |        "      <th>demand</th>\n",
 249 |        "      <th>lat</th>\n",
 250 |        "      <th>long</th>\n",
 251 |        "      <th>new_datetime</th>\n",
 252 |        "    </tr>\n",
 253 |        "  </thead>\n",
 254 |        "  <tbody>\n",
 255 |        "    <tr>\n",
 256 |        "      <th>0</th>\n",
 257 |        "      <td>2012-01-01 00:00:00</td>\n",
 258 |        "      <td>4937.5</td>\n",
 259 |        "      <td>40.701</td>\n",
 260 |        "      <td>-74.009</td>\n",
 261 |        "      <td>2012-01-01 00:00:00</td>\n",
 262 |        "    </tr>\n",
 263 |        "    <tr>\n",
 264 |        "      <th>1</th>\n",
 265 |        "      <td>2012-01-01 01:00:00</td>\n",
 266 |        "      <td>4752.1</td>\n",
 267 |        "      <td>40.701</td>\n",
 268 |        "      <td>-74.009</td>\n",
 269 |        "      <td>2012-01-01 01:00:00</td>\n",
 270 |        "    </tr>\n",
 271 |        "    <tr>\n",
 272 |        "      <th>2</th>\n",
 273 |        "      <td>2012-01-01 02:00:00</td>\n",
 274 |        "      <td>4542.6</td>\n",
 275 |        "      <td>40.701</td>\n",
 276 |        "      <td>-74.009</td>\n",
 277 |        "      <td>2012-01-01 02:00:00</td>\n",
 278 |        "    </tr>\n",
 279 |        "    <tr>\n",
 280 |        "      <th>3</th>\n",
 281 |        "      <td>2012-01-01 03:00:00</td>\n",
 282 |        "      <td>4357.7</td>\n",
 283 |        "      <td>40.701</td>\n",
 284 |        "      <td>-74.009</td>\n",
 285 |        "      <td>2012-01-01 03:00:00</td>\n",
 286 |        "    </tr>\n",
 287 |        "    <tr>\n",
 288 |        "      <th>4</th>\n",
 289 |        "      <td>2012-01-01 04:00:00</td>\n",
 290 |        "      <td>4275.5</td>\n",
 291 |        "      <td>40.701</td>\n",
 292 |        "      <td>-74.009</td>\n",
 293 |        "      <td>2012-01-01 04:00:00</td>\n",
 294 |        "    </tr>\n",
 295 |        "  </tbody>\n",
 296 |        "</table>\n",
 297 |        "</div>"
 298 |       ],
 299 |       "text/plain": [
 300 |        "             timeStamp  demand     lat    long        new_datetime\n",
 301 |        "0  2012-01-01 00:00:00  4937.5  40.701 -74.009 2012-01-01 00:00:00\n",
 302 |        "1  2012-01-01 01:00:00  4752.1  40.701 -74.009 2012-01-01 01:00:00\n",
 303 |        "2  2012-01-01 02:00:00  4542.6  40.701 -74.009 2012-01-01 02:00:00\n",
 304 |        "3  2012-01-01 03:00:00  4357.7  40.701 -74.009 2012-01-01 03:00:00\n",
 305 |        "4  2012-01-01 04:00:00  4275.5  40.701 -74.009 2012-01-01 04:00:00"
 306 |       ]
 307 |      },
 308 |      "execution_count": 6,
 309 |      "metadata": {},
 310 |      "output_type": "execute_result"
 311 |     }
 312 |    ],
 313 |    "source": [
 314 |     "from dateutil import parser\n",
 315 |     "\n",
 316 |     "\n",
 317 |     "df['new_datetime'] = df['timeStamp'].apply(parser.parse)\n",
 318 |     "raw_columns = list(df.columns)\n",
 319 |     "df.head(5)"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {},
 325 |    "source": [
 326 |     "### Create the temp folder in which we save the enriched data per month."
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": 7,
 332 |    "metadata": {},
 333 |    "outputs": [],
 334 |    "source": [
 335 |     "!if [ ! -d \"./temp\" ]; then mkdir temp; fi"
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "markdown",
 340 |    "metadata": {},
 341 |    "source": [
 342 |     "### Enriching..."
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "code",
 347 |    "execution_count": 8,
 348 |    "metadata": {
 349 |     "scrolled": true
 350 |    },
 351 |    "outputs": [
 352 |     {
 353 |      "name": "stdout",
 354 |      "output_type": "stream",
 355 |      "text": [
 356 |       "[2019-04-29 09:00:03.181218] Start enriching...\n",
 357 |       "ActivityStarted, get_enricher\n",
 358 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.29 [ms]\n",
 359 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 360 |       "ActivityStarted, enrich\n",
 361 |       "Target paths: ['/year=2012/month=1/']\n",
 362 |       "Looking for parquet files...\n",
 363 |       "Reading them into Pandas dataframe...\n",
 364 |       "Reading ISDWeather/year=2012/month=1/part-00004-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-57.c000.snappy.parquet under container isdweatherdatacontainer\n",
 365 |       "Done.\n",
 366 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=84996.24 [ms]\n",
 367 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=85094.1 [ms]\n",
 368 |       "ActivityStarted, get_enricher\n",
 369 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.5 [ms]\n",
 370 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 371 |       "ActivityStarted, enrich\n",
 372 |       "Target paths: ['/year=2012/month=2/']\n",
 373 |       "Looking for parquet files...\n",
 374 |       "Reading them into Pandas dataframe...\n",
 375 |       "Reading ISDWeather/year=2012/month=2/part-00011-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-64.c000.snappy.parquet under container isdweatherdatacontainer\n",
 376 |       "Done.\n",
 377 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43456.5 [ms]\n",
 378 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43512.0 [ms]\n",
 379 |       "ActivityStarted, get_enricher\n",
 380 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.26 [ms]\n",
 381 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 382 |       "ActivityStarted, enrich\n",
 383 |       "Target paths: ['/year=2012/month=3/']\n",
 384 |       "Looking for parquet files...\n",
 385 |       "Reading them into Pandas dataframe...\n",
 386 |       "Reading ISDWeather/year=2012/month=3/part-00001-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-54.c000.snappy.parquet under container isdweatherdatacontainer\n",
 387 |       "Done.\n",
 388 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45698.45 [ms]\n",
 389 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45803.06 [ms]\n",
 390 |       "ActivityStarted, get_enricher\n",
 391 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.27 [ms]\n",
 392 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 393 |       "ActivityStarted, enrich\n",
 394 |       "Target paths: ['/year=2012/month=4/']\n",
 395 |       "Looking for parquet files...\n",
 396 |       "Reading them into Pandas dataframe...\n",
 397 |       "Reading ISDWeather/year=2012/month=4/part-00005-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-58.c000.snappy.parquet under container isdweatherdatacontainer\n",
 398 |       "Done.\n",
 399 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=54456.14 [ms]\n",
 400 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=54511.25 [ms]\n",
 401 |       "ActivityStarted, get_enricher\n",
 402 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=19.45 [ms]\n",
 403 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 404 |       "ActivityStarted, enrich\n",
 405 |       "Target paths: ['/year=2012/month=5/']\n",
 406 |       "Looking for parquet files...\n",
 407 |       "Reading them into Pandas dataframe...\n",
 408 |       "Reading ISDWeather/year=2012/month=5/part-00003-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-56.c000.snappy.parquet under container isdweatherdatacontainer\n",
 409 |       "Done.\n",
 410 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44063.01 [ms]\n",
 411 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44118.81 [ms]\n",
 412 |       "ActivityStarted, get_enricher\n",
 413 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.76 [ms]\n",
 414 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 415 |       "ActivityStarted, enrich\n",
 416 |       "Target paths: ['/year=2012/month=6/']\n",
 417 |       "Looking for parquet files...\n",
 418 |       "Reading them into Pandas dataframe...\n",
 419 |       "Reading ISDWeather/year=2012/month=6/part-00010-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-63.c000.snappy.parquet under container isdweatherdatacontainer\n",
 420 |       "Done.\n",
 421 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45127.85 [ms]\n",
 422 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45182.84 [ms]\n",
 423 |       "ActivityStarted, get_enricher\n",
 424 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.56 [ms]\n",
 425 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 426 |       "ActivityStarted, enrich\n",
 427 |       "Target paths: ['/year=2012/month=7/']\n",
 428 |       "Looking for parquet files...\n",
 429 |       "Reading them into Pandas dataframe...\n",
 430 |       "Reading ISDWeather/year=2012/month=7/part-00006-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-59.c000.snappy.parquet under container isdweatherdatacontainer\n",
 431 |       "Done.\n",
 432 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=75718.52 [ms]\n",
 433 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=75780.16 [ms]\n",
 434 |       "ActivityStarted, get_enricher\n",
 435 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.76 [ms]\n",
 436 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 437 |       "ActivityStarted, enrich\n",
 438 |       "Target paths: ['/year=2012/month=8/']\n",
 439 |       "Looking for parquet files...\n",
 440 |       "Reading them into Pandas dataframe...\n",
 441 |       "Reading ISDWeather/year=2012/month=8/part-00007-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-60.c000.snappy.parquet under container isdweatherdatacontainer\n",
 442 |       "Done.\n",
 443 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=129134.28 [ms]\n",
 444 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=129187.24 [ms]\n",
 445 |       "ActivityStarted, get_enricher\n",
 446 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.15 [ms]\n",
 447 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 448 |       "ActivityStarted, enrich\n",
 449 |       "Target paths: ['/year=2012/month=9/']\n",
 450 |       "Looking for parquet files...\n",
 451 |       "Reading them into Pandas dataframe...\n",
 452 |       "Reading ISDWeather/year=2012/month=9/part-00008-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-61.c000.snappy.parquet under container isdweatherdatacontainer\n",
 453 |       "Done.\n",
 454 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42776.43 [ms]\n",
 455 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=42830.6 [ms]\n",
 456 |       "ActivityStarted, get_enricher\n",
 457 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=20.45 [ms]\n",
 458 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 459 |       "ActivityStarted, enrich\n",
 460 |       "Target paths: ['/year=2012/month=10/']\n",
 461 |       "Looking for parquet files...\n",
 462 |       "Reading them into Pandas dataframe...\n",
 463 |       "Reading ISDWeather/year=2012/month=10/part-00002-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-55.c000.snappy.parquet under container isdweatherdatacontainer\n",
 464 |       "Done.\n",
 465 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45559.84 [ms]\n",
 466 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45615.31 [ms]\n",
 467 |       "ActivityStarted, get_enricher\n",
 468 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.28 [ms]\n",
 469 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 470 |       "ActivityStarted, enrich\n",
 471 |       "Target paths: ['/year=2012/month=11/']\n",
 472 |       "Looking for parquet files...\n",
 473 |       "Reading them into Pandas dataframe...\n",
 474 |       "Reading ISDWeather/year=2012/month=11/part-00009-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-62.c000.snappy.parquet under container isdweatherdatacontainer\n",
 475 |       "Done.\n",
 476 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44528.69 [ms]\n",
 477 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44578.58 [ms]\n",
 478 |       "ActivityStarted, get_enricher\n",
 479 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.67 [ms]\n",
 480 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 481 |       "ActivityStarted, enrich\n",
 482 |       "Target paths: ['/year=2012/month=12/']\n",
 483 |       "Looking for parquet files...\n",
 484 |       "Reading them into Pandas dataframe...\n",
 485 |       "Reading ISDWeather/year=2012/month=12/part-00000-tid-7816671341480880202-0b49e80b-f206-4731-ab5a-61d53f99b595-53.c000.snappy.parquet under container isdweatherdatacontainer\n",
 486 |       "Done.\n",
 487 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=88248.81 [ms]\n",
 488 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=88308.84 [ms]\n",
 489 |       "ActivityStarted, get_enricher\n",
 490 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.71 [ms]\n",
 491 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 492 |       "ActivityStarted, enrich\n",
 493 |       "Target paths: ['/year=2013/month=1/']\n",
 494 |       "Looking for parquet files...\n",
 495 |       "Reading them into Pandas dataframe...\n",
 496 |       "Reading ISDWeather/year=2013/month=1/part-00007-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-73.c000.snappy.parquet under container isdweatherdatacontainer\n",
 497 |       "Done.\n",
 498 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44205.78 [ms]\n"
 499 |      ]
 500 |     },
 501 |     {
 502 |      "name": "stdout",
 503 |      "output_type": "stream",
 504 |      "text": [
 505 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44269.14 [ms]\n",
 506 |       "ActivityStarted, get_enricher\n",
 507 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.68 [ms]\n",
 508 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 509 |       "ActivityStarted, enrich\n",
 510 |       "Target paths: ['/year=2013/month=2/']\n",
 511 |       "Looking for parquet files...\n",
 512 |       "Reading them into Pandas dataframe...\n",
 513 |       "Reading ISDWeather/year=2013/month=2/part-00011-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-77.c000.snappy.parquet under container isdweatherdatacontainer\n",
 514 |       "Done.\n",
 515 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=62644.02 [ms]\n",
 516 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=62696.92 [ms]\n",
 517 |       "ActivityStarted, get_enricher\n",
 518 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.12 [ms]\n",
 519 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 520 |       "ActivityStarted, enrich\n",
 521 |       "Target paths: ['/year=2013/month=3/']\n",
 522 |       "Looking for parquet files...\n",
 523 |       "Reading them into Pandas dataframe...\n",
 524 |       "Reading ISDWeather/year=2013/month=3/part-00008-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-74.c000.snappy.parquet under container isdweatherdatacontainer\n",
 525 |       "Done.\n",
 526 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=63616.67 [ms]\n",
 527 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=63668.2 [ms]\n",
 528 |       "ActivityStarted, get_enricher\n",
 529 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.23 [ms]\n",
 530 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 531 |       "ActivityStarted, enrich\n",
 532 |       "Target paths: ['/year=2013/month=4/']\n",
 533 |       "Looking for parquet files...\n",
 534 |       "Reading them into Pandas dataframe...\n",
 535 |       "Reading ISDWeather/year=2013/month=4/part-00010-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-76.c000.snappy.parquet under container isdweatherdatacontainer\n",
 536 |       "Done.\n",
 537 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44978.72 [ms]\n",
 538 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45061.97 [ms]\n",
 539 |       "ActivityStarted, get_enricher\n",
 540 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.39 [ms]\n",
 541 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 542 |       "ActivityStarted, enrich\n",
 543 |       "Target paths: ['/year=2013/month=5/']\n",
 544 |       "Looking for parquet files...\n",
 545 |       "Reading them into Pandas dataframe...\n",
 546 |       "Reading ISDWeather/year=2013/month=5/part-00003-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-69.c000.snappy.parquet under container isdweatherdatacontainer\n",
 547 |       "Done.\n",
 548 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44434.92 [ms]\n",
 549 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44490.32 [ms]\n",
 550 |       "ActivityStarted, get_enricher\n",
 551 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.25 [ms]\n",
 552 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 553 |       "ActivityStarted, enrich\n",
 554 |       "Target paths: ['/year=2013/month=6/']\n",
 555 |       "Looking for parquet files...\n",
 556 |       "Reading them into Pandas dataframe...\n",
 557 |       "Reading ISDWeather/year=2013/month=6/part-00005-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-71.c000.snappy.parquet under container isdweatherdatacontainer\n",
 558 |       "Done.\n",
 559 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42086.7 [ms]\n",
 560 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=42142.35 [ms]\n",
 561 |       "ActivityStarted, get_enricher\n",
 562 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.83 [ms]\n",
 563 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 564 |       "ActivityStarted, enrich\n",
 565 |       "Target paths: ['/year=2013/month=7/']\n",
 566 |       "Looking for parquet files...\n",
 567 |       "Reading them into Pandas dataframe...\n",
 568 |       "Reading ISDWeather/year=2013/month=7/part-00004-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-70.c000.snappy.parquet under container isdweatherdatacontainer\n",
 569 |       "Done.\n",
 570 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=41910.92 [ms]\n",
 571 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=41964.88 [ms]\n",
 572 |       "ActivityStarted, get_enricher\n",
 573 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=13.26 [ms]\n",
 574 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 575 |       "ActivityStarted, enrich\n",
 576 |       "Target paths: ['/year=2013/month=8/']\n",
 577 |       "Looking for parquet files...\n",
 578 |       "Reading them into Pandas dataframe...\n",
 579 |       "Reading ISDWeather/year=2013/month=8/part-00001-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-67.c000.snappy.parquet under container isdweatherdatacontainer\n",
 580 |       "Done.\n",
 581 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=46636.82 [ms]\n",
 582 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=46692.14 [ms]\n",
 583 |       "ActivityStarted, get_enricher\n",
 584 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.81 [ms]\n",
 585 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 586 |       "ActivityStarted, enrich\n",
 587 |       "Target paths: ['/year=2013/month=9/']\n",
 588 |       "Looking for parquet files...\n",
 589 |       "Reading them into Pandas dataframe...\n",
 590 |       "Reading ISDWeather/year=2013/month=9/part-00006-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-72.c000.snappy.parquet under container isdweatherdatacontainer\n",
 591 |       "Done.\n",
 592 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=46989.72 [ms]\n",
 593 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47071.96 [ms]\n",
 594 |       "ActivityStarted, get_enricher\n",
 595 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=11.18 [ms]\n",
 596 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 597 |       "ActivityStarted, enrich\n",
 598 |       "Target paths: ['/year=2013/month=10/']\n",
 599 |       "Looking for parquet files...\n",
 600 |       "Reading them into Pandas dataframe...\n",
 601 |       "Reading ISDWeather/year=2013/month=10/part-00000-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-66.c000.snappy.parquet under container isdweatherdatacontainer\n",
 602 |       "Done.\n",
 603 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=162179.34 [ms]\n",
 604 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=162226.64 [ms]\n",
 605 |       "ActivityStarted, get_enricher\n",
 606 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.66 [ms]\n",
 607 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 608 |       "ActivityStarted, enrich\n",
 609 |       "Target paths: ['/year=2013/month=11/']\n",
 610 |       "Looking for parquet files...\n",
 611 |       "Reading them into Pandas dataframe...\n",
 612 |       "Reading ISDWeather/year=2013/month=11/part-00009-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-75.c000.snappy.parquet under container isdweatherdatacontainer\n",
 613 |       "Done.\n",
 614 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43066.49 [ms]\n",
 615 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43117.76 [ms]\n",
 616 |       "ActivityStarted, get_enricher\n",
 617 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.52 [ms]\n",
 618 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 619 |       "ActivityStarted, enrich\n",
 620 |       "Target paths: ['/year=2013/month=12/']\n",
 621 |       "Looking for parquet files...\n",
 622 |       "Reading them into Pandas dataframe...\n",
 623 |       "Reading ISDWeather/year=2013/month=12/part-00002-tid-236689213593784421-264283c4-dffb-42b8-9bbf-d912ec6814af-68.c000.snappy.parquet under container isdweatherdatacontainer\n",
 624 |       "Done.\n",
 625 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45187.0 [ms]\n",
 626 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45230.84 [ms]\n",
 627 |       "ActivityStarted, get_enricher\n",
 628 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.41 [ms]\n",
 629 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 630 |       "ActivityStarted, enrich\n",
 631 |       "Target paths: ['/year=2014/month=1/']\n",
 632 |       "Looking for parquet files...\n",
 633 |       "Reading them into Pandas dataframe...\n",
 634 |       "Reading ISDWeather/year=2014/month=1/part-00000-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-79.c000.snappy.parquet under container isdweatherdatacontainer\n",
 635 |       "Done.\n",
 636 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=49615.68 [ms]\n",
 637 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=49671.23 [ms]\n",
 638 |       "ActivityStarted, get_enricher\n",
 639 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=13.47 [ms]\n",
 640 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 641 |       "ActivityStarted, enrich\n",
 642 |       "Target paths: ['/year=2014/month=2/']\n",
 643 |       "Looking for parquet files...\n",
 644 |       "Reading them into Pandas dataframe...\n",
 645 |       "Reading ISDWeather/year=2014/month=2/part-00011-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-90.c000.snappy.parquet under container isdweatherdatacontainer\n",
 646 |       "Done.\n"
 647 |      ]
 648 |     },
 649 |     {
 650 |      "name": "stdout",
 651 |      "output_type": "stream",
 652 |      "text": [
 653 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44652.11 [ms]\n",
 654 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44703.73 [ms]\n",
 655 |       "ActivityStarted, get_enricher\n",
 656 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.69 [ms]\n",
 657 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 658 |       "ActivityStarted, enrich\n",
 659 |       "Target paths: ['/year=2014/month=3/']\n",
 660 |       "Looking for parquet files...\n",
 661 |       "Reading them into Pandas dataframe...\n",
 662 |       "Reading ISDWeather/year=2014/month=3/part-00001-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-80.c000.snappy.parquet under container isdweatherdatacontainer\n",
 663 |       "Done.\n",
 664 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43063.67 [ms]\n",
 665 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43116.56 [ms]\n",
 666 |       "ActivityStarted, get_enricher\n",
 667 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.72 [ms]\n",
 668 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 669 |       "ActivityStarted, enrich\n",
 670 |       "Target paths: ['/year=2014/month=4/']\n",
 671 |       "Looking for parquet files...\n",
 672 |       "Reading them into Pandas dataframe...\n",
 673 |       "Reading ISDWeather/year=2014/month=4/part-00006-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-85.c000.snappy.parquet under container isdweatherdatacontainer\n",
 674 |       "Done.\n",
 675 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=57670.61 [ms]\n",
 676 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=57753.74 [ms]\n",
 677 |       "ActivityStarted, get_enricher\n",
 678 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=11.64 [ms]\n",
 679 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 680 |       "ActivityStarted, enrich\n",
 681 |       "Target paths: ['/year=2014/month=5/']\n",
 682 |       "Looking for parquet files...\n",
 683 |       "Reading them into Pandas dataframe...\n",
 684 |       "Reading ISDWeather/year=2014/month=5/part-00003-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-82.c000.snappy.parquet under container isdweatherdatacontainer\n",
 685 |       "Done.\n",
 686 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=47830.11 [ms]\n",
 687 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47883.77 [ms]\n",
 688 |       "ActivityStarted, get_enricher\n",
 689 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=20.26 [ms]\n",
 690 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 691 |       "ActivityStarted, enrich\n",
 692 |       "Target paths: ['/year=2014/month=6/']\n",
 693 |       "Looking for parquet files...\n",
 694 |       "Reading them into Pandas dataframe...\n",
 695 |       "Reading ISDWeather/year=2014/month=6/part-00010-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-89.c000.snappy.parquet under container isdweatherdatacontainer\n",
 696 |       "Done.\n",
 697 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=41722.47 [ms]\n",
 698 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=41783.46 [ms]\n",
 699 |       "ActivityStarted, get_enricher\n",
 700 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.44 [ms]\n",
 701 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 702 |       "ActivityStarted, enrich\n",
 703 |       "Target paths: ['/year=2014/month=7/']\n",
 704 |       "Looking for parquet files...\n",
 705 |       "Reading them into Pandas dataframe...\n",
 706 |       "Reading ISDWeather/year=2014/month=7/part-00009-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-88.c000.snappy.parquet under container isdweatherdatacontainer\n",
 707 |       "Done.\n",
 708 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42955.4 [ms]\n",
 709 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43005.24 [ms]\n",
 710 |       "ActivityStarted, get_enricher\n",
 711 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.54 [ms]\n",
 712 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 713 |       "ActivityStarted, enrich\n",
 714 |       "Target paths: ['/year=2014/month=8/']\n",
 715 |       "Looking for parquet files...\n",
 716 |       "Reading them into Pandas dataframe...\n",
 717 |       "Reading ISDWeather/year=2014/month=8/part-00005-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-84.c000.snappy.parquet under container isdweatherdatacontainer\n",
 718 |       "Done.\n",
 719 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42296.2 [ms]\n",
 720 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=42380.61 [ms]\n",
 721 |       "ActivityStarted, get_enricher\n",
 722 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=19.23 [ms]\n",
 723 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 724 |       "ActivityStarted, enrich\n",
 725 |       "Target paths: ['/year=2014/month=9/']\n",
 726 |       "Looking for parquet files...\n",
 727 |       "Reading them into Pandas dataframe...\n",
 728 |       "Reading ISDWeather/year=2014/month=9/part-00007-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-86.c000.snappy.parquet under container isdweatherdatacontainer\n",
 729 |       "Done.\n",
 730 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=48054.2 [ms]\n",
 731 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=48104.44 [ms]\n",
 732 |       "ActivityStarted, get_enricher\n",
 733 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.48 [ms]\n",
 734 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 735 |       "ActivityStarted, enrich\n",
 736 |       "Target paths: ['/year=2014/month=10/']\n",
 737 |       "Looking for parquet files...\n",
 738 |       "Reading them into Pandas dataframe...\n",
 739 |       "Reading ISDWeather/year=2014/month=10/part-00002-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-81.c000.snappy.parquet under container isdweatherdatacontainer\n",
 740 |       "Done.\n",
 741 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44798.42 [ms]\n",
 742 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44854.57 [ms]\n",
 743 |       "ActivityStarted, get_enricher\n",
 744 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=19.53 [ms]\n",
 745 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 746 |       "ActivityStarted, enrich\n",
 747 |       "Target paths: ['/year=2014/month=11/']\n",
 748 |       "Looking for parquet files...\n",
 749 |       "Reading them into Pandas dataframe...\n",
 750 |       "Reading ISDWeather/year=2014/month=11/part-00008-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-87.c000.snappy.parquet under container isdweatherdatacontainer\n",
 751 |       "Done.\n",
 752 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=47152.97 [ms]\n",
 753 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47238.21 [ms]\n",
 754 |       "ActivityStarted, get_enricher\n",
 755 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.0 [ms]\n",
 756 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 757 |       "ActivityStarted, enrich\n",
 758 |       "Target paths: ['/year=2014/month=12/']\n",
 759 |       "Looking for parquet files...\n",
 760 |       "Reading them into Pandas dataframe...\n",
 761 |       "Reading ISDWeather/year=2014/month=12/part-00004-tid-9219175779481662582-3729dfdb-ab32-4767-b9b6-11d2d644c3ce-83.c000.snappy.parquet under container isdweatherdatacontainer\n",
 762 |       "Done.\n",
 763 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=41111.37 [ms]\n",
 764 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=41203.44 [ms]\n",
 765 |       "ActivityStarted, get_enricher\n",
 766 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.96 [ms]\n",
 767 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 768 |       "ActivityStarted, enrich\n",
 769 |       "Target paths: ['/year=2015/month=1/']\n",
 770 |       "Looking for parquet files...\n",
 771 |       "Reading them into Pandas dataframe...\n",
 772 |       "Reading ISDWeather/year=2015/month=1/part-00006-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-98.c000.snappy.parquet under container isdweatherdatacontainer\n",
 773 |       "Done.\n",
 774 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43527.36 [ms]\n",
 775 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43580.11 [ms]\n",
 776 |       "ActivityStarted, get_enricher\n",
 777 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=19.69 [ms]\n",
 778 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 779 |       "ActivityStarted, enrich\n",
 780 |       "Target paths: ['/year=2015/month=2/']\n",
 781 |       "Looking for parquet files...\n",
 782 |       "Reading them into Pandas dataframe...\n",
 783 |       "Reading ISDWeather/year=2015/month=2/part-00010-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-102.c000.snappy.parquet under container isdweatherdatacontainer\n",
 784 |       "Done.\n",
 785 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=39870.61 [ms]\n",
 786 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=39917.0 [ms]\n",
 787 |       "ActivityStarted, get_enricher\n",
 788 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=20.74 [ms]\n",
 789 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 790 |       "ActivityStarted, enrich\n",
 791 |       "Target paths: ['/year=2015/month=3/']\n",
 792 |       "Looking for parquet files...\n",
 793 |       "Reading them into Pandas dataframe...\n",
 794 |       "Reading ISDWeather/year=2015/month=3/part-00005-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-97.c000.snappy.parquet under container isdweatherdatacontainer\n"
 795 |      ]
 796 |     },
 797 |     {
 798 |      "name": "stdout",
 799 |      "output_type": "stream",
 800 |      "text": [
 801 |       "Done.\n",
 802 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44280.98 [ms]\n",
 803 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44339.07 [ms]\n",
 804 |       "ActivityStarted, get_enricher\n",
 805 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.61 [ms]\n",
 806 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 807 |       "ActivityStarted, enrich\n",
 808 |       "Target paths: ['/year=2015/month=4/']\n",
 809 |       "Looking for parquet files...\n",
 810 |       "Reading them into Pandas dataframe...\n",
 811 |       "Reading ISDWeather/year=2015/month=4/part-00011-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-103.c000.snappy.parquet under container isdweatherdatacontainer\n",
 812 |       "Done.\n",
 813 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43311.91 [ms]\n",
 814 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43347.73 [ms]\n",
 815 |       "ActivityStarted, get_enricher\n",
 816 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.16 [ms]\n",
 817 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 818 |       "ActivityStarted, enrich\n",
 819 |       "Target paths: ['/year=2015/month=5/']\n",
 820 |       "Looking for parquet files...\n",
 821 |       "Reading them into Pandas dataframe...\n",
 822 |       "Reading ISDWeather/year=2015/month=5/part-00001-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-93.c000.snappy.parquet under container isdweatherdatacontainer\n",
 823 |       "Done.\n",
 824 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42948.68 [ms]\n",
 825 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=42999.33 [ms]\n",
 826 |       "ActivityStarted, get_enricher\n",
 827 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=20.54 [ms]\n",
 828 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 829 |       "ActivityStarted, enrich\n",
 830 |       "Target paths: ['/year=2015/month=6/']\n",
 831 |       "Looking for parquet files...\n",
 832 |       "Reading them into Pandas dataframe...\n",
 833 |       "Reading ISDWeather/year=2015/month=6/part-00008-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-100.c000.snappy.parquet under container isdweatherdatacontainer\n",
 834 |       "Done.\n",
 835 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=40811.53 [ms]\n",
 836 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=40867.21 [ms]\n",
 837 |       "ActivityStarted, get_enricher\n",
 838 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.17 [ms]\n",
 839 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 840 |       "ActivityStarted, enrich\n",
 841 |       "Target paths: ['/year=2015/month=7/']\n",
 842 |       "Looking for parquet files...\n",
 843 |       "Reading them into Pandas dataframe...\n",
 844 |       "Reading ISDWeather/year=2015/month=7/part-00004-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-96.c000.snappy.parquet under container isdweatherdatacontainer\n",
 845 |       "Done.\n",
 846 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43527.2 [ms]\n",
 847 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43582.72 [ms]\n",
 848 |       "ActivityStarted, get_enricher\n",
 849 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.76 [ms]\n",
 850 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 851 |       "ActivityStarted, enrich\n",
 852 |       "Target paths: ['/year=2015/month=8/']\n",
 853 |       "Looking for parquet files...\n",
 854 |       "Reading them into Pandas dataframe...\n",
 855 |       "Reading ISDWeather/year=2015/month=8/part-00007-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-99.c000.snappy.parquet under container isdweatherdatacontainer\n",
 856 |       "Done.\n",
 857 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43075.59 [ms]\n",
 858 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43126.87 [ms]\n",
 859 |       "ActivityStarted, get_enricher\n",
 860 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.8 [ms]\n",
 861 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 862 |       "ActivityStarted, enrich\n",
 863 |       "Target paths: ['/year=2015/month=9/']\n",
 864 |       "Looking for parquet files...\n",
 865 |       "Reading them into Pandas dataframe...\n",
 866 |       "Reading ISDWeather/year=2015/month=9/part-00009-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-101.c000.snappy.parquet under container isdweatherdatacontainer\n",
 867 |       "Done.\n",
 868 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=41673.19 [ms]\n",
 869 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=41722.83 [ms]\n",
 870 |       "ActivityStarted, get_enricher\n",
 871 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.24 [ms]\n",
 872 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 873 |       "ActivityStarted, enrich\n",
 874 |       "Target paths: ['/year=2015/month=10/']\n",
 875 |       "Looking for parquet files...\n",
 876 |       "Reading them into Pandas dataframe...\n",
 877 |       "Reading ISDWeather/year=2015/month=10/part-00002-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-94.c000.snappy.parquet under container isdweatherdatacontainer\n",
 878 |       "Done.\n",
 879 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=42945.69 [ms]\n",
 880 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=42996.68 [ms]\n",
 881 |       "ActivityStarted, get_enricher\n",
 882 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=18.69 [ms]\n",
 883 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 884 |       "ActivityStarted, enrich\n",
 885 |       "Target paths: ['/year=2015/month=11/']\n",
 886 |       "Looking for parquet files...\n",
 887 |       "Reading them into Pandas dataframe...\n",
 888 |       "Reading ISDWeather/year=2015/month=11/part-00003-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-95.c000.snappy.parquet under container isdweatherdatacontainer\n",
 889 |       "Done.\n",
 890 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43708.65 [ms]\n",
 891 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43754.12 [ms]\n",
 892 |       "ActivityStarted, get_enricher\n",
 893 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.34 [ms]\n",
 894 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 895 |       "ActivityStarted, enrich\n",
 896 |       "Target paths: ['/year=2015/month=12/']\n",
 897 |       "Looking for parquet files...\n",
 898 |       "Reading them into Pandas dataframe...\n",
 899 |       "Reading ISDWeather/year=2015/month=12/part-00000-tid-2198075741767757560-e3eb994e-d560-4dfc-941e-0aae74c8d9ed-92.c000.snappy.parquet under container isdweatherdatacontainer\n",
 900 |       "Done.\n",
 901 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=60355.98 [ms]\n",
 902 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=60447.26 [ms]\n",
 903 |       "ActivityStarted, get_enricher\n",
 904 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.05 [ms]\n",
 905 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 906 |       "ActivityStarted, enrich\n",
 907 |       "Target paths: ['/year=2016/month=1/']\n",
 908 |       "Looking for parquet files...\n",
 909 |       "Reading them into Pandas dataframe...\n",
 910 |       "Reading ISDWeather/year=2016/month=1/part-00005-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-110.c000.snappy.parquet under container isdweatherdatacontainer\n",
 911 |       "Done.\n",
 912 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=60960.86 [ms]\n",
 913 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=61040.38 [ms]\n",
 914 |       "ActivityStarted, get_enricher\n",
 915 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.98 [ms]\n",
 916 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 917 |       "ActivityStarted, enrich\n",
 918 |       "Target paths: ['/year=2016/month=2/']\n",
 919 |       "Looking for parquet files...\n",
 920 |       "Reading them into Pandas dataframe...\n",
 921 |       "Reading ISDWeather/year=2016/month=2/part-00011-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-116.c000.snappy.parquet under container isdweatherdatacontainer\n",
 922 |       "Done.\n",
 923 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43105.63 [ms]\n",
 924 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43189.44 [ms]\n",
 925 |       "ActivityStarted, get_enricher\n",
 926 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.53 [ms]\n",
 927 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 928 |       "ActivityStarted, enrich\n",
 929 |       "Target paths: ['/year=2016/month=3/']\n",
 930 |       "Looking for parquet files...\n",
 931 |       "Reading them into Pandas dataframe...\n",
 932 |       "Reading ISDWeather/year=2016/month=3/part-00004-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-109.c000.snappy.parquet under container isdweatherdatacontainer\n",
 933 |       "Done.\n",
 934 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=46416.04 [ms]\n",
 935 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=46471.95 [ms]\n",
 936 |       "ActivityStarted, get_enricher\n",
 937 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.41 [ms]\n",
 938 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 939 |       "ActivityStarted, enrich\n",
 940 |       "Target paths: ['/year=2016/month=4/']\n",
 941 |       "Looking for parquet files...\n",
 942 |       "Reading them into Pandas dataframe...\n",
 943 |       "Reading ISDWeather/year=2016/month=4/part-00008-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-113.c000.snappy.parquet under container isdweatherdatacontainer\n"
 944 |      ]
 945 |     },
 946 |     {
 947 |      "name": "stdout",
 948 |      "output_type": "stream",
 949 |      "text": [
 950 |       "Done.\n",
 951 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43887.97 [ms]\n",
 952 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43966.98 [ms]\n",
 953 |       "ActivityStarted, get_enricher\n",
 954 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.72 [ms]\n",
 955 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 956 |       "ActivityStarted, enrich\n",
 957 |       "Target paths: ['/year=2016/month=5/']\n",
 958 |       "Looking for parquet files...\n",
 959 |       "Reading them into Pandas dataframe...\n",
 960 |       "Reading ISDWeather/year=2016/month=5/part-00006-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-111.c000.snappy.parquet under container isdweatherdatacontainer\n",
 961 |       "Done.\n",
 962 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44077.01 [ms]\n",
 963 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44163.85 [ms]\n",
 964 |       "ActivityStarted, get_enricher\n",
 965 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.48 [ms]\n",
 966 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 967 |       "ActivityStarted, enrich\n",
 968 |       "Target paths: ['/year=2016/month=6/']\n",
 969 |       "Looking for parquet files...\n",
 970 |       "Reading them into Pandas dataframe...\n",
 971 |       "Reading ISDWeather/year=2016/month=6/part-00010-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-115.c000.snappy.parquet under container isdweatherdatacontainer\n",
 972 |       "Done.\n",
 973 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=41326.38 [ms]\n",
 974 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=41374.81 [ms]\n",
 975 |       "ActivityStarted, get_enricher\n",
 976 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.47 [ms]\n",
 977 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 978 |       "ActivityStarted, enrich\n",
 979 |       "Target paths: ['/year=2016/month=7/']\n",
 980 |       "Looking for parquet files...\n",
 981 |       "Reading them into Pandas dataframe...\n",
 982 |       "Reading ISDWeather/year=2016/month=7/part-00003-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-108.c000.snappy.parquet under container isdweatherdatacontainer\n",
 983 |       "Done.\n",
 984 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=52732.89 [ms]\n",
 985 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=52784.83 [ms]\n",
 986 |       "ActivityStarted, get_enricher\n",
 987 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.63 [ms]\n",
 988 |       "ActivityStarted, enrich_customer_data_no_agg\n",
 989 |       "ActivityStarted, enrich\n",
 990 |       "Target paths: ['/year=2016/month=8/']\n",
 991 |       "Looking for parquet files...\n",
 992 |       "Reading them into Pandas dataframe...\n",
 993 |       "Reading ISDWeather/year=2016/month=8/part-00001-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-106.c000.snappy.parquet under container isdweatherdatacontainer\n",
 994 |       "Done.\n",
 995 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44712.9 [ms]\n",
 996 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44768.49 [ms]\n",
 997 |       "ActivityStarted, get_enricher\n",
 998 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=21.75 [ms]\n",
 999 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1000 |       "ActivityStarted, enrich\n",
1001 |       "Target paths: ['/year=2016/month=9/']\n",
1002 |       "Looking for parquet files...\n",
1003 |       "Reading them into Pandas dataframe...\n",
1004 |       "Reading ISDWeather/year=2016/month=9/part-00007-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-112.c000.snappy.parquet under container isdweatherdatacontainer\n",
1005 |       "Done.\n",
1006 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45721.92 [ms]\n",
1007 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45792.78 [ms]\n",
1008 |       "ActivityStarted, get_enricher\n",
1009 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.39 [ms]\n",
1010 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1011 |       "ActivityStarted, enrich\n",
1012 |       "Target paths: ['/year=2016/month=10/']\n",
1013 |       "Looking for parquet files...\n",
1014 |       "Reading them into Pandas dataframe...\n",
1015 |       "Reading ISDWeather/year=2016/month=10/part-00002-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-107.c000.snappy.parquet under container isdweatherdatacontainer\n",
1016 |       "Done.\n",
1017 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=48494.7 [ms]\n",
1018 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=48549.09 [ms]\n",
1019 |       "ActivityStarted, get_enricher\n",
1020 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=16.91 [ms]\n",
1021 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1022 |       "ActivityStarted, enrich\n",
1023 |       "Target paths: ['/year=2016/month=11/']\n",
1024 |       "Looking for parquet files...\n",
1025 |       "Reading them into Pandas dataframe...\n",
1026 |       "Reading ISDWeather/year=2016/month=11/part-00009-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-114.c000.snappy.parquet under container isdweatherdatacontainer\n",
1027 |       "Done.\n",
1028 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=102567.37 [ms]\n",
1029 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=102611.05 [ms]\n",
1030 |       "ActivityStarted, get_enricher\n",
1031 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.52 [ms]\n",
1032 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1033 |       "ActivityStarted, enrich\n",
1034 |       "Target paths: ['/year=2016/month=12/']\n",
1035 |       "Looking for parquet files...\n",
1036 |       "Reading them into Pandas dataframe...\n",
1037 |       "Reading ISDWeather/year=2016/month=12/part-00000-tid-6700213360605767691-4491b75c-f137-489b-b5df-4204b9326fda-105.c000.snappy.parquet under container isdweatherdatacontainer\n",
1038 |       "Done.\n",
1039 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=122545.74 [ms]\n",
1040 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=122593.63 [ms]\n",
1041 |       "ActivityStarted, get_enricher\n",
1042 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.22 [ms]\n",
1043 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1044 |       "ActivityStarted, enrich\n",
1045 |       "Target paths: ['/year=2017/month=1/']\n",
1046 |       "Looking for parquet files...\n",
1047 |       "Reading them into Pandas dataframe...\n",
1048 |       "Reading ISDWeather/year=2017/month=1/part-00001-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-119.c000.snappy.parquet under container isdweatherdatacontainer\n",
1049 |       "Done.\n",
1050 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=47248.72 [ms]\n",
1051 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47308.75 [ms]\n",
1052 |       "ActivityStarted, get_enricher\n",
1053 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.4 [ms]\n",
1054 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1055 |       "ActivityStarted, enrich\n",
1056 |       "Target paths: ['/year=2017/month=2/']\n",
1057 |       "Looking for parquet files...\n",
1058 |       "Reading them into Pandas dataframe...\n",
1059 |       "Reading ISDWeather/year=2017/month=2/part-00011-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-129.c000.snappy.parquet under container isdweatherdatacontainer\n",
1060 |       "Done.\n",
1061 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=44875.39 [ms]\n",
1062 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=44932.98 [ms]\n",
1063 |       "ActivityStarted, get_enricher\n",
1064 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=12.19 [ms]\n",
1065 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1066 |       "ActivityStarted, enrich\n",
1067 |       "Target paths: ['/year=2017/month=3/']\n",
1068 |       "Looking for parquet files...\n",
1069 |       "Reading them into Pandas dataframe...\n",
1070 |       "Reading ISDWeather/year=2017/month=3/part-00000-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-118.c000.snappy.parquet under container isdweatherdatacontainer\n",
1071 |       "Done.\n",
1072 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=43882.2 [ms]\n",
1073 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=43949.11 [ms]\n",
1074 |       "ActivityStarted, get_enricher\n",
1075 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.48 [ms]\n",
1076 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1077 |       "ActivityStarted, enrich\n",
1078 |       "Target paths: ['/year=2017/month=4/']\n",
1079 |       "Looking for parquet files...\n",
1080 |       "Reading them into Pandas dataframe...\n",
1081 |       "Reading ISDWeather/year=2017/month=4/part-00007-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-125.c000.snappy.parquet under container isdweatherdatacontainer\n",
1082 |       "Done.\n",
1083 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=53383.3 [ms]\n",
1084 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=53444.65 [ms]\n",
1085 |       "ActivityStarted, get_enricher\n",
1086 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.69 [ms]\n",
1087 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1088 |       "ActivityStarted, enrich\n",
1089 |       "Target paths: ['/year=2017/month=5/']\n",
1090 |       "Looking for parquet files...\n",
1091 |       "Reading them into Pandas dataframe...\n",
1092 |       "Reading ISDWeather/year=2017/month=5/part-00002-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-120.c000.snappy.parquet under container isdweatherdatacontainer\n"
1093 |      ]
1094 |     },
1095 |     {
1096 |      "name": "stdout",
1097 |      "output_type": "stream",
1098 |      "text": [
1099 |       "Done.\n",
1100 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=45802.38 [ms]\n",
1101 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=45874.65 [ms]\n",
1102 |       "ActivityStarted, get_enricher\n",
1103 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.22 [ms]\n",
1104 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1105 |       "ActivityStarted, enrich\n",
1106 |       "Target paths: ['/year=2017/month=6/']\n",
1107 |       "Looking for parquet files...\n",
1108 |       "Reading them into Pandas dataframe...\n",
1109 |       "Reading ISDWeather/year=2017/month=6/part-00010-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-128.c000.snappy.parquet under container isdweatherdatacontainer\n",
1110 |       "Done.\n",
1111 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=47608.01 [ms]\n",
1112 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=47667.03 [ms]\n",
1113 |       "ActivityStarted, get_enricher\n",
1114 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=17.83 [ms]\n",
1115 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1116 |       "ActivityStarted, enrich\n",
1117 |       "Target paths: ['/year=2017/month=7/']\n",
1118 |       "Looking for parquet files...\n",
1119 |       "Reading them into Pandas dataframe...\n",
1120 |       "Reading ISDWeather/year=2017/month=7/part-00006-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-124.c000.snappy.parquet under container isdweatherdatacontainer\n",
1121 |       "Done.\n",
1122 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=54551.39 [ms]\n",
1123 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=54612.86 [ms]\n",
1124 |       "ActivityStarted, get_enricher\n",
1125 |       "ActivityCompleted: Activity=get_enricher, HowEnded=Success, Duration=19.39 [ms]\n",
1126 |       "ActivityStarted, enrich_customer_data_no_agg\n",
1127 |       "ActivityStarted, enrich\n",
1128 |       "Target paths: ['/year=2017/month=8/']\n",
1129 |       "Looking for parquet files...\n",
1130 |       "Reading them into Pandas dataframe...\n",
1131 |       "Reading ISDWeather/year=2017/month=8/part-00003-tid-1321158002197267978-8e3eb092-4b7a-42de-97ee-e23297ed8955-121.c000.snappy.parquet under container isdweatherdatacontainer\n",
1132 |       "Done.\n",
1133 |       "ActivityCompleted: Activity=enrich, HowEnded=Success, Duration=39574.97 [ms]\n",
1134 |       "ActivityCompleted: Activity=enrich_customer_data_no_agg, HowEnded=Success, Duration=39606.91 [ms]\n",
1135 |       "[2019-04-29 10:13:28.810135] End enriching...\n"
1136 |      ]
1137 |     }
1138 |    ],
1139 |    "source": [
1140 |     "import os.path\n",
1141 |     "import pandas as pd\n",
1142 |     "import numpy as np\n",
1143 |     "from azureml.opendatasets.accessories.location_data import LatLongColumn\n",
1144 |     "from azureml.opendatasets.accessories.location_time_customer_data \\\n",
1145 |     "    import LocationTimeCustomerData\n",
1146 |     "from azureml.opendatasets import NoaaIsdWeather\n",
1147 |     "from azureml.opendatasets.environ import PandasEnv\n",
1148 |     "\n",
1149 |     "\n",
1150 |     "if os.path.exists('./nyc_energy_enriched.csv'):\n",
1151 |     "    raise RuntimeError('nyc_energy_enriched.csv exists already.')\n",
1152 |     "else:\n",
1153 |     "    print('[%s] Start enriching...' % datetime.now())\n",
1154 |     "    all = pd.DataFrame([])\n",
1155 |     "    report_joined = {}\n",
1156 |     "    i_date = start_date\n",
1157 |     "    for m in range(months):\n",
1158 |     "        j_date = i_date + relativedelta(months=1) - timedelta(milliseconds=1)\n",
1159 |     "\n",
1160 |     "        # This is important to set monotonically increasing index for successful enrichemnt.\n",
1161 |     "        df1 = df[(df['new_datetime'] >= i_date) & (df['new_datetime'] <= j_date)].copy()\n",
1162 |     "        df1['idx'] = list(range(len(df1.index)))\n",
1163 |     "        df1 = df1.set_index('idx')\n",
1164 |     "\n",
1165 |     "        energy = LocationTimeCustomerData(\n",
1166 |     "            df1,\n",
1167 |     "            LatLongColumn('lat', 'long'),\n",
1168 |     "            'new_datetime')\n",
1169 |     "\n",
1170 |     "        weather = NoaaIsdWeather(\n",
1171 |     "            cols=[\"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\"],\n",
1172 |     "            start_date=i_date,\n",
1173 |     "            end_date=j_date)\n",
1174 |     "\n",
1175 |     "        weather_enricher = weather.get_enricher()\n",
1176 |     "        new_energy, processed_weather = weather_enricher.enrich_customer_data_no_agg(\n",
1177 |     "            customer_data_object=energy,\n",
1178 |     "            location_match_granularity=5, # higher for high join success rate, lower for performance.\n",
1179 |     "            time_round_granularity='day')\n",
1180 |     "        \n",
1181 |     "        # ---=== Begin of cusomtized aggregation ===---\n",
1182 |     "        \n",
1183 |     "        processed_weather.data['precipDepth'] = processed_weather.data['precipDepth'].apply(\n",
1184 |     "            lambda x: np.nan if x == 9999 else x)\n",
1185 |     "        processed_weather.data['precipTime'] = processed_weather.data['precipTime'].apply(\n",
1186 |     "            lambda x: np.nan if x == 99 else x)\n",
1187 |     "\n",
1188 |     "        processed_weather.data['precipDepth/precipTime'] = \\\n",
1189 |     "        processed_weather.data[['precipDepth', 'precipTime']].apply(\n",
1190 |     "            lambda x: np.nan if (\n",
1191 |     "                pd.isna(x[0]) or pd.isna(x[1]) or x[1] == 0.0) else (x[0] / x[1]), axis=1)\n",
1192 |     "        \n",
1193 |     "        aggregations = {\n",
1194 |     "            \"temperature\": \"mean\",\n",
1195 |     "            \"snowDepth\": \"mean\",\n",
1196 |     "            \"precipDepth/precipTime\": \"mean\",\n",
1197 |     "            \"precipDepth\": \"max\",\n",
1198 |     "            \"precipTime\": \"max\"}\n",
1199 |     "        \n",
1200 |     "        public_rankgroup = processed_weather.id\n",
1201 |     "\n",
1202 |     "        public_join_time = [\n",
1203 |     "            s for s in list(processed_weather.data.columns)\n",
1204 |     "            if s.startswith('ds_join_time')][0]\n",
1205 |     "\n",
1206 |     "        customer_rankgroup = weather_enricher.location_selector.customer_rankgroup\n",
1207 |     "\n",
1208 |     "        customer_join_time = [\n",
1209 |     "            s for s in list(new_energy.data.columns)\n",
1210 |     "            if s.startswith('customer_join_time')][0]\n",
1211 |     "\n",
1212 |     "        weather_df_grouped = processed_weather.data.groupby(\n",
1213 |     "            by=[public_rankgroup, public_join_time]).agg(aggregations)\n",
1214 |     "        \n",
1215 |     "        joined_dataset = new_energy.data.merge(\n",
1216 |     "            weather_df_grouped,\n",
1217 |     "            left_on=[customer_rankgroup, customer_join_time],\n",
1218 |     "            right_on=[public_rankgroup, public_join_time],\n",
1219 |     "            how='left')\n",
1220 |     "\n",
1221 |     "        final_df = joined_dataset[raw_columns + [\n",
1222 |     "            \"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\", \"precipDepth/precipTime\"]]\n",
1223 |     "\n",
1224 |     "        report_joined[i_date] = final_df.describe()\n",
1225 |     "        \n",
1226 |     "        # ---=== End of customized aggregation ===---\n",
1227 |     "        \n",
1228 |     "        fn = './temp/nyc_energy_enriched_%s.csv' % i_date\n",
1229 |     "        final_df.to_csv(fn)\n",
1230 |     "\n",
1231 |     "        all = pd.concat([all, final_df])\n",
1232 |     "        all.to_csv('./nyc_energy_enriched.csv')\n",
1233 |     "\n",
1234 |     "        i_date += relativedelta(months=1)\n",
1235 |     "\n",
1236 |     "    print('[%s] End enriching...' % datetime.now())"
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "markdown",
1241 |    "metadata": {},
1242 |    "source": [
1243 |     "### The final result has been saved to ``\"./nyc_energy_enriched.csv\"``"
1244 |    ]
1245 |   },
1246 |   {
1247 |    "cell_type": "code",
1248 |    "execution_count": 9,
1249 |    "metadata": {},
1250 |    "outputs": [
1251 |     {
1252 |      "data": {
1253 |       "text/html": [
1254 |        "<div>\n",
1255 |        "<style scoped>\n",
1256 |        "    .dataframe tbody tr th:only-of-type {\n",
1257 |        "        vertical-align: middle;\n",
1258 |        "    }\n",
1259 |        "\n",
1260 |        "    .dataframe tbody tr th {\n",
1261 |        "        vertical-align: top;\n",
1262 |        "    }\n",
1263 |        "\n",
1264 |        "    .dataframe thead th {\n",
1265 |        "        text-align: right;\n",
1266 |        "    }\n",
1267 |        "</style>\n",
1268 |        "<table border=\"1\" class=\"dataframe\">\n",
1269 |        "  <thead>\n",
1270 |        "    <tr style=\"text-align: right;\">\n",
1271 |        "      <th></th>\n",
1272 |        "      <th>timeStamp</th>\n",
1273 |        "      <th>demand</th>\n",
1274 |        "      <th>lat</th>\n",
1275 |        "      <th>long</th>\n",
1276 |        "      <th>new_datetime</th>\n",
1277 |        "      <th>temperature</th>\n",
1278 |        "      <th>precipTime</th>\n",
1279 |        "      <th>precipDepth</th>\n",
1280 |        "      <th>snowDepth</th>\n",
1281 |        "      <th>precipDepth/precipTime</th>\n",
1282 |        "    </tr>\n",
1283 |        "  </thead>\n",
1284 |        "  <tbody>\n",
1285 |        "    <tr>\n",
1286 |        "      <th>0</th>\n",
1287 |        "      <td>2012-01-01 00:00:00</td>\n",
1288 |        "      <td>4937.5</td>\n",
1289 |        "      <td>40.701</td>\n",
1290 |        "      <td>-74.009</td>\n",
1291 |        "      <td>2012-01-01 00:00:00</td>\n",
1292 |        "      <td>7.665934</td>\n",
1293 |        "      <td>24.0</td>\n",
1294 |        "      <td>58.0</td>\n",
1295 |        "      <td>0.0</td>\n",
1296 |        "      <td>0.046384</td>\n",
1297 |        "    </tr>\n",
1298 |        "    <tr>\n",
1299 |        "      <th>1</th>\n",
1300 |        "      <td>2012-01-01 01:00:00</td>\n",
1301 |        "      <td>4752.1</td>\n",
1302 |        "      <td>40.701</td>\n",
1303 |        "      <td>-74.009</td>\n",
1304 |        "      <td>2012-01-01 01:00:00</td>\n",
1305 |        "      <td>7.665934</td>\n",
1306 |        "      <td>24.0</td>\n",
1307 |        "      <td>58.0</td>\n",
1308 |        "      <td>0.0</td>\n",
1309 |        "      <td>0.046384</td>\n",
1310 |        "    </tr>\n",
1311 |        "    <tr>\n",
1312 |        "      <th>2</th>\n",
1313 |        "      <td>2012-01-01 02:00:00</td>\n",
1314 |        "      <td>4542.6</td>\n",
1315 |        "      <td>40.701</td>\n",
1316 |        "      <td>-74.009</td>\n",
1317 |        "      <td>2012-01-01 02:00:00</td>\n",
1318 |        "      <td>7.665934</td>\n",
1319 |        "      <td>24.0</td>\n",
1320 |        "      <td>58.0</td>\n",
1321 |        "      <td>0.0</td>\n",
1322 |        "      <td>0.046384</td>\n",
1323 |        "    </tr>\n",
1324 |        "    <tr>\n",
1325 |        "      <th>3</th>\n",
1326 |        "      <td>2012-01-01 03:00:00</td>\n",
1327 |        "      <td>4357.7</td>\n",
1328 |        "      <td>40.701</td>\n",
1329 |        "      <td>-74.009</td>\n",
1330 |        "      <td>2012-01-01 03:00:00</td>\n",
1331 |        "      <td>7.665934</td>\n",
1332 |        "      <td>24.0</td>\n",
1333 |        "      <td>58.0</td>\n",
1334 |        "      <td>0.0</td>\n",
1335 |        "      <td>0.046384</td>\n",
1336 |        "    </tr>\n",
1337 |        "    <tr>\n",
1338 |        "      <th>4</th>\n",
1339 |        "      <td>2012-01-01 04:00:00</td>\n",
1340 |        "      <td>4275.5</td>\n",
1341 |        "      <td>40.701</td>\n",
1342 |        "      <td>-74.009</td>\n",
1343 |        "      <td>2012-01-01 04:00:00</td>\n",
1344 |        "      <td>7.665934</td>\n",
1345 |        "      <td>24.0</td>\n",
1346 |        "      <td>58.0</td>\n",
1347 |        "      <td>0.0</td>\n",
1348 |        "      <td>0.046384</td>\n",
1349 |        "    </tr>\n",
1350 |        "  </tbody>\n",
1351 |        "</table>\n",
1352 |        "</div>"
1353 |       ],
1354 |       "text/plain": [
1355 |        "             timeStamp  demand     lat    long        new_datetime  \\\n",
1356 |        "0  2012-01-01 00:00:00  4937.5  40.701 -74.009 2012-01-01 00:00:00   \n",
1357 |        "1  2012-01-01 01:00:00  4752.1  40.701 -74.009 2012-01-01 01:00:00   \n",
1358 |        "2  2012-01-01 02:00:00  4542.6  40.701 -74.009 2012-01-01 02:00:00   \n",
1359 |        "3  2012-01-01 03:00:00  4357.7  40.701 -74.009 2012-01-01 03:00:00   \n",
1360 |        "4  2012-01-01 04:00:00  4275.5  40.701 -74.009 2012-01-01 04:00:00   \n",
1361 |        "\n",
1362 |        "   temperature  precipTime  precipDepth  snowDepth  precipDepth/precipTime  \n",
1363 |        "0     7.665934        24.0         58.0        0.0                0.046384  \n",
1364 |        "1     7.665934        24.0         58.0        0.0                0.046384  \n",
1365 |        "2     7.665934        24.0         58.0        0.0                0.046384  \n",
1366 |        "3     7.665934        24.0         58.0        0.0                0.046384  \n",
1367 |        "4     7.665934        24.0         58.0        0.0                0.046384  "
1368 |       ]
1369 |      },
1370 |      "execution_count": 9,
1371 |      "metadata": {},
1372 |      "output_type": "execute_result"
1373 |     }
1374 |    ],
1375 |    "source": [
1376 |     "all.head(5)"
1377 |    ]
1378 |   },
1379 |   {
1380 |    "cell_type": "markdown",
1381 |    "metadata": {},
1382 |    "source": [
1383 |     "<font color='blue'>The join success rate is 100%</font>"
1384 |    ]
1385 |   },
1386 |   {
1387 |    "cell_type": "code",
1388 |    "execution_count": 10,
1389 |    "metadata": {},
1390 |    "outputs": [
1391 |     {
1392 |      "name": "stdout",
1393 |      "output_type": "stream",
1394 |      "text": [
1395 |       "<class 'pandas.core.frame.DataFrame'>\n",
1396 |       "Int64Index: 49205 entries, 0 to 270\n",
1397 |       "Data columns (total 10 columns):\n",
1398 |       "timeStamp                 49205 non-null object\n",
1399 |       "demand                    49124 non-null float64\n",
1400 |       "lat                       49205 non-null float64\n",
1401 |       "long                      49205 non-null float64\n",
1402 |       "new_datetime              49205 non-null datetime64[ns]\n",
1403 |       "temperature               49205 non-null float64\n",
1404 |       "precipTime                49205 non-null float64\n",
1405 |       "precipDepth               49205 non-null float64\n",
1406 |       "snowDepth                 49205 non-null float64\n",
1407 |       "precipDepth/precipTime    49205 non-null float64\n",
1408 |       "dtypes: datetime64[ns](1), float64(8), object(1)\n",
1409 |       "memory usage: 4.1+ MB\n"
1410 |      ]
1411 |     }
1412 |    ],
1413 |    "source": [
1414 |     "all.info()"
1415 |    ]
1416 |   },
1417 |   {
1418 |    "cell_type": "code",
1419 |    "execution_count": 11,
1420 |    "metadata": {},
1421 |    "outputs": [
1422 |     {
1423 |      "data": {
1424 |       "text/html": [
1425 |        "<div>\n",
1426 |        "<style scoped>\n",
1427 |        "    .dataframe tbody tr th:only-of-type {\n",
1428 |        "        vertical-align: middle;\n",
1429 |        "    }\n",
1430 |        "\n",
1431 |        "    .dataframe tbody tr th {\n",
1432 |        "        vertical-align: top;\n",
1433 |        "    }\n",
1434 |        "\n",
1435 |        "    .dataframe thead th {\n",
1436 |        "        text-align: right;\n",
1437 |        "    }\n",
1438 |        "</style>\n",
1439 |        "<table border=\"1\" class=\"dataframe\">\n",
1440 |        "  <thead>\n",
1441 |        "    <tr style=\"text-align: right;\">\n",
1442 |        "      <th></th>\n",
1443 |        "      <th>demand</th>\n",
1444 |        "      <th>lat</th>\n",
1445 |        "      <th>long</th>\n",
1446 |        "      <th>temperature</th>\n",
1447 |        "      <th>precipTime</th>\n",
1448 |        "      <th>precipDepth</th>\n",
1449 |        "      <th>snowDepth</th>\n",
1450 |        "      <th>precipDepth/precipTime</th>\n",
1451 |        "    </tr>\n",
1452 |        "  </thead>\n",
1453 |        "  <tbody>\n",
1454 |        "    <tr>\n",
1455 |        "      <th>count</th>\n",
1456 |        "      <td>49124.000000</td>\n",
1457 |        "      <td>4.920500e+04</td>\n",
1458 |        "      <td>49205.000</td>\n",
1459 |        "      <td>49205.000000</td>\n",
1460 |        "      <td>49205.0</td>\n",
1461 |        "      <td>49205.000000</td>\n",
1462 |        "      <td>49205.000000</td>\n",
1463 |        "      <td>49205.000000</td>\n",
1464 |        "    </tr>\n",
1465 |        "    <tr>\n",
1466 |        "      <th>mean</th>\n",
1467 |        "      <td>6067.447361</td>\n",
1468 |        "      <td>4.070100e+01</td>\n",
1469 |        "      <td>-74.009</td>\n",
1470 |        "      <td>13.372627</td>\n",
1471 |        "      <td>24.0</td>\n",
1472 |        "      <td>391.145534</td>\n",
1473 |        "      <td>1.072569</td>\n",
1474 |        "      <td>19.859086</td>\n",
1475 |        "    </tr>\n",
1476 |        "    <tr>\n",
1477 |        "      <th>std</th>\n",
1478 |        "      <td>1285.607657</td>\n",
1479 |        "      <td>7.105500e-15</td>\n",
1480 |        "      <td>0.000</td>\n",
1481 |        "      <td>9.640060</td>\n",
1482 |        "      <td>0.0</td>\n",
1483 |        "      <td>1042.909500</td>\n",
1484 |        "      <td>4.329940</td>\n",
1485 |        "      <td>61.989804</td>\n",
1486 |        "    </tr>\n",
1487 |        "    <tr>\n",
1488 |        "      <th>min</th>\n",
1489 |        "      <td>2859.600000</td>\n",
1490 |        "      <td>4.070100e+01</td>\n",
1491 |        "      <td>-74.009</td>\n",
1492 |        "      <td>-13.226429</td>\n",
1493 |        "      <td>24.0</td>\n",
1494 |        "      <td>0.000000</td>\n",
1495 |        "      <td>0.000000</td>\n",
1496 |        "      <td>0.000000</td>\n",
1497 |        "    </tr>\n",
1498 |        "    <tr>\n",
1499 |        "      <th>25%</th>\n",
1500 |        "      <td>5133.862250</td>\n",
1501 |        "      <td>4.070100e+01</td>\n",
1502 |        "      <td>-74.009</td>\n",
1503 |        "      <td>5.637931</td>\n",
1504 |        "      <td>24.0</td>\n",
1505 |        "      <td>0.000000</td>\n",
1506 |        "      <td>0.000000</td>\n",
1507 |        "      <td>0.000000</td>\n",
1508 |        "    </tr>\n",
1509 |        "    <tr>\n",
1510 |        "      <th>50%</th>\n",
1511 |        "      <td>6020.071000</td>\n",
1512 |        "      <td>4.070100e+01</td>\n",
1513 |        "      <td>-74.009</td>\n",
1514 |        "      <td>13.955882</td>\n",
1515 |        "      <td>24.0</td>\n",
1516 |        "      <td>10.000000</td>\n",
1517 |        "      <td>0.000000</td>\n",
1518 |        "      <td>0.065359</td>\n",
1519 |        "    </tr>\n",
1520 |        "    <tr>\n",
1521 |        "      <th>75%</th>\n",
1522 |        "      <td>6684.300000</td>\n",
1523 |        "      <td>4.070100e+01</td>\n",
1524 |        "      <td>-74.009</td>\n",
1525 |        "      <td>22.236709</td>\n",
1526 |        "      <td>24.0</td>\n",
1527 |        "      <td>135.000000</td>\n",
1528 |        "      <td>0.000000</td>\n",
1529 |        "      <td>2.711580</td>\n",
1530 |        "    </tr>\n",
1531 |        "    <tr>\n",
1532 |        "      <th>max</th>\n",
1533 |        "      <td>11456.000000</td>\n",
1534 |        "      <td>4.070100e+01</td>\n",
1535 |        "      <td>-74.009</td>\n",
1536 |        "      <td>32.852857</td>\n",
1537 |        "      <td>24.0</td>\n",
1538 |        "      <td>7630.000000</td>\n",
1539 |        "      <td>51.228571</td>\n",
1540 |        "      <td>578.402778</td>\n",
1541 |        "    </tr>\n",
1542 |        "  </tbody>\n",
1543 |        "</table>\n",
1544 |        "</div>"
1545 |       ],
1546 |       "text/plain": [
1547 |        "             demand           lat       long   temperature  precipTime  \\\n",
1548 |        "count  49124.000000  4.920500e+04  49205.000  49205.000000     49205.0   \n",
1549 |        "mean    6067.447361  4.070100e+01    -74.009     13.372627        24.0   \n",
1550 |        "std     1285.607657  7.105500e-15      0.000      9.640060         0.0   \n",
1551 |        "min     2859.600000  4.070100e+01    -74.009    -13.226429        24.0   \n",
1552 |        "25%     5133.862250  4.070100e+01    -74.009      5.637931        24.0   \n",
1553 |        "50%     6020.071000  4.070100e+01    -74.009     13.955882        24.0   \n",
1554 |        "75%     6684.300000  4.070100e+01    -74.009     22.236709        24.0   \n",
1555 |        "max    11456.000000  4.070100e+01    -74.009     32.852857        24.0   \n",
1556 |        "\n",
1557 |        "        precipDepth     snowDepth  precipDepth/precipTime  \n",
1558 |        "count  49205.000000  49205.000000            49205.000000  \n",
1559 |        "mean     391.145534      1.072569               19.859086  \n",
1560 |        "std     1042.909500      4.329940               61.989804  \n",
1561 |        "min        0.000000      0.000000                0.000000  \n",
1562 |        "25%        0.000000      0.000000                0.000000  \n",
1563 |        "50%       10.000000      0.000000                0.065359  \n",
1564 |        "75%      135.000000      0.000000                2.711580  \n",
1565 |        "max     7630.000000     51.228571              578.402778  "
1566 |       ]
1567 |      },
1568 |      "execution_count": 11,
1569 |      "metadata": {},
1570 |      "output_type": "execute_result"
1571 |     }
1572 |    ],
1573 |    "source": [
1574 |     "all.describe()"
1575 |    ]
1576 |   },
1577 |   {
1578 |    "cell_type": "code",
1579 |    "execution_count": 12,
1580 |    "metadata": {},
1581 |    "outputs": [
1582 |     {
1583 |      "data": {
1584 |       "text/html": [
1585 |        "<div>\n",
1586 |        "<style scoped>\n",
1587 |        "    .dataframe tbody tr th:only-of-type {\n",
1588 |        "        vertical-align: middle;\n",
1589 |        "    }\n",
1590 |        "\n",
1591 |        "    .dataframe tbody tr th {\n",
1592 |        "        vertical-align: top;\n",
1593 |        "    }\n",
1594 |        "\n",
1595 |        "    .dataframe thead th {\n",
1596 |        "        text-align: right;\n",
1597 |        "    }\n",
1598 |        "</style>\n",
1599 |        "<table border=\"1\" class=\"dataframe\">\n",
1600 |        "  <thead>\n",
1601 |        "    <tr style=\"text-align: right;\">\n",
1602 |        "      <th></th>\n",
1603 |        "      <th>demand</th>\n",
1604 |        "      <th>lat</th>\n",
1605 |        "      <th>long</th>\n",
1606 |        "      <th>temperature</th>\n",
1607 |        "      <th>precipTime</th>\n",
1608 |        "      <th>precipDepth</th>\n",
1609 |        "      <th>snowDepth</th>\n",
1610 |        "      <th>precipDepth/precipTime</th>\n",
1611 |        "    </tr>\n",
1612 |        "  </thead>\n",
1613 |        "  <tbody>\n",
1614 |        "    <tr>\n",
1615 |        "      <th>count</th>\n",
1616 |        "      <td>743.000000</td>\n",
1617 |        "      <td>7.440000e+02</td>\n",
1618 |        "      <td>744.000</td>\n",
1619 |        "      <td>744.000000</td>\n",
1620 |        "      <td>744.0</td>\n",
1621 |        "      <td>744.000000</td>\n",
1622 |        "      <td>744.0</td>\n",
1623 |        "      <td>744.000000</td>\n",
1624 |        "    </tr>\n",
1625 |        "    <tr>\n",
1626 |        "      <th>mean</th>\n",
1627 |        "      <td>5427.314386</td>\n",
1628 |        "      <td>4.070100e+01</td>\n",
1629 |        "      <td>-74.009</td>\n",
1630 |        "      <td>15.386461</td>\n",
1631 |        "      <td>24.0</td>\n",
1632 |        "      <td>81.774194</td>\n",
1633 |        "      <td>0.0</td>\n",
1634 |        "      <td>1.889804</td>\n",
1635 |        "    </tr>\n",
1636 |        "    <tr>\n",
1637 |        "      <th>std</th>\n",
1638 |        "      <td>880.418908</td>\n",
1639 |        "      <td>7.110207e-15</td>\n",
1640 |        "      <td>0.000</td>\n",
1641 |        "      <td>4.159070</td>\n",
1642 |        "      <td>0.0</td>\n",
1643 |        "      <td>138.531590</td>\n",
1644 |        "      <td>0.0</td>\n",
1645 |        "      <td>3.683908</td>\n",
1646 |        "    </tr>\n",
1647 |        "    <tr>\n",
1648 |        "      <th>min</th>\n",
1649 |        "      <td>3946.383000</td>\n",
1650 |        "      <td>4.070100e+01</td>\n",
1651 |        "      <td>-74.009</td>\n",
1652 |        "      <td>7.381250</td>\n",
1653 |        "      <td>24.0</td>\n",
1654 |        "      <td>0.000000</td>\n",
1655 |        "      <td>0.0</td>\n",
1656 |        "      <td>0.000000</td>\n",
1657 |        "    </tr>\n",
1658 |        "    <tr>\n",
1659 |        "      <th>25%</th>\n",
1660 |        "      <td>4660.270500</td>\n",
1661 |        "      <td>4.070100e+01</td>\n",
1662 |        "      <td>-74.009</td>\n",
1663 |        "      <td>12.365179</td>\n",
1664 |        "      <td>24.0</td>\n",
1665 |        "      <td>0.000000</td>\n",
1666 |        "      <td>0.0</td>\n",
1667 |        "      <td>0.000000</td>\n",
1668 |        "    </tr>\n",
1669 |        "    <tr>\n",
1670 |        "      <th>50%</th>\n",
1671 |        "      <td>5433.483000</td>\n",
1672 |        "      <td>4.070100e+01</td>\n",
1673 |        "      <td>-74.009</td>\n",
1674 |        "      <td>15.759633</td>\n",
1675 |        "      <td>24.0</td>\n",
1676 |        "      <td>0.000000</td>\n",
1677 |        "      <td>0.0</td>\n",
1678 |        "      <td>0.000000</td>\n",
1679 |        "    </tr>\n",
1680 |        "    <tr>\n",
1681 |        "      <th>75%</th>\n",
1682 |        "      <td>6101.083000</td>\n",
1683 |        "      <td>4.070100e+01</td>\n",
1684 |        "      <td>-74.009</td>\n",
1685 |        "      <td>18.396000</td>\n",
1686 |        "      <td>24.0</td>\n",
1687 |        "      <td>140.000000</td>\n",
1688 |        "      <td>0.0</td>\n",
1689 |        "      <td>1.198718</td>\n",
1690 |        "    </tr>\n",
1691 |        "    <tr>\n",
1692 |        "      <th>max</th>\n",
1693 |        "      <td>7853.300000</td>\n",
1694 |        "      <td>4.070100e+01</td>\n",
1695 |        "      <td>-74.009</td>\n",
1696 |        "      <td>24.327826</td>\n",
1697 |        "      <td>24.0</td>\n",
1698 |        "      <td>521.000000</td>\n",
1699 |        "      <td>0.0</td>\n",
1700 |        "      <td>13.738443</td>\n",
1701 |        "    </tr>\n",
1702 |        "  </tbody>\n",
1703 |        "</table>\n",
1704 |        "</div>"
1705 |       ],
1706 |       "text/plain": [
1707 |        "            demand           lat     long  temperature  precipTime  \\\n",
1708 |        "count   743.000000  7.440000e+02  744.000   744.000000       744.0   \n",
1709 |        "mean   5427.314386  4.070100e+01  -74.009    15.386461        24.0   \n",
1710 |        "std     880.418908  7.110207e-15    0.000     4.159070         0.0   \n",
1711 |        "min    3946.383000  4.070100e+01  -74.009     7.381250        24.0   \n",
1712 |        "25%    4660.270500  4.070100e+01  -74.009    12.365179        24.0   \n",
1713 |        "50%    5433.483000  4.070100e+01  -74.009    15.759633        24.0   \n",
1714 |        "75%    6101.083000  4.070100e+01  -74.009    18.396000        24.0   \n",
1715 |        "max    7853.300000  4.070100e+01  -74.009    24.327826        24.0   \n",
1716 |        "\n",
1717 |        "       precipDepth  snowDepth  precipDepth/precipTime  \n",
1718 |        "count   744.000000      744.0              744.000000  \n",
1719 |        "mean     81.774194        0.0                1.889804  \n",
1720 |        "std     138.531590        0.0                3.683908  \n",
1721 |        "min       0.000000        0.0                0.000000  \n",
1722 |        "25%       0.000000        0.0                0.000000  \n",
1723 |        "50%       0.000000        0.0                0.000000  \n",
1724 |        "75%     140.000000        0.0                1.198718  \n",
1725 |        "max     521.000000        0.0               13.738443  "
1726 |       ]
1727 |      },
1728 |      "execution_count": 12,
1729 |      "metadata": {},
1730 |      "output_type": "execute_result"
1731 |     }
1732 |    ],
1733 |    "source": [
1734 |     "report_joined[datetime(2016, 10, 1, 0, 0)]"
1735 |    ]
1736 |   },
1737 |   {
1738 |    "cell_type": "code",
1739 |    "execution_count": 13,
1740 |    "metadata": {},
1741 |    "outputs": [
1742 |     {
1743 |      "data": {
1744 |       "text/html": [
1745 |        "<div>\n",
1746 |        "<style scoped>\n",
1747 |        "    .dataframe tbody tr th:only-of-type {\n",
1748 |        "        vertical-align: middle;\n",
1749 |        "    }\n",
1750 |        "\n",
1751 |        "    .dataframe tbody tr th {\n",
1752 |        "        vertical-align: top;\n",
1753 |        "    }\n",
1754 |        "\n",
1755 |        "    .dataframe thead th {\n",
1756 |        "        text-align: right;\n",
1757 |        "    }\n",
1758 |        "</style>\n",
1759 |        "<table border=\"1\" class=\"dataframe\">\n",
1760 |        "  <thead>\n",
1761 |        "    <tr style=\"text-align: right;\">\n",
1762 |        "      <th></th>\n",
1763 |        "      <th>demand</th>\n",
1764 |        "      <th>lat</th>\n",
1765 |        "      <th>long</th>\n",
1766 |        "      <th>temperature</th>\n",
1767 |        "      <th>precipTime</th>\n",
1768 |        "      <th>precipDepth</th>\n",
1769 |        "      <th>snowDepth</th>\n",
1770 |        "      <th>precipDepth/precipTime</th>\n",
1771 |        "    </tr>\n",
1772 |        "  </thead>\n",
1773 |        "  <tbody>\n",
1774 |        "    <tr>\n",
1775 |        "      <th>count</th>\n",
1776 |        "      <td>720.000000</td>\n",
1777 |        "      <td>7.200000e+02</td>\n",
1778 |        "      <td>720.000</td>\n",
1779 |        "      <td>720.000000</td>\n",
1780 |        "      <td>720.0</td>\n",
1781 |        "      <td>720.000000</td>\n",
1782 |        "      <td>720.0</td>\n",
1783 |        "      <td>720.000000</td>\n",
1784 |        "    </tr>\n",
1785 |        "    <tr>\n",
1786 |        "      <th>mean</th>\n",
1787 |        "      <td>5371.189944</td>\n",
1788 |        "      <td>4.070100e+01</td>\n",
1789 |        "      <td>-74.009</td>\n",
1790 |        "      <td>10.440883</td>\n",
1791 |        "      <td>24.0</td>\n",
1792 |        "      <td>1787.366667</td>\n",
1793 |        "      <td>0.0</td>\n",
1794 |        "      <td>130.880351</td>\n",
1795 |        "    </tr>\n",
1796 |        "    <tr>\n",
1797 |        "      <th>std</th>\n",
1798 |        "      <td>783.680586</td>\n",
1799 |        "      <td>7.110367e-15</td>\n",
1800 |        "      <td>0.000</td>\n",
1801 |        "      <td>3.318841</td>\n",
1802 |        "      <td>0.0</td>\n",
1803 |        "      <td>1170.974488</td>\n",
1804 |        "      <td>0.0</td>\n",
1805 |        "      <td>122.276823</td>\n",
1806 |        "    </tr>\n",
1807 |        "    <tr>\n",
1808 |        "      <th>min</th>\n",
1809 |        "      <td>3999.292000</td>\n",
1810 |        "      <td>4.070100e+01</td>\n",
1811 |        "      <td>-74.009</td>\n",
1812 |        "      <td>4.346117</td>\n",
1813 |        "      <td>24.0</td>\n",
1814 |        "      <td>0.000000</td>\n",
1815 |        "      <td>0.0</td>\n",
1816 |        "      <td>0.000000</td>\n",
1817 |        "    </tr>\n",
1818 |        "    <tr>\n",
1819 |        "      <th>25%</th>\n",
1820 |        "      <td>4622.829500</td>\n",
1821 |        "      <td>4.070100e+01</td>\n",
1822 |        "      <td>-74.009</td>\n",
1823 |        "      <td>7.377561</td>\n",
1824 |        "      <td>24.0</td>\n",
1825 |        "      <td>0.000000</td>\n",
1826 |        "      <td>0.0</td>\n",
1827 |        "      <td>0.000000</td>\n",
1828 |        "    </tr>\n",
1829 |        "    <tr>\n",
1830 |        "      <th>50%</th>\n",
1831 |        "      <td>5419.716500</td>\n",
1832 |        "      <td>4.070100e+01</td>\n",
1833 |        "      <td>-74.009</td>\n",
1834 |        "      <td>11.029123</td>\n",
1835 |        "      <td>24.0</td>\n",
1836 |        "      <td>2540.000000</td>\n",
1837 |        "      <td>0.0</td>\n",
1838 |        "      <td>78.905303</td>\n",
1839 |        "    </tr>\n",
1840 |        "    <tr>\n",
1841 |        "      <th>75%</th>\n",
1842 |        "      <td>6115.795750</td>\n",
1843 |        "      <td>4.070100e+01</td>\n",
1844 |        "      <td>-74.009</td>\n",
1845 |        "      <td>12.923214</td>\n",
1846 |        "      <td>24.0</td>\n",
1847 |        "      <td>2553.000000</td>\n",
1848 |        "      <td>0.0</td>\n",
1849 |        "      <td>232.143362</td>\n",
1850 |        "    </tr>\n",
1851 |        "    <tr>\n",
1852 |        "      <th>max</th>\n",
1853 |        "      <td>6797.308000</td>\n",
1854 |        "      <td>4.070100e+01</td>\n",
1855 |        "      <td>-74.009</td>\n",
1856 |        "      <td>18.216071</td>\n",
1857 |        "      <td>24.0</td>\n",
1858 |        "      <td>2646.000000</td>\n",
1859 |        "      <td>0.0</td>\n",
1860 |        "      <td>404.605201</td>\n",
1861 |        "    </tr>\n",
1862 |        "  </tbody>\n",
1863 |        "</table>\n",
1864 |        "</div>"
1865 |       ],
1866 |       "text/plain": [
1867 |        "            demand           lat     long  temperature  precipTime  \\\n",
1868 |        "count   720.000000  7.200000e+02  720.000   720.000000       720.0   \n",
1869 |        "mean   5371.189944  4.070100e+01  -74.009    10.440883        24.0   \n",
1870 |        "std     783.680586  7.110367e-15    0.000     3.318841         0.0   \n",
1871 |        "min    3999.292000  4.070100e+01  -74.009     4.346117        24.0   \n",
1872 |        "25%    4622.829500  4.070100e+01  -74.009     7.377561        24.0   \n",
1873 |        "50%    5419.716500  4.070100e+01  -74.009    11.029123        24.0   \n",
1874 |        "75%    6115.795750  4.070100e+01  -74.009    12.923214        24.0   \n",
1875 |        "max    6797.308000  4.070100e+01  -74.009    18.216071        24.0   \n",
1876 |        "\n",
1877 |        "       precipDepth  snowDepth  precipDepth/precipTime  \n",
1878 |        "count   720.000000      720.0              720.000000  \n",
1879 |        "mean   1787.366667        0.0              130.880351  \n",
1880 |        "std    1170.974488        0.0              122.276823  \n",
1881 |        "min       0.000000        0.0                0.000000  \n",
1882 |        "25%       0.000000        0.0                0.000000  \n",
1883 |        "50%    2540.000000        0.0               78.905303  \n",
1884 |        "75%    2553.000000        0.0              232.143362  \n",
1885 |        "max    2646.000000        0.0              404.605201  "
1886 |       ]
1887 |      },
1888 |      "execution_count": 13,
1889 |      "metadata": {},
1890 |      "output_type": "execute_result"
1891 |     }
1892 |    ],
1893 |    "source": [
1894 |     "report_joined[datetime(2016, 11, 1, 0, 0)]"
1895 |    ]
1896 |   },
1897 |   {
1898 |    "cell_type": "code",
1899 |    "execution_count": 14,
1900 |    "metadata": {},
1901 |    "outputs": [],
1902 |    "source": [
1903 |     "# EOF"
1904 |    ]
1905 |   }
1906 |  ],
1907 |  "metadata": {
1908 |   "kernelspec": {
1909 |    "display_name": "Python 3",
1910 |    "language": "python",
1911 |    "name": "python3"
1912 |   },
1913 |   "language_info": {
1914 |    "codemirror_mode": {
1915 |     "name": "ipython",
1916 |     "version": 3
1917 |    },
1918 |    "file_extension": ".py",
1919 |    "mimetype": "text/x-python",
1920 |    "name": "python",
1921 |    "nbconvert_exporter": "python",
1922 |    "pygments_lexer": "ipython3",
1923 |    "version": "3.6.7"
1924 |   }
1925 |  },
1926 |  "nbformat": 4,
1927 |  "nbformat_minor": 2
1928 | }
1929 | 


--------------------------------------------------------------------------------
/tutorials/taxi-automl/01-tutorial-opendatasets-automl.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tutorial: Build a regression model with automated machine learning and Open Datasets"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this tutorial, you leverage the convenience of Azure Open Datasets along with the power of Azure Machine Learning service to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data, and configure an automated machine learning experiment using Azure Machine Learning service. This process accepts training data and configuration settings, and automatically iterates through combinations of different feature normalization/standardization methods, models, and hyperparameter settings to arrive at the best model.\n",
 15 |     "\n",
 16 |     "In this tutorial you learn the following tasks:\n",
 17 |     "\n",
 18 |     "* Configure an Azure Machine Learning service workspace\n",
 19 |     "* Set up a local Python environment\n",
 20 |     "* Access, transform, and join data using Azure Open Datasets\n",
 21 |     "* Train an automated machine learning regression model\n",
 22 |     "* Calculate model accuracy"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Prerequisites"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "This tutorial requires the following prerequisites.\n",
 37 |     "\n",
 38 |     "* An Azure Machine Learning service workspace\n",
 39 |     "* A Python 3.6 environment "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Create a workspace"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "Follow the [instructions](https://docs.microsoft.com/azure/machine-learning/service/setup-create-workspace#portal) to create a workspace through the Azure portal, if you don't already have one. After creation, make note of your workspace name, resource group name, and subscription id."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Create a Python environment"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "This example uses an Anaconda environment with Jupyter notebooks, but you can run this code in any 3.6.x environment and with any text editor or IDE. Use the following steps to create a new development environment.\n",
 68 |     "\n",
 69 |     "1. If you don't already have it, [download](https://www.anaconda.com/distribution/) and install Anaconda, and choose **Python 3.7 version**.\n",
 70 |     "1. Open an Anaconda prompt and create a new environment. It will take several minutes to create the environment while components and packages are downloaded.\n",
 71 |     "```\n",
 72 |     "conda create -n tutorialenv python=3.6.5\n",
 73 |     "```\n",
 74 |     "1. Activate the environment.\n",
 75 |     "```\n",
 76 |     "conda activate tutorialenv\n",
 77 |     "```\n",
 78 |     "1. Enable environment-specific ipython kernels.\n",
 79 |     "```\n",
 80 |     "conda install notebook ipykernel\n",
 81 |     "```\n",
 82 |     "1. Create the kernel.\n",
 83 |     "```\n",
 84 |     "ipython kernel install --user\n",
 85 |     "```\n",
 86 |     "1. Install the packages you need for this tutorial. These packages are large and will take 5-10 minutes to install.\n",
 87 |     "```\n",
 88 |     "pip install azureml-sdk[automl] azureml-opendatasets\n",
 89 |     "```\n",
 90 |     "1. Start a notebook kernel from your environment.\n",
 91 |     "```\n",
 92 |     "jupyter notebook\n",
 93 |     "```\n",
 94 |     "\n",
 95 |     "After you complete these steps, clone the [repo](https://github.com/Azure/OpenDatasetsNotebooks) and open the **tutorials/taxi-automl/01-tutorial-opendatasets-automl.ipynb** notebook to run it."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Download and prepare data"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Import the necessary packages. The Open Datasets package contains a class representing each data source (`NycTlcGreen` for example) to easily filter date parameters before downloading."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "from azureml.opendatasets import NycTlcGreen\n",
119 |     "import pandas as pd\n",
120 |     "from datetime import datetime\n",
121 |     "from dateutil.relativedelta import relativedelta"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Begin by creating a dataframe to hold the taxi data. When working in a non-Spark environment, Open Datasets only allows downloading one month of data at a time with certain classes to avoid `MemoryError` with large datasets. To download a year of taxi data, iteratively fetch one month at a time, and before appending it to `green_taxi_df` randomly sample 2000 records from each month to avoid bloating the dataframe. Then preview the data.\n",
129 |     "\n",
130 |     "Note: Open Datasets has mirroring classes for working in Spark environments where data size and memory aren't a concern."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "green_taxi_df = pd.DataFrame([])\n",
140 |     "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
141 |     "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
142 |     "\n",
143 |     "for sample_month in range(12):\n",
144 |     "    temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \\\n",
145 |     "        .to_pandas_dataframe()\n",
146 |     "    green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))\n",
147 |     "    \n",
148 |     "green_taxi_df.head(10)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Now that the intial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day, and will allow the model to factor in time-based seasonality. The function also adds a static feature for the country code to join holiday data. Use the `apply()` function on the dataframe to iteratively apply the `build_time_features()` function to each row in the taxi data."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "def build_time_features(vector):\n",
165 |     "    pickup_datetime = vector[0]\n",
166 |     "    month_num = pickup_datetime.month\n",
167 |     "    day_of_month = pickup_datetime.day\n",
168 |     "    day_of_week = pickup_datetime.weekday()\n",
169 |     "    hour_of_day = pickup_datetime.hour\n",
170 |     "    country_code = \"US\"\n",
171 |     "    \n",
172 |     "    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code))\n",
173 |     "\n",
174 |     "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
175 |     "green_taxi_df.head(10)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. You do this to all time features so that the datetime component can be later used as a key when joining datasets together at a daily level of granularity."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "scrolled": false
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
194 |     "                     \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
195 |     "                     \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"\n",
196 |     "                    ]\n",
197 |     "for col in columns_to_remove:\n",
198 |     "    green_taxi_df.pop(col)\n",
199 |     "    \n",
200 |     "green_taxi_df = green_taxi_df.rename(columns={\"lpepPickupDatetime\": \"datetime\"})\n",
201 |     "green_taxi_df[\"datetime\"] = green_taxi_df[\"datetime\"].dt.normalize()\n",
202 |     "green_taxi_df.head(5)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "### Enrich with holiday data"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "Now that you have taxi data downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "from azureml.opendatasets import PublicHolidays\n",
226 |     "# call default constructor to download full dataset\n",
227 |     "holidays_df = PublicHolidays().to_pandas_dataframe()\n",
228 |     "holidays_df.head(5)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\"US\"`. Preview the data to verify that they were merged correctly."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {
242 |     "scrolled": false
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\", \"date\": \"datetime\"})\n",
247 |     "holidays_df[\"datetime\"] = holidays_df[\"datetime\"].dt.normalize()\n",
248 |     "holidays_df.pop(\"countryOrRegion\")\n",
249 |     "holidays_df.pop(\"holidayName\")\n",
250 |     "\n",
251 |     "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n",
252 |     "taxi_holidays_df.head(5)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "### Enrich with weather data"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "Now you append NOAA surface weather data to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns you want to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "from azureml.opendatasets import NoaaIsdWeather\n",
276 |     "\n",
277 |     "weather_df = pd.DataFrame([])\n",
278 |     "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
279 |     "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
280 |     "\n",
281 |     "for sample_month in range(12):\n",
282 |     "    tmp_df = NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\", \"snowDepth\"], start_date=start + relativedelta(months=sample_month), end_date=end + relativedelta(months=sample_month))\\\n",
283 |     "        .to_pandas_dataframe()\n",
284 |     "    print(\"--weather downloaded--\")\n",
285 |     "    \n",
286 |     "    # filter out coordinates not in NYC to conserve memory\n",
287 |     "    tmp_df = tmp_df.query(\"latitude>=40.53 and latitude<=40.88\")\n",
288 |     "    tmp_df = tmp_df.query(\"longitude>=-74.09 and longitude<=-73.72\")\n",
289 |     "    print(\"--filtered coordinates--\")\n",
290 |     "    weather_df = weather_df.append(tmp_df)\n",
291 |     "    \n",
292 |     "weather_df.head(10)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`. Delete the unneeded columns, and filter out records where the temperature is `NaN`. \n",
300 |     "\n",
301 |     "Next group the weather data so that you have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For `snowDepth` and `temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day."
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n",
311 |     "weather_df.pop(\"usaf\")\n",
312 |     "weather_df.pop(\"wban\")\n",
313 |     "weather_df.pop(\"longitude\")\n",
314 |     "weather_df.pop(\"latitude\")\n",
315 |     "\n",
316 |     "# filter out NaN\n",
317 |     "weather_df = weather_df.query(\"temperature==temperature\")\n",
318 |     "\n",
319 |     "# group by datetime\n",
320 |     "aggregations = {\"snowDepth\": \"mean\", \"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n",
321 |     "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n",
322 |     "weather_df_grouped.head(10)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "### Cleanse data "
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "Merge the taxi and holiday data you prepared with the new weather data. This time you only need the `datetime` key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "scrolled": true
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n",
355 |     "taxi_holidays_weather_df.describe()"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n",
363 |     "\n",
364 |     "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88\")\n",
374 |     "final_df = final_df.query(\"pickupLongitude>=-74.09 and pickupLongitude<=-73.72\")\n",
375 |     "final_df = final_df.query(\"tripDistance>0 and tripDistance<75\")\n",
376 |     "final_df = final_df.query(\"passengerCount>0 and passengerCount<100\")\n",
377 |     "final_df = final_df.query(\"totalAmount>0\")\n",
378 |     "\n",
379 |     "columns_to_remove_for_training = [\"datetime\", \"pickupLongitude\", \"pickupLatitude\", \"dropoffLongitude\", \"dropoffLatitude\", \"country_code\"]\n",
380 |     "for col in columns_to_remove_for_training:\n",
381 |     "    final_df.pop(col)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "Call `describe()` again on the data to ensure cleansing worked as expected. You now have a prepared and cleansed set of taxi, holiday, and weather data to use for machine learning model training."
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "final_df.describe()"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "## Train a model"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "Now you use the prepared data to train an automated machine learning model. Start by splitting `final_df` into features (X values) and labels (y value), which for this model is the taxi fare cost."
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "y_df = final_df.pop(\"totalAmount\")\n",
421 |     "x_df = final_df"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "Now you split the data into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic."
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": null,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "from sklearn.model_selection import train_test_split\n",
438 |     "\n",
439 |     "X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=222)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "### Load workspace and configure experiment"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {},
452 |    "source": [
453 |     "Load your Azure Machine Learning service workspace using the `get()` function with your subscription and workspace information. Create an experiment within your workspace to store and monitor your model runs."
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "from azureml.core.workspace import Workspace\n",
463 |     "from azureml.core.experiment import Experiment\n",
464 |     "\n",
465 |     "workspace = Workspace.get(subscription_id=\"<your-subscription-id>\", name=\"<your-workspace-name>\", resource_group=\"<your-resource-group>\")\n",
466 |     "experiment = Experiment(workspace, \"opendatasets-ml\")"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "Create a configuration object for the experiment using the `AutoMLConfig` class. You attach your training data, and additionally specify settings and parameters that control the training process. The parameters have the following purposes:\n",
474 |     "\n",
475 |     "* `task`: the type of experiment to run.\n",
476 |     "* `X`: training features.\n",
477 |     "* `y`: training labels.\n",
478 |     "* `iterations`: number of iterations to run. Each iteration tries combinations of different feature normalization/standardization methods, and different models using multiple hyperparameter settings.\n",
479 |     "* `primary_metric`: primary metric to optimize during model training. Best fit model will be chosen based on this metric.\n",
480 |     "* `preprocess`: controls whether the experiment can preprocess the input data (handling missing data, converting text to numeric, etc.)\n",
481 |     "* `n_cross_validations`: Number of cross-validation splits to perform when validation data is not specified."
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "from azureml.train.automl import AutoMLConfig\n",
491 |     "\n",
492 |     "automl_config = AutoMLConfig(task=\"regression\", \n",
493 |     "                             X=X_train.values, \n",
494 |     "                             y=y_train.values.flatten(),\n",
495 |     "                             iterations=20,\n",
496 |     "                             primary_metric=\"spearman_correlation\",\n",
497 |     "                             preprocess=True,\n",
498 |     "                             n_cross_validations=5\n",
499 |     "                            )"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "### Submit experiment"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "Submit the experiment for training. After submitting the experiment, the process iterates through different machine learning algorithms and hyperparameter settings, adhering to your defined constraints. It chooses the best-fit model by optimizing the defined accuracy metric. Pass the `automl_config` object to the experiment. Set the output to `True` to view progress during the experiment. \n",
514 |     "\n",
515 |     "After submitting the experiment you see live output for the training process. For each iteration, you see the model type and feature normalization/standardization method, the run duration, and the training accuracy. The field `BEST` tracks the best running training score based on your metric type."
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": [
524 |     "training_run = experiment.submit(automl_config, show_output=True)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {},
530 |    "source": [
531 |     "### Retrieve the fitted model"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "markdown",
536 |    "metadata": {},
537 |    "source": [
538 |     "At the end of all training iterations, the automated machine learning process creates an ensemble algorithm from all individual runs, either with bagging or stacking. Retrieve the fitted ensemble into the variable `fitted_model`, and the best individual run into the variable `best_run`."
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": null,
544 |    "metadata": {},
545 |    "outputs": [],
546 |    "source": [
547 |     "best_run, fitted_model = training_run.get_output()\n",
548 |     "print(best_run)\n",
549 |     "print(fitted_model)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "markdown",
554 |    "metadata": {},
555 |    "source": [
556 |     "## Test model accuracy"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "markdown",
561 |    "metadata": {},
562 |    "source": [
563 |     "Use the fitted ensemble model to run predictions on the test dataset to predict taxi fares. The function `predict()` uses the fitted model and predicts the values of y, taxi fare cost, for the `X_test` dataset."
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": [
572 |     "y_predict = fitted_model.predict(X_test.values)"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "Calculate the root mean squared error of the results. Use the `y_test` dataframe, and convert it to a list `y_actual` to compare to the predicted values. The function `mean_squared_error` takes two arrays of values and calculates the average squared error between them. Taking the square root of the result gives an error in the same units as the y variable, cost. It indicates roughly how far the taxi fare predictions are from the actual fares, while heavily weighting large errors."
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": null,
585 |    "metadata": {},
586 |    "outputs": [],
587 |    "source": [
588 |     "from sklearn.metrics import mean_squared_error\n",
589 |     "from math import sqrt\n",
590 |     "\n",
591 |     "y_actual = y_test.values.flatten().tolist()\n",
592 |     "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n",
593 |     "rmse"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "Run the following code to calculate mean absolute percent error (MAPE) by using the full `y_actual` and `y_predict` datasets. This metric calculates an absolute difference between each predicted and actual value and sums all the differences. Then it expresses that sum as a percent of the total of the actual values."
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": null,
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": [
609 |     "sum_actuals = sum_errors = 0\n",
610 |     "\n",
611 |     "for actual_val, predict_val in zip(y_actual, y_predict):\n",
612 |     "    abs_error = actual_val - predict_val\n",
613 |     "    if abs_error < 0:\n",
614 |     "        abs_error = abs_error * -1\n",
615 |     "\n",
616 |     "    sum_errors = sum_errors + abs_error\n",
617 |     "    sum_actuals = sum_actuals + actual_val\n",
618 |     "\n",
619 |     "mean_abs_percent_error = sum_errors / sum_actuals\n",
620 |     "print(\"Model MAPE:\")\n",
621 |     "print(mean_abs_percent_error)\n",
622 |     "print()\n",
623 |     "print(\"Model Accuracy:\")\n",
624 |     "print(1 - mean_abs_percent_error)"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "markdown",
629 |    "metadata": {},
630 |    "source": [
631 |     "Given that we used a fairly small sample of data relative to the full dataset (n=11748), model accuracy is fairly high at 85%, with RMSE at around +- $4.00 error in predicting taxi fare price. As a potential next step to improve accuracy, go back to the second cell of this notebook, and increase the sample size from 2,000 records per month, and run the entire experiment again to re-train the model with more data."
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "## Clean up resources"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "markdown",
643 |    "metadata": {},
644 |    "source": [
645 |     "If you don't plan to use the resources you created, delete them, so you don't incur any charges.\n",
646 |     "\n",
647 |     "1. In the Azure portal, select **Resource groups** on the far left.\n",
648 |     "1. From the list, select the resource group you created.\n",
649 |     "1. Select **Delete resource group**.\n",
650 |     "1. Enter the resource group name. Then select **Delete**."
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "markdown",
655 |    "metadata": {},
656 |    "source": [
657 |     "## Next steps"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "markdown",
662 |    "metadata": {},
663 |    "source": [
664 |     "* See the Azure Open Datasets [notebooks](https://github.com/Azure/OpenDatasetsNotebooks) for more code examples.\n",
665 |     "* Follow the [how-to](https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train) for more information on automated machine learning in Azure Machine Learning service."
666 |    ]
667 |   }
668 |  ],
669 |  "metadata": {
670 |   "kernelspec": {
671 |    "display_name": "Python 3",
672 |    "language": "python",
673 |    "name": "python3"
674 |   },
675 |   "language_info": {
676 |    "codemirror_mode": {
677 |     "name": "ipython",
678 |     "version": 3
679 |    },
680 |    "file_extension": ".py",
681 |    "mimetype": "text/x-python",
682 |    "name": "python",
683 |    "nbconvert_exporter": "python",
684 |    "pygments_lexer": "ipython3",
685 |    "version": "3.6.5"
686 |   }
687 |  },
688 |  "nbformat": 4,
689 |  "nbformat_minor": 2
690 | }
691 | 


--------------------------------------------------------------------------------