├── LICENSE
├── README.md
├── docs
    └── source
    │   └── images
    │       ├── deploy-model-results.png
    │       ├── deploy-node-config.png
    │       ├── flight-delays-pipeline-deploy.png
    │       ├── flight-delays-pipeline.png
    │       ├── kfp-experiment-deploy.png
    │       ├── kfp-experiment.png
    │       └── object-storage-results.png
├── flight-delays-env.yaml
├── kfserving.md
├── notebooks
    ├── analyze_flight_delays.ipynb
    ├── deploy_model.ipynb
    ├── load_data.py
    ├── merge_data.ipynb
    ├── predict_flight_delays.ipynb
    ├── process_flight_data.ipynb
    └── process_weather_data.ipynb
└── pipelines
    ├── flight_delays.pipeline
    └── flight_delays_with_deployment.pipeline


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Analyzing flight delay and weather data using Elyra, Kubeflow Pipelines and KFServing
  2 | 
  3 | This repository contains a set of Python scripts and Jupyter notebooks that analyze and predict flight delays. The datasets are hosted on the [IBM Developer Data Asset Exchange](https://ibm.biz/data-exchange).
  4 | 
  5 | We use [Elyra](https://github.com/elyra-ai/elyra) to create a pipeline that can be executed locally or using a [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/) runtime. This pipeline:
  6 | 
  7 | * Loads the datasets
  8 | * Pre-processes the datasets
  9 | * Performs data merging and feature extraction
 10 | * Analyzes and visualizes the processed dataset
 11 | * Trains and evaluates machine learning models for predicting delayed flights, using features about flights as well as related weather features
 12 | * _Optionally_ deploys the trained model to Kubeflow Serving
 13 | 
 14 | ![Flight Delays Pipeline](docs/source/images/flight-delays-pipeline.png)
 15 | 
 16 | ### Configuring the local development environment
 17 | 
 18 | It's highly recommended to create a dedicated and consistent Python environment for running the notebooks in this repository:
 19 | 
 20 | 1. Install [Anaconda](https://docs.anaconda.com/anaconda/install/)
 21 |    or [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
 22 | 1. Navigate to your local copy of this repository.
 23 | 1. Create an Anaconda environment from the `yaml` file in the repository:
 24 |     ```console
 25 |     $ conda env create -f flight-delays-env.yaml
 26 |     ```
 27 | 1. Activate the new environment:
 28 |     ```console
 29 |     $ conda activate flight-delays-env
 30 |     ```
 31 | 1. If running JupyterLab and Elyra for the first time, build the extensions:
 32 |     ```console
 33 |     $ jupyter lab build
 34 |     ```
 35 | 1. Launch JupyterLab:
 36 |     ```console
 37 |     $ jupyter lab
 38 |     ```
 39 | 
 40 | ### Configuring a Kubeflow Pipeline runtime
 41 | 
 42 | [Elyra's Notebook pipeline visual editor](https://elyra.readthedocs.io/en/latest/getting_started/overview.html#notebook-pipelines-visual-editor)
 43 | currently supports running these pipelines in a Kubeflow Pipeline runtime.  If required, these are
 44 | [the steps to install a local deployment of KFP](https://elyra.readthedocs.io/en/latest/recipes/deploying-kubeflow-locally-for-dev.html).
 45 | 
 46 | After installing your Kubeflow Pipeline runtime, use the command below (with proper updates) to configure the new
 47 | KFP runtime with Elyra.
 48 | 
 49 | ```bash
 50 | elyra-metadata install runtimes --replace=true \
 51 |        --schema_name=kfp \
 52 |        --name=kfp_runtime \
 53 |        --display_name="Kubeflow Pipeline Runtime" \
 54 |        --api_endpoint=http://[host]:[api port]/pipeline \
 55 |        --cos_endpoint=http://[host]:[cos port] \
 56 |        --cos_username=[cos username] \
 57 |        --cos_password=[cos password] \
 58 |        --cos_bucket=flights
 59 | ```
 60 | 
 61 | **Note:** The cloud object storage endpoint above assumes a local minio object storage but other cloud-based object storage services could be configured and used in this scenario.
 62 | 
 63 | If using the default minio storage - following the local Kubeflow installation instructions above - the arguments should be `--cos_endpoint=http://minio-service:9000`, `--cos_username=minio`, `--cos_password=minio123`. The api endpoint for local Kubeflow Pipelines would then be `--api_endpoint=http://127.0.0.1:31380/pipeline`.
 64 | 
 65 | **Don't forget to setup port-forwarding for the KFP ML Pipelines API service and Minio service as per the above instructions.**
 66 | 
 67 | ## Elyra Notebook pipelines
 68 | 
 69 | Elyra provides a visual editor for building Notebook-based AI pipelines, simplifying the conversion of 
 70 | multiple notebooks into batch jobs or workflows. By leveraging cloud-based resources to run their 
 71 | experiments faster, the data scientists, machine learning engineers, and AI developers are then more productive,
 72 | allowing them to spend their time using their technical skills.
 73 | 
 74 | ![Notebook pipeline](https://raw.githubusercontent.com/elyra-ai/community/master/resources/blog-announcement/elyra-pipelines.gif)
 75 | 
 76 | ### Running the Elyra pipeline
 77 | 
 78 | The Elyra pipeline `flight_delays.pipeline`, which is located in the `pipelines` directory, can be run by clicking
 79 | on the `play` button as seen on the image above. The `submit` dialog will request two inputs from the user: a name 
 80 | for the pipeline and a runtime to use while executing the pipeline.
 81 | 
 82 | The list of available runtimes comes from the registered Kubeflow Pipelines runtimes documented above and includes a `Run in-place locally` option for local execution.
 83 | 
 84 | #### Local execution
 85 | 
 86 | If running locally, the notebooks are executed and updated in-place. You can track the progress in the terminal screen where you ran `jupyter lab`. The downloaded and processed datasets will be available locally in `notebooks/data` in this case.
 87 | 
 88 | #### Kubeflow Pipelines execution
 89 | 
 90 | After submitting the pipeline to Kubeflow Pipelines, Elyra will show a dialog with a direct link to where the experiment is being executed on Kubeflow Piplines.
 91 | 
 92 | The user can access the pipelines, and respective experiment runs, via the `api_endpoint` of the Kubeflow Pipelines
 93 | runtime (e.g. `http://[host]:[port]/pipeline`)
 94 | 
 95 | ![Pipeline experiment run](docs/source/images/kfp-experiment.png)
 96 | 
 97 | The output from the executed experiments are then available in the associated `object storage`
 98 | and the executed notebooks are available as native `.ipynb` notebooks and also in `html` format
 99 | to facilitate the visualization and sharing of the results.
100 | 
101 | ![Pipeline experiment results in object storage](docs/source/images/object-storage-results.png)
102 | 
103 | 
104 | ### Running the Elyra pipeline with model deployment to Kubeflow Serving
105 | 
106 | Please follow the [instructions](kfserving.md) for running the pipeline `flight_delays_with_deployment.pipeline`, which adds a node at the end of the pipeline for deploying the model to [KFServing](https://www.kubeflow.org/docs/components/serving/kfserving/).
107 | 
108 | ### References
109 | 
110 | Find more project details on [Elyra's GitHub](https://github.com/elyra-ai/elyra) or watching the
111 | [Elyra demo](https://www.youtube.com/watch?v=Nj0yga6T4U8).


--------------------------------------------------------------------------------
/docs/source/images/deploy-model-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/deploy-model-results.png


--------------------------------------------------------------------------------
/docs/source/images/deploy-node-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/deploy-node-config.png


--------------------------------------------------------------------------------
/docs/source/images/flight-delays-pipeline-deploy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/flight-delays-pipeline-deploy.png


--------------------------------------------------------------------------------
/docs/source/images/flight-delays-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/flight-delays-pipeline.png


--------------------------------------------------------------------------------
/docs/source/images/kfp-experiment-deploy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/kfp-experiment-deploy.png


--------------------------------------------------------------------------------
/docs/source/images/kfp-experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/kfp-experiment.png


--------------------------------------------------------------------------------
/docs/source/images/object-storage-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/object-storage-results.png


--------------------------------------------------------------------------------
/flight-delays-env.yaml:
--------------------------------------------------------------------------------
  1 | name: flight-delays-env
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - appnope=0.1.0
  6 |   - argon2-cffi=20.1.0
  7 |   - async_generator=1.10
  8 |   - attrs=20.3.0
  9 |   - backcall=0.2.0
 10 |   - blas=1.0
 11 |   - bleach=3.2.1
 12 |   - brotlipy=0.7.0
 13 |   - ca-certificates=2020.10.14
 14 |   - certifi=2020.6.20
 15 |   - cffi=1.14.3
 16 |   - chardet=3.0.4
 17 |   - cryptography=3.1.1
 18 |   - cycler=0.10.0
 19 |   - decorator=4.4.2
 20 |   - defusedxml=0.6.0
 21 |   - entrypoints=0.3
 22 |   - freetype=2.10.4
 23 |   - idna=2.10
 24 |   - importlib-metadata=2.0.0
 25 |   - importlib_metadata=2.0.0
 26 |   - intel-openmp=2019.4
 27 |   - ipykernel=5.3.4
 28 |   - ipython=7.19.0
 29 |   - ipython_genutils=0.2.0
 30 |   - jedi=0.17.2
 31 |   - jinja2=2.11.2
 32 |   - joblib=0.17.0
 33 |   - jpeg=9b
 34 |   - json5=0.9.5
 35 |   - jsonschema=3.2.0
 36 |   - jupyter_client=6.1.7
 37 |   - jupyter_core=4.6.3
 38 |   - jupyterlab=2.2.6
 39 |   - jupyterlab_pygments=0.1.2
 40 |   - jupyterlab_server=1.2.0
 41 |   - kiwisolver=1.3.0
 42 |   - lcms2=2.11
 43 |   - libcxx=10.0.0
 44 |   - libedit=3.1.20191231
 45 |   - libffi=3.3
 46 |   - libgfortran=3.0.1
 47 |   - libpng=1.6.37
 48 |   - libsodium=1.0.18
 49 |   - libtiff=4.1.0
 50 |   - llvm-openmp=10.0.0
 51 |   - lz4-c=1.9.2
 52 |   - markupsafe=1.1.1
 53 |   - matplotlib=3.3.2
 54 |   - matplotlib-base=3.3.2
 55 |   - mistune=0.8.4
 56 |   - mkl=2019.4
 57 |   - mkl-service=2.3.0
 58 |   - mkl_fft=1.2.0
 59 |   - mkl_random=1.1.1
 60 |   - nbclient=0.5.1
 61 |   - nbformat=5.0.8
 62 |   - ncurses=6.2
 63 |   - nest-asyncio=1.4.2
 64 |   - notebook=6.1.4
 65 |   - numpy=1.19.2
 66 |   - numpy-base=1.19.2
 67 |   - olefile=0.46
 68 |   - openssl=1.1.1h
 69 |   - packaging=20.4
 70 |   - pandas=1.1.3
 71 |   - pandoc=2.11
 72 |   - pandocfilters=1.4.3
 73 |   - parso=0.7.0
 74 |   - pexpect=4.8.0
 75 |   - pickleshare=0.7.5
 76 |   - pillow=8.0.1
 77 |   - pip=20.2.4
 78 |   - prometheus_client=0.8.0
 79 |   - prompt-toolkit=3.0.8
 80 |   - ptyprocess=0.6.0
 81 |   - pycparser=2.20
 82 |   - pygments=2.7.2
 83 |   - pyopenssl=19.1.0
 84 |   - pyparsing=2.4.7
 85 |   - pyrsistent=0.17.3
 86 |   - pysocks=1.7.1
 87 |   - python=3.7.9
 88 |   - python-dateutil=2.8.1
 89 |   - pytz=2020.1
 90 |   - pyzmq=19.0.2
 91 |   - readline=8.0
 92 |   - requests=2.24.0
 93 |   - scikit-learn=0.23.2
 94 |   - scipy=1.5.2
 95 |   - seaborn=0.11.0
 96 |   - send2trash=1.5.0
 97 |   - setuptools=50.3.1
 98 |   - six=1.15.0
 99 |   - sqlite=3.33.0
100 |   - terminado=0.9.1
101 |   - testpath=0.4.4
102 |   - threadpoolctl=2.1.0
103 |   - tk=8.6.10
104 |   - tornado=6.0.4
105 |   - traitlets=5.0.5
106 |   - urllib3=1.25.11
107 |   - wcwidth=0.2.5
108 |   - webencodings=0.5.1
109 |   - wheel=0.35.1
110 |   - xz=5.2.5
111 |   - zeromq=4.3.3
112 |   - zipp=3.4.0
113 |   - zlib=1.2.11
114 |   - zstd=1.4.5
115 |   - pip:
116 |     - ansiwrap==0.8.4
117 |     - appdirs==1.4.4
118 |     - autopep8==1.5.4
119 |     - black==20.8b1
120 |     - bump2version==1.0.1
121 |     - bumpversion==0.6.0
122 |     - cachetools==4.1.1
123 |     - click==7.1.2
124 |     - cloudpickle==1.6.0
125 |     - colorama==0.4.4
126 |     - configparser==5.0.1
127 |     - coverage==5.3
128 |     - deprecated==1.2.10
129 |     - distlib==0.3.1
130 |     - docutils==0.16
131 |     - elyra==1.4.1
132 |     - filelock==3.0.12
133 |     - flake8==3.8.4
134 |     - gitdb==4.0.5
135 |     - gitpython==3.1.11
136 |     - google-api-core==1.23.0
137 |     - google-auth==1.23.0
138 |     - google-cloud-core==1.4.3
139 |     - google-cloud-storage==1.32.0
140 |     - google-crc32c==1.0.0
141 |     - google-resumable-media==1.1.0
142 |     - googleapis-common-protos==1.52.0
143 |     - jupyterlab-git==0.23.1
144 |     - keyring==21.5.0
145 |     - kfp==1.0.0
146 |     - kfp-notebook==0.14.0
147 |     - kfp-server-api==1.1.0a1
148 |     - kubernetes==11.0.0
149 |     - mccabe==0.6.1
150 |     - minio==6.0.0
151 |     - mypy-extensions==0.4.3
152 |     - nbconvert==5.6.1
153 |     - nbdime==2.1.0
154 |     - nbresuse==0.3.6
155 |     - oauthlib==3.1.0
156 |     - papermill==2.2.2
157 |     - pathspec==0.8.1
158 |     - pathtools==0.1.2
159 |     - pkginfo==1.6.1
160 |     - pluggy==1.0.0.dev0
161 |     - protobuf==3.14.0
162 |     - psutil==5.7.3
163 |     - py==1.9.0
164 |     - pyasn1==0.4.8
165 |     - pyasn1-modules==0.2.8
166 |     - pycodestyle==2.6.0
167 |     - pyflakes==2.2.0
168 |     - pyyaml==5.3.1
169 |     - readme-renderer==28.0
170 |     - regex==2020.11.13
171 |     - requests-oauthlib==1.3.0
172 |     - requests-toolbelt==0.9.1
173 |     - rfc3986==1.4.0
174 |     - rfc3986-validator==0.1.1
175 |     - rsa==4.6
176 |     - smmap==3.0.4
177 |     - strip-hints==0.1.9
178 |     - tabulate==0.8.7
179 |     - tenacity==6.2.0
180 |     - textwrap3==0.9.2
181 |     - toml==0.10.2
182 |     - tox==3.20.1
183 |     - tqdm==4.51.0
184 |     - twine==3.2.0
185 |     - typed-ast==1.4.1
186 |     - typing-extensions==3.7.4.3
187 |     - virtualenv==20.1.0
188 |     - watchdog==0.10.3
189 |     - websocket-client==0.57.0
190 |     - wrapt==1.12.1
191 | prefix: /Users/nick/miniconda3/envs/flight-delays-env
192 | 


--------------------------------------------------------------------------------
/kfserving.md:
--------------------------------------------------------------------------------
  1 | # Model deployment using KFServing
  2 | 
  3 | The `pipelines` folder contains a pipeline - `flight_delays_with_deployment.pipeline` - that encompasses deploying the trained flight prediction model as a service running in [KFServing](https://www.kubeflow.org/docs/components/serving/kfserving/).
  4 | 
  5 | ![Deployment pipeline](docs/source/images/flight-delays-pipeline-deploy.png)
  6 | 
  7 | In order to run this version of the pipeline, you will need to setup KFServing.
  8 | 
  9 | **Note** this example uses the built-in `minio` object storage service within Kubeflow Pipelines as the storage location for deploying a model to KFServing. Hence, KFP is required unless you manually setup up minio, or use S3.
 10 | 
 11 | Once KFServing is set up, you can run the pipeline locally or using the KFP runtime, in the same way as the pipeline that excludes the model deployment step.
 12 | 
 13 | ### Configuring a local KFServing runtime
 14 | 
 15 | Follow these steps to configure your KFServing runtime:
 16 | 
 17 | #### Install KFServing
 18 | 
 19 | Install KFServing locally on an existing Kubernetes installation, using [these instructions](https://github.com/kubeflow/kfserving/tree/cd53eb10fc6cf52edb9e6623238ed9aa9fe5af72#install-kfserving-in-5-minutes-on-your-local-machine). You may optionally also have Kubeflow Pipelines installed (see the [main README instructions](../README.md#configuring-a-local-kubeflow-pipeline-runtime)).
 20 | 
 21 | #### Set up access to object storage
 22 | 
 23 | Once installed and running, you will need to set up a `Secret` and `ServiceAccount` to allow KFServing to access the object storage bucket for the model (refer to [these instructions](https://github.com/kubeflow/kfserving/tree/master/docs/samples/s3#create-s3-secret-and-attach-to-service-account)).
 24 | 
 25 | **Note** we use the `kubeflow` namespace, since the model deployment node within a KFP runtime is not able to create resources in another namespace.
 26 | 
 27 | Run the following command on the command line:
 28 | 
 29 | ```console
 30 | cat <<EOF | kubectl apply -f -
 31 | apiVersion: v1
 32 | stringData:
 33 |   AWS_ACCESS_KEY_ID: minio
 34 |   AWS_SECRET_ACCESS_KEY: minio123
 35 | kind: Secret
 36 | type: Opaque
 37 | metadata:
 38 |   annotations:
 39 |     serving.kubeflow.org/s3-endpoint: minio-service.kubeflow:9000
 40 |     serving.kubeflow.org/s3-usehttps: "0"
 41 |   name: kfserving-secret
 42 |   namespace: kubeflow
 43 | ---
 44 | apiVersion: v1
 45 | kind: ServiceAccount
 46 | metadata:
 47 |   name: kfserving-sa
 48 |   namespace: kubeflow
 49 | secrets:
 50 | - name: kfserving-secret
 51 | EOF
 52 | ```
 53 | 
 54 | #### Use a custom image for model server
 55 | 
 56 | By default, KFServing's model server for scikit-learn models - the `sklearnserver` - uses an image that runs sklearn version `0.20`. So, we need to use a custom image that runs sklearn `0.23`.
 57 | 
 58 | To do this, we follow the instructions [here](https://github.com/kubeflow/kfserving/tree/master/docs/samples/sklearn#run-sklearn-inferenceservice-with-your-own-image). Once KFServing is running, edit the `configmap`:
 59 | ```console
 60 | kubectl edit cm -n kfserving-system inferenceservice-config
 61 | ```
 62 | 
 63 | This will open the default editor with the configmap file. Change the entry for `sklearn` under `predictors` to:
 64 | 
 65 | ```
 66 | "sklearn": {
 67 |   "image": "docker.io/mlnick/sklearnserver",
 68 |   "defaultImageVersion": "latest"
 69 | },
 70 | ```
 71 | 
 72 | Save and exit the editor.
 73 | 
 74 | #### Update the pipeline-runner role
 75 | 
 76 | The Kubeflow pipeline-runner service account role must be updated so that it can create KFServing `inferenceservices`.
 77 | 
 78 | Edit the `role`:
 79 | 
 80 | ```console
 81 | kubectl edit role -n kubeflow pipeline-runner
 82 | ```
 83 | 
 84 | This will open the default editor with the role file. Add the following entry at the end of the file (after the last `apiGroups` entry):
 85 | 
 86 | ```
 87 | - apiGroups:
 88 |   - serving.kubeflow.org
 89 |   resources:
 90 |   - inferenceservices
 91 |   verbs:
 92 |   - '*'
 93 | ```
 94 | 
 95 | Save and exit the editor.
 96 | 
 97 | ### Run the pipeline
 98 | 
 99 | You can now run the `flight_delays_with_deployment.pipeline` either locally or with the KFP runtime.
100 | 
101 | #### Running locally
102 | 
103 | Ensure that you set up port-forwarding for the Istio ingress gateway so that your deployment notebook can reach the deployed inference service. Run:
104 | 
105 | ```console
106 | kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80
107 | ```
108 | 
109 | #### Running in Kubeflow Pipelines
110 | 
111 | Sending requests to the inference service uses a different url and request structure, depending on whether you're using the ingress gateway or sending the request within the cluster (which will be the case if executing on a KFP runtime). In this case, set the Elyra node environment variable `MODEL_MODE=kfp`, in the node `Properties` dialog:
112 | 
113 | ![Deployment node config](docs/source/images/deploy-node-config.png)
114 | 
115 | Once completed, you should be able to view the pipeline in the Kubeflow Pipelines UI:
116 | 
117 | ![Pipeline experiment run](docs/source/images/kfp-experiment-deploy.png)
118 | 
119 | You can also see the deployment notebook results by downloading the `html` file from object storage.
120 | 
121 | ![Deployment results](docs/source/images/deploy-model-results.png)


--------------------------------------------------------------------------------
/notebooks/analyze_flight_delays.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.04815,
  8 |      "end_time": "2020-11-18T10:42:47.562722",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T10:42:47.514572",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Analyzing Airline Delay Data\n",
 17 |     "\n",
 18 |     "In this notebook, we show some illustrative analysis of our flight delay dataset, exploring whether certain features in the dataset have any impact on flights being delayed.\n",
 19 |     "\n",
 20 |     "**Note** the full flight delay dataset is very large (over 80GB uncompressed), so we are working with a smaller sample dataset. Hence any results may not be a true reflection of the results on the full dataset.\n",
 21 |     "\n",
 22 |     "#### Import required modules\n",
 23 |     "\n",
 24 |     "Import and configure the required modules."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "papermill": {
 32 |      "duration": 3.604206,
 33 |      "end_time": "2020-11-18T10:42:51.232409",
 34 |      "exception": false,
 35 |      "start_time": "2020-11-18T10:42:47.628203",
 36 |      "status": "completed"
 37 |     },
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "!pip install seaborn > /dev/null 2>&1"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "papermill": {
 50 |      "duration": 2.313344,
 51 |      "end_time": "2020-11-18T10:42:53.586078",
 52 |      "exception": false,
 53 |      "start_time": "2020-11-18T10:42:51.272734",
 54 |      "status": "completed"
 55 |     },
 56 |     "tags": []
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Define required imports\n",
 61 |     "import pandas as pd\n",
 62 |     "import numpy as np\n",
 63 |     "import seaborn as sns\n",
 64 |     "import matplotlib.pyplot as plt\n",
 65 |     "sns.set_theme(style='darkgrid', palette='deep')\n",
 66 |     "# These set pandas max column and row display in the notebook\n",
 67 |     "pd.set_option('display.max_columns', 50)\n",
 68 |     "pd.set_option('display.max_rows', 50)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "papermill": {
 75 |      "duration": 0.042908,
 76 |      "end_time": "2020-11-18T10:42:53.674690",
 77 |      "exception": false,
 78 |      "start_time": "2020-11-18T10:42:53.631782",
 79 |      "status": "completed"
 80 |     },
 81 |     "tags": []
 82 |    },
 83 |    "source": [
 84 |     "### Read the data\n",
 85 |     "\n",
 86 |     "We start by reading in the merged flight delay and weather data"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "papermill": {
 94 |      "duration": 0.138328,
 95 |      "end_time": "2020-11-18T10:42:53.857510",
 96 |      "exception": false,
 97 |      "start_time": "2020-11-18T10:42:53.719182",
 98 |      "status": "completed"
 99 |     },
100 |     "tags": []
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "flight_path = 'data/jfk_flight_weather_features.csv'\n",
105 |     "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n",
106 |     "flight_data.head()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {
112 |     "papermill": {
113 |      "duration": 0.024233,
114 |      "end_time": "2020-11-18T10:42:53.906192",
115 |      "exception": false,
116 |      "start_time": "2020-11-18T10:42:53.881959",
117 |      "status": "completed"
118 |     },
119 |     "tags": []
120 |    },
121 |    "source": [
122 |     "### Analyze the data\n",
123 |     "\n",
124 |     "Now we will analyze the data to see if we can gain insight into flight delays.\n",
125 |     "\n",
126 |     "Let's start by looking at the overall proportion of flights that are delayed."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "papermill": {
134 |      "duration": 0.21334,
135 |      "end_time": "2020-11-18T10:42:54.143379",
136 |      "exception": false,
137 |      "start_time": "2020-11-18T10:42:53.930039",
138 |      "status": "completed"
139 |     },
140 |     "tags": []
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "vc = flight_data['delayed'].value_counts()\n",
145 |     "perc = vc / sum(vc)\n",
146 |     "print('On-time: {:.2f}%'.format(perc[0] * 100))\n",
147 |     "print('Delayed: {:.2f}%'.format(perc[1] * 100))\n",
148 |     "plt.figure(figsize=(8, 6))\n",
149 |     "chart = sns.countplot(data=flight_data, x='delayed')\n",
150 |     "chart.set_xticklabels(['On-time', 'Delayed'])\n",
151 |     "chart.set_xlabel('Flight status')\n",
152 |     "plt.show()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {
158 |     "papermill": {
159 |      "duration": 0.030623,
160 |      "end_time": "2020-11-18T10:42:54.195654",
161 |      "exception": false,
162 |      "start_time": "2020-11-18T10:42:54.165031",
163 |      "status": "completed"
164 |     },
165 |     "tags": []
166 |    },
167 |    "source": [
168 |     "We see 80% of flights are on-time. Still, a fairly high proportion of 20% of flights are delayed - recall delayed here means more than 15 minutes late!"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "papermill": {
175 |      "duration": 0.04472,
176 |      "end_time": "2020-11-18T10:42:54.270571",
177 |      "exception": false,
178 |      "start_time": "2020-11-18T10:42:54.225851",
179 |      "status": "completed"
180 |     },
181 |     "tags": []
182 |    },
183 |    "source": [
184 |     "#### Analyze and visualize flight delay durations\n",
185 |     "Next, we will plot the flight delay (in minutes) over time."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {
192 |     "papermill": {
193 |      "duration": 0.686545,
194 |      "end_time": "2020-11-18T10:42:55.005326",
195 |      "exception": false,
196 |      "start_time": "2020-11-18T10:42:54.318781",
197 |      "status": "completed"
198 |     },
199 |     "tags": []
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "plt.figure(figsize=(16, 6))\n",
204 |     "chart = sns.scatterplot(x='flight_date', y='dep_delay', data=flight_data)\n",
205 |     "plt.show()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {
211 |     "papermill": {
212 |      "duration": 0.036333,
213 |      "end_time": "2020-11-18T10:42:55.080494",
214 |      "exception": false,
215 |      "start_time": "2020-11-18T10:42:55.044161",
216 |      "status": "completed"
217 |     },
218 |     "tags": []
219 |    },
220 |    "source": [
221 |     "There doesn't appear to be any obvious relationship. It is worth noting that most flight delay lengths are very low (clustered around zero), with a relatively small number of very large values (i.e. _outliers_). This may tend to skew analysis based on, for example, analyzing the _average_ flight delay duration. This also consistent with our proportion analysis above.\n",
222 |     "\n",
223 |     "Let's look at whether flight delays are impacted by the day of the week:"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "papermill": {
231 |      "duration": 1.367749,
232 |      "end_time": "2020-11-18T10:42:56.495574",
233 |      "exception": false,
234 |      "start_time": "2020-11-18T10:42:55.127825",
235 |      "status": "completed"
236 |     },
237 |     "tags": []
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "plt.figure(figsize=(16, 6))\n",
242 |     "days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
243 |     "chart = sns.barplot(x='day_of_week', y='dep_delay', data=flight_data)\n",
244 |     "chart.set_xticklabels(days)\n",
245 |     "chart.set_xlabel('Day of Week')\n",
246 |     "chart.set_ylabel('Departure Delay (min)')\n",
247 |     "chart.set_title('Distribution of departure delay by day of week')\n",
248 |     "plt.show()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {
254 |     "papermill": {
255 |      "duration": 0.035542,
256 |      "end_time": "2020-11-18T10:42:56.574443",
257 |      "exception": false,
258 |      "start_time": "2020-11-18T10:42:56.538901",
259 |      "status": "completed"
260 |     },
261 |     "tags": []
262 |    },
263 |    "source": [
264 |     "This chart shows the average and confidence interval (standard deviation) for flight delays, grouped by day of week. It appears from the chart that Monday, Friday and Sunday are the worst days to fly, with respect to the average flight delay. Perhaps this is due to a larger volume of flights on those days? We can in fact check this by plotting the total number of flights per weekday in the dataset."
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {
271 |     "papermill": {
272 |      "duration": 0.382649,
273 |      "end_time": "2020-11-18T10:42:57.008464",
274 |      "exception": false,
275 |      "start_time": "2020-11-18T10:42:56.625815",
276 |      "status": "completed"
277 |     },
278 |     "tags": []
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "plt.figure(figsize=(16, 6))\n",
283 |     "chart = sns.countplot(x='day_of_week', data=flight_data)\n",
284 |     "chart.set_xticklabels(days)\n",
285 |     "chart.set_xlabel('Day of Week')\n",
286 |     "chart.set_ylabel('Number of flights')\n",
287 |     "chart.set_title('Flights by day of week')\n",
288 |     "plt.show()"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {
294 |     "papermill": {
295 |      "duration": 0.031438,
296 |      "end_time": "2020-11-18T10:42:57.070642",
297 |      "exception": false,
298 |      "start_time": "2020-11-18T10:42:57.039204",
299 |      "status": "completed"
300 |     },
301 |     "tags": []
302 |    },
303 |    "source": [
304 |     "There doesn't appear to be an obvious correlation between volume of flights and which days experience larger flight delays.\n",
305 |     "\n",
306 |     "**Note** however, that we are not taking into account volumes of arriving flights in this analysis, which may have an impact!\n",
307 |     "\n",
308 |     "Recall that the flight delay data appeared to have many outlier values. This means the distribution of flight delays is very skewed. We should take a look at a view that takes this into account:"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {
315 |     "papermill": {
316 |      "duration": 0.582665,
317 |      "end_time": "2020-11-18T10:42:57.683888",
318 |      "exception": false,
319 |      "start_time": "2020-11-18T10:42:57.101223",
320 |      "status": "completed"
321 |     },
322 |     "tags": []
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "plt.figure(figsize=(16, 6))\n",
327 |     "days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
328 |     "chart = sns.boxenplot(x='day_of_week', y='dep_delay', data=flight_data)\n",
329 |     "chart.set_xticklabels(days)\n",
330 |     "chart.set_xlabel('Day of Week')\n",
331 |     "chart.set_ylabel('Departure Delay (min)')\n",
332 |     "chart.set_title('Distribution of departure delay by day of week')\n",
333 |     "plt.show()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {
339 |     "papermill": {
340 |      "duration": 0.059778,
341 |      "end_time": "2020-11-18T10:42:57.800830",
342 |      "exception": false,
343 |      "start_time": "2020-11-18T10:42:57.741052",
344 |      "status": "completed"
345 |     },
346 |     "tags": []
347 |    },
348 |    "source": [
349 |     "The above chart shows a more detailed distribution of the flight delay for each weekday. This shows that Monday and Friday definitey have some extremely large outlier values that play a role in their higher average flight delays.\n",
350 |     "\n",
351 |     "We can also see that Friday and Sunday have \"wider\" and \"higher\" blocks at moderately higher flight delay levels. This contributes to the higher average flight delays and tells us that outliers alone are not fully to blame for the higher average delays on these days.\n",
352 |     "\n",
353 |     "It is usually wise to dig a bit deeper when visualizing skewed or imbalanced datasets. "
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {
359 |     "papermill": {
360 |      "duration": 0.04439,
361 |      "end_time": "2020-11-18T10:42:57.890880",
362 |      "exception": false,
363 |      "start_time": "2020-11-18T10:42:57.846490",
364 |      "status": "completed"
365 |     },
366 |     "tags": []
367 |    },
368 |    "source": [
369 |     "#### Analyze and visualize flight delay proportions for flight features\n",
370 |     "Next, we will analyze the proportion of flights that are delayed, for given sets of features in our dataset related to the **flight** itself. Since we wish to build a classifier, this analysis can help us to understand which features may be indicative of greater probability of a flight delay and which features have little impact.\n",
371 |     "\n",
372 |     "First, we will define a convenience function to create our stacked proportion charts:"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {
379 |     "papermill": {
380 |      "duration": 0.072936,
381 |      "end_time": "2020-11-18T10:42:58.043088",
382 |      "exception": false,
383 |      "start_time": "2020-11-18T10:42:57.970152",
384 |      "status": "completed"
385 |     },
386 |     "tags": []
387 |    },
388 |    "outputs": [],
389 |    "source": [
390 |     "def plot_stacked_by_col(col, x_label, rotation=0, horizontalalignment='center', xticks=None):\n",
391 |     "    grouped = flight_data['delayed'].groupby(flight_data[col]).value_counts()\n",
392 |     "    g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n",
393 |     "\n",
394 |     "    chart = g.unstack().plot(kind='bar', stacked=True, figsize=(16, 6))\n",
395 |     "    chart.set_xticklabels(\n",
396 |     "        xticks if xticks else chart.get_xticklabels(),\n",
397 |     "        rotation=rotation, \n",
398 |     "        horizontalalignment=horizontalalignment,\n",
399 |     "        fontweight='light',\n",
400 |     "        fontsize='medium'\n",
401 |     "    )\n",
402 |     "    chart.set_xlabel(x_label)\n",
403 |     "    chart.set_ylabel('Proportion delayed')\n",
404 |     "    chart.set_title('Proportion of flights delayed, by {}'.format(x_label))\n",
405 |     "    plt.show()"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {
411 |     "papermill": {
412 |      "duration": 0.0663,
413 |      "end_time": "2020-11-18T10:42:58.160817",
414 |      "exception": false,
415 |      "start_time": "2020-11-18T10:42:58.094517",
416 |      "status": "completed"
417 |     },
418 |     "tags": []
419 |    },
420 |    "source": [
421 |     "Let's start by analyzing proportion of flights delayed by weekday, continuing the theme of our analsis above."
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "papermill": {
429 |      "duration": 0.716892,
430 |      "end_time": "2020-11-18T10:42:58.944322",
431 |      "exception": false,
432 |      "start_time": "2020-11-18T10:42:58.227430",
433 |      "status": "completed"
434 |     },
435 |     "tags": []
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "plot_stacked_by_col('day_of_week', 'Day of Week', xticks=days)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {
445 |     "papermill": {
446 |      "duration": 0.041598,
447 |      "end_time": "2020-11-18T10:42:59.039796",
448 |      "exception": false,
449 |      "start_time": "2020-11-18T10:42:58.998198",
450 |      "status": "completed"
451 |     },
452 |     "tags": []
453 |    },
454 |    "source": [
455 |     "This chart roughly matches the delay duration charts above, with Monday, Friday and Sunday having the highest proportion of delayed flights, while Tuesday and Wednesday are the \"best\" days. This indicates that day of the week may be at least somewhat useful for predicting flight delays.\n",
456 |     "\n",
457 |     "Next, we plot the proportions by departure time (where departure times are grouped into hourly buckets, with the exception of a larger bucket for \"early morning flights\")."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {
464 |     "papermill": {
465 |      "duration": 1.05987,
466 |      "end_time": "2020-11-18T10:43:00.138704",
467 |      "exception": false,
468 |      "start_time": "2020-11-18T10:42:59.078834",
469 |      "status": "completed"
470 |     },
471 |     "tags": []
472 |    },
473 |    "outputs": [],
474 |    "source": [
475 |     "plot_stacked_by_col('dep_time_bin', 'Departure Time Bucket', rotation=45, horizontalalignment='right')"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "metadata": {
481 |     "papermill": {
482 |      "duration": 0.042265,
483 |      "end_time": "2020-11-18T10:43:00.211480",
484 |      "exception": false,
485 |      "start_time": "2020-11-18T10:43:00.169215",
486 |      "status": "completed"
487 |     },
488 |     "tags": []
489 |    },
490 |    "source": [
491 |     "It seems clear that flights later in the day have a generally higher chance of being delayed, relative to flights in the morning (and especially early morning). Perhaps this is related to flight volumes - are flight volumes lower in the early morning?\n",
492 |     "\n",
493 |     "Again, we can check this by plotting the number of flights per departure time bucket."
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {
500 |     "papermill": {
501 |      "duration": 0.552196,
502 |      "end_time": "2020-11-18T10:43:00.795416",
503 |      "exception": false,
504 |      "start_time": "2020-11-18T10:43:00.243220",
505 |      "status": "completed"
506 |     },
507 |     "tags": []
508 |    },
509 |    "outputs": [],
510 |    "source": [
511 |     "plt.figure(figsize=(16, 6))\n",
512 |     "chart = sns.countplot(\n",
513 |     "    x='dep_time_bin',\n",
514 |     "    data=flight_data,\n",
515 |     "    order=flight_data.groupby(flight_data['dep_time_bin']).groups.keys())\n",
516 |     "chart.set_xticklabels(\n",
517 |     "    chart.get_xticklabels(),\n",
518 |     "    rotation=45, \n",
519 |     "    horizontalalignment='right',\n",
520 |     "    fontweight='light',\n",
521 |     "    fontsize='medium'\n",
522 |     ")\n",
523 |     "chart.set_xlabel('Departure Time Bucket')\n",
524 |     "chart.set_ylabel('Count')\n",
525 |     "chart.set_title('Flights by Departure Time Bucket')\n",
526 |     "plt.show()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {
532 |     "papermill": {
533 |      "duration": 0.048324,
534 |      "end_time": "2020-11-18T10:43:00.878107",
535 |      "exception": false,
536 |      "start_time": "2020-11-18T10:43:00.829783",
537 |      "status": "completed"
538 |     },
539 |     "tags": []
540 |    },
541 |    "source": [
542 |     "While there are definitely relatively fewer very early flights, there are more flights in the early morning, and these are less likely to be delayed than afternoon flights, depsite flight volumes being similar between the two groups. Also, there are relatively lower volumes of late night flights, while these are relatively more likely to be delayed. So, flight volumes don't seem to play much of a role.\n",
543 |     "\n",
544 |     "**Note** however, that we are not taking into account volumes of arriving flights in this analysis, which may have an impact!\n",
545 |     "\n",
546 |     "Next, let's see if a particular airline's flights are more likely to be delayed."
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {
553 |     "papermill": {
554 |      "duration": 0.663351,
555 |      "end_time": "2020-11-18T10:43:01.579032",
556 |      "exception": false,
557 |      "start_time": "2020-11-18T10:43:00.915681",
558 |      "status": "completed"
559 |     },
560 |     "tags": []
561 |    },
562 |    "outputs": [],
563 |    "source": [
564 |     "plot_stacked_by_col('airline_name', 'Airline', rotation=45, horizontalalignment='right')"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "markdown",
569 |    "metadata": {
570 |     "papermill": {
571 |      "duration": 0.075645,
572 |      "end_time": "2020-11-18T10:43:01.736584",
573 |      "exception": false,
574 |      "start_time": "2020-11-18T10:43:01.660939",
575 |      "status": "completed"
576 |     },
577 |     "tags": []
578 |    },
579 |    "source": [
580 |     "It seems like the airline does have some impact on delay proportion (note volumes for some smaller airlines may be quite low due to sampling). How about flight destination?"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {
587 |     "papermill": {
588 |      "duration": 4.987105,
589 |      "end_time": "2020-11-18T10:43:06.793705",
590 |      "exception": false,
591 |      "start_time": "2020-11-18T10:43:01.806600",
592 |      "status": "completed"
593 |     },
594 |     "tags": []
595 |    },
596 |    "outputs": [],
597 |    "source": [
598 |     "plot_stacked_by_col('dest', 'Destination Airport', rotation=65, horizontalalignment='right')"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "markdown",
603 |    "metadata": {
604 |     "papermill": {
605 |      "duration": 0.054271,
606 |      "end_time": "2020-11-18T10:43:06.893444",
607 |      "exception": false,
608 |      "start_time": "2020-11-18T10:43:06.839173",
609 |      "status": "completed"
610 |     },
611 |     "tags": []
612 |    },
613 |    "source": [
614 |     "Again, it appears there is a relationship between proportion of flights delayed and the flight destination (the same caveats with respect to sampled data as mentioned above, would apply here).\n",
615 |     "\n",
616 |     "Finally, what about flight distance?"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {
623 |     "papermill": {
624 |      "duration": 0.705268,
625 |      "end_time": "2020-11-18T10:43:07.647698",
626 |      "exception": false,
627 |      "start_time": "2020-11-18T10:43:06.942430",
628 |      "status": "completed"
629 |     },
630 |     "tags": []
631 |    },
632 |    "outputs": [],
633 |    "source": [
634 |     "plot_stacked_by_col('distance_bin', 'Distance Bin', rotation=0, horizontalalignment='center')"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "markdown",
639 |    "metadata": {
640 |     "papermill": {
641 |      "duration": 0.043425,
642 |      "end_time": "2020-11-18T10:43:07.737420",
643 |      "exception": false,
644 |      "start_time": "2020-11-18T10:43:07.693995",
645 |      "status": "completed"
646 |     },
647 |     "tags": []
648 |    },
649 |    "source": [
650 |     "It seems like there may be some relationship, though it's not particlarly clear - shorter flights and longer flights tend to have roughly similar proportions."
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "markdown",
655 |    "metadata": {
656 |     "papermill": {
657 |      "duration": 0.039239,
658 |      "end_time": "2020-11-18T10:43:07.828689",
659 |      "exception": false,
660 |      "start_time": "2020-11-18T10:43:07.789450",
661 |      "status": "completed"
662 |     },
663 |     "tags": []
664 |    },
665 |    "source": [
666 |     "#### Analyze and visualize flight delay proportions for weather features\n",
667 |     "Now, we will analyze the proportion of flights that are delayed, for given sets of **weather** features in our dataset."
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {
674 |     "papermill": {
675 |      "duration": 0.852436,
676 |      "end_time": "2020-11-18T10:43:08.721519",
677 |      "exception": false,
678 |      "start_time": "2020-11-18T10:43:07.869083",
679 |      "status": "completed"
680 |     },
681 |     "tags": []
682 |    },
683 |    "outputs": [],
684 |    "source": [
685 |     "# create sub-plots for a few weather conditions\n",
686 |     "\n",
687 |     "ax = plt.subplot(221)\n",
688 |     "grouped = flight_data['delayed'].groupby(flight_data['drizzle']).value_counts()\n",
689 |     "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n",
690 |     "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n",
691 |     "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n",
692 |     "chart.set_xlabel('Drizzle')\n",
693 |     "chart.set_ylabel('Proportion delayed')\n",
694 |     "\n",
695 |     "ax = plt.subplot(222)\n",
696 |     "grouped = flight_data['delayed'].groupby(flight_data['mist']).value_counts()\n",
697 |     "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n",
698 |     "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n",
699 |     "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n",
700 |     "chart.set_xlabel('Mist')\n",
701 |     "chart.set_ylabel('Proportion delayed')\n",
702 |     "\n",
703 |     "ax = plt.subplot(223)\n",
704 |     "grouped = flight_data['delayed'].groupby(flight_data['snow']).value_counts()\n",
705 |     "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n",
706 |     "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n",
707 |     "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n",
708 |     "chart.set_xlabel('Snow')\n",
709 |     "chart.set_ylabel('Proportion delayed')\n",
710 |     "\n",
711 |     "ax = plt.subplot(224)\n",
712 |     "grouped = flight_data['delayed'].groupby(flight_data['thunderstorm']).value_counts()\n",
713 |     "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n",
714 |     "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n",
715 |     "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n",
716 |     "chart.set_xlabel('Thunderstorm')\n",
717 |     "chart.set_ylabel('Proportion delayed')\n",
718 |     "\n",
719 |     "plt.show()"
720 |    ]
721 |   },
722 |   {
723 |    "cell_type": "markdown",
724 |    "metadata": {
725 |     "papermill": {
726 |      "duration": 0.04203,
727 |      "end_time": "2020-11-18T10:43:08.797469",
728 |      "exception": false,
729 |      "start_time": "2020-11-18T10:43:08.755439",
730 |      "status": "completed"
731 |     },
732 |     "tags": []
733 |    },
734 |    "source": [
735 |     "From these charts, it appears that the presence of \"drizzle\" does not impact on whether a flight is likely to be delayed - as we might expect. However, if there is snow or a thunderstorm, for example, it appears flight delays are much more likely.\n",
736 |     "\n",
737 |     "We have touched on only a little of the analysis of weather features that could be performed. For example, one could explore more of the weather conditions similarly to the cell above; or investigate the potential relationship between features such as `visibility`, `wind_speed` and `precip` to both proportions of flights delayed as well as duration of flight delays. "
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "markdown",
742 |    "metadata": {
743 |     "papermill": {
744 |      "duration": 0.051397,
745 |      "end_time": "2020-11-18T10:43:08.896930",
746 |      "exception": false,
747 |      "start_time": "2020-11-18T10:43:08.845533",
748 |      "status": "completed"
749 |     },
750 |     "tags": []
751 |    },
752 |    "source": [
753 |     "<a id=\"authors\"></a> \n",
754 |     "### Authors"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "markdown",
759 |    "metadata": {
760 |     "papermill": {
761 |      "duration": 0.073045,
762 |      "end_time": "2020-11-18T10:43:09.014783",
763 |      "exception": false,
764 |      "start_time": "2020-11-18T10:43:08.941738",
765 |      "status": "completed"
766 |     },
767 |     "tags": []
768 |    },
769 |    "source": [
770 |     "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
771 |     "\n",
772 |     "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License."
773 |    ]
774 |   }
775 |  ],
776 |  "metadata": {
777 |   "kernelspec": {
778 |    "display_name": "Python 3",
779 |    "language": "python",
780 |    "name": "python3"
781 |   },
782 |   "language_info": {
783 |    "codemirror_mode": {
784 |     "name": "ipython",
785 |     "version": 3
786 |    },
787 |    "file_extension": ".py",
788 |    "mimetype": "text/x-python",
789 |    "name": "python",
790 |    "nbconvert_exporter": "python",
791 |    "pygments_lexer": "ipython3",
792 |    "version": "3.7.9"
793 |   },
794 |   "papermill": {
795 |    "duration": 23.694453,
796 |    "end_time": "2020-11-18T10:43:09.616322",
797 |    "environment_variables": {},
798 |    "exception": null,
799 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/analyze_flight_delays.ipynb",
800 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/analyze_flight_delays.ipynb",
801 |    "parameters": {},
802 |    "start_time": "2020-11-18T10:42:45.921869",
803 |    "version": "2.1.1"
804 |   },
805 |   "toc-autonumbering": false,
806 |   "toc-showcode": false,
807 |   "toc-showmarkdowntxt": false,
808 |   "toc-showtags": false
809 |  },
810 |  "nbformat": 4,
811 |  "nbformat_minor": 4
812 | }
813 | 


--------------------------------------------------------------------------------
/notebooks/deploy_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.04375,
  8 |      "end_time": "2020-11-18T16:57:46.111418",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T16:57:46.067668",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Deploying the Flight Delay Model\n",
 17 |     "\n",
 18 |     "In this notebook, we deploy the model we trained to predict flight delays, using [Kubeflow Serving](https://www.kubeflow.org/docs/components/serving/kfserving/).\n",
 19 |     "\n",
 20 |     "**Note** this notebook requires access to a KFServing installation. See the [KFServing instructions](../kfserving.md) for details. If running the pipeline on the Kubeflow Pipelines runtime, also see the [readme instructions](../README.md) for the link to install KFP.\n",
 21 |     "\n",
 22 |     "#### Import required modules\n",
 23 |     "\n",
 24 |     "Import and configure the required modules."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "papermill": {
 32 |      "duration": 17.423869,
 33 |      "end_time": "2020-11-18T16:58:03.571681",
 34 |      "exception": false,
 35 |      "start_time": "2020-11-18T16:57:46.147812",
 36 |      "status": "completed"
 37 |     },
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "! pip install -q kfserving"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "papermill": {
 50 |      "duration": 0.811648,
 51 |      "end_time": "2020-11-18T16:58:04.400939",
 52 |      "exception": false,
 53 |      "start_time": "2020-11-18T16:58:03.589291",
 54 |      "status": "completed"
 55 |     },
 56 |     "tags": []
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "import os\n",
 61 |     "import numpy as np\n",
 62 |     "import requests\n",
 63 |     "# minio is part of kfserving \n",
 64 |     "from minio import Minio\n",
 65 |     "from minio.error import NoSuchBucket"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {
 71 |     "papermill": {
 72 |      "duration": 0.042401,
 73 |      "end_time": "2020-11-18T16:58:04.478066",
 74 |      "exception": false,
 75 |      "start_time": "2020-11-18T16:58:04.435665",
 76 |      "status": "completed"
 77 |     },
 78 |     "tags": []
 79 |    },
 80 |    "source": [
 81 |     "### Upload the model to object storage\n",
 82 |     "\n",
 83 |     "Our notebook has access to the trained model file, which was exported by the previous pipeline phase. _However_, when using a Kubeflow Pipelines runtime, it is not possible to programatically access the object storage bucket. It also makes execution mechanics different between local and KFP execution mode.\n",
 84 |     "\n",
 85 |     "So, here we will use a dedicated bucket for models in object storage, and upload it from the notebook execution environment. We will then deploy the KFServing inference service using that object storage location."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "papermill": {
 93 |      "duration": 5.189059,
 94 |      "end_time": "2020-11-18T16:58:09.715558",
 95 |      "exception": false,
 96 |      "start_time": "2020-11-18T16:58:04.526499",
 97 |      "status": "completed"
 98 |     },
 99 |     "tags": []
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "# set up the minio client to access object storage buckets\n",
104 |     "os_url = os.environ.get('OS_URL', 'minio-service:9000')\n",
105 |     "access_key = os.environ.get('ACCESS_KEY_ID', 'minio')\n",
106 |     "secret_key = os.environ.get('SECRET_ACCESS_KEY', 'minio123')\n",
107 |     "\n",
108 |     "mc = Minio(os_url,\n",
109 |     "           access_key=access_key,\n",
110 |     "           secret_key=secret_key,\n",
111 |     "           secure=False)\n",
112 |     "\n",
113 |     "print('Current buckets:')\n",
114 |     "for b in mc.list_buckets():\n",
115 |     "    print('  ' + b.name)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {
122 |     "papermill": {
123 |      "duration": 0.158499,
124 |      "end_time": "2020-11-18T16:58:09.903405",
125 |      "exception": false,
126 |      "start_time": "2020-11-18T16:58:09.744906",
127 |      "status": "completed"
128 |     },
129 |     "tags": []
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "# create a bucket to upload the model file to\n",
134 |     "# Note: if the model file already exists we delete it\n",
135 |     "model_bucket = os.environ.get('MODEL_BUCKET', 'models')\n",
136 |     "model_dir = os.environ.get('MODEL_DIR', 'models')\n",
137 |     "model_file = 'model.joblib'\n",
138 |     "model_path = '{}/{}'.format(model_dir, model_file)\n",
139 |     "\n",
140 |     "try:\n",
141 |     "    # delete model file if if exists \n",
142 |     "    mc.remove_object(model_bucket, model_file)\n",
143 |     "except NoSuchBucket:\n",
144 |     "    # the bucket doesn't exist - create it\n",
145 |     "    print('Creating bucket [{}]'.format(model_bucket))\n",
146 |     "    mc.make_bucket(model_bucket)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "papermill": {
154 |      "duration": 0.148869,
155 |      "end_time": "2020-11-18T16:58:10.075811",
156 |      "exception": false,
157 |      "start_time": "2020-11-18T16:58:09.926942",
158 |      "status": "completed"
159 |     },
160 |     "tags": []
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "# upload the model file\n",
165 |     "file_stat = os.stat(model_path)\n",
166 |     "with open(model_path, 'rb') as data:\n",
167 |     "    mc.put_object(model_bucket, model_file, data, file_stat.st_size)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {
174 |     "papermill": {
175 |      "duration": 0.083705,
176 |      "end_time": "2020-11-18T16:58:10.193249",
177 |      "exception": false,
178 |      "start_time": "2020-11-18T16:58:10.109544",
179 |      "status": "completed"
180 |     },
181 |     "tags": []
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "# check whether the model file is there\n",
186 |     "for o in mc.list_objects(model_bucket, prefix=model_file):\n",
187 |     "    print(o)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {
193 |     "papermill": {
194 |      "duration": 0.052238,
195 |      "end_time": "2020-11-18T16:58:10.293275",
196 |      "exception": false,
197 |      "start_time": "2020-11-18T16:58:10.241037",
198 |      "status": "completed"
199 |     },
200 |     "tags": []
201 |    },
202 |    "source": [
203 |     "### Create the inference service\n",
204 |     "\n",
205 |     "Next, we use the KFServing Python client to create the inference service.\n",
206 |     "\n",
207 |     "**Note** the prerequisites (see the [KF Serving instructions](../kfserving.md)):\n",
208 |     "1. A service account and related secret for the object storage service\n",
209 |     "1. Specify the custom `sklearnserver` Docker image\n",
210 |     "1. Patch the KFP `pipeline-runner` service account role to allow creating a KFServing `inferenceservice`"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "papermill": {
218 |      "duration": 5.594827,
219 |      "end_time": "2020-11-18T16:58:15.938195",
220 |      "exception": false,
221 |      "start_time": "2020-11-18T16:58:10.343368",
222 |      "status": "completed"
223 |     },
224 |     "tags": []
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "from kubernetes import client\n",
229 |     "\n",
230 |     "from kfserving import KFServingClient\n",
231 |     "from kfserving import constants\n",
232 |     "from kfserving import utils\n",
233 |     "from kfserving import V1alpha2EndpointSpec\n",
234 |     "from kfserving import V1alpha2PredictorSpec\n",
235 |     "from kfserving import V1alpha2SKLearnSpec\n",
236 |     "from kfserving import V1alpha2InferenceServiceSpec\n",
237 |     "from kfserving import V1alpha2InferenceService\n",
238 |     "from kubernetes.client import V1ResourceRequirements"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "papermill": {
246 |      "duration": 0.107594,
247 |      "end_time": "2020-11-18T16:58:16.085369",
248 |      "exception": false,
249 |      "start_time": "2020-11-18T16:58:15.977775",
250 |      "status": "completed"
251 |     },
252 |     "tags": []
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "KFServing = KFServingClient()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "papermill": {
264 |      "duration": 0.063151,
265 |      "end_time": "2020-11-18T16:58:16.189803",
266 |      "exception": false,
267 |      "start_time": "2020-11-18T16:58:16.126652",
268 |      "status": "completed"
269 |     },
270 |     "tags": []
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "# we need to use the 'kubeflow' namespace so that the KFP runner can create the inference service\n",
275 |     "namespace = 'kubeflow'\n",
276 |     "# this is the service account created for S3 access credentials\n",
277 |     "service_acc = 'kfserving-sa'\n",
278 |     "model_storage_uri = 's3://{}'.format(model_bucket)\n",
279 |     "model_name = 'flight-model'"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {
286 |     "papermill": {
287 |      "duration": 2.893991,
288 |      "end_time": "2020-11-18T16:58:19.129355",
289 |      "exception": false,
290 |      "start_time": "2020-11-18T16:58:16.235364",
291 |      "status": "completed"
292 |     },
293 |     "tags": []
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "api_version = constants.KFSERVING_GROUP + '/' + constants.KFSERVING_VERSION\n",
298 |     "default_endpoint_spec = V1alpha2EndpointSpec(\n",
299 |     "    predictor=V1alpha2PredictorSpec(\n",
300 |     "        sklearn=V1alpha2SKLearnSpec(\n",
301 |     "            storage_uri=model_storage_uri,\n",
302 |     "            resources=V1ResourceRequirements(\n",
303 |     "                requests={'cpu':'100m','memory':'1Gi'},\n",
304 |     "                limits={'cpu':'100m', 'memory':'1Gi'}\n",
305 |     "            )\n",
306 |     "        ),\n",
307 |     "        service_account_name=service_acc\n",
308 |     "    )\n",
309 |     ")\n",
310 |     "    \n",
311 |     "isvc = V1alpha2InferenceService(api_version=api_version,\n",
312 |     "                                kind=constants.KFSERVING_KIND,\n",
313 |     "                                metadata=client.V1ObjectMeta(\n",
314 |     "                                    name=model_name, namespace=namespace),\n",
315 |     "                                spec=V1alpha2InferenceServiceSpec(default=default_endpoint_spec))\n",
316 |     "KFServing.create(isvc)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "papermill": {
324 |      "duration": 119.962162,
325 |      "end_time": "2020-11-18T17:00:19.130621",
326 |      "exception": false,
327 |      "start_time": "2020-11-18T16:58:19.168459",
328 |      "status": "completed"
329 |     },
330 |     "tags": []
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "# Wait for the inference service to be ready\n",
335 |     "KFServing.get(model_name, namespace=namespace, watch=True, timeout_seconds=120)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "papermill": {
342 |      "duration": 0.044726,
343 |      "end_time": "2020-11-18T17:00:19.224273",
344 |      "exception": false,
345 |      "start_time": "2020-11-18T17:00:19.179547",
346 |      "status": "completed"
347 |     },
348 |     "tags": []
349 |    },
350 |    "source": [
351 |     "### Test the inference service\n",
352 |     "\n",
353 |     "Once the inference service is running and available, we can send some test data to the service.\n",
354 |     "\n",
355 |     "**Note** that when deployed into KFP, we need to use the cluster-local url for the model. When executing locally, we assume that port-forwarding is enabled to allow access to the ingress gateway."
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "papermill": {
363 |      "duration": 0.084982,
364 |      "end_time": "2020-11-18T17:00:19.360922",
365 |      "exception": false,
366 |      "start_time": "2020-11-18T17:00:19.275940",
367 |      "status": "completed"
368 |     },
369 |     "tags": []
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "service = KFServing.get(model_name, namespace=namespace)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "papermill": {
381 |      "duration": 0.078281,
382 |      "end_time": "2020-11-18T17:00:19.493930",
383 |      "exception": false,
384 |      "start_time": "2020-11-18T17:00:19.415649",
385 |      "status": "completed"
386 |     },
387 |     "tags": []
388 |    },
389 |    "outputs": [],
390 |    "source": [
391 |     "# load the 10 example rows from our test data, and display a few rows\n",
392 |     "examples = np.load('data/test_rows.npy')\n",
393 |     "examples[:3]"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {
400 |     "papermill": {
401 |      "duration": 1.394202,
402 |      "end_time": "2020-11-18T17:00:20.943882",
403 |      "exception": true,
404 |      "start_time": "2020-11-18T17:00:19.549680",
405 |      "status": "failed"
406 |     },
407 |     "tags": []
408 |    },
409 |    "outputs": [],
410 |    "source": [
411 |     "model_mode = os.environ.get('MODEL_MODE', 'local')\n",
412 |     "model_data = {\"instances\": examples.tolist()}\n",
413 |     "if model_mode == 'local':\n",
414 |     "    # executing locally, use the ingress gateway (we assume port-forwarding) \n",
415 |     "    url = f'http://localhost:8080/v1/models/{model_name}:predict'\n",
416 |     "    service_hostname = '{}.{}.example.com'.format(model_name, namespace)\n",
417 |     "    headers = {'Host': service_hostname}\n",
418 |     "    resp = requests.post(url=url, json=model_data, headers=headers)\n",
419 |     "else:\n",
420 |     "    # we are executing in KFP, use the cluster-local address\n",
421 |     "    url = service['status']['address']['url']\n",
422 |     "    resp = requests.post(url=url, json=model_data)\n",
423 |     "\n",
424 |     "resp.json()"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {
430 |     "papermill": {
431 |      "duration": null,
432 |      "end_time": null,
433 |      "exception": null,
434 |      "start_time": null,
435 |      "status": "pending"
436 |     },
437 |     "tags": []
438 |    },
439 |    "source": [
440 |     "### Delete the model service\n",
441 |     "\n",
442 |     "Once we are done, we clean up the service."
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {
449 |     "papermill": {
450 |      "duration": null,
451 |      "end_time": null,
452 |      "exception": null,
453 |      "start_time": null,
454 |      "status": "pending"
455 |     },
456 |     "tags": []
457 |    },
458 |    "outputs": [],
459 |    "source": [
460 |     "KFServing.delete(model_name, namespace=namespace)"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {
466 |     "papermill": {
467 |      "duration": null,
468 |      "end_time": null,
469 |      "exception": null,
470 |      "start_time": null,
471 |      "status": "pending"
472 |     },
473 |     "tags": []
474 |    },
475 |    "source": [
476 |     "### Authors\n",
477 |     "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
478 |     "\n",
479 |     "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License."
480 |    ]
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "kernelspec": {
485 |    "display_name": "Python 3",
486 |    "language": "python",
487 |    "name": "python3"
488 |   },
489 |   "language_info": {
490 |    "codemirror_mode": {
491 |     "name": "ipython",
492 |     "version": 3
493 |    },
494 |    "file_extension": ".py",
495 |    "mimetype": "text/x-python",
496 |    "name": "python",
497 |    "nbconvert_exporter": "python",
498 |    "pygments_lexer": "ipython3",
499 |    "version": "3.7.9"
500 |   },
501 |   "papermill": {
502 |    "duration": 158.620592,
503 |    "end_time": "2020-11-18T17:00:22.439093",
504 |    "environment_variables": {},
505 |    "exception": true,
506 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/deploy_model.ipynb",
507 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/deploy_model.ipynb",
508 |    "parameters": {},
509 |    "start_time": "2020-11-18T16:57:43.818501",
510 |    "version": "2.1.1"
511 |   },
512 |   "toc-autonumbering": false,
513 |   "toc-showcode": false,
514 |   "toc-showmarkdowntxt": false,
515 |   "toc-showtags": false
516 |  },
517 |  "nbformat": 4,
518 |  "nbformat_minor": 4
519 | }
520 | 


--------------------------------------------------------------------------------
/notebooks/load_data.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2020 IBM Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | from pathlib import Path
18 | import requests
19 | import sys
20 | import tarfile
21 | from urllib.parse import urlparse
22 | 
23 | 
24 | def download_from_public_url(url):
25 | 
26 |     data_dir_name = 'data'
27 | 
28 |     print('Downloading data file {} ...'.format(url))
29 |     r = requests.get(url)
30 |     if r.status_code != 200:
31 |         raise RuntimeError('Could not fetch {}: HTTP status code {}'.format(url, r.status_code))
32 |     else:
33 |         # extract data set file name from URL 
34 |         data_file_name = Path((urlparse(url).path)).name
35 |         # create the directory where the downloaded file will be stored
36 |         data_dir = Path(data_dir_name)    
37 |         data_dir.mkdir(parents=True, exist_ok=True)
38 |         downloaded_data_file = data_dir / data_file_name
39 | 
40 |         print('Saving downloaded file "{}" as ...'.format(data_file_name))
41 |         with open(downloaded_data_file, 'wb') as downloaded_file:
42 |             downloaded_file.write(r.content)
43 |     
44 |         if r.headers['content-type'] in ['application/x-tar', 'application/x-gzip']:
45 |             print('Extracting downloaded file in directory "{}" ...'.format(data_dir))
46 |             with tarfile.open(downloaded_data_file, 'r') as tar:
47 |                 tar.extractall(data_dir)
48 |             print('Removing downloaded file ...')
49 |             downloaded_data_file.unlink()
50 | 
51 | if __name__ == "__main__": 
52 |     
53 |     # This script downloads a compressed data set archive from a public location
54 |     # e.g. http://server/path/to/archive and extracts it.
55 |     # The archive location can be specified using the DATASET_URL environment variable
56 |     # DATASET_URL=http://server/path/to/archive.
57 | 
58 |     # initialize download URL from environment variable
59 |     dataset_url = os.environ.get('DATASET_URL')
60 |     
61 |     # No data set URL was provided.
62 |     if dataset_url is None:
63 |         raise RuntimeError('Cannot run script. A data set URL must be provided as input.')
64 |  
65 |     # Try to process the URL
66 |     download_from_public_url(dataset_url)


--------------------------------------------------------------------------------
/notebooks/merge_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.025086,
  8 |      "end_time": "2020-11-18T10:42:42.222104",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T10:42:42.197018",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Merging Airline Delay and Weather Datasets\n",
 17 |     "\n",
 18 |     "In this notebook, we merge together two data sources in order to create richer features for our flight delay prediction classification problem.\n",
 19 |     "* selecting the columns we wish to keep for later analysis\n",
 20 |     "* converting and cleaning data where required\n",
 21 |     "* handling missing values\n",
 22 |     "\n",
 23 |     "#### Import required modules\n",
 24 |     "\n",
 25 |     "Import and configure the required modules."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "papermill": {
 33 |      "duration": 0.039234,
 34 |      "end_time": "2020-11-18T10:42:42.285141",
 35 |      "exception": false,
 36 |      "start_time": "2020-11-18T10:42:42.245907",
 37 |      "status": "completed"
 38 |     },
 39 |     "tags": []
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# !pip install pandas scikit-learn > /dev/null 2>&1"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "papermill": {
 51 |      "duration": 0.83582,
 52 |      "end_time": "2020-11-18T10:42:43.144936",
 53 |      "exception": false,
 54 |      "start_time": "2020-11-18T10:42:42.309116",
 55 |      "status": "completed"
 56 |     },
 57 |     "tags": []
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Define required imports\n",
 62 |     "import pandas as pd\n",
 63 |     "# These set pandas max column and row display in the notebook\n",
 64 |     "pd.set_option('display.max_columns', 50)\n",
 65 |     "pd.set_option('display.max_rows', 50)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {
 71 |     "papermill": {
 72 |      "duration": 0.011064,
 73 |      "end_time": "2020-11-18T10:42:43.173214",
 74 |      "exception": false,
 75 |      "start_time": "2020-11-18T10:42:43.162150",
 76 |      "status": "completed"
 77 |     },
 78 |     "tags": []
 79 |    },
 80 |    "source": [
 81 |     "### Read datasets\n",
 82 |     "\n",
 83 |     "We start by reading in the processed flight delay and weather datasets"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "papermill": {
 91 |      "duration": 0.083716,
 92 |      "end_time": "2020-11-18T10:42:43.270535",
 93 |      "exception": false,
 94 |      "start_time": "2020-11-18T10:42:43.186819",
 95 |      "status": "completed"
 96 |     },
 97 |     "tags": []
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "flight_path = 'data/jfk_flight_features.csv'\n",
102 |     "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n",
103 |     "flight_data.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "papermill": {
111 |      "duration": 0.253188,
112 |      "end_time": "2020-11-18T10:42:43.545450",
113 |      "exception": false,
114 |      "start_time": "2020-11-18T10:42:43.292262",
115 |      "status": "completed"
116 |     },
117 |     "tags": []
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "weather_path = 'data/jfk_weather_features.csv'\n",
122 |     "weather_data = pd.read_csv(weather_path, parse_dates=['DATE'])\n",
123 |     "weather_data.head()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {
129 |     "papermill": {
130 |      "duration": 0.012112,
131 |      "end_time": "2020-11-18T10:42:43.568539",
132 |      "exception": false,
133 |      "start_time": "2020-11-18T10:42:43.556427",
134 |      "status": "completed"
135 |     },
136 |     "tags": []
137 |    },
138 |    "source": [
139 |     "### Merge datasets\n",
140 |     "\n",
141 |     "The next step is to merge or join the two datasets, such that for each flight record in the flight delay dataset, we have information about the weather conditions present for that flight. \n",
142 |     "\n",
143 |     "**Note** we have to be careful not to effectively \"leak\" information. Recall that our weather observations come from automated weather station reports that are generated on the 51st minute of each hour. We must ensure that the weather report used for flight delay prediction is one covering weather conditions present _before_ the flight departure, otherwise we would be giving our model a glimpse in the the future!\n",
144 |     "\n",
145 |     "This makes joining the datasets a little tricky. One simple approach is to join the record for a given flight day and hour, with the weather reading for the same day but the _previous hour_. We can do this by extracting 2 \"join keys\" from each dataset: the first for the `date` and the second for the `hour` of the record. If we set the `hour` join key for the flight to the hour _before_ the actual hour of the flight scheduled departure, then we ensure the corresponding weather report comes from the hour before the flight would depart."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "papermill": {
153 |      "duration": 0.106487,
154 |      "end_time": "2020-11-18T10:42:43.699995",
155 |      "exception": false,
156 |      "start_time": "2020-11-18T10:42:43.593508",
157 |      "status": "completed"
158 |     },
159 |     "tags": []
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "flight_data.loc[:, 'hour_key'] = pd.to_datetime(flight_data['sched_dep_time'], format='%H%M', errors='ignore').dt.hour - 1\n",
164 |     "flight_data.loc[:, 'date_key'] = flight_data['flight_date'].dt.date\n",
165 |     "flight_data.head()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "papermill": {
173 |      "duration": 0.117894,
174 |      "end_time": "2020-11-18T10:42:43.842588",
175 |      "exception": false,
176 |      "start_time": "2020-11-18T10:42:43.724694",
177 |      "status": "completed"
178 |     },
179 |     "tags": []
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "weather_data.loc[:, 'date_key'] = weather_data['DATE'].dt.date\n",
184 |     "weather_data.loc[:, 'hour_key'] = weather_data['DATE'].dt.hour\n",
185 |     "weather_data.head()"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {
191 |     "papermill": {
192 |      "duration": 0.013882,
193 |      "end_time": "2020-11-18T10:42:43.873685",
194 |      "exception": false,
195 |      "start_time": "2020-11-18T10:42:43.859803",
196 |      "status": "completed"
197 |     },
198 |     "tags": []
199 |    },
200 |    "source": [
201 |     "Next, we join the datasets together based on the \"join keys\" we have created:"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "papermill": {
209 |      "duration": 0.104916,
210 |      "end_time": "2020-11-18T10:42:43.991593",
211 |      "exception": false,
212 |      "start_time": "2020-11-18T10:42:43.886677",
213 |      "status": "completed"
214 |     },
215 |     "tags": []
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "flight_weather_data = flight_data.merge(weather_data, how='inner', on=['date_key', 'hour_key'])\n",
220 |     "flight_weather_data.head()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {
226 |     "papermill": {
227 |      "duration": 0.016437,
228 |      "end_time": "2020-11-18T10:42:44.022539",
229 |      "exception": false,
230 |      "start_time": "2020-11-18T10:42:44.006102",
231 |      "status": "completed"
232 |     },
233 |     "tags": []
234 |    },
235 |    "source": [
236 |     "For the first record in our flight dataset, we can see that the flight departs at 15:25. The corresponding weather report is timestamped at 14:51.\n",
237 |     "\n",
238 |     "**Note** all we guarantee here is that the weather report is _within_ 1 hour before the flight departure, not _precisely 1 hour before_. "
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {
244 |     "papermill": {
245 |      "duration": 0.015937,
246 |      "end_time": "2020-11-18T10:42:44.052584",
247 |      "exception": false,
248 |      "start_time": "2020-11-18T10:42:44.036647",
249 |      "status": "completed"
250 |     },
251 |     "tags": []
252 |    },
253 |    "source": [
254 |     "### Save the Merged Data\n",
255 |     "\n",
256 |     "Finally, we save the merged dataset for use by downstream tasks."
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "papermill": {
264 |      "duration": 0.367411,
265 |      "end_time": "2020-11-18T10:42:44.439419",
266 |      "exception": false,
267 |      "start_time": "2020-11-18T10:42:44.072008",
268 |      "status": "completed"
269 |     },
270 |     "tags": []
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "flight_weather_data.to_csv('data/jfk_flight_weather_features.csv', index=False, float_format='%g')"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {
280 |     "papermill": {
281 |      "duration": 0.013914,
282 |      "end_time": "2020-11-18T10:42:44.470581",
283 |      "exception": false,
284 |      "start_time": "2020-11-18T10:42:44.456667",
285 |      "status": "completed"
286 |     },
287 |     "tags": []
288 |    },
289 |    "source": [
290 |     "<a id=\"authors\"></a> \n",
291 |     "### Authors"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "papermill": {
298 |      "duration": 0.029833,
299 |      "end_time": "2020-11-18T10:42:44.522443",
300 |      "exception": false,
301 |      "start_time": "2020-11-18T10:42:44.492610",
302 |      "status": "completed"
303 |     },
304 |     "tags": []
305 |    },
306 |    "source": [
307 |     "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
308 |     "\n",
309 |     "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License."
310 |    ]
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python 3",
316 |    "language": "python",
317 |    "name": "python3"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 3
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython3",
329 |    "version": "3.7.9"
330 |   },
331 |   "papermill": {
332 |    "duration": 5.066455,
333 |    "end_time": "2020-11-18T10:42:45.841585",
334 |    "environment_variables": {},
335 |    "exception": null,
336 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/merge_data.ipynb",
337 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/merge_data.ipynb",
338 |    "parameters": {},
339 |    "start_time": "2020-11-18T10:42:40.775130",
340 |    "version": "2.1.1"
341 |   },
342 |   "toc-autonumbering": false,
343 |   "toc-showcode": false,
344 |   "toc-showmarkdowntxt": false,
345 |   "toc-showtags": false
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 4
349 | }
350 | 


--------------------------------------------------------------------------------
/notebooks/predict_flight_delays.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.026827,
  8 |      "end_time": "2020-11-18T10:43:10.843474",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T10:43:10.816647",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Predicting Flight Delays\n",
 17 |     "\n",
 18 |     "In this notebook, we use the combined flight delay and weather data we have created to create and evaluate models to predict flight delays.\n",
 19 |     "\n",
 20 |     "**Note** the full flight delay dataset is very large (over 80GB uncompressed), so we are working with a smaller sample dataset. Hence our results may not be a true reflection of the results on the full dataset.\n",
 21 |     "\n",
 22 |     "#### Import required modules\n",
 23 |     "\n",
 24 |     "Import and configure the required modules."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "papermill": {
 32 |      "duration": 3.113441,
 33 |      "end_time": "2020-11-18T10:43:13.981561",
 34 |      "exception": false,
 35 |      "start_time": "2020-11-18T10:43:10.868120",
 36 |      "status": "completed"
 37 |     },
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "!pip install seaborn scikit-learn > /dev/null 2>&1"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "papermill": {
 50 |      "duration": 1.755517,
 51 |      "end_time": "2020-11-18T10:43:15.772084",
 52 |      "exception": false,
 53 |      "start_time": "2020-11-18T10:43:14.016567",
 54 |      "status": "completed"
 55 |     },
 56 |     "tags": []
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Define required imports\n",
 61 |     "import json\n",
 62 |     "import pandas as pd\n",
 63 |     "import numpy as np\n",
 64 |     "import seaborn as sns\n",
 65 |     "import matplotlib.pyplot as plt\n",
 66 |     "sns.set_theme(style='darkgrid', palette='deep')\n",
 67 |     "# These set pandas max column and row display in the notebook\n",
 68 |     "pd.set_option('display.max_columns', 50)\n",
 69 |     "pd.set_option('display.max_rows', 50)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "papermill": {
 77 |      "duration": 0.053132,
 78 |      "end_time": "2020-11-18T10:43:15.854310",
 79 |      "exception": false,
 80 |      "start_time": "2020-11-18T10:43:15.801178",
 81 |      "status": "completed"
 82 |     },
 83 |     "tags": []
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "MODEL_EXPORT_FOLDER = 'models'\n",
 88 |     "from pathlib import Path\n",
 89 |     "export_path = Path(MODEL_EXPORT_FOLDER)\n",
 90 |     "export_path.mkdir(parents=True, exist_ok=True)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {
 96 |     "papermill": {
 97 |      "duration": 0.027723,
 98 |      "end_time": "2020-11-18T10:43:15.907403",
 99 |      "exception": false,
100 |      "start_time": "2020-11-18T10:43:15.879680",
101 |      "status": "completed"
102 |     },
103 |     "tags": []
104 |    },
105 |    "source": [
106 |     "### Read the data\n",
107 |     "\n",
108 |     "We start by reading in the merged flight delay and weather data"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "papermill": {
116 |      "duration": 0.128993,
117 |      "end_time": "2020-11-18T10:43:16.065704",
118 |      "exception": false,
119 |      "start_time": "2020-11-18T10:43:15.936711",
120 |      "status": "completed"
121 |     },
122 |     "tags": []
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "flight_path = 'data/jfk_flight_weather_features.csv'\n",
127 |     "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n",
128 |     "flight_data.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "papermill": {
136 |      "duration": 0.054146,
137 |      "end_time": "2020-11-18T10:43:16.154043",
138 |      "exception": false,
139 |      "start_time": "2020-11-18T10:43:16.099897",
140 |      "status": "completed"
141 |     },
142 |     "tags": []
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "flight_data['dest'].value_counts().tail(10)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "papermill": {
154 |      "duration": 0.097234,
155 |      "end_time": "2020-11-18T10:43:16.283027",
156 |      "exception": false,
157 |      "start_time": "2020-11-18T10:43:16.185793",
158 |      "status": "completed"
159 |     },
160 |     "tags": []
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "flight_data['dest'].value_counts().tail(10)\n",
165 |     "dest_to_drop = ['MKE', 'HYA', 'ALB', 'PSP', 'BDL', 'TUS', 'DAB', 'BHM']\n",
166 |     "flight_data[flight_data['dest'].isin(dest_to_drop)]\n",
167 |     "flight_data.drop(flight_data[flight_data['dest'].isin(dest_to_drop)].index, inplace=True)\n",
168 |     "flight_data"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "papermill": {
175 |      "duration": 0.039101,
176 |      "end_time": "2020-11-18T10:43:16.360160",
177 |      "exception": false,
178 |      "start_time": "2020-11-18T10:43:16.321059",
179 |      "status": "completed"
180 |     },
181 |     "tags": []
182 |    },
183 |    "source": [
184 |     "### Create train / test data split\n",
185 |     "\n",
186 |     "The first step in building our models is to split the dataset into training and test sets. We use a portion of the data for training, and another portion of data for our test sets.\n",
187 |     "\n",
188 |     "If we instead trained a model on the full dataset, the model would learn to be very good at making predictions on that particular dataset, essentially just copying the answers it knows. However, when presented with data the model has not seen , it would perform poorly since it has not learned how to generalize its answers.\n",
189 |     "\n",
190 |     "By training on a portion of the dataset and testing the model's performance on another portion of the dataset (which data the model has not seen in training), we try to avoid our models \"over-fitting\" the dataset and make them better at prediction when given unseen, future data. This process of splitting the dataset and evaluating a model's performance on \"held-out\" datasets is commonly known as _cross-validation_.\n",
191 |     "\n",
192 |     "By default here we use 80% of the data for the training set and 20% for the test set.\n",
193 |     "\n",
194 |     "**Note** for simplicity here we perform a random split. Technically, we have some time-dependent information leakage, since for earlier records, the model can use data from the future in training. In reality, a model at that point in time would not have information about the future available for training. For a better evaluation of the model performance on fully unseen, new data, the test set should be generated from _future_ data occurring after the time window in the training set."
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {
201 |     "papermill": {
202 |      "duration": 0.448639,
203 |      "end_time": "2020-11-18T10:43:16.841145",
204 |      "exception": false,
205 |      "start_time": "2020-11-18T10:43:16.392506",
206 |      "status": "completed"
207 |     },
208 |     "tags": []
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "from sklearn.model_selection import train_test_split\n",
213 |     "\n",
214 |     "# Split the dataset into 80% training and 20% test sets, stratified by the 'delayed' field\n",
215 |     "df_train, df_test = train_test_split(\n",
216 |     "    flight_data, train_size=0.8, random_state=24, stratify=flight_data[['delayed']])"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {
223 |     "papermill": {
224 |      "duration": 0.034849,
225 |      "end_time": "2020-11-18T10:43:16.901670",
226 |      "exception": false,
227 |      "start_time": "2020-11-18T10:43:16.866821",
228 |      "status": "completed"
229 |     },
230 |     "tags": []
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "# specify the target variable\n",
235 |     "y_train = df_train['delayed'].values\n",
236 |     "y_test = df_test['delayed'].values\n",
237 |     "print('Training set: {} rows'.format(len(df_train)))\n",
238 |     "print('Test set: {} rows'.format(len(df_test)))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {
244 |     "papermill": {
245 |      "duration": 0.025412,
246 |      "end_time": "2020-11-18T10:43:16.950294",
247 |      "exception": false,
248 |      "start_time": "2020-11-18T10:43:16.924882",
249 |      "status": "completed"
250 |     },
251 |     "tags": []
252 |    },
253 |    "source": [
254 |     "### Encode categorical variables\n",
255 |     "\n",
256 |     "Next, we want to encode the various _categorical_ features we have - such as the flight departure time bucket, airline and airport ids, and so on - into numerical representations. We do this by assigning integer ids to each unique feature value. This is known as ordinal encoding.\n",
257 |     "\n",
258 |     "Note that certain models (e.g. linear models) will interpret these numerical values as having an ordinal structure. However, for our demonstration purposes we will use tree-based models, which can handle these types of integer ids directly. \n",
259 |     "\n",
260 |     "For linear models, we would prefer to use one-hot encoding for categorical features."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "papermill": {
268 |      "duration": 0.072311,
269 |      "end_time": "2020-11-18T10:43:17.051886",
270 |      "exception": false,
271 |      "start_time": "2020-11-18T10:43:16.979575",
272 |      "status": "completed"
273 |     },
274 |     "tags": []
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "from sklearn.preprocessing import OrdinalEncoder\n",
279 |     "\n",
280 |     "# specify columns for raw categorical features\n",
281 |     "cat_columns = [\n",
282 |     "    'month',\n",
283 |     "    'day_of_month',\n",
284 |     "    'day_of_week',\n",
285 |     "    'airline_name',\n",
286 |     "    'dest',\n",
287 |     "    'dep_time_bin',\n",
288 |     "    'distance_bin'\n",
289 |     "]\n",
290 |     "\n",
291 |     "# extract categorical data columns for training set\n",
292 |     "df_train_cat = df_train[cat_columns]\n",
293 |     "# extract categorical data columns for test set\n",
294 |     "df_test_cat = df_test[cat_columns]\n",
295 |     "\n",
296 |     "ord_enc = OrdinalEncoder()\n",
297 |     "# fit and encode training features\n",
298 |     "X_train_cat = ord_enc.fit_transform(df_train_cat)\n",
299 |     "# encode test features\n",
300 |     "X_test_cat = ord_enc.transform(df_test_cat)\n",
301 |     "\n",
302 |     "print('Training set categorical features: {} rows, {} features' .format(X_train_cat.shape[0], X_train_cat.shape[1]))\n",
303 |     "print('Test set categorical features: {} rows, {} features' .format(X_test_cat.shape[0], X_test_cat.shape[1]))"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {
309 |     "papermill": {
310 |      "duration": 0.044356,
311 |      "end_time": "2020-11-18T10:43:17.144311",
312 |      "exception": false,
313 |      "start_time": "2020-11-18T10:43:17.099955",
314 |      "status": "completed"
315 |     },
316 |     "tags": []
317 |    },
318 |    "source": [
319 |     "### Encode numerical variables\n",
320 |     "\n",
321 |     "The next step is to encode numerical features. Depending on the models used, it can be very important to scale / normalize numerical features - such as `wind_speed` or `precip`. Again, linear models and neural networks are a good example of this. In our case we will use tree-based models, which again do not require feature scaling, hence we can use these numerical features directly without pre-processing. \n",
322 |     "\n",
323 |     "**Note** that the weather type features are also categorical. However, we have already encoded these as binary values in our pre-processing step, hence we can now treat these features as numerical."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "papermill": {
331 |      "duration": 0.048752,
332 |      "end_time": "2020-11-18T10:43:17.225546",
333 |      "exception": false,
334 |      "start_time": "2020-11-18T10:43:17.176794",
335 |      "status": "completed"
336 |     },
337 |     "tags": []
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "num_columns = [\n",
342 |     "    'visibility',\n",
343 |     "    'wind_speed',\n",
344 |     "    'wind_gust_speed',\n",
345 |     "    'precip',\n",
346 |     "    'rain',\n",
347 |     "    'ice_pellets',\n",
348 |     "    'mist',\n",
349 |     "    'snow',\n",
350 |     "    'drizzle',\n",
351 |     "    'haze',\n",
352 |     "    'fog',\n",
353 |     "    'thunderstorm',\n",
354 |     "    'smoke',\n",
355 |     "    'unknown_precipitation'\n",
356 |     "]\n",
357 |     "\n",
358 |     "# extract numerical data columns for training set\n",
359 |     "X_train_num = df_train[num_columns].values\n",
360 |     "# extract numerical data columns for validation set\n",
361 |     "X_test_num = df_test[num_columns].values\n",
362 |     "\n",
363 |     "print('Training set numerical features: {} rows, {} features' .format(X_train_num.shape[0], X_train_num.shape[1]))\n",
364 |     "print('Test set numerical features: {} rows, {} features' .format(X_test_num.shape[0], X_test_num.shape[1]))"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {
370 |     "papermill": {
371 |      "duration": 0.098643,
372 |      "end_time": "2020-11-18T10:43:17.377832",
373 |      "exception": false,
374 |      "start_time": "2020-11-18T10:43:17.279189",
375 |      "status": "completed"
376 |     },
377 |     "tags": []
378 |    },
379 |    "source": [
380 |     "#### Combine categorical and numerical features\n",
381 |     "\n",
382 |     "We can now combine the two sets of features by concatenating them (\"horizontally stacking\"):"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "papermill": {
390 |      "duration": 0.04915,
391 |      "end_time": "2020-11-18T10:43:17.480635",
392 |      "exception": false,
393 |      "start_time": "2020-11-18T10:43:17.431485",
394 |      "status": "completed"
395 |     },
396 |     "tags": []
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "X_train = np.hstack((X_train_cat, X_train_num))\n",
401 |     "X_test = np.hstack((X_test_cat, X_test_num))\n",
402 |     "print('Training set all features: {} rows, {} features' .format(X_train.shape[0], X_train.shape[1]))\n",
403 |     "print('Test set all features: {} rows, {} features' .format(X_test.shape[0], X_test.shape[1]))"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {
409 |     "papermill": {
410 |      "duration": 0.026992,
411 |      "end_time": "2020-11-18T10:43:17.537965",
412 |      "exception": false,
413 |      "start_time": "2020-11-18T10:43:17.510973",
414 |      "status": "completed"
415 |     },
416 |     "tags": []
417 |    },
418 |    "source": [
419 |     "### Train and evaluate models\n",
420 |     "\n",
421 |     "Now that we have pre-processed all our features into numerical representations, we can pass them to our machine learning models.\n",
422 |     "\n",
423 |     "For simplicity, we will evalute 3 tree-based models: a single decision tree; a random forest and a gradient-boosting tree (both of these are \"ensemble\" models made up of many smaller sub-models, typicaly themselves single decision trees).\n",
424 |     "\n",
425 |     "Tree ensemble models are very flexible and powerful, and typically perform well \"out the box\" in particular on tabular datasets such as we have here. As we have seen, they also require less feature pre-processing and engineering in general than, for example, linear models."
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {
432 |     "papermill": {
433 |      "duration": 0.16584,
434 |      "end_time": "2020-11-18T10:43:17.729971",
435 |      "exception": false,
436 |      "start_time": "2020-11-18T10:43:17.564131",
437 |      "status": "completed"
438 |     },
439 |     "tags": []
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
444 |     "from sklearn.tree import DecisionTreeClassifier\n",
445 |     "from sklearn.model_selection import cross_val_score, cross_validate\n",
446 |     "                 \n",
447 |     "dt = DecisionTreeClassifier()\n",
448 |     "rf = RandomForestClassifier()\n",
449 |     "gb = GradientBoostingClassifier()"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {
455 |     "papermill": {
456 |      "duration": 0.044233,
457 |      "end_time": "2020-11-18T10:43:17.806791",
458 |      "exception": false,
459 |      "start_time": "2020-11-18T10:43:17.762558",
460 |      "status": "completed"
461 |     },
462 |     "tags": []
463 |    },
464 |    "source": [
465 |     "We have split out dataset into a training and test set. However, the test set itself should never be directly used in model training, but only to perform a final model evaluation. This gives an estimate on how the model might perform in the \"real world\".\n",
466 |     "\n",
467 |     "We would still like to perform model selection, which means we need to evaluate our models using the training set in some way. To avoid over-fitting on the training set, as well as to give a good estimate on how the model may perform on our test set, we will use K-fold cross-validation on our training set.\n",
468 |     "\n",
469 |     "This splits the dataset into `k` (in our case `5`) non-overlapping subsets (`folds`). In turn, the model is trained on 4 of these (80% of training data) and evaluated on 1 (20% of training data). This is repeated `k` times and the evaluation scores are averaged across each of the `k` runs. This averaged metric typically gives a fairly good indication of how the model performs on unseen data.\n",
470 |     "\n",
471 |     "`scikit-learn` provides us this functionality, built-in and easy to use!\n",
472 |     "\n",
473 |     "**Note** As we see in the analysis notebook, we are dealing with some degree of class imbalance - on-time flights are far more prevelant compared to delayed flights (80% / 20% split). So, we need to be cautious when evaluting the performance of such models. For example, if we use `accuracy` as a metric, then a simple rule that classifies all flights as `on-time` would achieve 80% accuracy, which sounds very good! However, the model is completely unable to actually predict whether a flight will be delayed, so is useless for any real-world application.\n",
474 |     "\n",
475 |     "A common metric used for binary classification is the area under the ROC curve (`roc_auc`). However, this metric can sometimes provide an unclear picture for imbalanced classes.\n",
476 |     "\n",
477 |     "There are a few metrics that try to alleviate this problem for binary classification problems. We will be using `F1 score` as our metric for selecting the model to use, since it can handle the class imbalance problem. _Note_ that the selection of metric also depends on the particular use case."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "papermill": {
485 |      "duration": 8.689184,
486 |      "end_time": "2020-11-18T10:43:26.538189",
487 |      "exception": false,
488 |      "start_time": "2020-11-18T10:43:17.849005",
489 |      "status": "completed"
490 |     },
491 |     "tags": []
492 |    },
493 |    "outputs": [],
494 |    "source": [
495 |     "metric = 'f1'\n",
496 |     "scores = cross_val_score(dt, X_train, y_train, cv=5, scoring=metric)\n",
497 |     "dt_score = np.mean(scores)\n",
498 |     "\n",
499 |     "scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=metric)\n",
500 |     "rf_score = np.mean(scores)\n",
501 |     "\n",
502 |     "scores = cross_val_score(gb, X_train, y_train, cv=5, scoring=metric)\n",
503 |     "gb_score = np.mean(scores)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {
510 |     "papermill": {
511 |      "duration": 0.286596,
512 |      "end_time": "2020-11-18T10:43:26.858782",
513 |      "exception": false,
514 |      "start_time": "2020-11-18T10:43:26.572186",
515 |      "status": "completed"
516 |     },
517 |     "tags": []
518 |    },
519 |    "outputs": [],
520 |    "source": [
521 |     "cv_scores = [dt_score, rf_score, gb_score]\n",
522 |     "plt.figure(figsize=(16, 6))\n",
523 |     "sns.barplot(x=['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'], y=cv_scores)\n",
524 |     "plt.show()\n",
525 |     "\n",
526 |     "print('Average {} for DecisionTreeClassifier: {}'.format(metric, dt_score))\n",
527 |     "print('Average {} for RandomForestClassifier: {}'.format(metric, rf_score))\n",
528 |     "print('Average {} for GradientBoostingClassifier: {}'.format(metric, gb_score))"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {
534 |     "papermill": {
535 |      "duration": 0.050985,
536 |      "end_time": "2020-11-18T10:43:26.952118",
537 |      "exception": false,
538 |      "start_time": "2020-11-18T10:43:26.901133",
539 |      "status": "completed"
540 |     },
541 |     "tags": []
542 |    },
543 |    "source": [
544 |     "Based on this, we will select the `DecisionTreeClassifier`.\n",
545 |     "\n",
546 |     "**Note** based on the `auc_roc` metric, we would have selected the `GradientBoostingClassifier` - try it out in the cells above to see and then compare the model performance later on.\n",
547 |     "\n",
548 |     "We can also evaluate the impact of adding our weather features on model performance:"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {
555 |     "papermill": {
556 |      "duration": 0.490133,
557 |      "end_time": "2020-11-18T10:43:27.499197",
558 |      "exception": false,
559 |      "start_time": "2020-11-18T10:43:27.009064",
560 |      "status": "completed"
561 |     },
562 |     "tags": []
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "scores = cross_val_score(dt, X_train_cat, y_train, cv=5, scoring=metric)\n",
567 |     "cat_score = np.mean(scores)\n",
568 |     "\n",
569 |     "scores = cross_val_score(dt, X_train_num, y_train, cv=5, scoring=metric)\n",
570 |     "num_score = np.mean(scores)\n",
571 |     "\n",
572 |     "scores = cross_val_score(dt, X_train, y_train, cv=5, scoring=metric)\n",
573 |     "all_score = np.mean(scores)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {
580 |     "papermill": {
581 |      "duration": 0.270508,
582 |      "end_time": "2020-11-18T10:43:27.797814",
583 |      "exception": false,
584 |      "start_time": "2020-11-18T10:43:27.527306",
585 |      "status": "completed"
586 |     },
587 |     "tags": []
588 |    },
589 |    "outputs": [],
590 |    "source": [
591 |     "cv_scores = [cat_score, num_score, all_score]\n",
592 |     "plt.figure(figsize=(16, 6))\n",
593 |     "sns.barplot(x=['Flight features', 'Weather features', 'Flight + Weather features'], y=cv_scores)\n",
594 |     "plt.show()\n",
595 |     "\n",
596 |     "print('Average {} for only flight delay features: {}'.format(metric, cat_score))\n",
597 |     "print('Average {} for only weather features: {}'.format(metric, num_score))\n",
598 |     "print('Average {} for all features: {}'.format(metric, all_score))"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "markdown",
603 |    "metadata": {
604 |     "papermill": {
605 |      "duration": 0.03945,
606 |      "end_time": "2020-11-18T10:43:27.867100",
607 |      "exception": false,
608 |      "start_time": "2020-11-18T10:43:27.827650",
609 |      "status": "completed"
610 |     },
611 |     "tags": []
612 |    },
613 |    "source": [
614 |     "We see that using only weather features does little better than random guessing, while adding weather features to the flight features increases our metric by around `0.01`. This is not a very large amount, but it does indicate that information about weather helps a little with predictions. In some applications, even small increases in model performance can be significant.\n",
615 |     "\n",
616 |     "Finally, we re-train the model on the full training dataset and perform a final classification evaluation on the test set."
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {
623 |     "papermill": {
624 |      "duration": 0.091014,
625 |      "end_time": "2020-11-18T10:43:27.992675",
626 |      "exception": false,
627 |      "start_time": "2020-11-18T10:43:27.901661",
628 |      "status": "completed"
629 |     },
630 |     "tags": []
631 |    },
632 |    "outputs": [],
633 |    "source": [
634 |     "from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report\n",
635 |     "from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve\n",
636 |     "\n",
637 |     "# fit on full data\n",
638 |     "dt.fit(X_train, y_train)\n",
639 |     "y_prob = dt.predict_proba(X_test)[:, 1]\n",
640 |     "y_pred = dt.predict(X_test)\n",
641 |     "\n",
642 |     "f1_test = f1_score(y_test, y_prob)\n",
643 |     "roc_auc_test = roc_auc_score(y_test, y_prob)\n",
644 |     "print('Final {} for test set: {}'.format(metric, f1_test))"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "markdown",
649 |    "metadata": {
650 |     "papermill": {
651 |      "duration": 0.044603,
652 |      "end_time": "2020-11-18T10:43:28.076712",
653 |      "exception": false,
654 |      "start_time": "2020-11-18T10:43:28.032109",
655 |      "status": "completed"
656 |     },
657 |     "tags": []
658 |    },
659 |    "source": [
660 |     "We export the trained model and a few example rows from the test dataset, for potential use by downstream stages."
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {
667 |     "papermill": {
668 |      "duration": 0.048674,
669 |      "end_time": "2020-11-18T10:43:28.166514",
670 |      "exception": false,
671 |      "start_time": "2020-11-18T10:43:28.117840",
672 |      "status": "completed"
673 |     },
674 |     "tags": []
675 |    },
676 |    "outputs": [],
677 |    "source": [
678 |     "# save the model file for downstream tasks\n",
679 |     "from joblib import dump\n",
680 |     "dump(dt, '{}/model.joblib'.format(MODEL_EXPORT_FOLDER))\n",
681 |     "\n",
682 |     "# also save a few example rows\n",
683 |     "np.save('data/test_rows.npy', X_test[:10])"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "code",
688 |    "execution_count": null,
689 |    "metadata": {
690 |     "papermill": {
691 |      "duration": 0.054397,
692 |      "end_time": "2020-11-18T10:43:28.262088",
693 |      "exception": false,
694 |      "start_time": "2020-11-18T10:43:28.207691",
695 |      "status": "completed"
696 |     },
697 |     "tags": []
698 |    },
699 |    "outputs": [],
700 |    "source": [
701 |     "# export metrics for KFP\n",
702 |     "metrics = {\n",
703 |     "    'metrics': [\n",
704 |     "        {\n",
705 |     "            'name': 'f1_score',\n",
706 |     "            'numberValue':  f1_test,\n",
707 |     "            'format': 'RAW'\n",
708 |     "        },\n",
709 |     "        {\n",
710 |     "            'name': 'roc_auc_score',\n",
711 |     "            'numberValue':  roc_auc_test,\n",
712 |     "            'format': 'RAW'       \n",
713 |     "        }\n",
714 |     "    ]\n",
715 |     "  }\n",
716 |     "\n",
717 |     "with open('mlpipeline-metrics.json', 'w') as f:\n",
718 |     "    json.dump(metrics, f)"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": null,
724 |    "metadata": {
725 |     "papermill": {
726 |      "duration": 0.503831,
727 |      "end_time": "2020-11-18T10:43:28.793550",
728 |      "exception": false,
729 |      "start_time": "2020-11-18T10:43:28.289719",
730 |      "status": "completed"
731 |     },
732 |     "tags": []
733 |    },
734 |    "outputs": [],
735 |    "source": [
736 |     "fig = plt.figure(figsize=(16, 6))\n",
737 |     "plt.subplot(121)\n",
738 |     "plot_roc_curve(dt, X_test, y_test, ax=fig.gca())\n",
739 |     "plt.subplot(122)\n",
740 |     "plot_precision_recall_curve(dt, X_test, y_test, ax=fig.gca())\n",
741 |     "plt.show()"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": null,
747 |    "metadata": {
748 |     "papermill": {
749 |      "duration": 0.047271,
750 |      "end_time": "2020-11-18T10:43:28.871750",
751 |      "exception": false,
752 |      "start_time": "2020-11-18T10:43:28.824479",
753 |      "status": "completed"
754 |     },
755 |     "tags": []
756 |    },
757 |    "outputs": [],
758 |    "source": [
759 |     "print(classification_report(y_test, y_pred, target_names=['On-time', 'Delayed']))"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": null,
765 |    "metadata": {
766 |     "papermill": {
767 |      "duration": 0.426407,
768 |      "end_time": "2020-11-18T10:43:29.332485",
769 |      "exception": false,
770 |      "start_time": "2020-11-18T10:43:28.906078",
771 |      "status": "completed"
772 |     },
773 |     "tags": []
774 |    },
775 |    "outputs": [],
776 |    "source": [
777 |     "cm = confusion_matrix(y_test, y_pred)\n",
778 |     "class_labels = ['On-time', 'Delayed']\n",
779 |     "labels = ['{0:0.0f}'.format(value) for value in\n",
780 |     "                cm.flatten()]\n",
781 |     "labels = np.asarray(labels).reshape(2,2)\n",
782 |     "fig = plt.figure(figsize=(12, 8))\n",
783 |     "chart = sns.heatmap(\n",
784 |     "    cm, annot=labels, fmt='', cmap='Blues',\n",
785 |     "    xticklabels=class_labels, yticklabels=class_labels)\n",
786 |     "chart.set_xlabel('Predicted label')\n",
787 |     "chart.set_ylabel('True label')\n",
788 |     "chart.set_title('Confusion Matrix')\n",
789 |     "plt.show()"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": null,
795 |    "metadata": {
796 |     "papermill": {
797 |      "duration": 0.094854,
798 |      "end_time": "2020-11-18T10:43:29.491225",
799 |      "exception": false,
800 |      "start_time": "2020-11-18T10:43:29.396371",
801 |      "status": "completed"
802 |     },
803 |     "tags": []
804 |    },
805 |    "outputs": [],
806 |    "source": [
807 |     "# export confusion matrix for KFP\n",
808 |     "cm_data = []\n",
809 |     "for target_index, target_row in enumerate(cm):\n",
810 |     "    for predicted_index, count in enumerate(target_row):\n",
811 |     "        cm_data.append((class_labels[target_index], class_labels[predicted_index], count))\n",
812 |     "        \n",
813 |     "ui_metadata = {\n",
814 |     "    'outputs' : [{\n",
815 |     "        'type': 'confusion_matrix',\n",
816 |     "        'format': 'csv',\n",
817 |     "        'schema': [\n",
818 |     "            {'name': 'target', 'type': 'CATEGORY'},\n",
819 |     "            {'name': 'predicted', 'type': 'CATEGORY'},\n",
820 |     "            {'name': 'count', 'type': 'NUMBER'},\n",
821 |     "        ],\n",
822 |     "        'source': pd.DataFrame(cm_data).to_csv(header=False, index=False),\n",
823 |     "        'storage': 'inline',\n",
824 |     "        'labels': ['Delayed', 'On-time'],\n",
825 |     "    }]\n",
826 |     "}\n",
827 |     "\n",
828 |     "with open('mlpipeline-ui-metadata.json', 'w') as f:\n",
829 |     "    json.dump(ui_metadata, f)"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "markdown",
834 |    "metadata": {
835 |     "papermill": {
836 |      "duration": 0.077766,
837 |      "end_time": "2020-11-18T10:43:29.636822",
838 |      "exception": false,
839 |      "start_time": "2020-11-18T10:43:29.559056",
840 |      "status": "completed"
841 |     },
842 |     "tags": []
843 |    },
844 |    "source": [
845 |     "If we investigate the various classification charts and reports, we can see that our problem of classifying whether a flight will be delayed is a tricky one.\n",
846 |     "\n",
847 |     "As one might expect, the model predicts most `on-time` flights as `on-time` (80%). However, it struggles to correctly predict `delayed` flights, instead classifying them as `on-time`. In fact it only correctly predicts delays 28% of the time! (this is the `recall` figure for `Delayed` in the classification report table). When it predicts a delayed flight, it is correct only 25% of the time (this is the `precision` field).\n",
848 |     "\n",
849 |     "Overall, we would say that our model is doing a mediocre job of predicting flight delays - we either need to do a lot more model tuning and hyper-parameter selection, or use more data and better features.\n",
850 |     "\n",
851 |     "Perhaps you can try to find ways to improve the performance!\n",
852 |     "\n",
853 |     "Finally, we can generate a list of \"feature importances\" to see what the model is focusing on for making predictions:"
854 |    ]
855 |   },
856 |   {
857 |    "cell_type": "code",
858 |    "execution_count": null,
859 |    "metadata": {
860 |     "papermill": {
861 |      "duration": 0.679681,
862 |      "end_time": "2020-11-18T10:43:30.395646",
863 |      "exception": false,
864 |      "start_time": "2020-11-18T10:43:29.715965",
865 |      "status": "completed"
866 |     },
867 |     "tags": []
868 |    },
869 |    "outputs": [],
870 |    "source": [
871 |     "feat_names = list(df_train_cat.columns.values) + list(df_train[num_columns].columns.values)\n",
872 |     "feat_nb = dt.feature_importances_\n",
873 |     "plt.figure(figsize=(16, 8))\n",
874 |     "chart = sns.barplot(x=feat_names, y=feat_nb, palette='Blues')\n",
875 |     "chart.set_xticklabels(\n",
876 |     "    chart.get_xticklabels(), \n",
877 |     "    rotation=45, \n",
878 |     "    horizontalalignment='right',\n",
879 |     "    fontweight='light',\n",
880 |     "    fontsize='large'\n",
881 |     ")\n",
882 |     "plt.show()"
883 |    ]
884 |   },
885 |   {
886 |    "cell_type": "markdown",
887 |    "metadata": {
888 |     "papermill": {
889 |      "duration": 0.046187,
890 |      "end_time": "2020-11-18T10:43:30.490059",
891 |      "exception": false,
892 |      "start_time": "2020-11-18T10:43:30.443872",
893 |      "status": "completed"
894 |     },
895 |     "tags": []
896 |    },
897 |    "source": [
898 |     "Of the flight features, the time-based features as well as departure time and destination seem to be most important. For weather features, wind speed and visibility seem to be dominant in importance."
899 |    ]
900 |   },
901 |   {
902 |    "cell_type": "markdown",
903 |    "metadata": {
904 |     "papermill": {
905 |      "duration": 0.048653,
906 |      "end_time": "2020-11-18T10:43:30.584400",
907 |      "exception": false,
908 |      "start_time": "2020-11-18T10:43:30.535747",
909 |      "status": "completed"
910 |     },
911 |     "tags": []
912 |    },
913 |    "source": [
914 |     "### Authors\n",
915 |     "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
916 |     "\n",
917 |     "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License."
918 |    ]
919 |   }
920 |  ],
921 |  "metadata": {
922 |   "kernelspec": {
923 |    "display_name": "Python 3",
924 |    "language": "python",
925 |    "name": "python3"
926 |   },
927 |   "language_info": {
928 |    "codemirror_mode": {
929 |     "name": "ipython",
930 |     "version": 3
931 |    },
932 |    "file_extension": ".py",
933 |    "mimetype": "text/x-python",
934 |    "name": "python",
935 |    "nbconvert_exporter": "python",
936 |    "pygments_lexer": "ipython3",
937 |    "version": "3.7.9"
938 |   },
939 |   "papermill": {
940 |    "duration": 22.125901,
941 |    "end_time": "2020-11-18T10:43:31.907543",
942 |    "environment_variables": {},
943 |    "exception": null,
944 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/predict_flight_delays.ipynb",
945 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/predict_flight_delays.ipynb",
946 |    "parameters": {},
947 |    "start_time": "2020-11-18T10:43:09.781642",
948 |    "version": "2.1.1"
949 |   },
950 |   "toc-autonumbering": false,
951 |   "toc-showcode": false,
952 |   "toc-showmarkdowntxt": false,
953 |   "toc-showtags": false
954 |  },
955 |  "nbformat": 4,
956 |  "nbformat_minor": 4
957 | }
958 | 


--------------------------------------------------------------------------------
/notebooks/process_flight_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.012298,
  8 |      "end_time": "2020-11-18T10:41:24.439638",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T10:41:24.427340",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Processing the Airline Reporting Carrier On-Time Performance Dataset\n",
 17 |     "\n",
 18 |     "This notebook relates to the Airline Reporting Carrier On-Time Performance Dataset. The dataset  contains information on approximately 200 million domestic US flights reported to the United States Bureau of Transportation Statistics, from 1987 - 2020. This dataset is freely available from the IBM Developer [Data Asset Exchange](https://developer.ibm.com/exchanges/data/all/airline/).\n",
 19 |     "\n",
 20 |     "**Note** the full dataset is very large (over 80GB uncompressed), so here we work with a smaller sample dataset containing a total of 2 million rows.\n",
 21 |     "\n",
 22 |     "In this notebook, we process the raw dataset by:\n",
 23 |     "* selecting the columns we wish to keep for later analysis\n",
 24 |     "* converting and cleaning data where required\n",
 25 |     "* handling missing values\n",
 26 |     "\n",
 27 |     "#### Import required modules\n",
 28 |     "\n",
 29 |     "Import and configure the required modules."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "papermill": {
 37 |      "duration": 0.020919,
 38 |      "end_time": "2020-11-18T10:41:24.474707",
 39 |      "exception": false,
 40 |      "start_time": "2020-11-18T10:41:24.453788",
 41 |      "status": "completed"
 42 |     },
 43 |     "tags": []
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# !pip install pandas > /dev/null 2>&1"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {
 54 |     "papermill": {
 55 |      "duration": 0.938723,
 56 |      "end_time": "2020-11-18T10:41:25.426277",
 57 |      "exception": false,
 58 |      "start_time": "2020-11-18T10:41:24.487554",
 59 |      "status": "completed"
 60 |     },
 61 |     "tags": []
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Define required imports\n",
 66 |     "import pandas as pd\n",
 67 |     "# These set pandas max column and row display in the notebook\n",
 68 |     "pd.set_option('display.max_columns', 50)\n",
 69 |     "pd.set_option('display.max_rows', 50)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {
 75 |     "papermill": {
 76 |      "duration": 0.023472,
 77 |      "end_time": "2020-11-18T10:41:25.467588",
 78 |      "exception": false,
 79 |      "start_time": "2020-11-18T10:41:25.444116",
 80 |      "status": "completed"
 81 |     },
 82 |     "tags": []
 83 |    },
 84 |    "source": [
 85 |     "### Read the Raw Data\n",
 86 |     "\n",
 87 |     "We start by reading in the raw dataset and displaying the first few rows of the dataframe."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "papermill": {
 95 |      "duration": 51.607155,
 96 |      "end_time": "2020-11-18T10:42:17.099275",
 97 |      "exception": false,
 98 |      "start_time": "2020-11-18T10:41:25.492120",
 99 |      "status": "completed"
100 |     },
101 |     "tags": []
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "data_path = 'data/airline_2m.csv'\n",
106 |     "raw_data = pd.read_csv(data_path, encoding = \"ISO-8859-1\", parse_dates=['FlightDate'],\n",
107 |     "                 dtype={'Div1Airport': str, 'Div1TailNum': str, 'Div2Airport': str, 'Div2TailNum': str})\n",
108 |     "raw_data.head()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {
114 |     "papermill": {
115 |      "duration": 0.02727,
116 |      "end_time": "2020-11-18T10:42:17.155366",
117 |      "exception": false,
118 |      "start_time": "2020-11-18T10:42:17.128096",
119 |      "status": "completed"
120 |     },
121 |     "tags": []
122 |    },
123 |    "source": [
124 |     "### Clean the Data\n",
125 |     "\n",
126 |     "Fortunately, the airline delay dataset is relatively clean already! The fields we wish to use already represent variables such as unique codes for the airline, origin and destination. There are also fields representing binned variables for departure time slot and flight distance.\n",
127 |     "\n",
128 |     "We will select a subset of the data relating to years 2010-2017, with origin airport `JFK`, to match our weather data. For simplicity, we will focus on delayed flights and ignore flight cancellations. We will ignore arrival delays, hence we will be focusing on predicting \"departure delays\" _from JFK_ to other destinations, using the field `DepDel15`. This is a binary value indicating whether the flight was delayed by more than 15 minutes (deemed to be `delayed`) or not (deemed to be `on time`)."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "papermill": {
136 |      "duration": 6.760619,
137 |      "end_time": "2020-11-18T10:42:23.945236",
138 |      "exception": false,
139 |      "start_time": "2020-11-18T10:42:17.184617",
140 |      "status": "completed"
141 |     },
142 |     "tags": []
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "# Select the data sub-set for years 2010-2017 for flights originating from JFK\n",
147 |     "jfk_flights = raw_data.copy()\n",
148 |     "jfk_flights = jfk_flights[(jfk_flights['Origin'] == 'JFK') & (jfk_flights['Year'].isin(range(2010, 2018))) & (jfk_flights['Cancelled'] == 0)]\n",
149 |     "jfk_flights.head()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {
155 |     "papermill": {
156 |      "duration": 0.020117,
157 |      "end_time": "2020-11-18T10:42:23.994662",
158 |      "exception": false,
159 |      "start_time": "2020-11-18T10:42:23.974545",
160 |      "status": "completed"
161 |     },
162 |     "tags": []
163 |    },
164 |    "source": [
165 |     "We create a mapping of airline id to a more readable airline name (see the [airline ID looukp table](https://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRLINE_ID))."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "papermill": {
173 |      "duration": 0.039415,
174 |      "end_time": "2020-11-18T10:42:24.057478",
175 |      "exception": false,
176 |      "start_time": "2020-11-18T10:42:24.018063",
177 |      "status": "completed"
178 |     },
179 |     "tags": []
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "airline_codes = {\n",
184 |     "    20409: 'JetBlue',\n",
185 |     "    19790: 'Delta',\n",
186 |     "    19805: 'American Airlines',\n",
187 |     "    20398: 'Envoy Air',\n",
188 |     "    19977: 'United',\n",
189 |     "    21171: 'Virgin America',\n",
190 |     "    20363: 'Endeavor Air',\n",
191 |     "    20417: 'Comair',\n",
192 |     "    20355: 'US Airways',\n",
193 |     "    20366: 'ExpressJet',\n",
194 |     "    19690: 'Hawaiian Airlines',\n",
195 |     "    19930: 'Alaska Airlines',\n",
196 |     "    20378: 'Mesa Airlines'\n",
197 |     "}"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "papermill": {
205 |      "duration": 0.075155,
206 |      "end_time": "2020-11-18T10:42:24.154423",
207 |      "exception": false,
208 |      "start_time": "2020-11-18T10:42:24.079268",
209 |      "status": "completed"
210 |     },
211 |     "tags": []
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "jfk_flights.loc[:, 'airline_name'] = jfk_flights['DOT_ID_Reporting_Airline'].map(lambda x: airline_codes[x])"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {
221 |     "papermill": {
222 |      "duration": 0.015748,
223 |      "end_time": "2020-11-18T10:42:24.188117",
224 |      "exception": false,
225 |      "start_time": "2020-11-18T10:42:24.172369",
226 |      "status": "completed"
227 |     },
228 |     "tags": []
229 |    },
230 |    "source": [
231 |     "Next, we select the set of columns to keep for downstream analysis and rename the columns to `snake_case` for consistency with our processed weather dataset."
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "papermill": {
239 |      "duration": 0.061216,
240 |      "end_time": "2020-11-18T10:42:24.265686",
241 |      "exception": false,
242 |      "start_time": "2020-11-18T10:42:24.204470",
243 |      "status": "completed"
244 |     },
245 |     "tags": []
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "cols_to_keep = ['FlightDate', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Reporting_Airline', 'airline_name', 'Origin', 'Dest', 'CRSDepTime', 'DepTimeBlk', 'DistanceGroup', 'DepDel15', 'DepDelay']\n",
250 |     "jfk_flights = jfk_flights[cols_to_keep]\n",
251 |     "col_names = {\n",
252 |     "    'FlightDate': 'flight_date',\n",
253 |     "    'Month': 'month',\n",
254 |     "    'DayofMonth': 'day_of_month',\n",
255 |     "    'DayOfWeek': 'day_of_week',\n",
256 |     "    'DOT_ID_Reporting_Airline': 'airline_id',\n",
257 |     "    'Origin': 'origin',\n",
258 |     "    'Dest': 'dest',\n",
259 |     "    'CRSDepTime': 'sched_dep_time',\n",
260 |     "    'DepTimeBlk': 'dep_time_bin',\n",
261 |     "    'DistanceGroup': 'distance_bin',\n",
262 |     "    'DepDel15': 'delayed',\n",
263 |     "    'DepDelay': 'dep_delay'\n",
264 |     "}\n",
265 |     "jfk_flights_renamed = jfk_flights.rename(columns=col_names)\n",
266 |     "jfk_flights_renamed.head()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "papermill": {
274 |      "duration": 0.043619,
275 |      "end_time": "2020-11-18T10:42:24.324809",
276 |      "exception": false,
277 |      "start_time": "2020-11-18T10:42:24.281190",
278 |      "status": "completed"
279 |     },
280 |     "tags": []
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "# Log some general information about the dataset\n",
285 |     "print('# of columns: ' + str(jfk_flights_renamed.shape[1])) \n",
286 |     "print('# of observations: ' + str(jfk_flights_renamed.shape[0]))\n",
287 |     "print('Start date: ' + str(jfk_flights_renamed['flight_date'].min()))\n",
288 |     "print('End date: ' + str(jfk_flights_renamed['flight_date'].max()))"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {
294 |     "papermill": {
295 |      "duration": 0.020485,
296 |      "end_time": "2020-11-18T10:42:24.374384",
297 |      "exception": false,
298 |      "start_time": "2020-11-18T10:42:24.353899",
299 |      "status": "completed"
300 |     },
301 |     "tags": []
302 |    },
303 |    "source": [
304 |     "### Save the Processed Data\n",
305 |     "\n",
306 |     "Finally, we save the processed dataset for use by downstream tasks."
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "papermill": {
314 |      "duration": 0.245405,
315 |      "end_time": "2020-11-18T10:42:24.638852",
316 |      "exception": false,
317 |      "start_time": "2020-11-18T10:42:24.393447",
318 |      "status": "completed"
319 |     },
320 |     "tags": []
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "jfk_flights_renamed.to_csv('data/jfk_flight_features.csv', index=False, float_format='%g')"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {
330 |     "papermill": {
331 |      "duration": 0.027191,
332 |      "end_time": "2020-11-18T10:42:24.695646",
333 |      "exception": false,
334 |      "start_time": "2020-11-18T10:42:24.668455",
335 |      "status": "completed"
336 |     },
337 |     "tags": []
338 |    },
339 |    "source": [
340 |     " ### Authors\n",
341 |     " \n",
342 |     " This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
343 |     "\n",
344 |     "Copyright © 2020 IBM. This notebook and its source code are released under the terms of the MIT License."
345 |    ]
346 |   }
347 |  ],
348 |  "metadata": {
349 |   "kernelspec": {
350 |    "display_name": "Python 3",
351 |    "language": "python",
352 |    "name": "python3"
353 |   },
354 |   "language_info": {
355 |    "codemirror_mode": {
356 |     "name": "ipython",
357 |     "version": 3
358 |    },
359 |    "file_extension": ".py",
360 |    "mimetype": "text/x-python",
361 |    "name": "python",
362 |    "nbconvert_exporter": "python",
363 |    "pygments_lexer": "ipython3",
364 |    "version": "3.7.9"
365 |   },
366 |   "papermill": {
367 |    "duration": 62.871732,
368 |    "end_time": "2020-11-18T10:42:25.559608",
369 |    "environment_variables": {},
370 |    "exception": null,
371 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_flight_data.ipynb",
372 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_flight_data.ipynb",
373 |    "parameters": {},
374 |    "start_time": "2020-11-18T10:41:22.687876",
375 |    "version": "2.1.1"
376 |   },
377 |   "toc-autonumbering": false,
378 |   "toc-showcode": false,
379 |   "toc-showmarkdowntxt": false,
380 |   "toc-showtags": false
381 |  },
382 |  "nbformat": 4,
383 |  "nbformat_minor": 4
384 | }
385 | 


--------------------------------------------------------------------------------
/notebooks/process_weather_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.057903,
  8 |      "end_time": "2020-11-18T10:42:27.967529",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-18T10:42:27.909626",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "# Processing NOAA Weather Data of JFK Airport (New York)\n",
 17 |     "\n",
 18 |     "This notebook relates to the NOAA Weather Dataset - JFK Airport (New York). The dataset contains 114,546 hourly observations of 12 local climatological variables (such as temperature, wind speed and precipitation) collected at JFK airport. This dataset is freely available from the IBM Developer [Data Asset Exchange](https://developer.ibm.com/exchanges/data/all/jfk-weather-data/).\n",
 19 |     "\n",
 20 |     "In this notebook, we process the raw dataset by:\n",
 21 |     "* selecting the columns we wish to keep for later downstream tasks\n",
 22 |     "* converting and cleaning data where required\n",
 23 |     "* filling missing values\n",
 24 |     "* extracting categorical weather features\n",
 25 |     "\n",
 26 |     "#### Import required modules\n",
 27 |     "\n",
 28 |     "Import and configure the required modules."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "papermill": {
 36 |      "duration": 0.072604,
 37 |      "end_time": "2020-11-18T10:42:28.096347",
 38 |      "exception": false,
 39 |      "start_time": "2020-11-18T10:42:28.023743",
 40 |      "status": "completed"
 41 |     },
 42 |     "tags": []
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# !pip install pandas > /dev/null 2>&1"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "papermill": {
 54 |      "duration": 1.236463,
 55 |      "end_time": "2020-11-18T10:42:29.391378",
 56 |      "exception": false,
 57 |      "start_time": "2020-11-18T10:42:28.154915",
 58 |      "status": "completed"
 59 |     },
 60 |     "tags": []
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# Define required imports\n",
 65 |     "import pandas as pd\n",
 66 |     "import numpy as np\n",
 67 |     "import re\n",
 68 |     "# These set pandas max column and row display in the notebook\n",
 69 |     "pd.set_option('display.max_columns', 50)\n",
 70 |     "pd.set_option('display.max_rows', 50)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {
 76 |     "papermill": {
 77 |      "duration": 0.053432,
 78 |      "end_time": "2020-11-18T10:42:29.492354",
 79 |      "exception": false,
 80 |      "start_time": "2020-11-18T10:42:29.438922",
 81 |      "status": "completed"
 82 |     },
 83 |     "tags": []
 84 |    },
 85 |    "source": [
 86 |     "### Read the Raw Data\n",
 87 |     "\n",
 88 |     "We start by reading in the raw dataset, displaying the first few rows of the dataframe, and taking a look at the columns and column types present."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "papermill": {
 96 |      "duration": 2.248616,
 97 |      "end_time": "2020-11-18T10:42:31.818019",
 98 |      "exception": false,
 99 |      "start_time": "2020-11-18T10:42:29.569403",
100 |      "status": "completed"
101 |     },
102 |     "tags": []
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "raw_data = pd.read_csv('data/noaa-weather-data-jfk-airport/jfk_weather.csv', parse_dates=['DATE'])\n",
107 |     "raw_data.head()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {
113 |     "papermill": {
114 |      "duration": 0.047672,
115 |      "end_time": "2020-11-18T10:42:31.916277",
116 |      "exception": false,
117 |      "start_time": "2020-11-18T10:42:31.868605",
118 |      "status": "completed"
119 |     },
120 |     "tags": []
121 |    },
122 |    "source": [
123 |     "### Clean the Data\n",
124 |     "\n",
125 |     "As you can see above, there are a lot of fields which are non-numerical - usually these will be fields that contain text or categorical data, e.g. `HOURLYPRSENTWEATHERTYPE`.\n",
126 |     "\n",
127 |     "There are also fields, such as `HOURLYVISIBILITY`, that we may expect to be numerical, but are instead `object` type. This often indicates that there may be missing (or `null`) values, or some other unusual readings that we may have to deal with (since otherwise the field would have been fully parsed as a numerical data type).\n",
128 |     "\n",
129 |     "In addition, some fields relate to hourly observations, while others relate to daily or monthly intervals. For purposes of later analysis, we will restrict the dataset to a certain subset  of fields that relate to hourly observations.\n",
130 |     "\n",
131 |     "In this section, we refer to the [NOAA Local Climatological Data Documentation](https://data.noaa.gov/dataset/dataset/u-s-local-climatological-data-lcd/resource/ee7381ea-647a-434f-8cfa-81202b9b4c05) to describe the fields and meaning of various values."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {
137 |     "papermill": {
138 |      "duration": 0.036204,
139 |      "end_time": "2020-11-18T10:42:31.987330",
140 |      "exception": false,
141 |      "start_time": "2020-11-18T10:42:31.951126",
142 |      "status": "completed"
143 |     },
144 |     "tags": []
145 |    },
146 |    "source": [
147 |     "#### Select data subset\n",
148 |     "\n",
149 |     "First, we select only the subset of data of interest. We will keep data for years 2010 - 2017 related to routine hourly weather station reports. We will also restrict our dataset to only a subest of column types that we expect to be pertinent for downstream tasks."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "papermill": {
157 |      "duration": 0.213779,
158 |      "end_time": "2020-11-18T10:42:32.244195",
159 |      "exception": false,
160 |      "start_time": "2020-11-18T10:42:32.030416",
161 |      "status": "completed"
162 |     },
163 |     "tags": []
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "# Choose what columns to import from raw data\n",
168 |     "column_subset = [\n",
169 |     "    'DATE',\n",
170 |     "    'HOURLYVISIBILITY',\n",
171 |     "    'HOURLYPRSENTWEATHERTYPE',\n",
172 |     "    'HOURLYWindSpeed',\n",
173 |     "    'HOURLYWindGustSpeed',\n",
174 |     "    'HOURLYPrecip'\n",
175 |     "]\n",
176 |     "\n",
177 |     "# Select the data sub-set for years 2010-2017 & report type FM-15 (routine hourly weather reports)\n",
178 |     "data_subset = raw_data[(raw_data['DATE'].dt.year.isin(range(2010, 2018))) & (raw_data['REPORTTPYE'] == 'FM-15')]\n",
179 |     "# Filter dataset to relevant columns\n",
180 |     "weather_data = data_subset.loc[:, column_subset]\n",
181 |     "# Set date index\n",
182 |     "weather_data = weather_data.set_index(pd.DatetimeIndex(weather_data['DATE']))\n",
183 |     "weather_data.drop(['DATE'], axis=1, inplace=True)\n",
184 |     "weather_data.replace(to_replace='*', value=np.nan, inplace=True)\n",
185 |     "weather_data.head()"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {
192 |     "papermill": {
193 |      "duration": 0.052686,
194 |      "end_time": "2020-11-18T10:42:32.344763",
195 |      "exception": false,
196 |      "start_time": "2020-11-18T10:42:32.292077",
197 |      "status": "completed"
198 |     },
199 |     "tags": []
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "weather_data.dtypes"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {
209 |     "papermill": {
210 |      "duration": 0.038565,
211 |      "end_time": "2020-11-18T10:42:32.425397",
212 |      "exception": false,
213 |      "start_time": "2020-11-18T10:42:32.386832",
214 |      "status": "completed"
215 |     },
216 |     "tags": []
217 |    },
218 |    "source": [
219 |     "#### Clean up precipitation column\n",
220 |     "\n",
221 |     "From the dataframe preview above, we can see that the column `HOURLYPrecip` - which is the hourly measure of precipitation levels - contains both `NaN` and `T` values. `T` specifies *trace amounts of precipitation*, while `NaN` means *not a number*, and is used to denote missing values.\n",
222 |     "\n",
223 |     "We can also inspect the unique values present for the field."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "papermill": {
231 |      "duration": 0.046581,
232 |      "end_time": "2020-11-18T10:42:32.509475",
233 |      "exception": false,
234 |      "start_time": "2020-11-18T10:42:32.462894",
235 |      "status": "completed"
236 |     },
237 |     "tags": []
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "weather_data['HOURLYPrecip'].unique()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {
247 |     "papermill": {
248 |      "duration": 0.036444,
249 |      "end_time": "2020-11-18T10:42:32.573140",
250 |      "exception": false,
251 |      "start_time": "2020-11-18T10:42:32.536696",
252 |      "status": "completed"
253 |     },
254 |     "tags": []
255 |    },
256 |    "source": [
257 |     "We can see that some values end with an `s` (indicating snow), while there is a strange value `0.020.01s` which appears to be an error of some sort. To deal with `T` values, we will set the observation to be `0`. We will also replace the erroneous value `0.020.01s` with `NaN`.\n",
258 |     "\n",
259 |     "Finally, we will replace all `NaN` entries with `0`, i.e. we assume no precipitation was present."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {
266 |     "papermill": {
267 |      "duration": 0.047662,
268 |      "end_time": "2020-11-18T10:42:32.657348",
269 |      "exception": false,
270 |      "start_time": "2020-11-18T10:42:32.609686",
271 |      "status": "completed"
272 |     },
273 |     "tags": []
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "# Fix precipitation data\n",
278 |     "weather_data['HOURLYPrecip'].replace(to_replace='T', value='0.00', inplace=True)\n",
279 |     "weather_data['HOURLYPrecip'].replace('0.020.01s', np.nan, inplace=True)\n",
280 |     "weather_data.fillna(value={'HOURLYPrecip': 0}, inplace=True)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {
286 |     "papermill": {
287 |      "duration": 0.0353,
288 |      "end_time": "2020-11-18T10:42:32.720877",
289 |      "exception": false,
290 |      "start_time": "2020-11-18T10:42:32.685577",
291 |      "status": "completed"
292 |     },
293 |     "tags": []
294 |    },
295 |    "source": [
296 |     "#### Inspect visibility column\n",
297 |     "\n",
298 |     "As we have done for precipitation, we can also inspect the unique values present for the column `HOURLYVISIBILITY` - which is the hourly measure of visibility. Below, we see that some values are `nan`, while some end with an `V`. "
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {
305 |     "papermill": {
306 |      "duration": 0.047845,
307 |      "end_time": "2020-11-18T10:42:32.796248",
308 |      "exception": false,
309 |      "start_time": "2020-11-18T10:42:32.748403",
310 |      "status": "completed"
311 |     },
312 |     "tags": []
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "weather_data['HOURLYVISIBILITY'].unique()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {
322 |     "papermill": {
323 |      "duration": 0.033404,
324 |      "end_time": "2020-11-18T10:42:32.875565",
325 |      "exception": false,
326 |      "start_time": "2020-11-18T10:42:32.842161",
327 |      "status": "completed"
328 |     },
329 |     "tags": []
330 |    },
331 |    "source": [
332 |     "#### Convert columns to numerical types\n",
333 |     "\n",
334 |     "Next, we will convert string columns that refer to numerical values to numerical types. For columns such as `HOURLYPrecip` and `HOURLYVISIBILITY`, we first also drop the non-numerical parts of the value (e.g .the `s` and `V` characters)."
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {
341 |     "papermill": {
342 |      "duration": 0.612421,
343 |      "end_time": "2020-11-18T10:42:33.517136",
344 |      "exception": false,
345 |      "start_time": "2020-11-18T10:42:32.904715",
346 |      "status": "completed"
347 |     },
348 |     "tags": []
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "# Set of columns to convert\n",
353 |     "messy_columns = ['HOURLYVISIBILITY', 'HOURLYPrecip', 'HOURLYWindSpeed', 'HOURLYWindGustSpeed']\n",
354 |     "\n",
355 |     "# Convert columns to float32 datatype\n",
356 |     "for i in messy_columns:\n",
357 |     "    weather_data[i] = weather_data[i].apply(lambda x: re.sub('[^0-9,.-]', '', x) if type(x) == str else x).replace('', np.nan).astype(('float32'))"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {
363 |     "papermill": {
364 |      "duration": 0.02799,
365 |      "end_time": "2020-11-18T10:42:33.571211",
366 |      "exception": false,
367 |      "start_time": "2020-11-18T10:42:33.543221",
368 |      "status": "completed"
369 |     },
370 |     "tags": []
371 |    },
372 |    "source": [
373 |     "We can now see that all fields have numerical data type."
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "papermill": {
381 |      "duration": 0.152779,
382 |      "end_time": "2020-11-18T10:42:33.793868",
383 |      "exception": false,
384 |      "start_time": "2020-11-18T10:42:33.641089",
385 |      "status": "completed"
386 |     },
387 |     "tags": []
388 |    },
389 |    "outputs": [],
390 |    "source": [
391 |     "print(weather_data.info())\n",
392 |     "# Generate the summary statistics for each column\n",
393 |     "weather_data.describe()"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {
399 |     "papermill": {
400 |      "duration": 0.047737,
401 |      "end_time": "2020-11-18T10:42:33.890040",
402 |      "exception": false,
403 |      "start_time": "2020-11-18T10:42:33.842303",
404 |      "status": "completed"
405 |     },
406 |     "tags": []
407 |    },
408 |    "source": [
409 |     "For wind gusts, rather than have `NaN` entries (which represent no gusts), we will represent the gust speed column as \"excess speed\" over the `HOURLYWindSpeed` values.weather_data.head()"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {
416 |     "papermill": {
417 |      "duration": 0.276195,
418 |      "end_time": "2020-11-18T10:42:34.197609",
419 |      "exception": false,
420 |      "start_time": "2020-11-18T10:42:33.921414",
421 |      "status": "completed"
422 |     },
423 |     "tags": []
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "weather_data.loc[:, 'HOURLYWindGustSpeed'] = np.vectorize(lambda x, y: 0.0 if np.isnan(y) else y - x)(\n",
428 |     "    weather_data['HOURLYWindSpeed'], weather_data['HOURLYWindGustSpeed'])\n",
429 |     "weather_data.head()"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {
435 |     "papermill": {
436 |      "duration": 0.034999,
437 |      "end_time": "2020-11-18T10:42:34.262944",
438 |      "exception": false,
439 |      "start_time": "2020-11-18T10:42:34.227945",
440 |      "status": "completed"
441 |     },
442 |     "tags": []
443 |    },
444 |    "source": [
445 |     "#### Check date index\n",
446 |     "\n",
447 |     "Next, we check if there are any duplicates with respect to our `DATE` index and check furthermore that our dates are in the correct order (that is, strictly increasing)."
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {
454 |     "papermill": {
455 |      "duration": 0.054207,
456 |      "end_time": "2020-11-18T10:42:34.353374",
457 |      "exception": false,
458 |      "start_time": "2020-11-18T10:42:34.299167",
459 |      "status": "completed"
460 |     },
461 |     "tags": []
462 |    },
463 |    "outputs": [],
464 |    "source": [
465 |     "cond = len(weather_data[weather_data.index.duplicated()].sort_index())\n",
466 |     "print('Date index contains no duplicate entries: {}'.format(cond == 0))\n",
467 |     "print('Date index is strictly increasing: {}'.format(weather_data.index.is_monotonic_increasing))"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {
473 |     "papermill": {
474 |      "duration": 0.032104,
475 |      "end_time": "2020-11-18T10:42:34.417847",
476 |      "exception": false,
477 |      "start_time": "2020-11-18T10:42:34.385743",
478 |      "status": "completed"
479 |     },
480 |     "tags": []
481 |    },
482 |    "source": [
483 |     "### Categorical Feature Extraction\n",
484 |     "\n",
485 |     "The final pre-processing step we will perform will be to handle the `HOURLYPRSENTWEATHERTYPE` column to correctly encode the weather features. This column indicates the presence of specific weather types for the given reading. For example, `-RA:02 BR:1 |RA:61 |RA:61` refers to 3 types of reading:\n",
486 |     "1. `AU` codes for automated weather readings\n",
487 |     "2. `AW` codes for a different type of automated weather reading\n",
488 |     "3. `MW` codes for manually-augmented weather readings\n",
489 |     "\n",
490 |     "This example reading happens to contain all 3 types, separated by a `|` character. The `AU` code is thus `-RA:02 BR:1`. If we refer to the data documentation linked above, we can see this indicates the presence of `RA:02 - Rain` and `BR:1 - Mist`.\n",
491 |     "\n",
492 |     "These _present weather types_ are categorical variables. **Note** that multiple categories of weather can be present. In order to process this column, we will:\n",
493 |     "* only use the `AU` codes for simplicity\n",
494 |     "* convert the codes to more readable category names\n",
495 |     "* extract the weather type categories into individual binary columns representing the presence (`1`) or absence (`0`) of that category. This is like \"one-hot encoding\" but for multi-category variables\n",
496 |     "\n",
497 |     "We start with creating a mapping from codes to category names"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {
504 |     "papermill": {
505 |      "duration": 0.05096,
506 |      "end_time": "2020-11-18T10:42:34.509497",
507 |      "exception": false,
508 |      "start_time": "2020-11-18T10:42:34.458537",
509 |      "status": "completed"
510 |     },
511 |     "tags": []
512 |    },
513 |    "outputs": [],
514 |    "source": [
515 |     "# start with raw types taken from the LCD Dataset Documentation\n",
516 |     "# we convert the raw weather type names to snake_case\n",
517 |     "raw_types = '''DZ:01 - drizzle\n",
518 |     "RA:02 - rain\n",
519 |     "SN:03 - snow\n",
520 |     "SG:04 - snow_grains\n",
521 |     "IC:05 - ice_crystals\n",
522 |     "PL:06 - ice_pellets\n",
523 |     "GR:07 - hail\n",
524 |     "GS:08 - small_hail_snow_pellets\n",
525 |     "UP:09 - unknown_precipitation\n",
526 |     "BR:1 - mist\n",
527 |     "FG:2 - fog\n",
528 |     "FU:3 - smoke\n",
529 |     "VA:4 - volcanic_ash\n",
530 |     "DU:5 - widespread_dust\n",
531 |     "SA:6 - sand\n",
532 |     "HZ:7 - haze\n",
533 |     "PY:8 - spray\n",
534 |     "PO:1 - well_developed_dust\n",
535 |     "SQ:2 - squalls\n",
536 |     "FC:3 - funnel_cloud_waterspout_tornado\n",
537 |     "SS:4 - sandstorm\n",
538 |     "DS:5 - duststorm'''.split('\\n')\n",
539 |     "\n",
540 |     "raw_types = [t.split(' - ') for t in raw_types]\n",
541 |     "weather_types = {t[0]: t[1] for t in raw_types}\n",
542 |     "# Add in a code that is inconsistently represented in the documentation\n",
543 |     "weather_types['TS:7'] = 'thunderstorm'\n",
544 |     "weather_types"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {
550 |     "papermill": {
551 |      "duration": 0.052291,
552 |      "end_time": "2020-11-18T10:42:34.593200",
553 |      "exception": false,
554 |      "start_time": "2020-11-18T10:42:34.540909",
555 |      "status": "completed"
556 |     },
557 |     "tags": []
558 |    },
559 |    "source": [
560 |     "There are still a few edge cases that do not fall within the weather type mapping we have created. For the purposes of simplification, we will ignore these, since we have captured the main weather types in our mapping. So, we create a function to convert codes to category names, handling any errors."
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {
567 |     "papermill": {
568 |      "duration": 0.074268,
569 |      "end_time": "2020-11-18T10:42:34.731431",
570 |      "exception": false,
571 |      "start_time": "2020-11-18T10:42:34.657163",
572 |      "status": "completed"
573 |     },
574 |     "tags": []
575 |    },
576 |    "outputs": [],
577 |    "source": [
578 |     "def get_type(k):\n",
579 |     "    if k in weather_types:\n",
580 |     "        return weather_types[k]\n",
581 |     "    else:\n",
582 |     "        return ''\n",
583 |     "    \n",
584 |     "def extract_weather_type(x):\n",
585 |     "    wt = x.split('|')[0].split() if isinstance(x, str) else []\n",
586 |     "    wt = [get_type(w.lstrip('-').lstrip('+')) for w in wt]\n",
587 |     "    return wt"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "markdown",
592 |    "metadata": {
593 |     "papermill": {
594 |      "duration": 0.068491,
595 |      "end_time": "2020-11-18T10:42:34.864111",
596 |      "exception": false,
597 |      "start_time": "2020-11-18T10:42:34.795620",
598 |      "status": "completed"
599 |     },
600 |     "tags": []
601 |    },
602 |    "source": [
603 |     "Let's test our function out:"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {
610 |     "papermill": {
611 |      "duration": 0.224131,
612 |      "end_time": "2020-11-18T10:42:35.139554",
613 |      "exception": false,
614 |      "start_time": "2020-11-18T10:42:34.915423",
615 |      "status": "completed"
616 |     },
617 |     "tags": []
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "print(weather_data['HOURLYPRSENTWEATHERTYPE'].head(5))\n",
622 |     "print()\n",
623 |     "print(weather_data['HOURLYPRSENTWEATHERTYPE'].apply(extract_weather_type).head(5))"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "markdown",
628 |    "metadata": {
629 |     "papermill": {
630 |      "duration": 0.054716,
631 |      "end_time": "2020-11-18T10:42:35.260798",
632 |      "exception": false,
633 |      "start_time": "2020-11-18T10:42:35.206082",
634 |      "status": "completed"
635 |     },
636 |     "tags": []
637 |    },
638 |    "source": [
639 |     "That seems to be working. Next, we binarize the present weather categories in each cell:"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {
646 |     "papermill": {
647 |      "duration": 0.820289,
648 |      "end_time": "2020-11-18T10:42:36.111270",
649 |      "exception": false,
650 |      "start_time": "2020-11-18T10:42:35.290981",
651 |      "status": "completed"
652 |     },
653 |     "tags": []
654 |    },
655 |    "outputs": [],
656 |    "source": [
657 |     "from collections import Counter\n",
658 |     "counts = weather_data['HOURLYPRSENTWEATHERTYPE'].apply(extract_weather_type).apply(Counter)\n",
659 |     "counts = pd.DataFrame.from_records(counts).fillna(value=0).drop(columns = [''])\n",
660 |     "counts"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "markdown",
665 |    "metadata": {
666 |     "papermill": {
667 |      "duration": 0.053868,
668 |      "end_time": "2020-11-18T10:42:36.209831",
669 |      "exception": false,
670 |      "start_time": "2020-11-18T10:42:36.155963",
671 |      "status": "completed"
672 |     },
673 |     "tags": []
674 |    },
675 |    "source": [
676 |     "Finally, we combine the extra columns we've created with our original dataframe:"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": null,
682 |    "metadata": {
683 |     "papermill": {
684 |      "duration": 0.115044,
685 |      "end_time": "2020-11-18T10:42:36.363452",
686 |      "exception": false,
687 |      "start_time": "2020-11-18T10:42:36.248408",
688 |      "status": "completed"
689 |     },
690 |     "tags": []
691 |    },
692 |    "outputs": [],
693 |    "source": [
694 |     "cleaned_data = pd.concat([weather_data, counts.set_index(weather_data.index)], axis=1)\n",
695 |     "cleaned_data"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "markdown",
700 |    "metadata": {
701 |     "papermill": {
702 |      "duration": 0.053257,
703 |      "end_time": "2020-11-18T10:42:36.454488",
704 |      "exception": false,
705 |      "start_time": "2020-11-18T10:42:36.401231",
706 |      "status": "completed"
707 |     },
708 |     "tags": []
709 |    },
710 |    "source": [
711 |     "#### Rename columns\n",
712 |     "\n",
713 |     "Before saving the dataset, we will rename the columns for readability."
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "code",
718 |    "execution_count": null,
719 |    "metadata": {
720 |     "papermill": {
721 |      "duration": 0.065562,
722 |      "end_time": "2020-11-18T10:42:36.566875",
723 |      "exception": false,
724 |      "start_time": "2020-11-18T10:42:36.501313",
725 |      "status": "completed"
726 |     },
727 |     "tags": []
728 |    },
729 |    "outputs": [],
730 |    "source": [
731 |     "cleaned_data.columns"
732 |    ]
733 |   },
734 |   {
735 |    "cell_type": "code",
736 |    "execution_count": null,
737 |    "metadata": {
738 |     "papermill": {
739 |      "duration": 0.079686,
740 |      "end_time": "2020-11-18T10:42:36.723515",
741 |      "exception": false,
742 |      "start_time": "2020-11-18T10:42:36.643829",
743 |      "status": "completed"
744 |     },
745 |     "tags": []
746 |    },
747 |    "outputs": [],
748 |    "source": [
749 |     "# define some new column names for consistency\n",
750 |     "columns_name_map = {\n",
751 |     "    'HOURLYVISIBILITY': 'visibility',\n",
752 |     "    'HOURLYPRSENTWEATHERTYPE': 'weather_type_raw',\n",
753 |     "    'HOURLYWindSpeed': 'wind_speed',\n",
754 |     "    'HOURLYWindGustSpeed': 'wind_gust_speed',\n",
755 |     "    'HOURLYPrecip': 'precip',\n",
756 |     "}\n",
757 |     "\n",
758 |     "cleaned_data_renamed = cleaned_data.rename(columns=columns_name_map)"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {
765 |     "papermill": {
766 |      "duration": 0.143285,
767 |      "end_time": "2020-11-18T10:42:36.925259",
768 |      "exception": false,
769 |      "start_time": "2020-11-18T10:42:36.781974",
770 |      "status": "completed"
771 |     },
772 |     "tags": []
773 |    },
774 |    "outputs": [],
775 |    "source": [
776 |     "print(cleaned_data_renamed.info())\n",
777 |     "print()\n",
778 |     "cleaned_data_renamed.head()"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": null,
784 |    "metadata": {
785 |     "papermill": {
786 |      "duration": 0.161798,
787 |      "end_time": "2020-11-18T10:42:37.164530",
788 |      "exception": false,
789 |      "start_time": "2020-11-18T10:42:37.002732",
790 |      "status": "completed"
791 |     },
792 |     "tags": []
793 |    },
794 |    "outputs": [],
795 |    "source": [
796 |     "# Log some general information about the dataset\n",
797 |     "print('# of columns: ' + str(cleaned_data_renamed.shape[1])) \n",
798 |     "print('# of observations: ' + str(cleaned_data_renamed.shape[0]))\n",
799 |     "print('Start date: ' + str(cleaned_data_renamed.index[0]))\n",
800 |     "print('End date: ' + str(cleaned_data_renamed.index[-1]))"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "markdown",
805 |    "metadata": {
806 |     "papermill": {
807 |      "duration": 0.109368,
808 |      "end_time": "2020-11-18T10:42:37.386139",
809 |      "exception": false,
810 |      "start_time": "2020-11-18T10:42:37.276771",
811 |      "status": "completed"
812 |     },
813 |     "tags": []
814 |    },
815 |    "source": [
816 |     "### Save the Processed Data\n",
817 |     "\n",
818 |     "Finally, we save the processed dataset for use by downstream tasks."
819 |    ]
820 |   },
821 |   {
822 |    "cell_type": "code",
823 |    "execution_count": null,
824 |    "metadata": {
825 |     "papermill": {
826 |      "duration": 2.452826,
827 |      "end_time": "2020-11-18T10:42:39.912468",
828 |      "exception": false,
829 |      "start_time": "2020-11-18T10:42:37.459642",
830 |      "status": "completed"
831 |     },
832 |     "tags": []
833 |    },
834 |    "outputs": [],
835 |    "source": [
836 |     "cleaned_data_renamed.to_csv('data/jfk_weather_features.csv', float_format='%g')"
837 |    ]
838 |   },
839 |   {
840 |    "cell_type": "markdown",
841 |    "metadata": {
842 |     "papermill": {
843 |      "duration": 0.049907,
844 |      "end_time": "2020-11-18T10:42:40.004559",
845 |      "exception": false,
846 |      "start_time": "2020-11-18T10:42:39.954652",
847 |      "status": "completed"
848 |     },
849 |     "tags": []
850 |    },
851 |    "source": [
852 |     " ### Authors\n",
853 |     " \n",
854 |     " This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n",
855 |     "\n",
856 |     "Copyright © 2020 IBM. This notebook and its source code are released under the terms of the MIT License."
857 |    ]
858 |   }
859 |  ],
860 |  "metadata": {
861 |   "kernelspec": {
862 |    "display_name": "Python 3",
863 |    "language": "python",
864 |    "name": "python3"
865 |   },
866 |   "language_info": {
867 |    "codemirror_mode": {
868 |     "name": "ipython",
869 |     "version": 3
870 |    },
871 |    "file_extension": ".py",
872 |    "mimetype": "text/x-python",
873 |    "name": "python",
874 |    "nbconvert_exporter": "python",
875 |    "pygments_lexer": "ipython3",
876 |    "version": "3.7.9"
877 |   },
878 |   "papermill": {
879 |    "duration": 14.833694,
880 |    "end_time": "2020-11-18T10:42:40.587559",
881 |    "environment_variables": {},
882 |    "exception": null,
883 |    "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_weather_data.ipynb",
884 |    "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_weather_data.ipynb",
885 |    "parameters": {},
886 |    "start_time": "2020-11-18T10:42:25.753865",
887 |    "version": "2.1.1"
888 |   }
889 |  },
890 |  "nbformat": 4,
891 |  "nbformat_minor": 4
892 | }
893 | 


--------------------------------------------------------------------------------
/pipelines/flight_delays.pipeline:
--------------------------------------------------------------------------------
  1 | {
  2 |   "doc_type": "pipeline",
  3 |   "version": "3.0",
  4 |   "json_schema": "http://api.dataplatform.ibm.com/schemas/common-pipeline/pipeline-flow/pipeline-flow-v3-schema.json",
  5 |   "id": "d8fa85b7-04fd-467d-a45d-e127e1eccfe8",
  6 |   "primary_pipeline": "07d7f720-4cde-4c57-a4ee-e99c68f307b1",
  7 |   "pipelines": [
  8 |     {
  9 |       "id": "07d7f720-4cde-4c57-a4ee-e99c68f307b1",
 10 |       "nodes": [
 11 |         {
 12 |           "id": "2f3f6243-82ef-43ee-af09-9888a5dfbc30",
 13 |           "type": "execution_node",
 14 |           "op": "execute-python-node",
 15 |           "app_data": {
 16 |             "filename": "../notebooks/load_data.py",
 17 |             "runtime_image": "amancevice/pandas:1.1.1",
 18 |             "env_vars": [
 19 |               "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/airline_2m.tar.gz"
 20 |             ],
 21 |             "include_subdirectories": false,
 22 |             "invalidNodeError": null,
 23 |             "outputs": [
 24 |               "data/airline_2m.csv"
 25 |             ],
 26 |             "ui_data": {
 27 |               "label": "load_data",
 28 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
 29 |               "x_pos": 71.16775512695312,
 30 |               "y_pos": 91.4539566040039,
 31 |               "description": "Python Script"
 32 |             }
 33 |           },
 34 |           "inputs": [
 35 |             {
 36 |               "id": "inPort",
 37 |               "app_data": {
 38 |                 "ui_data": {
 39 |                   "cardinality": {
 40 |                     "min": 0,
 41 |                     "max": 1
 42 |                   },
 43 |                   "label": "Input Port"
 44 |                 }
 45 |               }
 46 |             }
 47 |           ],
 48 |           "outputs": [
 49 |             {
 50 |               "id": "outPort",
 51 |               "app_data": {
 52 |                 "ui_data": {
 53 |                   "cardinality": {
 54 |                     "min": 0,
 55 |                     "max": -1
 56 |                   },
 57 |                   "label": "Output Port"
 58 |                 }
 59 |               }
 60 |             }
 61 |           ]
 62 |         },
 63 |         {
 64 |           "id": "b614bf55-c127-413a-935a-175d7afa7f0e",
 65 |           "type": "execution_node",
 66 |           "op": "execute-python-node",
 67 |           "app_data": {
 68 |             "filename": "../notebooks/load_data.py",
 69 |             "runtime_image": "amancevice/pandas:1.1.1",
 70 |             "env_vars": [
 71 |               "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz"
 72 |             ],
 73 |             "include_subdirectories": false,
 74 |             "invalidNodeError": null,
 75 |             "outputs": [
 76 |               "data/noaa-weather-data-jfk-airport/jfk_weather.csv"
 77 |             ],
 78 |             "ui_data": {
 79 |               "label": "load_data",
 80 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
 81 |               "x_pos": 65,
 82 |               "y_pos": 241,
 83 |               "description": "Python Script"
 84 |             }
 85 |           },
 86 |           "inputs": [
 87 |             {
 88 |               "id": "inPort",
 89 |               "app_data": {
 90 |                 "ui_data": {
 91 |                   "cardinality": {
 92 |                     "min": 0,
 93 |                     "max": 1
 94 |                   },
 95 |                   "label": "Input Port"
 96 |                 }
 97 |               }
 98 |             }
 99 |           ],
100 |           "outputs": [
101 |             {
102 |               "id": "outPort",
103 |               "app_data": {
104 |                 "ui_data": {
105 |                   "cardinality": {
106 |                     "min": 0,
107 |                     "max": -1
108 |                   },
109 |                   "label": "Output Port"
110 |                 }
111 |               }
112 |             }
113 |           ]
114 |         },
115 |         {
116 |           "id": "67d35156-a7ba-4339-9975-d5eac9be5b1e",
117 |           "type": "execution_node",
118 |           "op": "execute-notebook-node",
119 |           "app_data": {
120 |             "filename": "../notebooks/process_flight_data.ipynb",
121 |             "runtime_image": "amancevice/pandas:1.1.1",
122 |             "env_vars": [],
123 |             "include_subdirectories": false,
124 |             "invalidNodeError": null,
125 |             "outputs": [
126 |               "data/jfk_flight_features.csv"
127 |             ],
128 |             "ui_data": {
129 |               "label": "process_flight_data",
130 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
131 |               "x_pos": 344,
132 |               "y_pos": 97,
133 |               "description": "Notebook file"
134 |             }
135 |           },
136 |           "inputs": [
137 |             {
138 |               "id": "inPort",
139 |               "app_data": {
140 |                 "ui_data": {
141 |                   "cardinality": {
142 |                     "min": 0,
143 |                     "max": 1
144 |                   },
145 |                   "label": "Input Port"
146 |                 }
147 |               },
148 |               "links": [
149 |                 {
150 |                   "id": "eff34dc4-552d-49d1-86ab-ad774c09ed9c",
151 |                   "node_id_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30",
152 |                   "port_id_ref": "outPort"
153 |                 }
154 |               ]
155 |             }
156 |           ],
157 |           "outputs": [
158 |             {
159 |               "id": "outPort",
160 |               "app_data": {
161 |                 "ui_data": {
162 |                   "cardinality": {
163 |                     "min": 0,
164 |                     "max": -1
165 |                   },
166 |                   "label": "Output Port"
167 |                 }
168 |               }
169 |             }
170 |           ]
171 |         },
172 |         {
173 |           "id": "71257647-2fc5-4db8-95ef-5813bc386f95",
174 |           "type": "execution_node",
175 |           "op": "execute-notebook-node",
176 |           "app_data": {
177 |             "filename": "../notebooks/process_weather_data.ipynb",
178 |             "runtime_image": "amancevice/pandas:1.1.1",
179 |             "env_vars": [],
180 |             "include_subdirectories": false,
181 |             "invalidNodeError": null,
182 |             "outputs": [
183 |               "data/jfk_weather_features.csv"
184 |             ],
185 |             "ui_data": {
186 |               "label": "process_weather_data",
187 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
188 |               "x_pos": 342,
189 |               "y_pos": 240,
190 |               "description": "Notebook file"
191 |             }
192 |           },
193 |           "inputs": [
194 |             {
195 |               "id": "inPort",
196 |               "app_data": {
197 |                 "ui_data": {
198 |                   "cardinality": {
199 |                     "min": 0,
200 |                     "max": 1
201 |                   },
202 |                   "label": "Input Port"
203 |                 }
204 |               },
205 |               "links": [
206 |                 {
207 |                   "id": "0a85bfc7-3bf3-4885-9026-2bd9fa30b729",
208 |                   "node_id_ref": "b614bf55-c127-413a-935a-175d7afa7f0e",
209 |                   "port_id_ref": "outPort"
210 |                 }
211 |               ]
212 |             }
213 |           ],
214 |           "outputs": [
215 |             {
216 |               "id": "outPort",
217 |               "app_data": {
218 |                 "ui_data": {
219 |                   "cardinality": {
220 |                     "min": 0,
221 |                     "max": -1
222 |                   },
223 |                   "label": "Output Port"
224 |                 }
225 |               }
226 |             }
227 |           ]
228 |         },
229 |         {
230 |           "id": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
231 |           "type": "execution_node",
232 |           "op": "execute-notebook-node",
233 |           "app_data": {
234 |             "filename": "../notebooks/merge_data.ipynb",
235 |             "runtime_image": "amancevice/pandas:1.1.1",
236 |             "env_vars": [],
237 |             "include_subdirectories": false,
238 |             "invalidNodeError": null,
239 |             "outputs": [
240 |               "data/jfk_flight_weather_features.csv"
241 |             ],
242 |             "ui_data": {
243 |               "label": "merge_data",
244 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
245 |               "x_pos": 623,
246 |               "y_pos": 166,
247 |               "description": "Notebook file"
248 |             }
249 |           },
250 |           "inputs": [
251 |             {
252 |               "id": "inPort",
253 |               "app_data": {
254 |                 "ui_data": {
255 |                   "cardinality": {
256 |                     "min": 0,
257 |                     "max": 2
258 |                   },
259 |                   "label": "Input Port"
260 |                 }
261 |               },
262 |               "links": [
263 |                 {
264 |                   "id": "8d964ba9-5fd5-455d-83ef-0d64ecd05c56",
265 |                   "node_id_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e",
266 |                   "port_id_ref": "outPort"
267 |                 },
268 |                 {
269 |                   "id": "abfb2d60-15ea-4b54-ba87-5eb82aca5204",
270 |                   "node_id_ref": "71257647-2fc5-4db8-95ef-5813bc386f95",
271 |                   "port_id_ref": "outPort"
272 |                 }
273 |               ]
274 |             }
275 |           ],
276 |           "outputs": [
277 |             {
278 |               "id": "outPort",
279 |               "app_data": {
280 |                 "ui_data": {
281 |                   "cardinality": {
282 |                     "min": 0,
283 |                     "max": -1
284 |                   },
285 |                   "label": "Output Port"
286 |                 }
287 |               }
288 |             }
289 |           ]
290 |         },
291 |         {
292 |           "id": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01",
293 |           "type": "execution_node",
294 |           "op": "execute-notebook-node",
295 |           "app_data": {
296 |             "filename": "../notebooks/analyze_flight_delays.ipynb",
297 |             "runtime_image": "amancevice/pandas:1.1.1",
298 |             "env_vars": [],
299 |             "include_subdirectories": false,
300 |             "invalidNodeError": null,
301 |             "ui_data": {
302 |               "label": "analyze_flight_delays",
303 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
304 |               "x_pos": 943,
305 |               "y_pos": 94,
306 |               "description": "Notebook file"
307 |             }
308 |           },
309 |           "inputs": [
310 |             {
311 |               "id": "inPort",
312 |               "app_data": {
313 |                 "ui_data": {
314 |                   "cardinality": {
315 |                     "min": 0,
316 |                     "max": 1
317 |                   },
318 |                   "label": "Input Port"
319 |                 }
320 |               },
321 |               "links": [
322 |                 {
323 |                   "id": "809a9b07-21ec-4d30-b607-f0fab979790b",
324 |                   "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
325 |                   "port_id_ref": "outPort"
326 |                 }
327 |               ]
328 |             }
329 |           ],
330 |           "outputs": [
331 |             {
332 |               "id": "outPort",
333 |               "app_data": {
334 |                 "ui_data": {
335 |                   "cardinality": {
336 |                     "min": 0,
337 |                     "max": -1
338 |                   },
339 |                   "label": "Output Port"
340 |                 }
341 |               }
342 |             }
343 |           ]
344 |         },
345 |         {
346 |           "id": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f",
347 |           "type": "execution_node",
348 |           "op": "execute-notebook-node",
349 |           "app_data": {
350 |             "filename": "../notebooks/predict_flight_delays.ipynb",
351 |             "runtime_image": "amancevice/pandas:1.1.1",
352 |             "env_vars": [],
353 |             "include_subdirectories": false,
354 |             "invalidNodeError": null,
355 |             "outputs": [],
356 |             "ui_data": {
357 |               "label": "predict_flight_delays",
358 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
359 |               "x_pos": 944.013916015625,
360 |               "y_pos": 251.00418090820312,
361 |               "description": "Notebook file"
362 |             }
363 |           },
364 |           "inputs": [
365 |             {
366 |               "id": "inPort",
367 |               "app_data": {
368 |                 "ui_data": {
369 |                   "cardinality": {
370 |                     "min": 0,
371 |                     "max": 1
372 |                   },
373 |                   "label": "Input Port"
374 |                 }
375 |               },
376 |               "links": [
377 |                 {
378 |                   "id": "17ab35cb-a52f-4c37-984e-e33bf7b67d36",
379 |                   "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
380 |                   "port_id_ref": "outPort"
381 |                 }
382 |               ]
383 |             }
384 |           ],
385 |           "outputs": [
386 |             {
387 |               "id": "outPort",
388 |               "app_data": {
389 |                 "ui_data": {
390 |                   "cardinality": {
391 |                     "min": 0,
392 |                     "max": -1
393 |                   },
394 |                   "label": "Output Port"
395 |                 }
396 |               }
397 |             }
398 |           ]
399 |         }
400 |       ],
401 |       "app_data": {
402 |         "ui_data": {
403 |           "comments": [
404 |             {
405 |               "id": "e5c417b4-9be7-4244-8597-21e0c0e00a70",
406 |               "x_pos": 28,
407 |               "y_pos": 17,
408 |               "width": 175,
409 |               "height": 42,
410 |               "class_name": "d3-comment-rect",
411 |               "content": "Load flight delay data",
412 |               "associated_id_refs": [
413 |                 {
414 |                   "node_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30"
415 |                 }
416 |               ]
417 |             },
418 |             {
419 |               "id": "c1e71c34-9ae7-4a85-9251-91c554034cc2",
420 |               "x_pos": 30,
421 |               "y_pos": 338,
422 |               "width": 175,
423 |               "height": 42,
424 |               "class_name": "d3-comment-rect",
425 |               "content": "Load JFK weather data",
426 |               "associated_id_refs": [
427 |                 {
428 |                   "node_ref": "b614bf55-c127-413a-935a-175d7afa7f0e"
429 |                 }
430 |               ]
431 |             },
432 |             {
433 |               "id": "9cd374ba-b6ee-47a3-b963-4f164621d78b",
434 |               "x_pos": 292,
435 |               "y_pos": 15,
436 |               "width": 175,
437 |               "height": 42,
438 |               "class_name": "d3-comment-rect",
439 |               "content": "Clean up & pre-process flight delay data",
440 |               "associated_id_refs": [
441 |                 {
442 |                   "node_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e"
443 |                 }
444 |               ]
445 |             },
446 |             {
447 |               "id": "44d53e47-c149-4b69-ad6e-259dcd8f8b9f",
448 |               "x_pos": 308,
449 |               "y_pos": 334,
450 |               "width": 175,
451 |               "height": 42,
452 |               "class_name": "d3-comment-rect",
453 |               "content": "Clean up & pre-process weather data",
454 |               "associated_id_refs": [
455 |                 {
456 |                   "node_ref": "71257647-2fc5-4db8-95ef-5813bc386f95"
457 |                 }
458 |               ]
459 |             },
460 |             {
461 |               "id": "b12e0c12-9aa1-4c77-b6d5-0f02f7c64807",
462 |               "x_pos": 579,
463 |               "y_pos": 29,
464 |               "width": 243,
465 |               "height": 64,
466 |               "class_name": "d3-comment-rect",
467 |               "content": "Combine flight delay & weather dataset for downstream analytics & prediction tasks",
468 |               "associated_id_refs": [
469 |                 {
470 |                   "node_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b"
471 |                 }
472 |               ]
473 |             },
474 |             {
475 |               "id": "5d7aa386-549a-4c76-8f58-7ead3658fd7f",
476 |               "x_pos": 908,
477 |               "y_pos": 20,
478 |               "width": 175,
479 |               "height": 42,
480 |               "class_name": "d3-comment-rect",
481 |               "content": "Analyze & visualize flight delay & weather data",
482 |               "associated_id_refs": [
483 |                 {
484 |                   "node_ref": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01"
485 |                 }
486 |               ]
487 |             },
488 |             {
489 |               "id": "bf560da3-5a43-4671-84ea-d98d562e1ec3",
490 |               "x_pos": 860,
491 |               "y_pos": 357,
492 |               "width": 230,
493 |               "height": 52,
494 |               "class_name": "d3-comment-rect",
495 |               "content": "Train & evaluate machine learning models to predict flight delays",
496 |               "associated_id_refs": [
497 |                 {
498 |                   "node_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f"
499 |                 }
500 |               ]
501 |             }
502 |           ]
503 |         },
504 |         "version": 3
505 |       },
506 |       "runtime_ref": ""
507 |     }
508 |   ],
509 |   "schemas": []
510 | }


--------------------------------------------------------------------------------
/pipelines/flight_delays_with_deployment.pipeline:
--------------------------------------------------------------------------------
  1 | {
  2 |   "doc_type": "pipeline",
  3 |   "version": "3.0",
  4 |   "json_schema": "http://api.dataplatform.ibm.com/schemas/common-pipeline/pipeline-flow/pipeline-flow-v3-schema.json",
  5 |   "id": "d8fa85b7-04fd-467d-a45d-e127e1eccfe8",
  6 |   "primary_pipeline": "07d7f720-4cde-4c57-a4ee-e99c68f307b1",
  7 |   "pipelines": [
  8 |     {
  9 |       "id": "07d7f720-4cde-4c57-a4ee-e99c68f307b1",
 10 |       "nodes": [
 11 |         {
 12 |           "id": "2f3f6243-82ef-43ee-af09-9888a5dfbc30",
 13 |           "type": "execution_node",
 14 |           "op": "execute-python-node",
 15 |           "app_data": {
 16 |             "filename": "../notebooks/load_data.py",
 17 |             "runtime_image": "amancevice/pandas:1.1.1",
 18 |             "env_vars": [
 19 |               "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/airline_2m.tar.gz"
 20 |             ],
 21 |             "include_subdirectories": false,
 22 |             "invalidNodeError": null,
 23 |             "outputs": [
 24 |               "data/airline_2m.csv"
 25 |             ],
 26 |             "ui_data": {
 27 |               "label": "load_data",
 28 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
 29 |               "x_pos": 72.16775512695312,
 30 |               "y_pos": 91.4539566040039,
 31 |               "description": "Python Script"
 32 |             }
 33 |           },
 34 |           "inputs": [
 35 |             {
 36 |               "id": "inPort",
 37 |               "app_data": {
 38 |                 "ui_data": {
 39 |                   "cardinality": {
 40 |                     "min": 0,
 41 |                     "max": 1
 42 |                   },
 43 |                   "label": "Input Port"
 44 |                 }
 45 |               }
 46 |             }
 47 |           ],
 48 |           "outputs": [
 49 |             {
 50 |               "id": "outPort",
 51 |               "app_data": {
 52 |                 "ui_data": {
 53 |                   "cardinality": {
 54 |                     "min": 0,
 55 |                     "max": -1
 56 |                   },
 57 |                   "label": "Output Port"
 58 |                 }
 59 |               }
 60 |             }
 61 |           ]
 62 |         },
 63 |         {
 64 |           "id": "b614bf55-c127-413a-935a-175d7afa7f0e",
 65 |           "type": "execution_node",
 66 |           "op": "execute-python-node",
 67 |           "app_data": {
 68 |             "filename": "../notebooks/load_data.py",
 69 |             "runtime_image": "amancevice/pandas:1.1.1",
 70 |             "env_vars": [
 71 |               "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz"
 72 |             ],
 73 |             "include_subdirectories": false,
 74 |             "invalidNodeError": null,
 75 |             "outputs": [
 76 |               "data/noaa-weather-data-jfk-airport/jfk_weather.csv"
 77 |             ],
 78 |             "ui_data": {
 79 |               "label": "load_data",
 80 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
 81 |               "x_pos": 65,
 82 |               "y_pos": 241,
 83 |               "description": "Python Script"
 84 |             }
 85 |           },
 86 |           "inputs": [
 87 |             {
 88 |               "id": "inPort",
 89 |               "app_data": {
 90 |                 "ui_data": {
 91 |                   "cardinality": {
 92 |                     "min": 0,
 93 |                     "max": 1
 94 |                   },
 95 |                   "label": "Input Port"
 96 |                 }
 97 |               }
 98 |             }
 99 |           ],
100 |           "outputs": [
101 |             {
102 |               "id": "outPort",
103 |               "app_data": {
104 |                 "ui_data": {
105 |                   "cardinality": {
106 |                     "min": 0,
107 |                     "max": -1
108 |                   },
109 |                   "label": "Output Port"
110 |                 }
111 |               }
112 |             }
113 |           ]
114 |         },
115 |         {
116 |           "id": "67d35156-a7ba-4339-9975-d5eac9be5b1e",
117 |           "type": "execution_node",
118 |           "op": "execute-notebook-node",
119 |           "app_data": {
120 |             "filename": "../notebooks/process_flight_data.ipynb",
121 |             "runtime_image": "amancevice/pandas:1.1.1",
122 |             "env_vars": [],
123 |             "include_subdirectories": false,
124 |             "invalidNodeError": null,
125 |             "outputs": [
126 |               "data/jfk_flight_features.csv"
127 |             ],
128 |             "ui_data": {
129 |               "label": "process_flight_data",
130 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
131 |               "x_pos": 343,
132 |               "y_pos": 97,
133 |               "description": "Notebook file"
134 |             }
135 |           },
136 |           "inputs": [
137 |             {
138 |               "id": "inPort",
139 |               "app_data": {
140 |                 "ui_data": {
141 |                   "cardinality": {
142 |                     "min": 0,
143 |                     "max": 1
144 |                   },
145 |                   "label": "Input Port"
146 |                 }
147 |               },
148 |               "links": [
149 |                 {
150 |                   "id": "eff34dc4-552d-49d1-86ab-ad774c09ed9c",
151 |                   "node_id_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30",
152 |                   "port_id_ref": "outPort"
153 |                 }
154 |               ]
155 |             }
156 |           ],
157 |           "outputs": [
158 |             {
159 |               "id": "outPort",
160 |               "app_data": {
161 |                 "ui_data": {
162 |                   "cardinality": {
163 |                     "min": 0,
164 |                     "max": -1
165 |                   },
166 |                   "label": "Output Port"
167 |                 }
168 |               }
169 |             }
170 |           ]
171 |         },
172 |         {
173 |           "id": "71257647-2fc5-4db8-95ef-5813bc386f95",
174 |           "type": "execution_node",
175 |           "op": "execute-notebook-node",
176 |           "app_data": {
177 |             "filename": "../notebooks/process_weather_data.ipynb",
178 |             "runtime_image": "amancevice/pandas:1.1.1",
179 |             "env_vars": [],
180 |             "include_subdirectories": false,
181 |             "invalidNodeError": null,
182 |             "outputs": [
183 |               "data/jfk_weather_features.csv"
184 |             ],
185 |             "ui_data": {
186 |               "label": "process_weather_data",
187 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
188 |               "x_pos": 340,
189 |               "y_pos": 240,
190 |               "description": "Notebook file"
191 |             }
192 |           },
193 |           "inputs": [
194 |             {
195 |               "id": "inPort",
196 |               "app_data": {
197 |                 "ui_data": {
198 |                   "cardinality": {
199 |                     "min": 0,
200 |                     "max": 1
201 |                   },
202 |                   "label": "Input Port"
203 |                 }
204 |               },
205 |               "links": [
206 |                 {
207 |                   "id": "0a85bfc7-3bf3-4885-9026-2bd9fa30b729",
208 |                   "node_id_ref": "b614bf55-c127-413a-935a-175d7afa7f0e",
209 |                   "port_id_ref": "outPort"
210 |                 }
211 |               ]
212 |             }
213 |           ],
214 |           "outputs": [
215 |             {
216 |               "id": "outPort",
217 |               "app_data": {
218 |                 "ui_data": {
219 |                   "cardinality": {
220 |                     "min": 0,
221 |                     "max": -1
222 |                   },
223 |                   "label": "Output Port"
224 |                 }
225 |               }
226 |             }
227 |           ]
228 |         },
229 |         {
230 |           "id": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
231 |           "type": "execution_node",
232 |           "op": "execute-notebook-node",
233 |           "app_data": {
234 |             "filename": "../notebooks/merge_data.ipynb",
235 |             "runtime_image": "amancevice/pandas:1.1.1",
236 |             "env_vars": [],
237 |             "include_subdirectories": false,
238 |             "invalidNodeError": null,
239 |             "outputs": [
240 |               "data/jfk_flight_weather_features.csv"
241 |             ],
242 |             "ui_data": {
243 |               "label": "merge_data",
244 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
245 |               "x_pos": 623,
246 |               "y_pos": 166,
247 |               "description": "Notebook file"
248 |             }
249 |           },
250 |           "inputs": [
251 |             {
252 |               "id": "inPort",
253 |               "app_data": {
254 |                 "ui_data": {
255 |                   "cardinality": {
256 |                     "min": 0,
257 |                     "max": 2
258 |                   },
259 |                   "label": "Input Port"
260 |                 }
261 |               },
262 |               "links": [
263 |                 {
264 |                   "id": "8d964ba9-5fd5-455d-83ef-0d64ecd05c56",
265 |                   "node_id_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e",
266 |                   "port_id_ref": "outPort"
267 |                 },
268 |                 {
269 |                   "id": "abfb2d60-15ea-4b54-ba87-5eb82aca5204",
270 |                   "node_id_ref": "71257647-2fc5-4db8-95ef-5813bc386f95",
271 |                   "port_id_ref": "outPort"
272 |                 }
273 |               ]
274 |             }
275 |           ],
276 |           "outputs": [
277 |             {
278 |               "id": "outPort",
279 |               "app_data": {
280 |                 "ui_data": {
281 |                   "cardinality": {
282 |                     "min": 0,
283 |                     "max": -1
284 |                   },
285 |                   "label": "Output Port"
286 |                 }
287 |               }
288 |             }
289 |           ]
290 |         },
291 |         {
292 |           "id": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01",
293 |           "type": "execution_node",
294 |           "op": "execute-notebook-node",
295 |           "app_data": {
296 |             "filename": "../notebooks/analyze_flight_delays.ipynb",
297 |             "runtime_image": "amancevice/pandas:1.1.1",
298 |             "env_vars": [],
299 |             "include_subdirectories": false,
300 |             "invalidNodeError": null,
301 |             "ui_data": {
302 |               "label": "analyze_flight_delays",
303 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
304 |               "x_pos": 943,
305 |               "y_pos": 94,
306 |               "description": "Notebook file"
307 |             }
308 |           },
309 |           "inputs": [
310 |             {
311 |               "id": "inPort",
312 |               "app_data": {
313 |                 "ui_data": {
314 |                   "cardinality": {
315 |                     "min": 0,
316 |                     "max": 1
317 |                   },
318 |                   "label": "Input Port"
319 |                 }
320 |               },
321 |               "links": [
322 |                 {
323 |                   "id": "809a9b07-21ec-4d30-b607-f0fab979790b",
324 |                   "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
325 |                   "port_id_ref": "outPort"
326 |                 }
327 |               ]
328 |             }
329 |           ],
330 |           "outputs": [
331 |             {
332 |               "id": "outPort",
333 |               "app_data": {
334 |                 "ui_data": {
335 |                   "cardinality": {
336 |                     "min": 0,
337 |                     "max": -1
338 |                   },
339 |                   "label": "Output Port"
340 |                 }
341 |               }
342 |             }
343 |           ]
344 |         },
345 |         {
346 |           "id": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f",
347 |           "type": "execution_node",
348 |           "op": "execute-notebook-node",
349 |           "app_data": {
350 |             "filename": "../notebooks/predict_flight_delays.ipynb",
351 |             "runtime_image": "amancevice/pandas:1.1.1",
352 |             "env_vars": [],
353 |             "include_subdirectories": false,
354 |             "invalidNodeError": null,
355 |             "outputs": [
356 |               "models/model.joblib",
357 |               "data/test_rows.npy"
358 |             ],
359 |             "ui_data": {
360 |               "label": "predict_flight_delays",
361 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
362 |               "x_pos": 942.013916015625,
363 |               "y_pos": 251.00418090820312,
364 |               "description": "Notebook file"
365 |             }
366 |           },
367 |           "inputs": [
368 |             {
369 |               "id": "inPort",
370 |               "app_data": {
371 |                 "ui_data": {
372 |                   "cardinality": {
373 |                     "min": 0,
374 |                     "max": 1
375 |                   },
376 |                   "label": "Input Port"
377 |                 }
378 |               },
379 |               "links": [
380 |                 {
381 |                   "id": "17ab35cb-a52f-4c37-984e-e33bf7b67d36",
382 |                   "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b",
383 |                   "port_id_ref": "outPort"
384 |                 }
385 |               ]
386 |             }
387 |           ],
388 |           "outputs": [
389 |             {
390 |               "id": "outPort",
391 |               "app_data": {
392 |                 "ui_data": {
393 |                   "cardinality": {
394 |                     "min": 0,
395 |                     "max": -1
396 |                   },
397 |                   "label": "Output Port"
398 |                 }
399 |               }
400 |             }
401 |           ]
402 |         },
403 |         {
404 |           "id": "7a4f1f66-4930-4fa4-b5da-293801b3cea6",
405 |           "type": "execution_node",
406 |           "op": "execute-notebook-node",
407 |           "app_data": {
408 |             "filename": "../notebooks/deploy_model.ipynb",
409 |             "runtime_image": "amancevice/pandas:1.1.1",
410 |             "env_vars": [
411 |               "OS_URL=minio-service:9000",
412 |               "ACCESS_KEY_ID=minio",
413 |               "SECRET_ACCESS_KEY=minio123",
414 |               "MODEL_BUCKET=models",
415 |               "MODEL_DIR=models",
416 |               "MODEL_MODE=local"
417 |             ],
418 |             "include_subdirectories": false,
419 |             "invalidNodeError": null,
420 |             "ui_data": {
421 |               "label": "deploy_model",
422 |               "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A",
423 |               "x_pos": 1227,
424 |               "y_pos": 171,
425 |               "description": "Notebook file"
426 |             }
427 |           },
428 |           "inputs": [
429 |             {
430 |               "id": "inPort",
431 |               "app_data": {
432 |                 "ui_data": {
433 |                   "cardinality": {
434 |                     "min": 0,
435 |                     "max": -1
436 |                   },
437 |                   "label": "Input Port"
438 |                 }
439 |               },
440 |               "links": [
441 |                 {
442 |                   "id": "207b171f-3de3-44fd-a4e1-accbb5fcc14b",
443 |                   "node_id_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f",
444 |                   "port_id_ref": "outPort"
445 |                 }
446 |               ]
447 |             }
448 |           ],
449 |           "outputs": [
450 |             {
451 |               "id": "outPort",
452 |               "app_data": {
453 |                 "ui_data": {
454 |                   "cardinality": {
455 |                     "min": 0,
456 |                     "max": -1
457 |                   },
458 |                   "label": "Output Port"
459 |                 }
460 |               }
461 |             }
462 |           ]
463 |         }
464 |       ],
465 |       "app_data": {
466 |         "ui_data": {
467 |           "comments": [
468 |             {
469 |               "id": "e5c417b4-9be7-4244-8597-21e0c0e00a70",
470 |               "x_pos": 28,
471 |               "y_pos": 17,
472 |               "width": 175,
473 |               "height": 42,
474 |               "class_name": "d3-comment-rect",
475 |               "content": "Load flight delay data",
476 |               "associated_id_refs": [
477 |                 {
478 |                   "node_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30"
479 |                 }
480 |               ]
481 |             },
482 |             {
483 |               "id": "c1e71c34-9ae7-4a85-9251-91c554034cc2",
484 |               "x_pos": 30,
485 |               "y_pos": 338,
486 |               "width": 175,
487 |               "height": 42,
488 |               "class_name": "d3-comment-rect",
489 |               "content": "Load JFK weather data",
490 |               "associated_id_refs": [
491 |                 {
492 |                   "node_ref": "b614bf55-c127-413a-935a-175d7afa7f0e"
493 |                 }
494 |               ]
495 |             },
496 |             {
497 |               "id": "9cd374ba-b6ee-47a3-b963-4f164621d78b",
498 |               "x_pos": 292,
499 |               "y_pos": 15,
500 |               "width": 175,
501 |               "height": 42,
502 |               "class_name": "d3-comment-rect",
503 |               "content": "Clean up & pre-process flight delay data",
504 |               "associated_id_refs": [
505 |                 {
506 |                   "node_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e"
507 |                 }
508 |               ]
509 |             },
510 |             {
511 |               "id": "44d53e47-c149-4b69-ad6e-259dcd8f8b9f",
512 |               "x_pos": 308,
513 |               "y_pos": 334,
514 |               "width": 175,
515 |               "height": 42,
516 |               "class_name": "d3-comment-rect",
517 |               "content": "Clean up & pre-process weather data",
518 |               "associated_id_refs": [
519 |                 {
520 |                   "node_ref": "71257647-2fc5-4db8-95ef-5813bc386f95"
521 |                 }
522 |               ]
523 |             },
524 |             {
525 |               "id": "b12e0c12-9aa1-4c77-b6d5-0f02f7c64807",
526 |               "x_pos": 579,
527 |               "y_pos": 29,
528 |               "width": 243,
529 |               "height": 64,
530 |               "class_name": "d3-comment-rect",
531 |               "content": "Combine flight delay & weather dataset for downstream analytics & prediction tasks",
532 |               "associated_id_refs": [
533 |                 {
534 |                   "node_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b"
535 |                 }
536 |               ]
537 |             },
538 |             {
539 |               "id": "5d7aa386-549a-4c76-8f58-7ead3658fd7f",
540 |               "x_pos": 908,
541 |               "y_pos": 20,
542 |               "width": 175,
543 |               "height": 42,
544 |               "class_name": "d3-comment-rect",
545 |               "content": "Analyze & visualize flight delay & weather data",
546 |               "associated_id_refs": [
547 |                 {
548 |                   "node_ref": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01"
549 |                 }
550 |               ]
551 |             },
552 |             {
553 |               "id": "bf560da3-5a43-4671-84ea-d98d562e1ec3",
554 |               "x_pos": 860,
555 |               "y_pos": 357,
556 |               "width": 230,
557 |               "height": 52,
558 |               "class_name": "d3-comment-rect",
559 |               "content": "Train & evaluate machine learning models to predict flight delays",
560 |               "associated_id_refs": [
561 |                 {
562 |                   "node_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f"
563 |                 }
564 |               ]
565 |             },
566 |             {
567 |               "id": "ab7b35f6-20e6-4d63-b8e5-02a41fffa18f",
568 |               "x_pos": 1163,
569 |               "y_pos": 61,
570 |               "width": 175,
571 |               "height": 42,
572 |               "class_name": "d3-comment-rect",
573 |               "content": "Deploy the trained model to Kubeflow Serving",
574 |               "associated_id_refs": [
575 |                 {
576 |                   "node_ref": "7a4f1f66-4930-4fa4-b5da-293801b3cea6"
577 |                 }
578 |               ]
579 |             }
580 |           ]
581 |         },
582 |         "version": 3
583 |       },
584 |       "runtime_ref": ""
585 |     }
586 |   ],
587 |   "schemas": []
588 | }


--------------------------------------------------------------------------------