├── LICENSE ├── README.md ├── docs └── source │ └── images │ ├── deploy-model-results.png │ ├── deploy-node-config.png │ ├── flight-delays-pipeline-deploy.png │ ├── flight-delays-pipeline.png │ ├── kfp-experiment-deploy.png │ ├── kfp-experiment.png │ └── object-storage-results.png ├── flight-delays-env.yaml ├── kfserving.md ├── notebooks ├── analyze_flight_delays.ipynb ├── deploy_model.ipynb ├── load_data.py ├── merge_data.ipynb ├── predict_flight_delays.ipynb ├── process_flight_data.ipynb └── process_weather_data.ipynb └── pipelines ├── flight_delays.pipeline └── flight_delays_with_deployment.pipeline /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Analyzing flight delay and weather data using Elyra, Kubeflow Pipelines and KFServing 2 | 3 | This repository contains a set of Python scripts and Jupyter notebooks that analyze and predict flight delays. The datasets are hosted on the [IBM Developer Data Asset Exchange](https://ibm.biz/data-exchange). 4 | 5 | We use [Elyra](https://github.com/elyra-ai/elyra) to create a pipeline that can be executed locally or using a [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/) runtime. This pipeline: 6 | 7 | * Loads the datasets 8 | * Pre-processes the datasets 9 | * Performs data merging and feature extraction 10 | * Analyzes and visualizes the processed dataset 11 | * Trains and evaluates machine learning models for predicting delayed flights, using features about flights as well as related weather features 12 | * _Optionally_ deploys the trained model to Kubeflow Serving 13 | 14 | ![Flight Delays Pipeline](docs/source/images/flight-delays-pipeline.png) 15 | 16 | ### Configuring the local development environment 17 | 18 | It's highly recommended to create a dedicated and consistent Python environment for running the notebooks in this repository: 19 | 20 | 1. Install [Anaconda](https://docs.anaconda.com/anaconda/install/) 21 | or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) 22 | 1. Navigate to your local copy of this repository. 23 | 1. Create an Anaconda environment from the `yaml` file in the repository: 24 | ```console 25 | $ conda env create -f flight-delays-env.yaml 26 | ``` 27 | 1. Activate the new environment: 28 | ```console 29 | $ conda activate flight-delays-env 30 | ``` 31 | 1. If running JupyterLab and Elyra for the first time, build the extensions: 32 | ```console 33 | $ jupyter lab build 34 | ``` 35 | 1. Launch JupyterLab: 36 | ```console 37 | $ jupyter lab 38 | ``` 39 | 40 | ### Configuring a Kubeflow Pipeline runtime 41 | 42 | [Elyra's Notebook pipeline visual editor](https://elyra.readthedocs.io/en/latest/getting_started/overview.html#notebook-pipelines-visual-editor) 43 | currently supports running these pipelines in a Kubeflow Pipeline runtime. If required, these are 44 | [the steps to install a local deployment of KFP](https://elyra.readthedocs.io/en/latest/recipes/deploying-kubeflow-locally-for-dev.html). 45 | 46 | After installing your Kubeflow Pipeline runtime, use the command below (with proper updates) to configure the new 47 | KFP runtime with Elyra. 48 | 49 | ```bash 50 | elyra-metadata install runtimes --replace=true \ 51 | --schema_name=kfp \ 52 | --name=kfp_runtime \ 53 | --display_name="Kubeflow Pipeline Runtime" \ 54 | --api_endpoint=http://[host]:[api port]/pipeline \ 55 | --cos_endpoint=http://[host]:[cos port] \ 56 | --cos_username=[cos username] \ 57 | --cos_password=[cos password] \ 58 | --cos_bucket=flights 59 | ``` 60 | 61 | **Note:** The cloud object storage endpoint above assumes a local minio object storage but other cloud-based object storage services could be configured and used in this scenario. 62 | 63 | If using the default minio storage - following the local Kubeflow installation instructions above - the arguments should be `--cos_endpoint=http://minio-service:9000`, `--cos_username=minio`, `--cos_password=minio123`. The api endpoint for local Kubeflow Pipelines would then be `--api_endpoint=http://127.0.0.1:31380/pipeline`. 64 | 65 | **Don't forget to setup port-forwarding for the KFP ML Pipelines API service and Minio service as per the above instructions.** 66 | 67 | ## Elyra Notebook pipelines 68 | 69 | Elyra provides a visual editor for building Notebook-based AI pipelines, simplifying the conversion of 70 | multiple notebooks into batch jobs or workflows. By leveraging cloud-based resources to run their 71 | experiments faster, the data scientists, machine learning engineers, and AI developers are then more productive, 72 | allowing them to spend their time using their technical skills. 73 | 74 | ![Notebook pipeline](https://raw.githubusercontent.com/elyra-ai/community/master/resources/blog-announcement/elyra-pipelines.gif) 75 | 76 | ### Running the Elyra pipeline 77 | 78 | The Elyra pipeline `flight_delays.pipeline`, which is located in the `pipelines` directory, can be run by clicking 79 | on the `play` button as seen on the image above. The `submit` dialog will request two inputs from the user: a name 80 | for the pipeline and a runtime to use while executing the pipeline. 81 | 82 | The list of available runtimes comes from the registered Kubeflow Pipelines runtimes documented above and includes a `Run in-place locally` option for local execution. 83 | 84 | #### Local execution 85 | 86 | If running locally, the notebooks are executed and updated in-place. You can track the progress in the terminal screen where you ran `jupyter lab`. The downloaded and processed datasets will be available locally in `notebooks/data` in this case. 87 | 88 | #### Kubeflow Pipelines execution 89 | 90 | After submitting the pipeline to Kubeflow Pipelines, Elyra will show a dialog with a direct link to where the experiment is being executed on Kubeflow Piplines. 91 | 92 | The user can access the pipelines, and respective experiment runs, via the `api_endpoint` of the Kubeflow Pipelines 93 | runtime (e.g. `http://[host]:[port]/pipeline`) 94 | 95 | ![Pipeline experiment run](docs/source/images/kfp-experiment.png) 96 | 97 | The output from the executed experiments are then available in the associated `object storage` 98 | and the executed notebooks are available as native `.ipynb` notebooks and also in `html` format 99 | to facilitate the visualization and sharing of the results. 100 | 101 | ![Pipeline experiment results in object storage](docs/source/images/object-storage-results.png) 102 | 103 | 104 | ### Running the Elyra pipeline with model deployment to Kubeflow Serving 105 | 106 | Please follow the [instructions](kfserving.md) for running the pipeline `flight_delays_with_deployment.pipeline`, which adds a node at the end of the pipeline for deploying the model to [KFServing](https://www.kubeflow.org/docs/components/serving/kfserving/). 107 | 108 | ### References 109 | 110 | Find more project details on [Elyra's GitHub](https://github.com/elyra-ai/elyra) or watching the 111 | [Elyra demo](https://www.youtube.com/watch?v=Nj0yga6T4U8). -------------------------------------------------------------------------------- /docs/source/images/deploy-model-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/deploy-model-results.png -------------------------------------------------------------------------------- /docs/source/images/deploy-node-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/deploy-node-config.png -------------------------------------------------------------------------------- /docs/source/images/flight-delays-pipeline-deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/flight-delays-pipeline-deploy.png -------------------------------------------------------------------------------- /docs/source/images/flight-delays-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/flight-delays-pipeline.png -------------------------------------------------------------------------------- /docs/source/images/kfp-experiment-deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/kfp-experiment-deploy.png -------------------------------------------------------------------------------- /docs/source/images/kfp-experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/kfp-experiment.png -------------------------------------------------------------------------------- /docs/source/images/object-storage-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODAIT/flight-delay-notebooks/42df0a2ae028fa9c985c2f8fa45fd9569fed6662/docs/source/images/object-storage-results.png -------------------------------------------------------------------------------- /flight-delays-env.yaml: -------------------------------------------------------------------------------- 1 | name: flight-delays-env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - appnope=0.1.0 6 | - argon2-cffi=20.1.0 7 | - async_generator=1.10 8 | - attrs=20.3.0 9 | - backcall=0.2.0 10 | - blas=1.0 11 | - bleach=3.2.1 12 | - brotlipy=0.7.0 13 | - ca-certificates=2020.10.14 14 | - certifi=2020.6.20 15 | - cffi=1.14.3 16 | - chardet=3.0.4 17 | - cryptography=3.1.1 18 | - cycler=0.10.0 19 | - decorator=4.4.2 20 | - defusedxml=0.6.0 21 | - entrypoints=0.3 22 | - freetype=2.10.4 23 | - idna=2.10 24 | - importlib-metadata=2.0.0 25 | - importlib_metadata=2.0.0 26 | - intel-openmp=2019.4 27 | - ipykernel=5.3.4 28 | - ipython=7.19.0 29 | - ipython_genutils=0.2.0 30 | - jedi=0.17.2 31 | - jinja2=2.11.2 32 | - joblib=0.17.0 33 | - jpeg=9b 34 | - json5=0.9.5 35 | - jsonschema=3.2.0 36 | - jupyter_client=6.1.7 37 | - jupyter_core=4.6.3 38 | - jupyterlab=2.2.6 39 | - jupyterlab_pygments=0.1.2 40 | - jupyterlab_server=1.2.0 41 | - kiwisolver=1.3.0 42 | - lcms2=2.11 43 | - libcxx=10.0.0 44 | - libedit=3.1.20191231 45 | - libffi=3.3 46 | - libgfortran=3.0.1 47 | - libpng=1.6.37 48 | - libsodium=1.0.18 49 | - libtiff=4.1.0 50 | - llvm-openmp=10.0.0 51 | - lz4-c=1.9.2 52 | - markupsafe=1.1.1 53 | - matplotlib=3.3.2 54 | - matplotlib-base=3.3.2 55 | - mistune=0.8.4 56 | - mkl=2019.4 57 | - mkl-service=2.3.0 58 | - mkl_fft=1.2.0 59 | - mkl_random=1.1.1 60 | - nbclient=0.5.1 61 | - nbformat=5.0.8 62 | - ncurses=6.2 63 | - nest-asyncio=1.4.2 64 | - notebook=6.1.4 65 | - numpy=1.19.2 66 | - numpy-base=1.19.2 67 | - olefile=0.46 68 | - openssl=1.1.1h 69 | - packaging=20.4 70 | - pandas=1.1.3 71 | - pandoc=2.11 72 | - pandocfilters=1.4.3 73 | - parso=0.7.0 74 | - pexpect=4.8.0 75 | - pickleshare=0.7.5 76 | - pillow=8.0.1 77 | - pip=20.2.4 78 | - prometheus_client=0.8.0 79 | - prompt-toolkit=3.0.8 80 | - ptyprocess=0.6.0 81 | - pycparser=2.20 82 | - pygments=2.7.2 83 | - pyopenssl=19.1.0 84 | - pyparsing=2.4.7 85 | - pyrsistent=0.17.3 86 | - pysocks=1.7.1 87 | - python=3.7.9 88 | - python-dateutil=2.8.1 89 | - pytz=2020.1 90 | - pyzmq=19.0.2 91 | - readline=8.0 92 | - requests=2.24.0 93 | - scikit-learn=0.23.2 94 | - scipy=1.5.2 95 | - seaborn=0.11.0 96 | - send2trash=1.5.0 97 | - setuptools=50.3.1 98 | - six=1.15.0 99 | - sqlite=3.33.0 100 | - terminado=0.9.1 101 | - testpath=0.4.4 102 | - threadpoolctl=2.1.0 103 | - tk=8.6.10 104 | - tornado=6.0.4 105 | - traitlets=5.0.5 106 | - urllib3=1.25.11 107 | - wcwidth=0.2.5 108 | - webencodings=0.5.1 109 | - wheel=0.35.1 110 | - xz=5.2.5 111 | - zeromq=4.3.3 112 | - zipp=3.4.0 113 | - zlib=1.2.11 114 | - zstd=1.4.5 115 | - pip: 116 | - ansiwrap==0.8.4 117 | - appdirs==1.4.4 118 | - autopep8==1.5.4 119 | - black==20.8b1 120 | - bump2version==1.0.1 121 | - bumpversion==0.6.0 122 | - cachetools==4.1.1 123 | - click==7.1.2 124 | - cloudpickle==1.6.0 125 | - colorama==0.4.4 126 | - configparser==5.0.1 127 | - coverage==5.3 128 | - deprecated==1.2.10 129 | - distlib==0.3.1 130 | - docutils==0.16 131 | - elyra==1.4.1 132 | - filelock==3.0.12 133 | - flake8==3.8.4 134 | - gitdb==4.0.5 135 | - gitpython==3.1.11 136 | - google-api-core==1.23.0 137 | - google-auth==1.23.0 138 | - google-cloud-core==1.4.3 139 | - google-cloud-storage==1.32.0 140 | - google-crc32c==1.0.0 141 | - google-resumable-media==1.1.0 142 | - googleapis-common-protos==1.52.0 143 | - jupyterlab-git==0.23.1 144 | - keyring==21.5.0 145 | - kfp==1.0.0 146 | - kfp-notebook==0.14.0 147 | - kfp-server-api==1.1.0a1 148 | - kubernetes==11.0.0 149 | - mccabe==0.6.1 150 | - minio==6.0.0 151 | - mypy-extensions==0.4.3 152 | - nbconvert==5.6.1 153 | - nbdime==2.1.0 154 | - nbresuse==0.3.6 155 | - oauthlib==3.1.0 156 | - papermill==2.2.2 157 | - pathspec==0.8.1 158 | - pathtools==0.1.2 159 | - pkginfo==1.6.1 160 | - pluggy==1.0.0.dev0 161 | - protobuf==3.14.0 162 | - psutil==5.7.3 163 | - py==1.9.0 164 | - pyasn1==0.4.8 165 | - pyasn1-modules==0.2.8 166 | - pycodestyle==2.6.0 167 | - pyflakes==2.2.0 168 | - pyyaml==5.3.1 169 | - readme-renderer==28.0 170 | - regex==2020.11.13 171 | - requests-oauthlib==1.3.0 172 | - requests-toolbelt==0.9.1 173 | - rfc3986==1.4.0 174 | - rfc3986-validator==0.1.1 175 | - rsa==4.6 176 | - smmap==3.0.4 177 | - strip-hints==0.1.9 178 | - tabulate==0.8.7 179 | - tenacity==6.2.0 180 | - textwrap3==0.9.2 181 | - toml==0.10.2 182 | - tox==3.20.1 183 | - tqdm==4.51.0 184 | - twine==3.2.0 185 | - typed-ast==1.4.1 186 | - typing-extensions==3.7.4.3 187 | - virtualenv==20.1.0 188 | - watchdog==0.10.3 189 | - websocket-client==0.57.0 190 | - wrapt==1.12.1 191 | prefix: /Users/nick/miniconda3/envs/flight-delays-env 192 | -------------------------------------------------------------------------------- /kfserving.md: -------------------------------------------------------------------------------- 1 | # Model deployment using KFServing 2 | 3 | The `pipelines` folder contains a pipeline - `flight_delays_with_deployment.pipeline` - that encompasses deploying the trained flight prediction model as a service running in [KFServing](https://www.kubeflow.org/docs/components/serving/kfserving/). 4 | 5 | ![Deployment pipeline](docs/source/images/flight-delays-pipeline-deploy.png) 6 | 7 | In order to run this version of the pipeline, you will need to setup KFServing. 8 | 9 | **Note** this example uses the built-in `minio` object storage service within Kubeflow Pipelines as the storage location for deploying a model to KFServing. Hence, KFP is required unless you manually setup up minio, or use S3. 10 | 11 | Once KFServing is set up, you can run the pipeline locally or using the KFP runtime, in the same way as the pipeline that excludes the model deployment step. 12 | 13 | ### Configuring a local KFServing runtime 14 | 15 | Follow these steps to configure your KFServing runtime: 16 | 17 | #### Install KFServing 18 | 19 | Install KFServing locally on an existing Kubernetes installation, using [these instructions](https://github.com/kubeflow/kfserving/tree/cd53eb10fc6cf52edb9e6623238ed9aa9fe5af72#install-kfserving-in-5-minutes-on-your-local-machine). You may optionally also have Kubeflow Pipelines installed (see the [main README instructions](../README.md#configuring-a-local-kubeflow-pipeline-runtime)). 20 | 21 | #### Set up access to object storage 22 | 23 | Once installed and running, you will need to set up a `Secret` and `ServiceAccount` to allow KFServing to access the object storage bucket for the model (refer to [these instructions](https://github.com/kubeflow/kfserving/tree/master/docs/samples/s3#create-s3-secret-and-attach-to-service-account)). 24 | 25 | **Note** we use the `kubeflow` namespace, since the model deployment node within a KFP runtime is not able to create resources in another namespace. 26 | 27 | Run the following command on the command line: 28 | 29 | ```console 30 | cat < /dev/null 2>&1" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "papermill": { 50 | "duration": 2.313344, 51 | "end_time": "2020-11-18T10:42:53.586078", 52 | "exception": false, 53 | "start_time": "2020-11-18T10:42:51.272734", 54 | "status": "completed" 55 | }, 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "# Define required imports\n", 61 | "import pandas as pd\n", 62 | "import numpy as np\n", 63 | "import seaborn as sns\n", 64 | "import matplotlib.pyplot as plt\n", 65 | "sns.set_theme(style='darkgrid', palette='deep')\n", 66 | "# These set pandas max column and row display in the notebook\n", 67 | "pd.set_option('display.max_columns', 50)\n", 68 | "pd.set_option('display.max_rows', 50)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "papermill": { 75 | "duration": 0.042908, 76 | "end_time": "2020-11-18T10:42:53.674690", 77 | "exception": false, 78 | "start_time": "2020-11-18T10:42:53.631782", 79 | "status": "completed" 80 | }, 81 | "tags": [] 82 | }, 83 | "source": [ 84 | "### Read the data\n", 85 | "\n", 86 | "We start by reading in the merged flight delay and weather data" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "papermill": { 94 | "duration": 0.138328, 95 | "end_time": "2020-11-18T10:42:53.857510", 96 | "exception": false, 97 | "start_time": "2020-11-18T10:42:53.719182", 98 | "status": "completed" 99 | }, 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "flight_path = 'data/jfk_flight_weather_features.csv'\n", 105 | "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n", 106 | "flight_data.head()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "papermill": { 113 | "duration": 0.024233, 114 | "end_time": "2020-11-18T10:42:53.906192", 115 | "exception": false, 116 | "start_time": "2020-11-18T10:42:53.881959", 117 | "status": "completed" 118 | }, 119 | "tags": [] 120 | }, 121 | "source": [ 122 | "### Analyze the data\n", 123 | "\n", 124 | "Now we will analyze the data to see if we can gain insight into flight delays.\n", 125 | "\n", 126 | "Let's start by looking at the overall proportion of flights that are delayed." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "papermill": { 134 | "duration": 0.21334, 135 | "end_time": "2020-11-18T10:42:54.143379", 136 | "exception": false, 137 | "start_time": "2020-11-18T10:42:53.930039", 138 | "status": "completed" 139 | }, 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "vc = flight_data['delayed'].value_counts()\n", 145 | "perc = vc / sum(vc)\n", 146 | "print('On-time: {:.2f}%'.format(perc[0] * 100))\n", 147 | "print('Delayed: {:.2f}%'.format(perc[1] * 100))\n", 148 | "plt.figure(figsize=(8, 6))\n", 149 | "chart = sns.countplot(data=flight_data, x='delayed')\n", 150 | "chart.set_xticklabels(['On-time', 'Delayed'])\n", 151 | "chart.set_xlabel('Flight status')\n", 152 | "plt.show()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "papermill": { 159 | "duration": 0.030623, 160 | "end_time": "2020-11-18T10:42:54.195654", 161 | "exception": false, 162 | "start_time": "2020-11-18T10:42:54.165031", 163 | "status": "completed" 164 | }, 165 | "tags": [] 166 | }, 167 | "source": [ 168 | "We see 80% of flights are on-time. Still, a fairly high proportion of 20% of flights are delayed - recall delayed here means more than 15 minutes late!" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "papermill": { 175 | "duration": 0.04472, 176 | "end_time": "2020-11-18T10:42:54.270571", 177 | "exception": false, 178 | "start_time": "2020-11-18T10:42:54.225851", 179 | "status": "completed" 180 | }, 181 | "tags": [] 182 | }, 183 | "source": [ 184 | "#### Analyze and visualize flight delay durations\n", 185 | "Next, we will plot the flight delay (in minutes) over time." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "papermill": { 193 | "duration": 0.686545, 194 | "end_time": "2020-11-18T10:42:55.005326", 195 | "exception": false, 196 | "start_time": "2020-11-18T10:42:54.318781", 197 | "status": "completed" 198 | }, 199 | "tags": [] 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "plt.figure(figsize=(16, 6))\n", 204 | "chart = sns.scatterplot(x='flight_date', y='dep_delay', data=flight_data)\n", 205 | "plt.show()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "papermill": { 212 | "duration": 0.036333, 213 | "end_time": "2020-11-18T10:42:55.080494", 214 | "exception": false, 215 | "start_time": "2020-11-18T10:42:55.044161", 216 | "status": "completed" 217 | }, 218 | "tags": [] 219 | }, 220 | "source": [ 221 | "There doesn't appear to be any obvious relationship. It is worth noting that most flight delay lengths are very low (clustered around zero), with a relatively small number of very large values (i.e. _outliers_). This may tend to skew analysis based on, for example, analyzing the _average_ flight delay duration. This also consistent with our proportion analysis above.\n", 222 | "\n", 223 | "Let's look at whether flight delays are impacted by the day of the week:" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "papermill": { 231 | "duration": 1.367749, 232 | "end_time": "2020-11-18T10:42:56.495574", 233 | "exception": false, 234 | "start_time": "2020-11-18T10:42:55.127825", 235 | "status": "completed" 236 | }, 237 | "tags": [] 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "plt.figure(figsize=(16, 6))\n", 242 | "days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n", 243 | "chart = sns.barplot(x='day_of_week', y='dep_delay', data=flight_data)\n", 244 | "chart.set_xticklabels(days)\n", 245 | "chart.set_xlabel('Day of Week')\n", 246 | "chart.set_ylabel('Departure Delay (min)')\n", 247 | "chart.set_title('Distribution of departure delay by day of week')\n", 248 | "plt.show()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "papermill": { 255 | "duration": 0.035542, 256 | "end_time": "2020-11-18T10:42:56.574443", 257 | "exception": false, 258 | "start_time": "2020-11-18T10:42:56.538901", 259 | "status": "completed" 260 | }, 261 | "tags": [] 262 | }, 263 | "source": [ 264 | "This chart shows the average and confidence interval (standard deviation) for flight delays, grouped by day of week. It appears from the chart that Monday, Friday and Sunday are the worst days to fly, with respect to the average flight delay. Perhaps this is due to a larger volume of flights on those days? We can in fact check this by plotting the total number of flights per weekday in the dataset." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "papermill": { 272 | "duration": 0.382649, 273 | "end_time": "2020-11-18T10:42:57.008464", 274 | "exception": false, 275 | "start_time": "2020-11-18T10:42:56.625815", 276 | "status": "completed" 277 | }, 278 | "tags": [] 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "plt.figure(figsize=(16, 6))\n", 283 | "chart = sns.countplot(x='day_of_week', data=flight_data)\n", 284 | "chart.set_xticklabels(days)\n", 285 | "chart.set_xlabel('Day of Week')\n", 286 | "chart.set_ylabel('Number of flights')\n", 287 | "chart.set_title('Flights by day of week')\n", 288 | "plt.show()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "papermill": { 295 | "duration": 0.031438, 296 | "end_time": "2020-11-18T10:42:57.070642", 297 | "exception": false, 298 | "start_time": "2020-11-18T10:42:57.039204", 299 | "status": "completed" 300 | }, 301 | "tags": [] 302 | }, 303 | "source": [ 304 | "There doesn't appear to be an obvious correlation between volume of flights and which days experience larger flight delays.\n", 305 | "\n", 306 | "**Note** however, that we are not taking into account volumes of arriving flights in this analysis, which may have an impact!\n", 307 | "\n", 308 | "Recall that the flight delay data appeared to have many outlier values. This means the distribution of flight delays is very skewed. We should take a look at a view that takes this into account:" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": { 315 | "papermill": { 316 | "duration": 0.582665, 317 | "end_time": "2020-11-18T10:42:57.683888", 318 | "exception": false, 319 | "start_time": "2020-11-18T10:42:57.101223", 320 | "status": "completed" 321 | }, 322 | "tags": [] 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "plt.figure(figsize=(16, 6))\n", 327 | "days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n", 328 | "chart = sns.boxenplot(x='day_of_week', y='dep_delay', data=flight_data)\n", 329 | "chart.set_xticklabels(days)\n", 330 | "chart.set_xlabel('Day of Week')\n", 331 | "chart.set_ylabel('Departure Delay (min)')\n", 332 | "chart.set_title('Distribution of departure delay by day of week')\n", 333 | "plt.show()" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "papermill": { 340 | "duration": 0.059778, 341 | "end_time": "2020-11-18T10:42:57.800830", 342 | "exception": false, 343 | "start_time": "2020-11-18T10:42:57.741052", 344 | "status": "completed" 345 | }, 346 | "tags": [] 347 | }, 348 | "source": [ 349 | "The above chart shows a more detailed distribution of the flight delay for each weekday. This shows that Monday and Friday definitey have some extremely large outlier values that play a role in their higher average flight delays.\n", 350 | "\n", 351 | "We can also see that Friday and Sunday have \"wider\" and \"higher\" blocks at moderately higher flight delay levels. This contributes to the higher average flight delays and tells us that outliers alone are not fully to blame for the higher average delays on these days.\n", 352 | "\n", 353 | "It is usually wise to dig a bit deeper when visualizing skewed or imbalanced datasets. " 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": { 359 | "papermill": { 360 | "duration": 0.04439, 361 | "end_time": "2020-11-18T10:42:57.890880", 362 | "exception": false, 363 | "start_time": "2020-11-18T10:42:57.846490", 364 | "status": "completed" 365 | }, 366 | "tags": [] 367 | }, 368 | "source": [ 369 | "#### Analyze and visualize flight delay proportions for flight features\n", 370 | "Next, we will analyze the proportion of flights that are delayed, for given sets of features in our dataset related to the **flight** itself. Since we wish to build a classifier, this analysis can help us to understand which features may be indicative of greater probability of a flight delay and which features have little impact.\n", 371 | "\n", 372 | "First, we will define a convenience function to create our stacked proportion charts:" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "papermill": { 380 | "duration": 0.072936, 381 | "end_time": "2020-11-18T10:42:58.043088", 382 | "exception": false, 383 | "start_time": "2020-11-18T10:42:57.970152", 384 | "status": "completed" 385 | }, 386 | "tags": [] 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "def plot_stacked_by_col(col, x_label, rotation=0, horizontalalignment='center', xticks=None):\n", 391 | " grouped = flight_data['delayed'].groupby(flight_data[col]).value_counts()\n", 392 | " g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n", 393 | "\n", 394 | " chart = g.unstack().plot(kind='bar', stacked=True, figsize=(16, 6))\n", 395 | " chart.set_xticklabels(\n", 396 | " xticks if xticks else chart.get_xticklabels(),\n", 397 | " rotation=rotation, \n", 398 | " horizontalalignment=horizontalalignment,\n", 399 | " fontweight='light',\n", 400 | " fontsize='medium'\n", 401 | " )\n", 402 | " chart.set_xlabel(x_label)\n", 403 | " chart.set_ylabel('Proportion delayed')\n", 404 | " chart.set_title('Proportion of flights delayed, by {}'.format(x_label))\n", 405 | " plt.show()" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": { 411 | "papermill": { 412 | "duration": 0.0663, 413 | "end_time": "2020-11-18T10:42:58.160817", 414 | "exception": false, 415 | "start_time": "2020-11-18T10:42:58.094517", 416 | "status": "completed" 417 | }, 418 | "tags": [] 419 | }, 420 | "source": [ 421 | "Let's start by analyzing proportion of flights delayed by weekday, continuing the theme of our analsis above." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "papermill": { 429 | "duration": 0.716892, 430 | "end_time": "2020-11-18T10:42:58.944322", 431 | "exception": false, 432 | "start_time": "2020-11-18T10:42:58.227430", 433 | "status": "completed" 434 | }, 435 | "tags": [] 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "plot_stacked_by_col('day_of_week', 'Day of Week', xticks=days)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "papermill": { 446 | "duration": 0.041598, 447 | "end_time": "2020-11-18T10:42:59.039796", 448 | "exception": false, 449 | "start_time": "2020-11-18T10:42:58.998198", 450 | "status": "completed" 451 | }, 452 | "tags": [] 453 | }, 454 | "source": [ 455 | "This chart roughly matches the delay duration charts above, with Monday, Friday and Sunday having the highest proportion of delayed flights, while Tuesday and Wednesday are the \"best\" days. This indicates that day of the week may be at least somewhat useful for predicting flight delays.\n", 456 | "\n", 457 | "Next, we plot the proportions by departure time (where departure times are grouped into hourly buckets, with the exception of a larger bucket for \"early morning flights\")." 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": { 464 | "papermill": { 465 | "duration": 1.05987, 466 | "end_time": "2020-11-18T10:43:00.138704", 467 | "exception": false, 468 | "start_time": "2020-11-18T10:42:59.078834", 469 | "status": "completed" 470 | }, 471 | "tags": [] 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "plot_stacked_by_col('dep_time_bin', 'Departure Time Bucket', rotation=45, horizontalalignment='right')" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "papermill": { 482 | "duration": 0.042265, 483 | "end_time": "2020-11-18T10:43:00.211480", 484 | "exception": false, 485 | "start_time": "2020-11-18T10:43:00.169215", 486 | "status": "completed" 487 | }, 488 | "tags": [] 489 | }, 490 | "source": [ 491 | "It seems clear that flights later in the day have a generally higher chance of being delayed, relative to flights in the morning (and especially early morning). Perhaps this is related to flight volumes - are flight volumes lower in the early morning?\n", 492 | "\n", 493 | "Again, we can check this by plotting the number of flights per departure time bucket." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "papermill": { 501 | "duration": 0.552196, 502 | "end_time": "2020-11-18T10:43:00.795416", 503 | "exception": false, 504 | "start_time": "2020-11-18T10:43:00.243220", 505 | "status": "completed" 506 | }, 507 | "tags": [] 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "plt.figure(figsize=(16, 6))\n", 512 | "chart = sns.countplot(\n", 513 | " x='dep_time_bin',\n", 514 | " data=flight_data,\n", 515 | " order=flight_data.groupby(flight_data['dep_time_bin']).groups.keys())\n", 516 | "chart.set_xticklabels(\n", 517 | " chart.get_xticklabels(),\n", 518 | " rotation=45, \n", 519 | " horizontalalignment='right',\n", 520 | " fontweight='light',\n", 521 | " fontsize='medium'\n", 522 | ")\n", 523 | "chart.set_xlabel('Departure Time Bucket')\n", 524 | "chart.set_ylabel('Count')\n", 525 | "chart.set_title('Flights by Departure Time Bucket')\n", 526 | "plt.show()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": { 532 | "papermill": { 533 | "duration": 0.048324, 534 | "end_time": "2020-11-18T10:43:00.878107", 535 | "exception": false, 536 | "start_time": "2020-11-18T10:43:00.829783", 537 | "status": "completed" 538 | }, 539 | "tags": [] 540 | }, 541 | "source": [ 542 | "While there are definitely relatively fewer very early flights, there are more flights in the early morning, and these are less likely to be delayed than afternoon flights, depsite flight volumes being similar between the two groups. Also, there are relatively lower volumes of late night flights, while these are relatively more likely to be delayed. So, flight volumes don't seem to play much of a role.\n", 543 | "\n", 544 | "**Note** however, that we are not taking into account volumes of arriving flights in this analysis, which may have an impact!\n", 545 | "\n", 546 | "Next, let's see if a particular airline's flights are more likely to be delayed." 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": { 553 | "papermill": { 554 | "duration": 0.663351, 555 | "end_time": "2020-11-18T10:43:01.579032", 556 | "exception": false, 557 | "start_time": "2020-11-18T10:43:00.915681", 558 | "status": "completed" 559 | }, 560 | "tags": [] 561 | }, 562 | "outputs": [], 563 | "source": [ 564 | "plot_stacked_by_col('airline_name', 'Airline', rotation=45, horizontalalignment='right')" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": { 570 | "papermill": { 571 | "duration": 0.075645, 572 | "end_time": "2020-11-18T10:43:01.736584", 573 | "exception": false, 574 | "start_time": "2020-11-18T10:43:01.660939", 575 | "status": "completed" 576 | }, 577 | "tags": [] 578 | }, 579 | "source": [ 580 | "It seems like the airline does have some impact on delay proportion (note volumes for some smaller airlines may be quite low due to sampling). How about flight destination?" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "papermill": { 588 | "duration": 4.987105, 589 | "end_time": "2020-11-18T10:43:06.793705", 590 | "exception": false, 591 | "start_time": "2020-11-18T10:43:01.806600", 592 | "status": "completed" 593 | }, 594 | "tags": [] 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "plot_stacked_by_col('dest', 'Destination Airport', rotation=65, horizontalalignment='right')" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": { 604 | "papermill": { 605 | "duration": 0.054271, 606 | "end_time": "2020-11-18T10:43:06.893444", 607 | "exception": false, 608 | "start_time": "2020-11-18T10:43:06.839173", 609 | "status": "completed" 610 | }, 611 | "tags": [] 612 | }, 613 | "source": [ 614 | "Again, it appears there is a relationship between proportion of flights delayed and the flight destination (the same caveats with respect to sampled data as mentioned above, would apply here).\n", 615 | "\n", 616 | "Finally, what about flight distance?" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": { 623 | "papermill": { 624 | "duration": 0.705268, 625 | "end_time": "2020-11-18T10:43:07.647698", 626 | "exception": false, 627 | "start_time": "2020-11-18T10:43:06.942430", 628 | "status": "completed" 629 | }, 630 | "tags": [] 631 | }, 632 | "outputs": [], 633 | "source": [ 634 | "plot_stacked_by_col('distance_bin', 'Distance Bin', rotation=0, horizontalalignment='center')" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": { 640 | "papermill": { 641 | "duration": 0.043425, 642 | "end_time": "2020-11-18T10:43:07.737420", 643 | "exception": false, 644 | "start_time": "2020-11-18T10:43:07.693995", 645 | "status": "completed" 646 | }, 647 | "tags": [] 648 | }, 649 | "source": [ 650 | "It seems like there may be some relationship, though it's not particlarly clear - shorter flights and longer flights tend to have roughly similar proportions." 651 | ] 652 | }, 653 | { 654 | "cell_type": "markdown", 655 | "metadata": { 656 | "papermill": { 657 | "duration": 0.039239, 658 | "end_time": "2020-11-18T10:43:07.828689", 659 | "exception": false, 660 | "start_time": "2020-11-18T10:43:07.789450", 661 | "status": "completed" 662 | }, 663 | "tags": [] 664 | }, 665 | "source": [ 666 | "#### Analyze and visualize flight delay proportions for weather features\n", 667 | "Now, we will analyze the proportion of flights that are delayed, for given sets of **weather** features in our dataset." 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": { 674 | "papermill": { 675 | "duration": 0.852436, 676 | "end_time": "2020-11-18T10:43:08.721519", 677 | "exception": false, 678 | "start_time": "2020-11-18T10:43:07.869083", 679 | "status": "completed" 680 | }, 681 | "tags": [] 682 | }, 683 | "outputs": [], 684 | "source": [ 685 | "# create sub-plots for a few weather conditions\n", 686 | "\n", 687 | "ax = plt.subplot(221)\n", 688 | "grouped = flight_data['delayed'].groupby(flight_data['drizzle']).value_counts()\n", 689 | "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n", 690 | "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n", 691 | "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n", 692 | "chart.set_xlabel('Drizzle')\n", 693 | "chart.set_ylabel('Proportion delayed')\n", 694 | "\n", 695 | "ax = plt.subplot(222)\n", 696 | "grouped = flight_data['delayed'].groupby(flight_data['mist']).value_counts()\n", 697 | "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n", 698 | "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n", 699 | "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n", 700 | "chart.set_xlabel('Mist')\n", 701 | "chart.set_ylabel('Proportion delayed')\n", 702 | "\n", 703 | "ax = plt.subplot(223)\n", 704 | "grouped = flight_data['delayed'].groupby(flight_data['snow']).value_counts()\n", 705 | "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n", 706 | "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n", 707 | "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n", 708 | "chart.set_xlabel('Snow')\n", 709 | "chart.set_ylabel('Proportion delayed')\n", 710 | "\n", 711 | "ax = plt.subplot(224)\n", 712 | "grouped = flight_data['delayed'].groupby(flight_data['thunderstorm']).value_counts()\n", 713 | "g = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))\n", 714 | "chart = g.unstack().plot(kind='bar', stacked=True, figsize=(18, 10), ax=ax)\n", 715 | "chart.set_xticklabels(['No', 'Yes'], rotation=0)\n", 716 | "chart.set_xlabel('Thunderstorm')\n", 717 | "chart.set_ylabel('Proportion delayed')\n", 718 | "\n", 719 | "plt.show()" 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": { 725 | "papermill": { 726 | "duration": 0.04203, 727 | "end_time": "2020-11-18T10:43:08.797469", 728 | "exception": false, 729 | "start_time": "2020-11-18T10:43:08.755439", 730 | "status": "completed" 731 | }, 732 | "tags": [] 733 | }, 734 | "source": [ 735 | "From these charts, it appears that the presence of \"drizzle\" does not impact on whether a flight is likely to be delayed - as we might expect. However, if there is snow or a thunderstorm, for example, it appears flight delays are much more likely.\n", 736 | "\n", 737 | "We have touched on only a little of the analysis of weather features that could be performed. For example, one could explore more of the weather conditions similarly to the cell above; or investigate the potential relationship between features such as `visibility`, `wind_speed` and `precip` to both proportions of flights delayed as well as duration of flight delays. " 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": { 743 | "papermill": { 744 | "duration": 0.051397, 745 | "end_time": "2020-11-18T10:43:08.896930", 746 | "exception": false, 747 | "start_time": "2020-11-18T10:43:08.845533", 748 | "status": "completed" 749 | }, 750 | "tags": [] 751 | }, 752 | "source": [ 753 | " \n", 754 | "### Authors" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "metadata": { 760 | "papermill": { 761 | "duration": 0.073045, 762 | "end_time": "2020-11-18T10:43:09.014783", 763 | "exception": false, 764 | "start_time": "2020-11-18T10:43:08.941738", 765 | "status": "completed" 766 | }, 767 | "tags": [] 768 | }, 769 | "source": [ 770 | "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 771 | "\n", 772 | "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License." 773 | ] 774 | } 775 | ], 776 | "metadata": { 777 | "kernelspec": { 778 | "display_name": "Python 3", 779 | "language": "python", 780 | "name": "python3" 781 | }, 782 | "language_info": { 783 | "codemirror_mode": { 784 | "name": "ipython", 785 | "version": 3 786 | }, 787 | "file_extension": ".py", 788 | "mimetype": "text/x-python", 789 | "name": "python", 790 | "nbconvert_exporter": "python", 791 | "pygments_lexer": "ipython3", 792 | "version": "3.7.9" 793 | }, 794 | "papermill": { 795 | "duration": 23.694453, 796 | "end_time": "2020-11-18T10:43:09.616322", 797 | "environment_variables": {}, 798 | "exception": null, 799 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/analyze_flight_delays.ipynb", 800 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/analyze_flight_delays.ipynb", 801 | "parameters": {}, 802 | "start_time": "2020-11-18T10:42:45.921869", 803 | "version": "2.1.1" 804 | }, 805 | "toc-autonumbering": false, 806 | "toc-showcode": false, 807 | "toc-showmarkdowntxt": false, 808 | "toc-showtags": false 809 | }, 810 | "nbformat": 4, 811 | "nbformat_minor": 4 812 | } 813 | -------------------------------------------------------------------------------- /notebooks/deploy_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "papermill": { 7 | "duration": 0.04375, 8 | "end_time": "2020-11-18T16:57:46.111418", 9 | "exception": false, 10 | "start_time": "2020-11-18T16:57:46.067668", 11 | "status": "completed" 12 | }, 13 | "tags": [] 14 | }, 15 | "source": [ 16 | "# Deploying the Flight Delay Model\n", 17 | "\n", 18 | "In this notebook, we deploy the model we trained to predict flight delays, using [Kubeflow Serving](https://www.kubeflow.org/docs/components/serving/kfserving/).\n", 19 | "\n", 20 | "**Note** this notebook requires access to a KFServing installation. See the [KFServing instructions](../kfserving.md) for details. If running the pipeline on the Kubeflow Pipelines runtime, also see the [readme instructions](../README.md) for the link to install KFP.\n", 21 | "\n", 22 | "#### Import required modules\n", 23 | "\n", 24 | "Import and configure the required modules." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "papermill": { 32 | "duration": 17.423869, 33 | "end_time": "2020-11-18T16:58:03.571681", 34 | "exception": false, 35 | "start_time": "2020-11-18T16:57:46.147812", 36 | "status": "completed" 37 | }, 38 | "tags": [] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "! pip install -q kfserving" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "papermill": { 50 | "duration": 0.811648, 51 | "end_time": "2020-11-18T16:58:04.400939", 52 | "exception": false, 53 | "start_time": "2020-11-18T16:58:03.589291", 54 | "status": "completed" 55 | }, 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import os\n", 61 | "import numpy as np\n", 62 | "import requests\n", 63 | "# minio is part of kfserving \n", 64 | "from minio import Minio\n", 65 | "from minio.error import NoSuchBucket" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "papermill": { 72 | "duration": 0.042401, 73 | "end_time": "2020-11-18T16:58:04.478066", 74 | "exception": false, 75 | "start_time": "2020-11-18T16:58:04.435665", 76 | "status": "completed" 77 | }, 78 | "tags": [] 79 | }, 80 | "source": [ 81 | "### Upload the model to object storage\n", 82 | "\n", 83 | "Our notebook has access to the trained model file, which was exported by the previous pipeline phase. _However_, when using a Kubeflow Pipelines runtime, it is not possible to programatically access the object storage bucket. It also makes execution mechanics different between local and KFP execution mode.\n", 84 | "\n", 85 | "So, here we will use a dedicated bucket for models in object storage, and upload it from the notebook execution environment. We will then deploy the KFServing inference service using that object storage location." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "papermill": { 93 | "duration": 5.189059, 94 | "end_time": "2020-11-18T16:58:09.715558", 95 | "exception": false, 96 | "start_time": "2020-11-18T16:58:04.526499", 97 | "status": "completed" 98 | }, 99 | "tags": [] 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "# set up the minio client to access object storage buckets\n", 104 | "os_url = os.environ.get('OS_URL', 'minio-service:9000')\n", 105 | "access_key = os.environ.get('ACCESS_KEY_ID', 'minio')\n", 106 | "secret_key = os.environ.get('SECRET_ACCESS_KEY', 'minio123')\n", 107 | "\n", 108 | "mc = Minio(os_url,\n", 109 | " access_key=access_key,\n", 110 | " secret_key=secret_key,\n", 111 | " secure=False)\n", 112 | "\n", 113 | "print('Current buckets:')\n", 114 | "for b in mc.list_buckets():\n", 115 | " print(' ' + b.name)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "papermill": { 123 | "duration": 0.158499, 124 | "end_time": "2020-11-18T16:58:09.903405", 125 | "exception": false, 126 | "start_time": "2020-11-18T16:58:09.744906", 127 | "status": "completed" 128 | }, 129 | "tags": [] 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# create a bucket to upload the model file to\n", 134 | "# Note: if the model file already exists we delete it\n", 135 | "model_bucket = os.environ.get('MODEL_BUCKET', 'models')\n", 136 | "model_dir = os.environ.get('MODEL_DIR', 'models')\n", 137 | "model_file = 'model.joblib'\n", 138 | "model_path = '{}/{}'.format(model_dir, model_file)\n", 139 | "\n", 140 | "try:\n", 141 | " # delete model file if if exists \n", 142 | " mc.remove_object(model_bucket, model_file)\n", 143 | "except NoSuchBucket:\n", 144 | " # the bucket doesn't exist - create it\n", 145 | " print('Creating bucket [{}]'.format(model_bucket))\n", 146 | " mc.make_bucket(model_bucket)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "papermill": { 154 | "duration": 0.148869, 155 | "end_time": "2020-11-18T16:58:10.075811", 156 | "exception": false, 157 | "start_time": "2020-11-18T16:58:09.926942", 158 | "status": "completed" 159 | }, 160 | "tags": [] 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# upload the model file\n", 165 | "file_stat = os.stat(model_path)\n", 166 | "with open(model_path, 'rb') as data:\n", 167 | " mc.put_object(model_bucket, model_file, data, file_stat.st_size)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "papermill": { 175 | "duration": 0.083705, 176 | "end_time": "2020-11-18T16:58:10.193249", 177 | "exception": false, 178 | "start_time": "2020-11-18T16:58:10.109544", 179 | "status": "completed" 180 | }, 181 | "tags": [] 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "# check whether the model file is there\n", 186 | "for o in mc.list_objects(model_bucket, prefix=model_file):\n", 187 | " print(o)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "papermill": { 194 | "duration": 0.052238, 195 | "end_time": "2020-11-18T16:58:10.293275", 196 | "exception": false, 197 | "start_time": "2020-11-18T16:58:10.241037", 198 | "status": "completed" 199 | }, 200 | "tags": [] 201 | }, 202 | "source": [ 203 | "### Create the inference service\n", 204 | "\n", 205 | "Next, we use the KFServing Python client to create the inference service.\n", 206 | "\n", 207 | "**Note** the prerequisites (see the [KF Serving instructions](../kfserving.md)):\n", 208 | "1. A service account and related secret for the object storage service\n", 209 | "1. Specify the custom `sklearnserver` Docker image\n", 210 | "1. Patch the KFP `pipeline-runner` service account role to allow creating a KFServing `inferenceservice`" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "papermill": { 218 | "duration": 5.594827, 219 | "end_time": "2020-11-18T16:58:15.938195", 220 | "exception": false, 221 | "start_time": "2020-11-18T16:58:10.343368", 222 | "status": "completed" 223 | }, 224 | "tags": [] 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "from kubernetes import client\n", 229 | "\n", 230 | "from kfserving import KFServingClient\n", 231 | "from kfserving import constants\n", 232 | "from kfserving import utils\n", 233 | "from kfserving import V1alpha2EndpointSpec\n", 234 | "from kfserving import V1alpha2PredictorSpec\n", 235 | "from kfserving import V1alpha2SKLearnSpec\n", 236 | "from kfserving import V1alpha2InferenceServiceSpec\n", 237 | "from kfserving import V1alpha2InferenceService\n", 238 | "from kubernetes.client import V1ResourceRequirements" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "papermill": { 246 | "duration": 0.107594, 247 | "end_time": "2020-11-18T16:58:16.085369", 248 | "exception": false, 249 | "start_time": "2020-11-18T16:58:15.977775", 250 | "status": "completed" 251 | }, 252 | "tags": [] 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "KFServing = KFServingClient()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "papermill": { 264 | "duration": 0.063151, 265 | "end_time": "2020-11-18T16:58:16.189803", 266 | "exception": false, 267 | "start_time": "2020-11-18T16:58:16.126652", 268 | "status": "completed" 269 | }, 270 | "tags": [] 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "# we need to use the 'kubeflow' namespace so that the KFP runner can create the inference service\n", 275 | "namespace = 'kubeflow'\n", 276 | "# this is the service account created for S3 access credentials\n", 277 | "service_acc = 'kfserving-sa'\n", 278 | "model_storage_uri = 's3://{}'.format(model_bucket)\n", 279 | "model_name = 'flight-model'" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "papermill": { 287 | "duration": 2.893991, 288 | "end_time": "2020-11-18T16:58:19.129355", 289 | "exception": false, 290 | "start_time": "2020-11-18T16:58:16.235364", 291 | "status": "completed" 292 | }, 293 | "tags": [] 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "api_version = constants.KFSERVING_GROUP + '/' + constants.KFSERVING_VERSION\n", 298 | "default_endpoint_spec = V1alpha2EndpointSpec(\n", 299 | " predictor=V1alpha2PredictorSpec(\n", 300 | " sklearn=V1alpha2SKLearnSpec(\n", 301 | " storage_uri=model_storage_uri,\n", 302 | " resources=V1ResourceRequirements(\n", 303 | " requests={'cpu':'100m','memory':'1Gi'},\n", 304 | " limits={'cpu':'100m', 'memory':'1Gi'}\n", 305 | " )\n", 306 | " ),\n", 307 | " service_account_name=service_acc\n", 308 | " )\n", 309 | ")\n", 310 | " \n", 311 | "isvc = V1alpha2InferenceService(api_version=api_version,\n", 312 | " kind=constants.KFSERVING_KIND,\n", 313 | " metadata=client.V1ObjectMeta(\n", 314 | " name=model_name, namespace=namespace),\n", 315 | " spec=V1alpha2InferenceServiceSpec(default=default_endpoint_spec))\n", 316 | "KFServing.create(isvc)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "papermill": { 324 | "duration": 119.962162, 325 | "end_time": "2020-11-18T17:00:19.130621", 326 | "exception": false, 327 | "start_time": "2020-11-18T16:58:19.168459", 328 | "status": "completed" 329 | }, 330 | "tags": [] 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "# Wait for the inference service to be ready\n", 335 | "KFServing.get(model_name, namespace=namespace, watch=True, timeout_seconds=120)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "papermill": { 342 | "duration": 0.044726, 343 | "end_time": "2020-11-18T17:00:19.224273", 344 | "exception": false, 345 | "start_time": "2020-11-18T17:00:19.179547", 346 | "status": "completed" 347 | }, 348 | "tags": [] 349 | }, 350 | "source": [ 351 | "### Test the inference service\n", 352 | "\n", 353 | "Once the inference service is running and available, we can send some test data to the service.\n", 354 | "\n", 355 | "**Note** that when deployed into KFP, we need to use the cluster-local url for the model. When executing locally, we assume that port-forwarding is enabled to allow access to the ingress gateway." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "papermill": { 363 | "duration": 0.084982, 364 | "end_time": "2020-11-18T17:00:19.360922", 365 | "exception": false, 366 | "start_time": "2020-11-18T17:00:19.275940", 367 | "status": "completed" 368 | }, 369 | "tags": [] 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "service = KFServing.get(model_name, namespace=namespace)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "papermill": { 381 | "duration": 0.078281, 382 | "end_time": "2020-11-18T17:00:19.493930", 383 | "exception": false, 384 | "start_time": "2020-11-18T17:00:19.415649", 385 | "status": "completed" 386 | }, 387 | "tags": [] 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "# load the 10 example rows from our test data, and display a few rows\n", 392 | "examples = np.load('data/test_rows.npy')\n", 393 | "examples[:3]" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "papermill": { 401 | "duration": 1.394202, 402 | "end_time": "2020-11-18T17:00:20.943882", 403 | "exception": true, 404 | "start_time": "2020-11-18T17:00:19.549680", 405 | "status": "failed" 406 | }, 407 | "tags": [] 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "model_mode = os.environ.get('MODEL_MODE', 'local')\n", 412 | "model_data = {\"instances\": examples.tolist()}\n", 413 | "if model_mode == 'local':\n", 414 | " # executing locally, use the ingress gateway (we assume port-forwarding) \n", 415 | " url = f'http://localhost:8080/v1/models/{model_name}:predict'\n", 416 | " service_hostname = '{}.{}.example.com'.format(model_name, namespace)\n", 417 | " headers = {'Host': service_hostname}\n", 418 | " resp = requests.post(url=url, json=model_data, headers=headers)\n", 419 | "else:\n", 420 | " # we are executing in KFP, use the cluster-local address\n", 421 | " url = service['status']['address']['url']\n", 422 | " resp = requests.post(url=url, json=model_data)\n", 423 | "\n", 424 | "resp.json()" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": { 430 | "papermill": { 431 | "duration": null, 432 | "end_time": null, 433 | "exception": null, 434 | "start_time": null, 435 | "status": "pending" 436 | }, 437 | "tags": [] 438 | }, 439 | "source": [ 440 | "### Delete the model service\n", 441 | "\n", 442 | "Once we are done, we clean up the service." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "papermill": { 450 | "duration": null, 451 | "end_time": null, 452 | "exception": null, 453 | "start_time": null, 454 | "status": "pending" 455 | }, 456 | "tags": [] 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "KFServing.delete(model_name, namespace=namespace)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "papermill": { 467 | "duration": null, 468 | "end_time": null, 469 | "exception": null, 470 | "start_time": null, 471 | "status": "pending" 472 | }, 473 | "tags": [] 474 | }, 475 | "source": [ 476 | "### Authors\n", 477 | "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 478 | "\n", 479 | "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License." 480 | ] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 3", 486 | "language": "python", 487 | "name": "python3" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 3 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython3", 499 | "version": "3.7.9" 500 | }, 501 | "papermill": { 502 | "duration": 158.620592, 503 | "end_time": "2020-11-18T17:00:22.439093", 504 | "environment_variables": {}, 505 | "exception": true, 506 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/deploy_model.ipynb", 507 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/deploy_model.ipynb", 508 | "parameters": {}, 509 | "start_time": "2020-11-18T16:57:43.818501", 510 | "version": "2.1.1" 511 | }, 512 | "toc-autonumbering": false, 513 | "toc-showcode": false, 514 | "toc-showmarkdowntxt": false, 515 | "toc-showtags": false 516 | }, 517 | "nbformat": 4, 518 | "nbformat_minor": 4 519 | } 520 | -------------------------------------------------------------------------------- /notebooks/load_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 IBM Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import os 17 | from pathlib import Path 18 | import requests 19 | import sys 20 | import tarfile 21 | from urllib.parse import urlparse 22 | 23 | 24 | def download_from_public_url(url): 25 | 26 | data_dir_name = 'data' 27 | 28 | print('Downloading data file {} ...'.format(url)) 29 | r = requests.get(url) 30 | if r.status_code != 200: 31 | raise RuntimeError('Could not fetch {}: HTTP status code {}'.format(url, r.status_code)) 32 | else: 33 | # extract data set file name from URL 34 | data_file_name = Path((urlparse(url).path)).name 35 | # create the directory where the downloaded file will be stored 36 | data_dir = Path(data_dir_name) 37 | data_dir.mkdir(parents=True, exist_ok=True) 38 | downloaded_data_file = data_dir / data_file_name 39 | 40 | print('Saving downloaded file "{}" as ...'.format(data_file_name)) 41 | with open(downloaded_data_file, 'wb') as downloaded_file: 42 | downloaded_file.write(r.content) 43 | 44 | if r.headers['content-type'] in ['application/x-tar', 'application/x-gzip']: 45 | print('Extracting downloaded file in directory "{}" ...'.format(data_dir)) 46 | with tarfile.open(downloaded_data_file, 'r') as tar: 47 | tar.extractall(data_dir) 48 | print('Removing downloaded file ...') 49 | downloaded_data_file.unlink() 50 | 51 | if __name__ == "__main__": 52 | 53 | # This script downloads a compressed data set archive from a public location 54 | # e.g. http://server/path/to/archive and extracts it. 55 | # The archive location can be specified using the DATASET_URL environment variable 56 | # DATASET_URL=http://server/path/to/archive. 57 | 58 | # initialize download URL from environment variable 59 | dataset_url = os.environ.get('DATASET_URL') 60 | 61 | # No data set URL was provided. 62 | if dataset_url is None: 63 | raise RuntimeError('Cannot run script. A data set URL must be provided as input.') 64 | 65 | # Try to process the URL 66 | download_from_public_url(dataset_url) -------------------------------------------------------------------------------- /notebooks/merge_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "papermill": { 7 | "duration": 0.025086, 8 | "end_time": "2020-11-18T10:42:42.222104", 9 | "exception": false, 10 | "start_time": "2020-11-18T10:42:42.197018", 11 | "status": "completed" 12 | }, 13 | "tags": [] 14 | }, 15 | "source": [ 16 | "# Merging Airline Delay and Weather Datasets\n", 17 | "\n", 18 | "In this notebook, we merge together two data sources in order to create richer features for our flight delay prediction classification problem.\n", 19 | "* selecting the columns we wish to keep for later analysis\n", 20 | "* converting and cleaning data where required\n", 21 | "* handling missing values\n", 22 | "\n", 23 | "#### Import required modules\n", 24 | "\n", 25 | "Import and configure the required modules." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "papermill": { 33 | "duration": 0.039234, 34 | "end_time": "2020-11-18T10:42:42.285141", 35 | "exception": false, 36 | "start_time": "2020-11-18T10:42:42.245907", 37 | "status": "completed" 38 | }, 39 | "tags": [] 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "# !pip install pandas scikit-learn > /dev/null 2>&1" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "papermill": { 51 | "duration": 0.83582, 52 | "end_time": "2020-11-18T10:42:43.144936", 53 | "exception": false, 54 | "start_time": "2020-11-18T10:42:42.309116", 55 | "status": "completed" 56 | }, 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# Define required imports\n", 62 | "import pandas as pd\n", 63 | "# These set pandas max column and row display in the notebook\n", 64 | "pd.set_option('display.max_columns', 50)\n", 65 | "pd.set_option('display.max_rows', 50)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "papermill": { 72 | "duration": 0.011064, 73 | "end_time": "2020-11-18T10:42:43.173214", 74 | "exception": false, 75 | "start_time": "2020-11-18T10:42:43.162150", 76 | "status": "completed" 77 | }, 78 | "tags": [] 79 | }, 80 | "source": [ 81 | "### Read datasets\n", 82 | "\n", 83 | "We start by reading in the processed flight delay and weather datasets" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "papermill": { 91 | "duration": 0.083716, 92 | "end_time": "2020-11-18T10:42:43.270535", 93 | "exception": false, 94 | "start_time": "2020-11-18T10:42:43.186819", 95 | "status": "completed" 96 | }, 97 | "tags": [] 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "flight_path = 'data/jfk_flight_features.csv'\n", 102 | "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n", 103 | "flight_data.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "papermill": { 111 | "duration": 0.253188, 112 | "end_time": "2020-11-18T10:42:43.545450", 113 | "exception": false, 114 | "start_time": "2020-11-18T10:42:43.292262", 115 | "status": "completed" 116 | }, 117 | "tags": [] 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "weather_path = 'data/jfk_weather_features.csv'\n", 122 | "weather_data = pd.read_csv(weather_path, parse_dates=['DATE'])\n", 123 | "weather_data.head()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": { 129 | "papermill": { 130 | "duration": 0.012112, 131 | "end_time": "2020-11-18T10:42:43.568539", 132 | "exception": false, 133 | "start_time": "2020-11-18T10:42:43.556427", 134 | "status": "completed" 135 | }, 136 | "tags": [] 137 | }, 138 | "source": [ 139 | "### Merge datasets\n", 140 | "\n", 141 | "The next step is to merge or join the two datasets, such that for each flight record in the flight delay dataset, we have information about the weather conditions present for that flight. \n", 142 | "\n", 143 | "**Note** we have to be careful not to effectively \"leak\" information. Recall that our weather observations come from automated weather station reports that are generated on the 51st minute of each hour. We must ensure that the weather report used for flight delay prediction is one covering weather conditions present _before_ the flight departure, otherwise we would be giving our model a glimpse in the the future!\n", 144 | "\n", 145 | "This makes joining the datasets a little tricky. One simple approach is to join the record for a given flight day and hour, with the weather reading for the same day but the _previous hour_. We can do this by extracting 2 \"join keys\" from each dataset: the first for the `date` and the second for the `hour` of the record. If we set the `hour` join key for the flight to the hour _before_ the actual hour of the flight scheduled departure, then we ensure the corresponding weather report comes from the hour before the flight would depart." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "papermill": { 153 | "duration": 0.106487, 154 | "end_time": "2020-11-18T10:42:43.699995", 155 | "exception": false, 156 | "start_time": "2020-11-18T10:42:43.593508", 157 | "status": "completed" 158 | }, 159 | "tags": [] 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "flight_data.loc[:, 'hour_key'] = pd.to_datetime(flight_data['sched_dep_time'], format='%H%M', errors='ignore').dt.hour - 1\n", 164 | "flight_data.loc[:, 'date_key'] = flight_data['flight_date'].dt.date\n", 165 | "flight_data.head()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "papermill": { 173 | "duration": 0.117894, 174 | "end_time": "2020-11-18T10:42:43.842588", 175 | "exception": false, 176 | "start_time": "2020-11-18T10:42:43.724694", 177 | "status": "completed" 178 | }, 179 | "tags": [] 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "weather_data.loc[:, 'date_key'] = weather_data['DATE'].dt.date\n", 184 | "weather_data.loc[:, 'hour_key'] = weather_data['DATE'].dt.hour\n", 185 | "weather_data.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "papermill": { 192 | "duration": 0.013882, 193 | "end_time": "2020-11-18T10:42:43.873685", 194 | "exception": false, 195 | "start_time": "2020-11-18T10:42:43.859803", 196 | "status": "completed" 197 | }, 198 | "tags": [] 199 | }, 200 | "source": [ 201 | "Next, we join the datasets together based on the \"join keys\" we have created:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "papermill": { 209 | "duration": 0.104916, 210 | "end_time": "2020-11-18T10:42:43.991593", 211 | "exception": false, 212 | "start_time": "2020-11-18T10:42:43.886677", 213 | "status": "completed" 214 | }, 215 | "tags": [] 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "flight_weather_data = flight_data.merge(weather_data, how='inner', on=['date_key', 'hour_key'])\n", 220 | "flight_weather_data.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "papermill": { 227 | "duration": 0.016437, 228 | "end_time": "2020-11-18T10:42:44.022539", 229 | "exception": false, 230 | "start_time": "2020-11-18T10:42:44.006102", 231 | "status": "completed" 232 | }, 233 | "tags": [] 234 | }, 235 | "source": [ 236 | "For the first record in our flight dataset, we can see that the flight departs at 15:25. The corresponding weather report is timestamped at 14:51.\n", 237 | "\n", 238 | "**Note** all we guarantee here is that the weather report is _within_ 1 hour before the flight departure, not _precisely 1 hour before_. " 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "papermill": { 245 | "duration": 0.015937, 246 | "end_time": "2020-11-18T10:42:44.052584", 247 | "exception": false, 248 | "start_time": "2020-11-18T10:42:44.036647", 249 | "status": "completed" 250 | }, 251 | "tags": [] 252 | }, 253 | "source": [ 254 | "### Save the Merged Data\n", 255 | "\n", 256 | "Finally, we save the merged dataset for use by downstream tasks." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "papermill": { 264 | "duration": 0.367411, 265 | "end_time": "2020-11-18T10:42:44.439419", 266 | "exception": false, 267 | "start_time": "2020-11-18T10:42:44.072008", 268 | "status": "completed" 269 | }, 270 | "tags": [] 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "flight_weather_data.to_csv('data/jfk_flight_weather_features.csv', index=False, float_format='%g')" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "papermill": { 281 | "duration": 0.013914, 282 | "end_time": "2020-11-18T10:42:44.470581", 283 | "exception": false, 284 | "start_time": "2020-11-18T10:42:44.456667", 285 | "status": "completed" 286 | }, 287 | "tags": [] 288 | }, 289 | "source": [ 290 | " \n", 291 | "### Authors" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "papermill": { 298 | "duration": 0.029833, 299 | "end_time": "2020-11-18T10:42:44.522443", 300 | "exception": false, 301 | "start_time": "2020-11-18T10:42:44.492610", 302 | "status": "completed" 303 | }, 304 | "tags": [] 305 | }, 306 | "source": [ 307 | "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 308 | "\n", 309 | "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License." 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python 3", 316 | "language": "python", 317 | "name": "python3" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 3 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython3", 329 | "version": "3.7.9" 330 | }, 331 | "papermill": { 332 | "duration": 5.066455, 333 | "end_time": "2020-11-18T10:42:45.841585", 334 | "environment_variables": {}, 335 | "exception": null, 336 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/merge_data.ipynb", 337 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/merge_data.ipynb", 338 | "parameters": {}, 339 | "start_time": "2020-11-18T10:42:40.775130", 340 | "version": "2.1.1" 341 | }, 342 | "toc-autonumbering": false, 343 | "toc-showcode": false, 344 | "toc-showmarkdowntxt": false, 345 | "toc-showtags": false 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 4 349 | } 350 | -------------------------------------------------------------------------------- /notebooks/predict_flight_delays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "papermill": { 7 | "duration": 0.026827, 8 | "end_time": "2020-11-18T10:43:10.843474", 9 | "exception": false, 10 | "start_time": "2020-11-18T10:43:10.816647", 11 | "status": "completed" 12 | }, 13 | "tags": [] 14 | }, 15 | "source": [ 16 | "# Predicting Flight Delays\n", 17 | "\n", 18 | "In this notebook, we use the combined flight delay and weather data we have created to create and evaluate models to predict flight delays.\n", 19 | "\n", 20 | "**Note** the full flight delay dataset is very large (over 80GB uncompressed), so we are working with a smaller sample dataset. Hence our results may not be a true reflection of the results on the full dataset.\n", 21 | "\n", 22 | "#### Import required modules\n", 23 | "\n", 24 | "Import and configure the required modules." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "papermill": { 32 | "duration": 3.113441, 33 | "end_time": "2020-11-18T10:43:13.981561", 34 | "exception": false, 35 | "start_time": "2020-11-18T10:43:10.868120", 36 | "status": "completed" 37 | }, 38 | "tags": [] 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "!pip install seaborn scikit-learn > /dev/null 2>&1" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "papermill": { 50 | "duration": 1.755517, 51 | "end_time": "2020-11-18T10:43:15.772084", 52 | "exception": false, 53 | "start_time": "2020-11-18T10:43:14.016567", 54 | "status": "completed" 55 | }, 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "# Define required imports\n", 61 | "import json\n", 62 | "import pandas as pd\n", 63 | "import numpy as np\n", 64 | "import seaborn as sns\n", 65 | "import matplotlib.pyplot as plt\n", 66 | "sns.set_theme(style='darkgrid', palette='deep')\n", 67 | "# These set pandas max column and row display in the notebook\n", 68 | "pd.set_option('display.max_columns', 50)\n", 69 | "pd.set_option('display.max_rows', 50)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "papermill": { 77 | "duration": 0.053132, 78 | "end_time": "2020-11-18T10:43:15.854310", 79 | "exception": false, 80 | "start_time": "2020-11-18T10:43:15.801178", 81 | "status": "completed" 82 | }, 83 | "tags": [] 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "MODEL_EXPORT_FOLDER = 'models'\n", 88 | "from pathlib import Path\n", 89 | "export_path = Path(MODEL_EXPORT_FOLDER)\n", 90 | "export_path.mkdir(parents=True, exist_ok=True)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "papermill": { 97 | "duration": 0.027723, 98 | "end_time": "2020-11-18T10:43:15.907403", 99 | "exception": false, 100 | "start_time": "2020-11-18T10:43:15.879680", 101 | "status": "completed" 102 | }, 103 | "tags": [] 104 | }, 105 | "source": [ 106 | "### Read the data\n", 107 | "\n", 108 | "We start by reading in the merged flight delay and weather data" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "papermill": { 116 | "duration": 0.128993, 117 | "end_time": "2020-11-18T10:43:16.065704", 118 | "exception": false, 119 | "start_time": "2020-11-18T10:43:15.936711", 120 | "status": "completed" 121 | }, 122 | "tags": [] 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "flight_path = 'data/jfk_flight_weather_features.csv'\n", 127 | "flight_data = pd.read_csv(flight_path, parse_dates=['flight_date'])\n", 128 | "flight_data.head()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "papermill": { 136 | "duration": 0.054146, 137 | "end_time": "2020-11-18T10:43:16.154043", 138 | "exception": false, 139 | "start_time": "2020-11-18T10:43:16.099897", 140 | "status": "completed" 141 | }, 142 | "tags": [] 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "flight_data['dest'].value_counts().tail(10)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "papermill": { 154 | "duration": 0.097234, 155 | "end_time": "2020-11-18T10:43:16.283027", 156 | "exception": false, 157 | "start_time": "2020-11-18T10:43:16.185793", 158 | "status": "completed" 159 | }, 160 | "tags": [] 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "flight_data['dest'].value_counts().tail(10)\n", 165 | "dest_to_drop = ['MKE', 'HYA', 'ALB', 'PSP', 'BDL', 'TUS', 'DAB', 'BHM']\n", 166 | "flight_data[flight_data['dest'].isin(dest_to_drop)]\n", 167 | "flight_data.drop(flight_data[flight_data['dest'].isin(dest_to_drop)].index, inplace=True)\n", 168 | "flight_data" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "papermill": { 175 | "duration": 0.039101, 176 | "end_time": "2020-11-18T10:43:16.360160", 177 | "exception": false, 178 | "start_time": "2020-11-18T10:43:16.321059", 179 | "status": "completed" 180 | }, 181 | "tags": [] 182 | }, 183 | "source": [ 184 | "### Create train / test data split\n", 185 | "\n", 186 | "The first step in building our models is to split the dataset into training and test sets. We use a portion of the data for training, and another portion of data for our test sets.\n", 187 | "\n", 188 | "If we instead trained a model on the full dataset, the model would learn to be very good at making predictions on that particular dataset, essentially just copying the answers it knows. However, when presented with data the model has not seen , it would perform poorly since it has not learned how to generalize its answers.\n", 189 | "\n", 190 | "By training on a portion of the dataset and testing the model's performance on another portion of the dataset (which data the model has not seen in training), we try to avoid our models \"over-fitting\" the dataset and make them better at prediction when given unseen, future data. This process of splitting the dataset and evaluating a model's performance on \"held-out\" datasets is commonly known as _cross-validation_.\n", 191 | "\n", 192 | "By default here we use 80% of the data for the training set and 20% for the test set.\n", 193 | "\n", 194 | "**Note** for simplicity here we perform a random split. Technically, we have some time-dependent information leakage, since for earlier records, the model can use data from the future in training. In reality, a model at that point in time would not have information about the future available for training. For a better evaluation of the model performance on fully unseen, new data, the test set should be generated from _future_ data occurring after the time window in the training set." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "papermill": { 202 | "duration": 0.448639, 203 | "end_time": "2020-11-18T10:43:16.841145", 204 | "exception": false, 205 | "start_time": "2020-11-18T10:43:16.392506", 206 | "status": "completed" 207 | }, 208 | "tags": [] 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "from sklearn.model_selection import train_test_split\n", 213 | "\n", 214 | "# Split the dataset into 80% training and 20% test sets, stratified by the 'delayed' field\n", 215 | "df_train, df_test = train_test_split(\n", 216 | " flight_data, train_size=0.8, random_state=24, stratify=flight_data[['delayed']])" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "papermill": { 224 | "duration": 0.034849, 225 | "end_time": "2020-11-18T10:43:16.901670", 226 | "exception": false, 227 | "start_time": "2020-11-18T10:43:16.866821", 228 | "status": "completed" 229 | }, 230 | "tags": [] 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "# specify the target variable\n", 235 | "y_train = df_train['delayed'].values\n", 236 | "y_test = df_test['delayed'].values\n", 237 | "print('Training set: {} rows'.format(len(df_train)))\n", 238 | "print('Test set: {} rows'.format(len(df_test)))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "papermill": { 245 | "duration": 0.025412, 246 | "end_time": "2020-11-18T10:43:16.950294", 247 | "exception": false, 248 | "start_time": "2020-11-18T10:43:16.924882", 249 | "status": "completed" 250 | }, 251 | "tags": [] 252 | }, 253 | "source": [ 254 | "### Encode categorical variables\n", 255 | "\n", 256 | "Next, we want to encode the various _categorical_ features we have - such as the flight departure time bucket, airline and airport ids, and so on - into numerical representations. We do this by assigning integer ids to each unique feature value. This is known as ordinal encoding.\n", 257 | "\n", 258 | "Note that certain models (e.g. linear models) will interpret these numerical values as having an ordinal structure. However, for our demonstration purposes we will use tree-based models, which can handle these types of integer ids directly. \n", 259 | "\n", 260 | "For linear models, we would prefer to use one-hot encoding for categorical features." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "papermill": { 268 | "duration": 0.072311, 269 | "end_time": "2020-11-18T10:43:17.051886", 270 | "exception": false, 271 | "start_time": "2020-11-18T10:43:16.979575", 272 | "status": "completed" 273 | }, 274 | "tags": [] 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "from sklearn.preprocessing import OrdinalEncoder\n", 279 | "\n", 280 | "# specify columns for raw categorical features\n", 281 | "cat_columns = [\n", 282 | " 'month',\n", 283 | " 'day_of_month',\n", 284 | " 'day_of_week',\n", 285 | " 'airline_name',\n", 286 | " 'dest',\n", 287 | " 'dep_time_bin',\n", 288 | " 'distance_bin'\n", 289 | "]\n", 290 | "\n", 291 | "# extract categorical data columns for training set\n", 292 | "df_train_cat = df_train[cat_columns]\n", 293 | "# extract categorical data columns for test set\n", 294 | "df_test_cat = df_test[cat_columns]\n", 295 | "\n", 296 | "ord_enc = OrdinalEncoder()\n", 297 | "# fit and encode training features\n", 298 | "X_train_cat = ord_enc.fit_transform(df_train_cat)\n", 299 | "# encode test features\n", 300 | "X_test_cat = ord_enc.transform(df_test_cat)\n", 301 | "\n", 302 | "print('Training set categorical features: {} rows, {} features' .format(X_train_cat.shape[0], X_train_cat.shape[1]))\n", 303 | "print('Test set categorical features: {} rows, {} features' .format(X_test_cat.shape[0], X_test_cat.shape[1]))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": { 309 | "papermill": { 310 | "duration": 0.044356, 311 | "end_time": "2020-11-18T10:43:17.144311", 312 | "exception": false, 313 | "start_time": "2020-11-18T10:43:17.099955", 314 | "status": "completed" 315 | }, 316 | "tags": [] 317 | }, 318 | "source": [ 319 | "### Encode numerical variables\n", 320 | "\n", 321 | "The next step is to encode numerical features. Depending on the models used, it can be very important to scale / normalize numerical features - such as `wind_speed` or `precip`. Again, linear models and neural networks are a good example of this. In our case we will use tree-based models, which again do not require feature scaling, hence we can use these numerical features directly without pre-processing. \n", 322 | "\n", 323 | "**Note** that the weather type features are also categorical. However, we have already encoded these as binary values in our pre-processing step, hence we can now treat these features as numerical." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "papermill": { 331 | "duration": 0.048752, 332 | "end_time": "2020-11-18T10:43:17.225546", 333 | "exception": false, 334 | "start_time": "2020-11-18T10:43:17.176794", 335 | "status": "completed" 336 | }, 337 | "tags": [] 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "num_columns = [\n", 342 | " 'visibility',\n", 343 | " 'wind_speed',\n", 344 | " 'wind_gust_speed',\n", 345 | " 'precip',\n", 346 | " 'rain',\n", 347 | " 'ice_pellets',\n", 348 | " 'mist',\n", 349 | " 'snow',\n", 350 | " 'drizzle',\n", 351 | " 'haze',\n", 352 | " 'fog',\n", 353 | " 'thunderstorm',\n", 354 | " 'smoke',\n", 355 | " 'unknown_precipitation'\n", 356 | "]\n", 357 | "\n", 358 | "# extract numerical data columns for training set\n", 359 | "X_train_num = df_train[num_columns].values\n", 360 | "# extract numerical data columns for validation set\n", 361 | "X_test_num = df_test[num_columns].values\n", 362 | "\n", 363 | "print('Training set numerical features: {} rows, {} features' .format(X_train_num.shape[0], X_train_num.shape[1]))\n", 364 | "print('Test set numerical features: {} rows, {} features' .format(X_test_num.shape[0], X_test_num.shape[1]))" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "papermill": { 371 | "duration": 0.098643, 372 | "end_time": "2020-11-18T10:43:17.377832", 373 | "exception": false, 374 | "start_time": "2020-11-18T10:43:17.279189", 375 | "status": "completed" 376 | }, 377 | "tags": [] 378 | }, 379 | "source": [ 380 | "#### Combine categorical and numerical features\n", 381 | "\n", 382 | "We can now combine the two sets of features by concatenating them (\"horizontally stacking\"):" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "papermill": { 390 | "duration": 0.04915, 391 | "end_time": "2020-11-18T10:43:17.480635", 392 | "exception": false, 393 | "start_time": "2020-11-18T10:43:17.431485", 394 | "status": "completed" 395 | }, 396 | "tags": [] 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "X_train = np.hstack((X_train_cat, X_train_num))\n", 401 | "X_test = np.hstack((X_test_cat, X_test_num))\n", 402 | "print('Training set all features: {} rows, {} features' .format(X_train.shape[0], X_train.shape[1]))\n", 403 | "print('Test set all features: {} rows, {} features' .format(X_test.shape[0], X_test.shape[1]))" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "papermill": { 410 | "duration": 0.026992, 411 | "end_time": "2020-11-18T10:43:17.537965", 412 | "exception": false, 413 | "start_time": "2020-11-18T10:43:17.510973", 414 | "status": "completed" 415 | }, 416 | "tags": [] 417 | }, 418 | "source": [ 419 | "### Train and evaluate models\n", 420 | "\n", 421 | "Now that we have pre-processed all our features into numerical representations, we can pass them to our machine learning models.\n", 422 | "\n", 423 | "For simplicity, we will evalute 3 tree-based models: a single decision tree; a random forest and a gradient-boosting tree (both of these are \"ensemble\" models made up of many smaller sub-models, typicaly themselves single decision trees).\n", 424 | "\n", 425 | "Tree ensemble models are very flexible and powerful, and typically perform well \"out the box\" in particular on tabular datasets such as we have here. As we have seen, they also require less feature pre-processing and engineering in general than, for example, linear models." 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "papermill": { 433 | "duration": 0.16584, 434 | "end_time": "2020-11-18T10:43:17.729971", 435 | "exception": false, 436 | "start_time": "2020-11-18T10:43:17.564131", 437 | "status": "completed" 438 | }, 439 | "tags": [] 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", 444 | "from sklearn.tree import DecisionTreeClassifier\n", 445 | "from sklearn.model_selection import cross_val_score, cross_validate\n", 446 | " \n", 447 | "dt = DecisionTreeClassifier()\n", 448 | "rf = RandomForestClassifier()\n", 449 | "gb = GradientBoostingClassifier()" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": { 455 | "papermill": { 456 | "duration": 0.044233, 457 | "end_time": "2020-11-18T10:43:17.806791", 458 | "exception": false, 459 | "start_time": "2020-11-18T10:43:17.762558", 460 | "status": "completed" 461 | }, 462 | "tags": [] 463 | }, 464 | "source": [ 465 | "We have split out dataset into a training and test set. However, the test set itself should never be directly used in model training, but only to perform a final model evaluation. This gives an estimate on how the model might perform in the \"real world\".\n", 466 | "\n", 467 | "We would still like to perform model selection, which means we need to evaluate our models using the training set in some way. To avoid over-fitting on the training set, as well as to give a good estimate on how the model may perform on our test set, we will use K-fold cross-validation on our training set.\n", 468 | "\n", 469 | "This splits the dataset into `k` (in our case `5`) non-overlapping subsets (`folds`). In turn, the model is trained on 4 of these (80% of training data) and evaluated on 1 (20% of training data). This is repeated `k` times and the evaluation scores are averaged across each of the `k` runs. This averaged metric typically gives a fairly good indication of how the model performs on unseen data.\n", 470 | "\n", 471 | "`scikit-learn` provides us this functionality, built-in and easy to use!\n", 472 | "\n", 473 | "**Note** As we see in the analysis notebook, we are dealing with some degree of class imbalance - on-time flights are far more prevelant compared to delayed flights (80% / 20% split). So, we need to be cautious when evaluting the performance of such models. For example, if we use `accuracy` as a metric, then a simple rule that classifies all flights as `on-time` would achieve 80% accuracy, which sounds very good! However, the model is completely unable to actually predict whether a flight will be delayed, so is useless for any real-world application.\n", 474 | "\n", 475 | "A common metric used for binary classification is the area under the ROC curve (`roc_auc`). However, this metric can sometimes provide an unclear picture for imbalanced classes.\n", 476 | "\n", 477 | "There are a few metrics that try to alleviate this problem for binary classification problems. We will be using `F1 score` as our metric for selecting the model to use, since it can handle the class imbalance problem. _Note_ that the selection of metric also depends on the particular use case." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "papermill": { 485 | "duration": 8.689184, 486 | "end_time": "2020-11-18T10:43:26.538189", 487 | "exception": false, 488 | "start_time": "2020-11-18T10:43:17.849005", 489 | "status": "completed" 490 | }, 491 | "tags": [] 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "metric = 'f1'\n", 496 | "scores = cross_val_score(dt, X_train, y_train, cv=5, scoring=metric)\n", 497 | "dt_score = np.mean(scores)\n", 498 | "\n", 499 | "scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=metric)\n", 500 | "rf_score = np.mean(scores)\n", 501 | "\n", 502 | "scores = cross_val_score(gb, X_train, y_train, cv=5, scoring=metric)\n", 503 | "gb_score = np.mean(scores)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "papermill": { 511 | "duration": 0.286596, 512 | "end_time": "2020-11-18T10:43:26.858782", 513 | "exception": false, 514 | "start_time": "2020-11-18T10:43:26.572186", 515 | "status": "completed" 516 | }, 517 | "tags": [] 518 | }, 519 | "outputs": [], 520 | "source": [ 521 | "cv_scores = [dt_score, rf_score, gb_score]\n", 522 | "plt.figure(figsize=(16, 6))\n", 523 | "sns.barplot(x=['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier'], y=cv_scores)\n", 524 | "plt.show()\n", 525 | "\n", 526 | "print('Average {} for DecisionTreeClassifier: {}'.format(metric, dt_score))\n", 527 | "print('Average {} for RandomForestClassifier: {}'.format(metric, rf_score))\n", 528 | "print('Average {} for GradientBoostingClassifier: {}'.format(metric, gb_score))" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": { 534 | "papermill": { 535 | "duration": 0.050985, 536 | "end_time": "2020-11-18T10:43:26.952118", 537 | "exception": false, 538 | "start_time": "2020-11-18T10:43:26.901133", 539 | "status": "completed" 540 | }, 541 | "tags": [] 542 | }, 543 | "source": [ 544 | "Based on this, we will select the `DecisionTreeClassifier`.\n", 545 | "\n", 546 | "**Note** based on the `auc_roc` metric, we would have selected the `GradientBoostingClassifier` - try it out in the cells above to see and then compare the model performance later on.\n", 547 | "\n", 548 | "We can also evaluate the impact of adding our weather features on model performance:" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": { 555 | "papermill": { 556 | "duration": 0.490133, 557 | "end_time": "2020-11-18T10:43:27.499197", 558 | "exception": false, 559 | "start_time": "2020-11-18T10:43:27.009064", 560 | "status": "completed" 561 | }, 562 | "tags": [] 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "scores = cross_val_score(dt, X_train_cat, y_train, cv=5, scoring=metric)\n", 567 | "cat_score = np.mean(scores)\n", 568 | "\n", 569 | "scores = cross_val_score(dt, X_train_num, y_train, cv=5, scoring=metric)\n", 570 | "num_score = np.mean(scores)\n", 571 | "\n", 572 | "scores = cross_val_score(dt, X_train, y_train, cv=5, scoring=metric)\n", 573 | "all_score = np.mean(scores)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "papermill": { 581 | "duration": 0.270508, 582 | "end_time": "2020-11-18T10:43:27.797814", 583 | "exception": false, 584 | "start_time": "2020-11-18T10:43:27.527306", 585 | "status": "completed" 586 | }, 587 | "tags": [] 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "cv_scores = [cat_score, num_score, all_score]\n", 592 | "plt.figure(figsize=(16, 6))\n", 593 | "sns.barplot(x=['Flight features', 'Weather features', 'Flight + Weather features'], y=cv_scores)\n", 594 | "plt.show()\n", 595 | "\n", 596 | "print('Average {} for only flight delay features: {}'.format(metric, cat_score))\n", 597 | "print('Average {} for only weather features: {}'.format(metric, num_score))\n", 598 | "print('Average {} for all features: {}'.format(metric, all_score))" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": { 604 | "papermill": { 605 | "duration": 0.03945, 606 | "end_time": "2020-11-18T10:43:27.867100", 607 | "exception": false, 608 | "start_time": "2020-11-18T10:43:27.827650", 609 | "status": "completed" 610 | }, 611 | "tags": [] 612 | }, 613 | "source": [ 614 | "We see that using only weather features does little better than random guessing, while adding weather features to the flight features increases our metric by around `0.01`. This is not a very large amount, but it does indicate that information about weather helps a little with predictions. In some applications, even small increases in model performance can be significant.\n", 615 | "\n", 616 | "Finally, we re-train the model on the full training dataset and perform a final classification evaluation on the test set." 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": { 623 | "papermill": { 624 | "duration": 0.091014, 625 | "end_time": "2020-11-18T10:43:27.992675", 626 | "exception": false, 627 | "start_time": "2020-11-18T10:43:27.901661", 628 | "status": "completed" 629 | }, 630 | "tags": [] 631 | }, 632 | "outputs": [], 633 | "source": [ 634 | "from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, classification_report\n", 635 | "from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve\n", 636 | "\n", 637 | "# fit on full data\n", 638 | "dt.fit(X_train, y_train)\n", 639 | "y_prob = dt.predict_proba(X_test)[:, 1]\n", 640 | "y_pred = dt.predict(X_test)\n", 641 | "\n", 642 | "f1_test = f1_score(y_test, y_prob)\n", 643 | "roc_auc_test = roc_auc_score(y_test, y_prob)\n", 644 | "print('Final {} for test set: {}'.format(metric, f1_test))" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": { 650 | "papermill": { 651 | "duration": 0.044603, 652 | "end_time": "2020-11-18T10:43:28.076712", 653 | "exception": false, 654 | "start_time": "2020-11-18T10:43:28.032109", 655 | "status": "completed" 656 | }, 657 | "tags": [] 658 | }, 659 | "source": [ 660 | "We export the trained model and a few example rows from the test dataset, for potential use by downstream stages." 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": { 667 | "papermill": { 668 | "duration": 0.048674, 669 | "end_time": "2020-11-18T10:43:28.166514", 670 | "exception": false, 671 | "start_time": "2020-11-18T10:43:28.117840", 672 | "status": "completed" 673 | }, 674 | "tags": [] 675 | }, 676 | "outputs": [], 677 | "source": [ 678 | "# save the model file for downstream tasks\n", 679 | "from joblib import dump\n", 680 | "dump(dt, '{}/model.joblib'.format(MODEL_EXPORT_FOLDER))\n", 681 | "\n", 682 | "# also save a few example rows\n", 683 | "np.save('data/test_rows.npy', X_test[:10])" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "papermill": { 691 | "duration": 0.054397, 692 | "end_time": "2020-11-18T10:43:28.262088", 693 | "exception": false, 694 | "start_time": "2020-11-18T10:43:28.207691", 695 | "status": "completed" 696 | }, 697 | "tags": [] 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "# export metrics for KFP\n", 702 | "metrics = {\n", 703 | " 'metrics': [\n", 704 | " {\n", 705 | " 'name': 'f1_score',\n", 706 | " 'numberValue': f1_test,\n", 707 | " 'format': 'RAW'\n", 708 | " },\n", 709 | " {\n", 710 | " 'name': 'roc_auc_score',\n", 711 | " 'numberValue': roc_auc_test,\n", 712 | " 'format': 'RAW' \n", 713 | " }\n", 714 | " ]\n", 715 | " }\n", 716 | "\n", 717 | "with open('mlpipeline-metrics.json', 'w') as f:\n", 718 | " json.dump(metrics, f)" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": { 725 | "papermill": { 726 | "duration": 0.503831, 727 | "end_time": "2020-11-18T10:43:28.793550", 728 | "exception": false, 729 | "start_time": "2020-11-18T10:43:28.289719", 730 | "status": "completed" 731 | }, 732 | "tags": [] 733 | }, 734 | "outputs": [], 735 | "source": [ 736 | "fig = plt.figure(figsize=(16, 6))\n", 737 | "plt.subplot(121)\n", 738 | "plot_roc_curve(dt, X_test, y_test, ax=fig.gca())\n", 739 | "plt.subplot(122)\n", 740 | "plot_precision_recall_curve(dt, X_test, y_test, ax=fig.gca())\n", 741 | "plt.show()" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": { 748 | "papermill": { 749 | "duration": 0.047271, 750 | "end_time": "2020-11-18T10:43:28.871750", 751 | "exception": false, 752 | "start_time": "2020-11-18T10:43:28.824479", 753 | "status": "completed" 754 | }, 755 | "tags": [] 756 | }, 757 | "outputs": [], 758 | "source": [ 759 | "print(classification_report(y_test, y_pred, target_names=['On-time', 'Delayed']))" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": { 766 | "papermill": { 767 | "duration": 0.426407, 768 | "end_time": "2020-11-18T10:43:29.332485", 769 | "exception": false, 770 | "start_time": "2020-11-18T10:43:28.906078", 771 | "status": "completed" 772 | }, 773 | "tags": [] 774 | }, 775 | "outputs": [], 776 | "source": [ 777 | "cm = confusion_matrix(y_test, y_pred)\n", 778 | "class_labels = ['On-time', 'Delayed']\n", 779 | "labels = ['{0:0.0f}'.format(value) for value in\n", 780 | " cm.flatten()]\n", 781 | "labels = np.asarray(labels).reshape(2,2)\n", 782 | "fig = plt.figure(figsize=(12, 8))\n", 783 | "chart = sns.heatmap(\n", 784 | " cm, annot=labels, fmt='', cmap='Blues',\n", 785 | " xticklabels=class_labels, yticklabels=class_labels)\n", 786 | "chart.set_xlabel('Predicted label')\n", 787 | "chart.set_ylabel('True label')\n", 788 | "chart.set_title('Confusion Matrix')\n", 789 | "plt.show()" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": { 796 | "papermill": { 797 | "duration": 0.094854, 798 | "end_time": "2020-11-18T10:43:29.491225", 799 | "exception": false, 800 | "start_time": "2020-11-18T10:43:29.396371", 801 | "status": "completed" 802 | }, 803 | "tags": [] 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "# export confusion matrix for KFP\n", 808 | "cm_data = []\n", 809 | "for target_index, target_row in enumerate(cm):\n", 810 | " for predicted_index, count in enumerate(target_row):\n", 811 | " cm_data.append((class_labels[target_index], class_labels[predicted_index], count))\n", 812 | " \n", 813 | "ui_metadata = {\n", 814 | " 'outputs' : [{\n", 815 | " 'type': 'confusion_matrix',\n", 816 | " 'format': 'csv',\n", 817 | " 'schema': [\n", 818 | " {'name': 'target', 'type': 'CATEGORY'},\n", 819 | " {'name': 'predicted', 'type': 'CATEGORY'},\n", 820 | " {'name': 'count', 'type': 'NUMBER'},\n", 821 | " ],\n", 822 | " 'source': pd.DataFrame(cm_data).to_csv(header=False, index=False),\n", 823 | " 'storage': 'inline',\n", 824 | " 'labels': ['Delayed', 'On-time'],\n", 825 | " }]\n", 826 | "}\n", 827 | "\n", 828 | "with open('mlpipeline-ui-metadata.json', 'w') as f:\n", 829 | " json.dump(ui_metadata, f)" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": { 835 | "papermill": { 836 | "duration": 0.077766, 837 | "end_time": "2020-11-18T10:43:29.636822", 838 | "exception": false, 839 | "start_time": "2020-11-18T10:43:29.559056", 840 | "status": "completed" 841 | }, 842 | "tags": [] 843 | }, 844 | "source": [ 845 | "If we investigate the various classification charts and reports, we can see that our problem of classifying whether a flight will be delayed is a tricky one.\n", 846 | "\n", 847 | "As one might expect, the model predicts most `on-time` flights as `on-time` (80%). However, it struggles to correctly predict `delayed` flights, instead classifying them as `on-time`. In fact it only correctly predicts delays 28% of the time! (this is the `recall` figure for `Delayed` in the classification report table). When it predicts a delayed flight, it is correct only 25% of the time (this is the `precision` field).\n", 848 | "\n", 849 | "Overall, we would say that our model is doing a mediocre job of predicting flight delays - we either need to do a lot more model tuning and hyper-parameter selection, or use more data and better features.\n", 850 | "\n", 851 | "Perhaps you can try to find ways to improve the performance!\n", 852 | "\n", 853 | "Finally, we can generate a list of \"feature importances\" to see what the model is focusing on for making predictions:" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": { 860 | "papermill": { 861 | "duration": 0.679681, 862 | "end_time": "2020-11-18T10:43:30.395646", 863 | "exception": false, 864 | "start_time": "2020-11-18T10:43:29.715965", 865 | "status": "completed" 866 | }, 867 | "tags": [] 868 | }, 869 | "outputs": [], 870 | "source": [ 871 | "feat_names = list(df_train_cat.columns.values) + list(df_train[num_columns].columns.values)\n", 872 | "feat_nb = dt.feature_importances_\n", 873 | "plt.figure(figsize=(16, 8))\n", 874 | "chart = sns.barplot(x=feat_names, y=feat_nb, palette='Blues')\n", 875 | "chart.set_xticklabels(\n", 876 | " chart.get_xticklabels(), \n", 877 | " rotation=45, \n", 878 | " horizontalalignment='right',\n", 879 | " fontweight='light',\n", 880 | " fontsize='large'\n", 881 | ")\n", 882 | "plt.show()" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": { 888 | "papermill": { 889 | "duration": 0.046187, 890 | "end_time": "2020-11-18T10:43:30.490059", 891 | "exception": false, 892 | "start_time": "2020-11-18T10:43:30.443872", 893 | "status": "completed" 894 | }, 895 | "tags": [] 896 | }, 897 | "source": [ 898 | "Of the flight features, the time-based features as well as departure time and destination seem to be most important. For weather features, wind speed and visibility seem to be dominant in importance." 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": { 904 | "papermill": { 905 | "duration": 0.048653, 906 | "end_time": "2020-11-18T10:43:30.584400", 907 | "exception": false, 908 | "start_time": "2020-11-18T10:43:30.535747", 909 | "status": "completed" 910 | }, 911 | "tags": [] 912 | }, 913 | "source": [ 914 | "### Authors\n", 915 | "This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 916 | "\n", 917 | "Copyright © 2019 IBM. This notebook and its source code are released under the terms of the MIT License." 918 | ] 919 | } 920 | ], 921 | "metadata": { 922 | "kernelspec": { 923 | "display_name": "Python 3", 924 | "language": "python", 925 | "name": "python3" 926 | }, 927 | "language_info": { 928 | "codemirror_mode": { 929 | "name": "ipython", 930 | "version": 3 931 | }, 932 | "file_extension": ".py", 933 | "mimetype": "text/x-python", 934 | "name": "python", 935 | "nbconvert_exporter": "python", 936 | "pygments_lexer": "ipython3", 937 | "version": "3.7.9" 938 | }, 939 | "papermill": { 940 | "duration": 22.125901, 941 | "end_time": "2020-11-18T10:43:31.907543", 942 | "environment_variables": {}, 943 | "exception": null, 944 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/predict_flight_delays.ipynb", 945 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/predict_flight_delays.ipynb", 946 | "parameters": {}, 947 | "start_time": "2020-11-18T10:43:09.781642", 948 | "version": "2.1.1" 949 | }, 950 | "toc-autonumbering": false, 951 | "toc-showcode": false, 952 | "toc-showmarkdowntxt": false, 953 | "toc-showtags": false 954 | }, 955 | "nbformat": 4, 956 | "nbformat_minor": 4 957 | } 958 | -------------------------------------------------------------------------------- /notebooks/process_flight_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "papermill": { 7 | "duration": 0.012298, 8 | "end_time": "2020-11-18T10:41:24.439638", 9 | "exception": false, 10 | "start_time": "2020-11-18T10:41:24.427340", 11 | "status": "completed" 12 | }, 13 | "tags": [] 14 | }, 15 | "source": [ 16 | "# Processing the Airline Reporting Carrier On-Time Performance Dataset\n", 17 | "\n", 18 | "This notebook relates to the Airline Reporting Carrier On-Time Performance Dataset. The dataset contains information on approximately 200 million domestic US flights reported to the United States Bureau of Transportation Statistics, from 1987 - 2020. This dataset is freely available from the IBM Developer [Data Asset Exchange](https://developer.ibm.com/exchanges/data/all/airline/).\n", 19 | "\n", 20 | "**Note** the full dataset is very large (over 80GB uncompressed), so here we work with a smaller sample dataset containing a total of 2 million rows.\n", 21 | "\n", 22 | "In this notebook, we process the raw dataset by:\n", 23 | "* selecting the columns we wish to keep for later analysis\n", 24 | "* converting and cleaning data where required\n", 25 | "* handling missing values\n", 26 | "\n", 27 | "#### Import required modules\n", 28 | "\n", 29 | "Import and configure the required modules." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "papermill": { 37 | "duration": 0.020919, 38 | "end_time": "2020-11-18T10:41:24.474707", 39 | "exception": false, 40 | "start_time": "2020-11-18T10:41:24.453788", 41 | "status": "completed" 42 | }, 43 | "tags": [] 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# !pip install pandas > /dev/null 2>&1" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "papermill": { 55 | "duration": 0.938723, 56 | "end_time": "2020-11-18T10:41:25.426277", 57 | "exception": false, 58 | "start_time": "2020-11-18T10:41:24.487554", 59 | "status": "completed" 60 | }, 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Define required imports\n", 66 | "import pandas as pd\n", 67 | "# These set pandas max column and row display in the notebook\n", 68 | "pd.set_option('display.max_columns', 50)\n", 69 | "pd.set_option('display.max_rows', 50)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "papermill": { 76 | "duration": 0.023472, 77 | "end_time": "2020-11-18T10:41:25.467588", 78 | "exception": false, 79 | "start_time": "2020-11-18T10:41:25.444116", 80 | "status": "completed" 81 | }, 82 | "tags": [] 83 | }, 84 | "source": [ 85 | "### Read the Raw Data\n", 86 | "\n", 87 | "We start by reading in the raw dataset and displaying the first few rows of the dataframe." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "papermill": { 95 | "duration": 51.607155, 96 | "end_time": "2020-11-18T10:42:17.099275", 97 | "exception": false, 98 | "start_time": "2020-11-18T10:41:25.492120", 99 | "status": "completed" 100 | }, 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "data_path = 'data/airline_2m.csv'\n", 106 | "raw_data = pd.read_csv(data_path, encoding = \"ISO-8859-1\", parse_dates=['FlightDate'],\n", 107 | " dtype={'Div1Airport': str, 'Div1TailNum': str, 'Div2Airport': str, 'Div2TailNum': str})\n", 108 | "raw_data.head()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "papermill": { 115 | "duration": 0.02727, 116 | "end_time": "2020-11-18T10:42:17.155366", 117 | "exception": false, 118 | "start_time": "2020-11-18T10:42:17.128096", 119 | "status": "completed" 120 | }, 121 | "tags": [] 122 | }, 123 | "source": [ 124 | "### Clean the Data\n", 125 | "\n", 126 | "Fortunately, the airline delay dataset is relatively clean already! The fields we wish to use already represent variables such as unique codes for the airline, origin and destination. There are also fields representing binned variables for departure time slot and flight distance.\n", 127 | "\n", 128 | "We will select a subset of the data relating to years 2010-2017, with origin airport `JFK`, to match our weather data. For simplicity, we will focus on delayed flights and ignore flight cancellations. We will ignore arrival delays, hence we will be focusing on predicting \"departure delays\" _from JFK_ to other destinations, using the field `DepDel15`. This is a binary value indicating whether the flight was delayed by more than 15 minutes (deemed to be `delayed`) or not (deemed to be `on time`)." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "papermill": { 136 | "duration": 6.760619, 137 | "end_time": "2020-11-18T10:42:23.945236", 138 | "exception": false, 139 | "start_time": "2020-11-18T10:42:17.184617", 140 | "status": "completed" 141 | }, 142 | "tags": [] 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "# Select the data sub-set for years 2010-2017 for flights originating from JFK\n", 147 | "jfk_flights = raw_data.copy()\n", 148 | "jfk_flights = jfk_flights[(jfk_flights['Origin'] == 'JFK') & (jfk_flights['Year'].isin(range(2010, 2018))) & (jfk_flights['Cancelled'] == 0)]\n", 149 | "jfk_flights.head()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "papermill": { 156 | "duration": 0.020117, 157 | "end_time": "2020-11-18T10:42:23.994662", 158 | "exception": false, 159 | "start_time": "2020-11-18T10:42:23.974545", 160 | "status": "completed" 161 | }, 162 | "tags": [] 163 | }, 164 | "source": [ 165 | "We create a mapping of airline id to a more readable airline name (see the [airline ID looukp table](https://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRLINE_ID))." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "papermill": { 173 | "duration": 0.039415, 174 | "end_time": "2020-11-18T10:42:24.057478", 175 | "exception": false, 176 | "start_time": "2020-11-18T10:42:24.018063", 177 | "status": "completed" 178 | }, 179 | "tags": [] 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "airline_codes = {\n", 184 | " 20409: 'JetBlue',\n", 185 | " 19790: 'Delta',\n", 186 | " 19805: 'American Airlines',\n", 187 | " 20398: 'Envoy Air',\n", 188 | " 19977: 'United',\n", 189 | " 21171: 'Virgin America',\n", 190 | " 20363: 'Endeavor Air',\n", 191 | " 20417: 'Comair',\n", 192 | " 20355: 'US Airways',\n", 193 | " 20366: 'ExpressJet',\n", 194 | " 19690: 'Hawaiian Airlines',\n", 195 | " 19930: 'Alaska Airlines',\n", 196 | " 20378: 'Mesa Airlines'\n", 197 | "}" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "papermill": { 205 | "duration": 0.075155, 206 | "end_time": "2020-11-18T10:42:24.154423", 207 | "exception": false, 208 | "start_time": "2020-11-18T10:42:24.079268", 209 | "status": "completed" 210 | }, 211 | "tags": [] 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "jfk_flights.loc[:, 'airline_name'] = jfk_flights['DOT_ID_Reporting_Airline'].map(lambda x: airline_codes[x])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": { 221 | "papermill": { 222 | "duration": 0.015748, 223 | "end_time": "2020-11-18T10:42:24.188117", 224 | "exception": false, 225 | "start_time": "2020-11-18T10:42:24.172369", 226 | "status": "completed" 227 | }, 228 | "tags": [] 229 | }, 230 | "source": [ 231 | "Next, we select the set of columns to keep for downstream analysis and rename the columns to `snake_case` for consistency with our processed weather dataset." 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "papermill": { 239 | "duration": 0.061216, 240 | "end_time": "2020-11-18T10:42:24.265686", 241 | "exception": false, 242 | "start_time": "2020-11-18T10:42:24.204470", 243 | "status": "completed" 244 | }, 245 | "tags": [] 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "cols_to_keep = ['FlightDate', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Reporting_Airline', 'airline_name', 'Origin', 'Dest', 'CRSDepTime', 'DepTimeBlk', 'DistanceGroup', 'DepDel15', 'DepDelay']\n", 250 | "jfk_flights = jfk_flights[cols_to_keep]\n", 251 | "col_names = {\n", 252 | " 'FlightDate': 'flight_date',\n", 253 | " 'Month': 'month',\n", 254 | " 'DayofMonth': 'day_of_month',\n", 255 | " 'DayOfWeek': 'day_of_week',\n", 256 | " 'DOT_ID_Reporting_Airline': 'airline_id',\n", 257 | " 'Origin': 'origin',\n", 258 | " 'Dest': 'dest',\n", 259 | " 'CRSDepTime': 'sched_dep_time',\n", 260 | " 'DepTimeBlk': 'dep_time_bin',\n", 261 | " 'DistanceGroup': 'distance_bin',\n", 262 | " 'DepDel15': 'delayed',\n", 263 | " 'DepDelay': 'dep_delay'\n", 264 | "}\n", 265 | "jfk_flights_renamed = jfk_flights.rename(columns=col_names)\n", 266 | "jfk_flights_renamed.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "papermill": { 274 | "duration": 0.043619, 275 | "end_time": "2020-11-18T10:42:24.324809", 276 | "exception": false, 277 | "start_time": "2020-11-18T10:42:24.281190", 278 | "status": "completed" 279 | }, 280 | "tags": [] 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "# Log some general information about the dataset\n", 285 | "print('# of columns: ' + str(jfk_flights_renamed.shape[1])) \n", 286 | "print('# of observations: ' + str(jfk_flights_renamed.shape[0]))\n", 287 | "print('Start date: ' + str(jfk_flights_renamed['flight_date'].min()))\n", 288 | "print('End date: ' + str(jfk_flights_renamed['flight_date'].max()))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "papermill": { 295 | "duration": 0.020485, 296 | "end_time": "2020-11-18T10:42:24.374384", 297 | "exception": false, 298 | "start_time": "2020-11-18T10:42:24.353899", 299 | "status": "completed" 300 | }, 301 | "tags": [] 302 | }, 303 | "source": [ 304 | "### Save the Processed Data\n", 305 | "\n", 306 | "Finally, we save the processed dataset for use by downstream tasks." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "papermill": { 314 | "duration": 0.245405, 315 | "end_time": "2020-11-18T10:42:24.638852", 316 | "exception": false, 317 | "start_time": "2020-11-18T10:42:24.393447", 318 | "status": "completed" 319 | }, 320 | "tags": [] 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "jfk_flights_renamed.to_csv('data/jfk_flight_features.csv', index=False, float_format='%g')" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "papermill": { 331 | "duration": 0.027191, 332 | "end_time": "2020-11-18T10:42:24.695646", 333 | "exception": false, 334 | "start_time": "2020-11-18T10:42:24.668455", 335 | "status": "completed" 336 | }, 337 | "tags": [] 338 | }, 339 | "source": [ 340 | " ### Authors\n", 341 | " \n", 342 | " This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 343 | "\n", 344 | "Copyright © 2020 IBM. This notebook and its source code are released under the terms of the MIT License." 345 | ] 346 | } 347 | ], 348 | "metadata": { 349 | "kernelspec": { 350 | "display_name": "Python 3", 351 | "language": "python", 352 | "name": "python3" 353 | }, 354 | "language_info": { 355 | "codemirror_mode": { 356 | "name": "ipython", 357 | "version": 3 358 | }, 359 | "file_extension": ".py", 360 | "mimetype": "text/x-python", 361 | "name": "python", 362 | "nbconvert_exporter": "python", 363 | "pygments_lexer": "ipython3", 364 | "version": "3.7.9" 365 | }, 366 | "papermill": { 367 | "duration": 62.871732, 368 | "end_time": "2020-11-18T10:42:25.559608", 369 | "environment_variables": {}, 370 | "exception": null, 371 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_flight_data.ipynb", 372 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_flight_data.ipynb", 373 | "parameters": {}, 374 | "start_time": "2020-11-18T10:41:22.687876", 375 | "version": "2.1.1" 376 | }, 377 | "toc-autonumbering": false, 378 | "toc-showcode": false, 379 | "toc-showmarkdowntxt": false, 380 | "toc-showtags": false 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 4 384 | } 385 | -------------------------------------------------------------------------------- /notebooks/process_weather_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "papermill": { 7 | "duration": 0.057903, 8 | "end_time": "2020-11-18T10:42:27.967529", 9 | "exception": false, 10 | "start_time": "2020-11-18T10:42:27.909626", 11 | "status": "completed" 12 | }, 13 | "tags": [] 14 | }, 15 | "source": [ 16 | "# Processing NOAA Weather Data of JFK Airport (New York)\n", 17 | "\n", 18 | "This notebook relates to the NOAA Weather Dataset - JFK Airport (New York). The dataset contains 114,546 hourly observations of 12 local climatological variables (such as temperature, wind speed and precipitation) collected at JFK airport. This dataset is freely available from the IBM Developer [Data Asset Exchange](https://developer.ibm.com/exchanges/data/all/jfk-weather-data/).\n", 19 | "\n", 20 | "In this notebook, we process the raw dataset by:\n", 21 | "* selecting the columns we wish to keep for later downstream tasks\n", 22 | "* converting and cleaning data where required\n", 23 | "* filling missing values\n", 24 | "* extracting categorical weather features\n", 25 | "\n", 26 | "#### Import required modules\n", 27 | "\n", 28 | "Import and configure the required modules." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "papermill": { 36 | "duration": 0.072604, 37 | "end_time": "2020-11-18T10:42:28.096347", 38 | "exception": false, 39 | "start_time": "2020-11-18T10:42:28.023743", 40 | "status": "completed" 41 | }, 42 | "tags": [] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "# !pip install pandas > /dev/null 2>&1" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "papermill": { 54 | "duration": 1.236463, 55 | "end_time": "2020-11-18T10:42:29.391378", 56 | "exception": false, 57 | "start_time": "2020-11-18T10:42:28.154915", 58 | "status": "completed" 59 | }, 60 | "tags": [] 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "# Define required imports\n", 65 | "import pandas as pd\n", 66 | "import numpy as np\n", 67 | "import re\n", 68 | "# These set pandas max column and row display in the notebook\n", 69 | "pd.set_option('display.max_columns', 50)\n", 70 | "pd.set_option('display.max_rows', 50)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "papermill": { 77 | "duration": 0.053432, 78 | "end_time": "2020-11-18T10:42:29.492354", 79 | "exception": false, 80 | "start_time": "2020-11-18T10:42:29.438922", 81 | "status": "completed" 82 | }, 83 | "tags": [] 84 | }, 85 | "source": [ 86 | "### Read the Raw Data\n", 87 | "\n", 88 | "We start by reading in the raw dataset, displaying the first few rows of the dataframe, and taking a look at the columns and column types present." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "papermill": { 96 | "duration": 2.248616, 97 | "end_time": "2020-11-18T10:42:31.818019", 98 | "exception": false, 99 | "start_time": "2020-11-18T10:42:29.569403", 100 | "status": "completed" 101 | }, 102 | "tags": [] 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "raw_data = pd.read_csv('data/noaa-weather-data-jfk-airport/jfk_weather.csv', parse_dates=['DATE'])\n", 107 | "raw_data.head()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "papermill": { 114 | "duration": 0.047672, 115 | "end_time": "2020-11-18T10:42:31.916277", 116 | "exception": false, 117 | "start_time": "2020-11-18T10:42:31.868605", 118 | "status": "completed" 119 | }, 120 | "tags": [] 121 | }, 122 | "source": [ 123 | "### Clean the Data\n", 124 | "\n", 125 | "As you can see above, there are a lot of fields which are non-numerical - usually these will be fields that contain text or categorical data, e.g. `HOURLYPRSENTWEATHERTYPE`.\n", 126 | "\n", 127 | "There are also fields, such as `HOURLYVISIBILITY`, that we may expect to be numerical, but are instead `object` type. This often indicates that there may be missing (or `null`) values, or some other unusual readings that we may have to deal with (since otherwise the field would have been fully parsed as a numerical data type).\n", 128 | "\n", 129 | "In addition, some fields relate to hourly observations, while others relate to daily or monthly intervals. For purposes of later analysis, we will restrict the dataset to a certain subset of fields that relate to hourly observations.\n", 130 | "\n", 131 | "In this section, we refer to the [NOAA Local Climatological Data Documentation](https://data.noaa.gov/dataset/dataset/u-s-local-climatological-data-lcd/resource/ee7381ea-647a-434f-8cfa-81202b9b4c05) to describe the fields and meaning of various values." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": { 137 | "papermill": { 138 | "duration": 0.036204, 139 | "end_time": "2020-11-18T10:42:31.987330", 140 | "exception": false, 141 | "start_time": "2020-11-18T10:42:31.951126", 142 | "status": "completed" 143 | }, 144 | "tags": [] 145 | }, 146 | "source": [ 147 | "#### Select data subset\n", 148 | "\n", 149 | "First, we select only the subset of data of interest. We will keep data for years 2010 - 2017 related to routine hourly weather station reports. We will also restrict our dataset to only a subest of column types that we expect to be pertinent for downstream tasks." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "papermill": { 157 | "duration": 0.213779, 158 | "end_time": "2020-11-18T10:42:32.244195", 159 | "exception": false, 160 | "start_time": "2020-11-18T10:42:32.030416", 161 | "status": "completed" 162 | }, 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "# Choose what columns to import from raw data\n", 168 | "column_subset = [\n", 169 | " 'DATE',\n", 170 | " 'HOURLYVISIBILITY',\n", 171 | " 'HOURLYPRSENTWEATHERTYPE',\n", 172 | " 'HOURLYWindSpeed',\n", 173 | " 'HOURLYWindGustSpeed',\n", 174 | " 'HOURLYPrecip'\n", 175 | "]\n", 176 | "\n", 177 | "# Select the data sub-set for years 2010-2017 & report type FM-15 (routine hourly weather reports)\n", 178 | "data_subset = raw_data[(raw_data['DATE'].dt.year.isin(range(2010, 2018))) & (raw_data['REPORTTPYE'] == 'FM-15')]\n", 179 | "# Filter dataset to relevant columns\n", 180 | "weather_data = data_subset.loc[:, column_subset]\n", 181 | "# Set date index\n", 182 | "weather_data = weather_data.set_index(pd.DatetimeIndex(weather_data['DATE']))\n", 183 | "weather_data.drop(['DATE'], axis=1, inplace=True)\n", 184 | "weather_data.replace(to_replace='*', value=np.nan, inplace=True)\n", 185 | "weather_data.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "papermill": { 193 | "duration": 0.052686, 194 | "end_time": "2020-11-18T10:42:32.344763", 195 | "exception": false, 196 | "start_time": "2020-11-18T10:42:32.292077", 197 | "status": "completed" 198 | }, 199 | "tags": [] 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "weather_data.dtypes" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": { 209 | "papermill": { 210 | "duration": 0.038565, 211 | "end_time": "2020-11-18T10:42:32.425397", 212 | "exception": false, 213 | "start_time": "2020-11-18T10:42:32.386832", 214 | "status": "completed" 215 | }, 216 | "tags": [] 217 | }, 218 | "source": [ 219 | "#### Clean up precipitation column\n", 220 | "\n", 221 | "From the dataframe preview above, we can see that the column `HOURLYPrecip` - which is the hourly measure of precipitation levels - contains both `NaN` and `T` values. `T` specifies *trace amounts of precipitation*, while `NaN` means *not a number*, and is used to denote missing values.\n", 222 | "\n", 223 | "We can also inspect the unique values present for the field." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "papermill": { 231 | "duration": 0.046581, 232 | "end_time": "2020-11-18T10:42:32.509475", 233 | "exception": false, 234 | "start_time": "2020-11-18T10:42:32.462894", 235 | "status": "completed" 236 | }, 237 | "tags": [] 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "weather_data['HOURLYPrecip'].unique()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "papermill": { 248 | "duration": 0.036444, 249 | "end_time": "2020-11-18T10:42:32.573140", 250 | "exception": false, 251 | "start_time": "2020-11-18T10:42:32.536696", 252 | "status": "completed" 253 | }, 254 | "tags": [] 255 | }, 256 | "source": [ 257 | "We can see that some values end with an `s` (indicating snow), while there is a strange value `0.020.01s` which appears to be an error of some sort. To deal with `T` values, we will set the observation to be `0`. We will also replace the erroneous value `0.020.01s` with `NaN`.\n", 258 | "\n", 259 | "Finally, we will replace all `NaN` entries with `0`, i.e. we assume no precipitation was present." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "papermill": { 267 | "duration": 0.047662, 268 | "end_time": "2020-11-18T10:42:32.657348", 269 | "exception": false, 270 | "start_time": "2020-11-18T10:42:32.609686", 271 | "status": "completed" 272 | }, 273 | "tags": [] 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "# Fix precipitation data\n", 278 | "weather_data['HOURLYPrecip'].replace(to_replace='T', value='0.00', inplace=True)\n", 279 | "weather_data['HOURLYPrecip'].replace('0.020.01s', np.nan, inplace=True)\n", 280 | "weather_data.fillna(value={'HOURLYPrecip': 0}, inplace=True)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "papermill": { 287 | "duration": 0.0353, 288 | "end_time": "2020-11-18T10:42:32.720877", 289 | "exception": false, 290 | "start_time": "2020-11-18T10:42:32.685577", 291 | "status": "completed" 292 | }, 293 | "tags": [] 294 | }, 295 | "source": [ 296 | "#### Inspect visibility column\n", 297 | "\n", 298 | "As we have done for precipitation, we can also inspect the unique values present for the column `HOURLYVISIBILITY` - which is the hourly measure of visibility. Below, we see that some values are `nan`, while some end with an `V`. " 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "papermill": { 306 | "duration": 0.047845, 307 | "end_time": "2020-11-18T10:42:32.796248", 308 | "exception": false, 309 | "start_time": "2020-11-18T10:42:32.748403", 310 | "status": "completed" 311 | }, 312 | "tags": [] 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "weather_data['HOURLYVISIBILITY'].unique()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "papermill": { 323 | "duration": 0.033404, 324 | "end_time": "2020-11-18T10:42:32.875565", 325 | "exception": false, 326 | "start_time": "2020-11-18T10:42:32.842161", 327 | "status": "completed" 328 | }, 329 | "tags": [] 330 | }, 331 | "source": [ 332 | "#### Convert columns to numerical types\n", 333 | "\n", 334 | "Next, we will convert string columns that refer to numerical values to numerical types. For columns such as `HOURLYPrecip` and `HOURLYVISIBILITY`, we first also drop the non-numerical parts of the value (e.g .the `s` and `V` characters)." 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "papermill": { 342 | "duration": 0.612421, 343 | "end_time": "2020-11-18T10:42:33.517136", 344 | "exception": false, 345 | "start_time": "2020-11-18T10:42:32.904715", 346 | "status": "completed" 347 | }, 348 | "tags": [] 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "# Set of columns to convert\n", 353 | "messy_columns = ['HOURLYVISIBILITY', 'HOURLYPrecip', 'HOURLYWindSpeed', 'HOURLYWindGustSpeed']\n", 354 | "\n", 355 | "# Convert columns to float32 datatype\n", 356 | "for i in messy_columns:\n", 357 | " weather_data[i] = weather_data[i].apply(lambda x: re.sub('[^0-9,.-]', '', x) if type(x) == str else x).replace('', np.nan).astype(('float32'))" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": { 363 | "papermill": { 364 | "duration": 0.02799, 365 | "end_time": "2020-11-18T10:42:33.571211", 366 | "exception": false, 367 | "start_time": "2020-11-18T10:42:33.543221", 368 | "status": "completed" 369 | }, 370 | "tags": [] 371 | }, 372 | "source": [ 373 | "We can now see that all fields have numerical data type." 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "papermill": { 381 | "duration": 0.152779, 382 | "end_time": "2020-11-18T10:42:33.793868", 383 | "exception": false, 384 | "start_time": "2020-11-18T10:42:33.641089", 385 | "status": "completed" 386 | }, 387 | "tags": [] 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "print(weather_data.info())\n", 392 | "# Generate the summary statistics for each column\n", 393 | "weather_data.describe()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "papermill": { 400 | "duration": 0.047737, 401 | "end_time": "2020-11-18T10:42:33.890040", 402 | "exception": false, 403 | "start_time": "2020-11-18T10:42:33.842303", 404 | "status": "completed" 405 | }, 406 | "tags": [] 407 | }, 408 | "source": [ 409 | "For wind gusts, rather than have `NaN` entries (which represent no gusts), we will represent the gust speed column as \"excess speed\" over the `HOURLYWindSpeed` values.weather_data.head()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "papermill": { 417 | "duration": 0.276195, 418 | "end_time": "2020-11-18T10:42:34.197609", 419 | "exception": false, 420 | "start_time": "2020-11-18T10:42:33.921414", 421 | "status": "completed" 422 | }, 423 | "tags": [] 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "weather_data.loc[:, 'HOURLYWindGustSpeed'] = np.vectorize(lambda x, y: 0.0 if np.isnan(y) else y - x)(\n", 428 | " weather_data['HOURLYWindSpeed'], weather_data['HOURLYWindGustSpeed'])\n", 429 | "weather_data.head()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "papermill": { 436 | "duration": 0.034999, 437 | "end_time": "2020-11-18T10:42:34.262944", 438 | "exception": false, 439 | "start_time": "2020-11-18T10:42:34.227945", 440 | "status": "completed" 441 | }, 442 | "tags": [] 443 | }, 444 | "source": [ 445 | "#### Check date index\n", 446 | "\n", 447 | "Next, we check if there are any duplicates with respect to our `DATE` index and check furthermore that our dates are in the correct order (that is, strictly increasing)." 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "papermill": { 455 | "duration": 0.054207, 456 | "end_time": "2020-11-18T10:42:34.353374", 457 | "exception": false, 458 | "start_time": "2020-11-18T10:42:34.299167", 459 | "status": "completed" 460 | }, 461 | "tags": [] 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "cond = len(weather_data[weather_data.index.duplicated()].sort_index())\n", 466 | "print('Date index contains no duplicate entries: {}'.format(cond == 0))\n", 467 | "print('Date index is strictly increasing: {}'.format(weather_data.index.is_monotonic_increasing))" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": { 473 | "papermill": { 474 | "duration": 0.032104, 475 | "end_time": "2020-11-18T10:42:34.417847", 476 | "exception": false, 477 | "start_time": "2020-11-18T10:42:34.385743", 478 | "status": "completed" 479 | }, 480 | "tags": [] 481 | }, 482 | "source": [ 483 | "### Categorical Feature Extraction\n", 484 | "\n", 485 | "The final pre-processing step we will perform will be to handle the `HOURLYPRSENTWEATHERTYPE` column to correctly encode the weather features. This column indicates the presence of specific weather types for the given reading. For example, `-RA:02 BR:1 |RA:61 |RA:61` refers to 3 types of reading:\n", 486 | "1. `AU` codes for automated weather readings\n", 487 | "2. `AW` codes for a different type of automated weather reading\n", 488 | "3. `MW` codes for manually-augmented weather readings\n", 489 | "\n", 490 | "This example reading happens to contain all 3 types, separated by a `|` character. The `AU` code is thus `-RA:02 BR:1`. If we refer to the data documentation linked above, we can see this indicates the presence of `RA:02 - Rain` and `BR:1 - Mist`.\n", 491 | "\n", 492 | "These _present weather types_ are categorical variables. **Note** that multiple categories of weather can be present. In order to process this column, we will:\n", 493 | "* only use the `AU` codes for simplicity\n", 494 | "* convert the codes to more readable category names\n", 495 | "* extract the weather type categories into individual binary columns representing the presence (`1`) or absence (`0`) of that category. This is like \"one-hot encoding\" but for multi-category variables\n", 496 | "\n", 497 | "We start with creating a mapping from codes to category names" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": { 504 | "papermill": { 505 | "duration": 0.05096, 506 | "end_time": "2020-11-18T10:42:34.509497", 507 | "exception": false, 508 | "start_time": "2020-11-18T10:42:34.458537", 509 | "status": "completed" 510 | }, 511 | "tags": [] 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "# start with raw types taken from the LCD Dataset Documentation\n", 516 | "# we convert the raw weather type names to snake_case\n", 517 | "raw_types = '''DZ:01 - drizzle\n", 518 | "RA:02 - rain\n", 519 | "SN:03 - snow\n", 520 | "SG:04 - snow_grains\n", 521 | "IC:05 - ice_crystals\n", 522 | "PL:06 - ice_pellets\n", 523 | "GR:07 - hail\n", 524 | "GS:08 - small_hail_snow_pellets\n", 525 | "UP:09 - unknown_precipitation\n", 526 | "BR:1 - mist\n", 527 | "FG:2 - fog\n", 528 | "FU:3 - smoke\n", 529 | "VA:4 - volcanic_ash\n", 530 | "DU:5 - widespread_dust\n", 531 | "SA:6 - sand\n", 532 | "HZ:7 - haze\n", 533 | "PY:8 - spray\n", 534 | "PO:1 - well_developed_dust\n", 535 | "SQ:2 - squalls\n", 536 | "FC:3 - funnel_cloud_waterspout_tornado\n", 537 | "SS:4 - sandstorm\n", 538 | "DS:5 - duststorm'''.split('\\n')\n", 539 | "\n", 540 | "raw_types = [t.split(' - ') for t in raw_types]\n", 541 | "weather_types = {t[0]: t[1] for t in raw_types}\n", 542 | "# Add in a code that is inconsistently represented in the documentation\n", 543 | "weather_types['TS:7'] = 'thunderstorm'\n", 544 | "weather_types" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": { 550 | "papermill": { 551 | "duration": 0.052291, 552 | "end_time": "2020-11-18T10:42:34.593200", 553 | "exception": false, 554 | "start_time": "2020-11-18T10:42:34.540909", 555 | "status": "completed" 556 | }, 557 | "tags": [] 558 | }, 559 | "source": [ 560 | "There are still a few edge cases that do not fall within the weather type mapping we have created. For the purposes of simplification, we will ignore these, since we have captured the main weather types in our mapping. So, we create a function to convert codes to category names, handling any errors." 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": { 567 | "papermill": { 568 | "duration": 0.074268, 569 | "end_time": "2020-11-18T10:42:34.731431", 570 | "exception": false, 571 | "start_time": "2020-11-18T10:42:34.657163", 572 | "status": "completed" 573 | }, 574 | "tags": [] 575 | }, 576 | "outputs": [], 577 | "source": [ 578 | "def get_type(k):\n", 579 | " if k in weather_types:\n", 580 | " return weather_types[k]\n", 581 | " else:\n", 582 | " return ''\n", 583 | " \n", 584 | "def extract_weather_type(x):\n", 585 | " wt = x.split('|')[0].split() if isinstance(x, str) else []\n", 586 | " wt = [get_type(w.lstrip('-').lstrip('+')) for w in wt]\n", 587 | " return wt" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": { 593 | "papermill": { 594 | "duration": 0.068491, 595 | "end_time": "2020-11-18T10:42:34.864111", 596 | "exception": false, 597 | "start_time": "2020-11-18T10:42:34.795620", 598 | "status": "completed" 599 | }, 600 | "tags": [] 601 | }, 602 | "source": [ 603 | "Let's test our function out:" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": { 610 | "papermill": { 611 | "duration": 0.224131, 612 | "end_time": "2020-11-18T10:42:35.139554", 613 | "exception": false, 614 | "start_time": "2020-11-18T10:42:34.915423", 615 | "status": "completed" 616 | }, 617 | "tags": [] 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "print(weather_data['HOURLYPRSENTWEATHERTYPE'].head(5))\n", 622 | "print()\n", 623 | "print(weather_data['HOURLYPRSENTWEATHERTYPE'].apply(extract_weather_type).head(5))" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": { 629 | "papermill": { 630 | "duration": 0.054716, 631 | "end_time": "2020-11-18T10:42:35.260798", 632 | "exception": false, 633 | "start_time": "2020-11-18T10:42:35.206082", 634 | "status": "completed" 635 | }, 636 | "tags": [] 637 | }, 638 | "source": [ 639 | "That seems to be working. Next, we binarize the present weather categories in each cell:" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": { 646 | "papermill": { 647 | "duration": 0.820289, 648 | "end_time": "2020-11-18T10:42:36.111270", 649 | "exception": false, 650 | "start_time": "2020-11-18T10:42:35.290981", 651 | "status": "completed" 652 | }, 653 | "tags": [] 654 | }, 655 | "outputs": [], 656 | "source": [ 657 | "from collections import Counter\n", 658 | "counts = weather_data['HOURLYPRSENTWEATHERTYPE'].apply(extract_weather_type).apply(Counter)\n", 659 | "counts = pd.DataFrame.from_records(counts).fillna(value=0).drop(columns = [''])\n", 660 | "counts" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": { 666 | "papermill": { 667 | "duration": 0.053868, 668 | "end_time": "2020-11-18T10:42:36.209831", 669 | "exception": false, 670 | "start_time": "2020-11-18T10:42:36.155963", 671 | "status": "completed" 672 | }, 673 | "tags": [] 674 | }, 675 | "source": [ 676 | "Finally, we combine the extra columns we've created with our original dataframe:" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": { 683 | "papermill": { 684 | "duration": 0.115044, 685 | "end_time": "2020-11-18T10:42:36.363452", 686 | "exception": false, 687 | "start_time": "2020-11-18T10:42:36.248408", 688 | "status": "completed" 689 | }, 690 | "tags": [] 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "cleaned_data = pd.concat([weather_data, counts.set_index(weather_data.index)], axis=1)\n", 695 | "cleaned_data" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": { 701 | "papermill": { 702 | "duration": 0.053257, 703 | "end_time": "2020-11-18T10:42:36.454488", 704 | "exception": false, 705 | "start_time": "2020-11-18T10:42:36.401231", 706 | "status": "completed" 707 | }, 708 | "tags": [] 709 | }, 710 | "source": [ 711 | "#### Rename columns\n", 712 | "\n", 713 | "Before saving the dataset, we will rename the columns for readability." 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": { 720 | "papermill": { 721 | "duration": 0.065562, 722 | "end_time": "2020-11-18T10:42:36.566875", 723 | "exception": false, 724 | "start_time": "2020-11-18T10:42:36.501313", 725 | "status": "completed" 726 | }, 727 | "tags": [] 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "cleaned_data.columns" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "metadata": { 738 | "papermill": { 739 | "duration": 0.079686, 740 | "end_time": "2020-11-18T10:42:36.723515", 741 | "exception": false, 742 | "start_time": "2020-11-18T10:42:36.643829", 743 | "status": "completed" 744 | }, 745 | "tags": [] 746 | }, 747 | "outputs": [], 748 | "source": [ 749 | "# define some new column names for consistency\n", 750 | "columns_name_map = {\n", 751 | " 'HOURLYVISIBILITY': 'visibility',\n", 752 | " 'HOURLYPRSENTWEATHERTYPE': 'weather_type_raw',\n", 753 | " 'HOURLYWindSpeed': 'wind_speed',\n", 754 | " 'HOURLYWindGustSpeed': 'wind_gust_speed',\n", 755 | " 'HOURLYPrecip': 'precip',\n", 756 | "}\n", 757 | "\n", 758 | "cleaned_data_renamed = cleaned_data.rename(columns=columns_name_map)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": { 765 | "papermill": { 766 | "duration": 0.143285, 767 | "end_time": "2020-11-18T10:42:36.925259", 768 | "exception": false, 769 | "start_time": "2020-11-18T10:42:36.781974", 770 | "status": "completed" 771 | }, 772 | "tags": [] 773 | }, 774 | "outputs": [], 775 | "source": [ 776 | "print(cleaned_data_renamed.info())\n", 777 | "print()\n", 778 | "cleaned_data_renamed.head()" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": { 785 | "papermill": { 786 | "duration": 0.161798, 787 | "end_time": "2020-11-18T10:42:37.164530", 788 | "exception": false, 789 | "start_time": "2020-11-18T10:42:37.002732", 790 | "status": "completed" 791 | }, 792 | "tags": [] 793 | }, 794 | "outputs": [], 795 | "source": [ 796 | "# Log some general information about the dataset\n", 797 | "print('# of columns: ' + str(cleaned_data_renamed.shape[1])) \n", 798 | "print('# of observations: ' + str(cleaned_data_renamed.shape[0]))\n", 799 | "print('Start date: ' + str(cleaned_data_renamed.index[0]))\n", 800 | "print('End date: ' + str(cleaned_data_renamed.index[-1]))" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": { 806 | "papermill": { 807 | "duration": 0.109368, 808 | "end_time": "2020-11-18T10:42:37.386139", 809 | "exception": false, 810 | "start_time": "2020-11-18T10:42:37.276771", 811 | "status": "completed" 812 | }, 813 | "tags": [] 814 | }, 815 | "source": [ 816 | "### Save the Processed Data\n", 817 | "\n", 818 | "Finally, we save the processed dataset for use by downstream tasks." 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": { 825 | "papermill": { 826 | "duration": 2.452826, 827 | "end_time": "2020-11-18T10:42:39.912468", 828 | "exception": false, 829 | "start_time": "2020-11-18T10:42:37.459642", 830 | "status": "completed" 831 | }, 832 | "tags": [] 833 | }, 834 | "outputs": [], 835 | "source": [ 836 | "cleaned_data_renamed.to_csv('data/jfk_weather_features.csv', float_format='%g')" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": { 842 | "papermill": { 843 | "duration": 0.049907, 844 | "end_time": "2020-11-18T10:42:40.004559", 845 | "exception": false, 846 | "start_time": "2020-11-18T10:42:39.954652", 847 | "status": "completed" 848 | }, 849 | "tags": [] 850 | }, 851 | "source": [ 852 | " ### Authors\n", 853 | " \n", 854 | " This notebook was created by the [Center for Open-Source Data & AI Technologies](http://codait.org).\n", 855 | "\n", 856 | "Copyright © 2020 IBM. This notebook and its source code are released under the terms of the MIT License." 857 | ] 858 | } 859 | ], 860 | "metadata": { 861 | "kernelspec": { 862 | "display_name": "Python 3", 863 | "language": "python", 864 | "name": "python3" 865 | }, 866 | "language_info": { 867 | "codemirror_mode": { 868 | "name": "ipython", 869 | "version": 3 870 | }, 871 | "file_extension": ".py", 872 | "mimetype": "text/x-python", 873 | "name": "python", 874 | "nbconvert_exporter": "python", 875 | "pygments_lexer": "ipython3", 876 | "version": "3.7.9" 877 | }, 878 | "papermill": { 879 | "duration": 14.833694, 880 | "end_time": "2020-11-18T10:42:40.587559", 881 | "environment_variables": {}, 882 | "exception": null, 883 | "input_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_weather_data.ipynb", 884 | "output_path": "/Users/nick/workspace/python/flight-delay-notebooks/notebooks/process_weather_data.ipynb", 885 | "parameters": {}, 886 | "start_time": "2020-11-18T10:42:25.753865", 887 | "version": "2.1.1" 888 | } 889 | }, 890 | "nbformat": 4, 891 | "nbformat_minor": 4 892 | } 893 | -------------------------------------------------------------------------------- /pipelines/flight_delays.pipeline: -------------------------------------------------------------------------------- 1 | { 2 | "doc_type": "pipeline", 3 | "version": "3.0", 4 | "json_schema": "http://api.dataplatform.ibm.com/schemas/common-pipeline/pipeline-flow/pipeline-flow-v3-schema.json", 5 | "id": "d8fa85b7-04fd-467d-a45d-e127e1eccfe8", 6 | "primary_pipeline": "07d7f720-4cde-4c57-a4ee-e99c68f307b1", 7 | "pipelines": [ 8 | { 9 | "id": "07d7f720-4cde-4c57-a4ee-e99c68f307b1", 10 | "nodes": [ 11 | { 12 | "id": "2f3f6243-82ef-43ee-af09-9888a5dfbc30", 13 | "type": "execution_node", 14 | "op": "execute-python-node", 15 | "app_data": { 16 | "filename": "../notebooks/load_data.py", 17 | "runtime_image": "amancevice/pandas:1.1.1", 18 | "env_vars": [ 19 | "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/airline_2m.tar.gz" 20 | ], 21 | "include_subdirectories": false, 22 | "invalidNodeError": null, 23 | "outputs": [ 24 | "data/airline_2m.csv" 25 | ], 26 | "ui_data": { 27 | "label": "load_data", 28 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 29 | "x_pos": 71.16775512695312, 30 | "y_pos": 91.4539566040039, 31 | "description": "Python Script" 32 | } 33 | }, 34 | "inputs": [ 35 | { 36 | "id": "inPort", 37 | "app_data": { 38 | "ui_data": { 39 | "cardinality": { 40 | "min": 0, 41 | "max": 1 42 | }, 43 | "label": "Input Port" 44 | } 45 | } 46 | } 47 | ], 48 | "outputs": [ 49 | { 50 | "id": "outPort", 51 | "app_data": { 52 | "ui_data": { 53 | "cardinality": { 54 | "min": 0, 55 | "max": -1 56 | }, 57 | "label": "Output Port" 58 | } 59 | } 60 | } 61 | ] 62 | }, 63 | { 64 | "id": "b614bf55-c127-413a-935a-175d7afa7f0e", 65 | "type": "execution_node", 66 | "op": "execute-python-node", 67 | "app_data": { 68 | "filename": "../notebooks/load_data.py", 69 | "runtime_image": "amancevice/pandas:1.1.1", 70 | "env_vars": [ 71 | "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz" 72 | ], 73 | "include_subdirectories": false, 74 | "invalidNodeError": null, 75 | "outputs": [ 76 | "data/noaa-weather-data-jfk-airport/jfk_weather.csv" 77 | ], 78 | "ui_data": { 79 | "label": "load_data", 80 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 81 | "x_pos": 65, 82 | "y_pos": 241, 83 | "description": "Python Script" 84 | } 85 | }, 86 | "inputs": [ 87 | { 88 | "id": "inPort", 89 | "app_data": { 90 | "ui_data": { 91 | "cardinality": { 92 | "min": 0, 93 | "max": 1 94 | }, 95 | "label": "Input Port" 96 | } 97 | } 98 | } 99 | ], 100 | "outputs": [ 101 | { 102 | "id": "outPort", 103 | "app_data": { 104 | "ui_data": { 105 | "cardinality": { 106 | "min": 0, 107 | "max": -1 108 | }, 109 | "label": "Output Port" 110 | } 111 | } 112 | } 113 | ] 114 | }, 115 | { 116 | "id": "67d35156-a7ba-4339-9975-d5eac9be5b1e", 117 | "type": "execution_node", 118 | "op": "execute-notebook-node", 119 | "app_data": { 120 | "filename": "../notebooks/process_flight_data.ipynb", 121 | "runtime_image": "amancevice/pandas:1.1.1", 122 | "env_vars": [], 123 | "include_subdirectories": false, 124 | "invalidNodeError": null, 125 | "outputs": [ 126 | "data/jfk_flight_features.csv" 127 | ], 128 | "ui_data": { 129 | "label": "process_flight_data", 130 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 131 | "x_pos": 344, 132 | "y_pos": 97, 133 | "description": "Notebook file" 134 | } 135 | }, 136 | "inputs": [ 137 | { 138 | "id": "inPort", 139 | "app_data": { 140 | "ui_data": { 141 | "cardinality": { 142 | "min": 0, 143 | "max": 1 144 | }, 145 | "label": "Input Port" 146 | } 147 | }, 148 | "links": [ 149 | { 150 | "id": "eff34dc4-552d-49d1-86ab-ad774c09ed9c", 151 | "node_id_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30", 152 | "port_id_ref": "outPort" 153 | } 154 | ] 155 | } 156 | ], 157 | "outputs": [ 158 | { 159 | "id": "outPort", 160 | "app_data": { 161 | "ui_data": { 162 | "cardinality": { 163 | "min": 0, 164 | "max": -1 165 | }, 166 | "label": "Output Port" 167 | } 168 | } 169 | } 170 | ] 171 | }, 172 | { 173 | "id": "71257647-2fc5-4db8-95ef-5813bc386f95", 174 | "type": "execution_node", 175 | "op": "execute-notebook-node", 176 | "app_data": { 177 | "filename": "../notebooks/process_weather_data.ipynb", 178 | "runtime_image": "amancevice/pandas:1.1.1", 179 | "env_vars": [], 180 | "include_subdirectories": false, 181 | "invalidNodeError": null, 182 | "outputs": [ 183 | "data/jfk_weather_features.csv" 184 | ], 185 | "ui_data": { 186 | "label": "process_weather_data", 187 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 188 | "x_pos": 342, 189 | "y_pos": 240, 190 | "description": "Notebook file" 191 | } 192 | }, 193 | "inputs": [ 194 | { 195 | "id": "inPort", 196 | "app_data": { 197 | "ui_data": { 198 | "cardinality": { 199 | "min": 0, 200 | "max": 1 201 | }, 202 | "label": "Input Port" 203 | } 204 | }, 205 | "links": [ 206 | { 207 | "id": "0a85bfc7-3bf3-4885-9026-2bd9fa30b729", 208 | "node_id_ref": "b614bf55-c127-413a-935a-175d7afa7f0e", 209 | "port_id_ref": "outPort" 210 | } 211 | ] 212 | } 213 | ], 214 | "outputs": [ 215 | { 216 | "id": "outPort", 217 | "app_data": { 218 | "ui_data": { 219 | "cardinality": { 220 | "min": 0, 221 | "max": -1 222 | }, 223 | "label": "Output Port" 224 | } 225 | } 226 | } 227 | ] 228 | }, 229 | { 230 | "id": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 231 | "type": "execution_node", 232 | "op": "execute-notebook-node", 233 | "app_data": { 234 | "filename": "../notebooks/merge_data.ipynb", 235 | "runtime_image": "amancevice/pandas:1.1.1", 236 | "env_vars": [], 237 | "include_subdirectories": false, 238 | "invalidNodeError": null, 239 | "outputs": [ 240 | "data/jfk_flight_weather_features.csv" 241 | ], 242 | "ui_data": { 243 | "label": "merge_data", 244 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 245 | "x_pos": 623, 246 | "y_pos": 166, 247 | "description": "Notebook file" 248 | } 249 | }, 250 | "inputs": [ 251 | { 252 | "id": "inPort", 253 | "app_data": { 254 | "ui_data": { 255 | "cardinality": { 256 | "min": 0, 257 | "max": 2 258 | }, 259 | "label": "Input Port" 260 | } 261 | }, 262 | "links": [ 263 | { 264 | "id": "8d964ba9-5fd5-455d-83ef-0d64ecd05c56", 265 | "node_id_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e", 266 | "port_id_ref": "outPort" 267 | }, 268 | { 269 | "id": "abfb2d60-15ea-4b54-ba87-5eb82aca5204", 270 | "node_id_ref": "71257647-2fc5-4db8-95ef-5813bc386f95", 271 | "port_id_ref": "outPort" 272 | } 273 | ] 274 | } 275 | ], 276 | "outputs": [ 277 | { 278 | "id": "outPort", 279 | "app_data": { 280 | "ui_data": { 281 | "cardinality": { 282 | "min": 0, 283 | "max": -1 284 | }, 285 | "label": "Output Port" 286 | } 287 | } 288 | } 289 | ] 290 | }, 291 | { 292 | "id": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01", 293 | "type": "execution_node", 294 | "op": "execute-notebook-node", 295 | "app_data": { 296 | "filename": "../notebooks/analyze_flight_delays.ipynb", 297 | "runtime_image": "amancevice/pandas:1.1.1", 298 | "env_vars": [], 299 | "include_subdirectories": false, 300 | "invalidNodeError": null, 301 | "ui_data": { 302 | "label": "analyze_flight_delays", 303 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 304 | "x_pos": 943, 305 | "y_pos": 94, 306 | "description": "Notebook file" 307 | } 308 | }, 309 | "inputs": [ 310 | { 311 | "id": "inPort", 312 | "app_data": { 313 | "ui_data": { 314 | "cardinality": { 315 | "min": 0, 316 | "max": 1 317 | }, 318 | "label": "Input Port" 319 | } 320 | }, 321 | "links": [ 322 | { 323 | "id": "809a9b07-21ec-4d30-b607-f0fab979790b", 324 | "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 325 | "port_id_ref": "outPort" 326 | } 327 | ] 328 | } 329 | ], 330 | "outputs": [ 331 | { 332 | "id": "outPort", 333 | "app_data": { 334 | "ui_data": { 335 | "cardinality": { 336 | "min": 0, 337 | "max": -1 338 | }, 339 | "label": "Output Port" 340 | } 341 | } 342 | } 343 | ] 344 | }, 345 | { 346 | "id": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f", 347 | "type": "execution_node", 348 | "op": "execute-notebook-node", 349 | "app_data": { 350 | "filename": "../notebooks/predict_flight_delays.ipynb", 351 | "runtime_image": "amancevice/pandas:1.1.1", 352 | "env_vars": [], 353 | "include_subdirectories": false, 354 | "invalidNodeError": null, 355 | "outputs": [], 356 | "ui_data": { 357 | "label": "predict_flight_delays", 358 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 359 | "x_pos": 944.013916015625, 360 | "y_pos": 251.00418090820312, 361 | "description": "Notebook file" 362 | } 363 | }, 364 | "inputs": [ 365 | { 366 | "id": "inPort", 367 | "app_data": { 368 | "ui_data": { 369 | "cardinality": { 370 | "min": 0, 371 | "max": 1 372 | }, 373 | "label": "Input Port" 374 | } 375 | }, 376 | "links": [ 377 | { 378 | "id": "17ab35cb-a52f-4c37-984e-e33bf7b67d36", 379 | "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 380 | "port_id_ref": "outPort" 381 | } 382 | ] 383 | } 384 | ], 385 | "outputs": [ 386 | { 387 | "id": "outPort", 388 | "app_data": { 389 | "ui_data": { 390 | "cardinality": { 391 | "min": 0, 392 | "max": -1 393 | }, 394 | "label": "Output Port" 395 | } 396 | } 397 | } 398 | ] 399 | } 400 | ], 401 | "app_data": { 402 | "ui_data": { 403 | "comments": [ 404 | { 405 | "id": "e5c417b4-9be7-4244-8597-21e0c0e00a70", 406 | "x_pos": 28, 407 | "y_pos": 17, 408 | "width": 175, 409 | "height": 42, 410 | "class_name": "d3-comment-rect", 411 | "content": "Load flight delay data", 412 | "associated_id_refs": [ 413 | { 414 | "node_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30" 415 | } 416 | ] 417 | }, 418 | { 419 | "id": "c1e71c34-9ae7-4a85-9251-91c554034cc2", 420 | "x_pos": 30, 421 | "y_pos": 338, 422 | "width": 175, 423 | "height": 42, 424 | "class_name": "d3-comment-rect", 425 | "content": "Load JFK weather data", 426 | "associated_id_refs": [ 427 | { 428 | "node_ref": "b614bf55-c127-413a-935a-175d7afa7f0e" 429 | } 430 | ] 431 | }, 432 | { 433 | "id": "9cd374ba-b6ee-47a3-b963-4f164621d78b", 434 | "x_pos": 292, 435 | "y_pos": 15, 436 | "width": 175, 437 | "height": 42, 438 | "class_name": "d3-comment-rect", 439 | "content": "Clean up & pre-process flight delay data", 440 | "associated_id_refs": [ 441 | { 442 | "node_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e" 443 | } 444 | ] 445 | }, 446 | { 447 | "id": "44d53e47-c149-4b69-ad6e-259dcd8f8b9f", 448 | "x_pos": 308, 449 | "y_pos": 334, 450 | "width": 175, 451 | "height": 42, 452 | "class_name": "d3-comment-rect", 453 | "content": "Clean up & pre-process weather data", 454 | "associated_id_refs": [ 455 | { 456 | "node_ref": "71257647-2fc5-4db8-95ef-5813bc386f95" 457 | } 458 | ] 459 | }, 460 | { 461 | "id": "b12e0c12-9aa1-4c77-b6d5-0f02f7c64807", 462 | "x_pos": 579, 463 | "y_pos": 29, 464 | "width": 243, 465 | "height": 64, 466 | "class_name": "d3-comment-rect", 467 | "content": "Combine flight delay & weather dataset for downstream analytics & prediction tasks", 468 | "associated_id_refs": [ 469 | { 470 | "node_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b" 471 | } 472 | ] 473 | }, 474 | { 475 | "id": "5d7aa386-549a-4c76-8f58-7ead3658fd7f", 476 | "x_pos": 908, 477 | "y_pos": 20, 478 | "width": 175, 479 | "height": 42, 480 | "class_name": "d3-comment-rect", 481 | "content": "Analyze & visualize flight delay & weather data", 482 | "associated_id_refs": [ 483 | { 484 | "node_ref": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01" 485 | } 486 | ] 487 | }, 488 | { 489 | "id": "bf560da3-5a43-4671-84ea-d98d562e1ec3", 490 | "x_pos": 860, 491 | "y_pos": 357, 492 | "width": 230, 493 | "height": 52, 494 | "class_name": "d3-comment-rect", 495 | "content": "Train & evaluate machine learning models to predict flight delays", 496 | "associated_id_refs": [ 497 | { 498 | "node_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f" 499 | } 500 | ] 501 | } 502 | ] 503 | }, 504 | "version": 3 505 | }, 506 | "runtime_ref": "" 507 | } 508 | ], 509 | "schemas": [] 510 | } -------------------------------------------------------------------------------- /pipelines/flight_delays_with_deployment.pipeline: -------------------------------------------------------------------------------- 1 | { 2 | "doc_type": "pipeline", 3 | "version": "3.0", 4 | "json_schema": "http://api.dataplatform.ibm.com/schemas/common-pipeline/pipeline-flow/pipeline-flow-v3-schema.json", 5 | "id": "d8fa85b7-04fd-467d-a45d-e127e1eccfe8", 6 | "primary_pipeline": "07d7f720-4cde-4c57-a4ee-e99c68f307b1", 7 | "pipelines": [ 8 | { 9 | "id": "07d7f720-4cde-4c57-a4ee-e99c68f307b1", 10 | "nodes": [ 11 | { 12 | "id": "2f3f6243-82ef-43ee-af09-9888a5dfbc30", 13 | "type": "execution_node", 14 | "op": "execute-python-node", 15 | "app_data": { 16 | "filename": "../notebooks/load_data.py", 17 | "runtime_image": "amancevice/pandas:1.1.1", 18 | "env_vars": [ 19 | "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/airline_2m.tar.gz" 20 | ], 21 | "include_subdirectories": false, 22 | "invalidNodeError": null, 23 | "outputs": [ 24 | "data/airline_2m.csv" 25 | ], 26 | "ui_data": { 27 | "label": "load_data", 28 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 29 | "x_pos": 72.16775512695312, 30 | "y_pos": 91.4539566040039, 31 | "description": "Python Script" 32 | } 33 | }, 34 | "inputs": [ 35 | { 36 | "id": "inPort", 37 | "app_data": { 38 | "ui_data": { 39 | "cardinality": { 40 | "min": 0, 41 | "max": 1 42 | }, 43 | "label": "Input Port" 44 | } 45 | } 46 | } 47 | ], 48 | "outputs": [ 49 | { 50 | "id": "outPort", 51 | "app_data": { 52 | "ui_data": { 53 | "cardinality": { 54 | "min": 0, 55 | "max": -1 56 | }, 57 | "label": "Output Port" 58 | } 59 | } 60 | } 61 | ] 62 | }, 63 | { 64 | "id": "b614bf55-c127-413a-935a-175d7afa7f0e", 65 | "type": "execution_node", 66 | "op": "execute-python-node", 67 | "app_data": { 68 | "filename": "../notebooks/load_data.py", 69 | "runtime_image": "amancevice/pandas:1.1.1", 70 | "env_vars": [ 71 | "DATASET_URL=https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz" 72 | ], 73 | "include_subdirectories": false, 74 | "invalidNodeError": null, 75 | "outputs": [ 76 | "data/noaa-weather-data-jfk-airport/jfk_weather.csv" 77 | ], 78 | "ui_data": { 79 | "label": "load_data", 80 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-brand0%20jp-icon-selectable%22%20fill%3D%22%230D47A1%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M11.1%206.9V5.8H6.9c0-.5%200-1.3.2-1.6.4-.7.8-1.1%201.7-1.4%201.7-.3%202.5-.3%203.9-.1%201%20.1%201.9.9%201.9%201.9v4.2c0%20.5-.9%201.6-2%201.6H8.8c-1.5%200-2.4%201.4-2.4%202.8v2.2H4.7C3.5%2015.1%203%2014%203%2013.1V9c-.1-1%20.6-2%201.8-2%201.5-.1%206.3-.1%206.3-.1z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M10.9%2015.1v1.1h4.2c0%20.5%200%201.3-.2%201.6-.4.7-.8%201.1-1.7%201.4-1.7.3-2.5.3-3.9.1-1-.1-1.9-.9-1.9-1.9v-4.2c0-.5.9-1.6%202-1.6h3.8c1.5%200%202.4-1.4%202.4-2.8V6.6h1.7C18.5%206.9%2019%208%2019%208.9V13c0%201-.7%202.1-1.9%202.1h-6.2z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 81 | "x_pos": 65, 82 | "y_pos": 241, 83 | "description": "Python Script" 84 | } 85 | }, 86 | "inputs": [ 87 | { 88 | "id": "inPort", 89 | "app_data": { 90 | "ui_data": { 91 | "cardinality": { 92 | "min": 0, 93 | "max": 1 94 | }, 95 | "label": "Input Port" 96 | } 97 | } 98 | } 99 | ], 100 | "outputs": [ 101 | { 102 | "id": "outPort", 103 | "app_data": { 104 | "ui_data": { 105 | "cardinality": { 106 | "min": 0, 107 | "max": -1 108 | }, 109 | "label": "Output Port" 110 | } 111 | } 112 | } 113 | ] 114 | }, 115 | { 116 | "id": "67d35156-a7ba-4339-9975-d5eac9be5b1e", 117 | "type": "execution_node", 118 | "op": "execute-notebook-node", 119 | "app_data": { 120 | "filename": "../notebooks/process_flight_data.ipynb", 121 | "runtime_image": "amancevice/pandas:1.1.1", 122 | "env_vars": [], 123 | "include_subdirectories": false, 124 | "invalidNodeError": null, 125 | "outputs": [ 126 | "data/jfk_flight_features.csv" 127 | ], 128 | "ui_data": { 129 | "label": "process_flight_data", 130 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 131 | "x_pos": 343, 132 | "y_pos": 97, 133 | "description": "Notebook file" 134 | } 135 | }, 136 | "inputs": [ 137 | { 138 | "id": "inPort", 139 | "app_data": { 140 | "ui_data": { 141 | "cardinality": { 142 | "min": 0, 143 | "max": 1 144 | }, 145 | "label": "Input Port" 146 | } 147 | }, 148 | "links": [ 149 | { 150 | "id": "eff34dc4-552d-49d1-86ab-ad774c09ed9c", 151 | "node_id_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30", 152 | "port_id_ref": "outPort" 153 | } 154 | ] 155 | } 156 | ], 157 | "outputs": [ 158 | { 159 | "id": "outPort", 160 | "app_data": { 161 | "ui_data": { 162 | "cardinality": { 163 | "min": 0, 164 | "max": -1 165 | }, 166 | "label": "Output Port" 167 | } 168 | } 169 | } 170 | ] 171 | }, 172 | { 173 | "id": "71257647-2fc5-4db8-95ef-5813bc386f95", 174 | "type": "execution_node", 175 | "op": "execute-notebook-node", 176 | "app_data": { 177 | "filename": "../notebooks/process_weather_data.ipynb", 178 | "runtime_image": "amancevice/pandas:1.1.1", 179 | "env_vars": [], 180 | "include_subdirectories": false, 181 | "invalidNodeError": null, 182 | "outputs": [ 183 | "data/jfk_weather_features.csv" 184 | ], 185 | "ui_data": { 186 | "label": "process_weather_data", 187 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 188 | "x_pos": 340, 189 | "y_pos": 240, 190 | "description": "Notebook file" 191 | } 192 | }, 193 | "inputs": [ 194 | { 195 | "id": "inPort", 196 | "app_data": { 197 | "ui_data": { 198 | "cardinality": { 199 | "min": 0, 200 | "max": 1 201 | }, 202 | "label": "Input Port" 203 | } 204 | }, 205 | "links": [ 206 | { 207 | "id": "0a85bfc7-3bf3-4885-9026-2bd9fa30b729", 208 | "node_id_ref": "b614bf55-c127-413a-935a-175d7afa7f0e", 209 | "port_id_ref": "outPort" 210 | } 211 | ] 212 | } 213 | ], 214 | "outputs": [ 215 | { 216 | "id": "outPort", 217 | "app_data": { 218 | "ui_data": { 219 | "cardinality": { 220 | "min": 0, 221 | "max": -1 222 | }, 223 | "label": "Output Port" 224 | } 225 | } 226 | } 227 | ] 228 | }, 229 | { 230 | "id": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 231 | "type": "execution_node", 232 | "op": "execute-notebook-node", 233 | "app_data": { 234 | "filename": "../notebooks/merge_data.ipynb", 235 | "runtime_image": "amancevice/pandas:1.1.1", 236 | "env_vars": [], 237 | "include_subdirectories": false, 238 | "invalidNodeError": null, 239 | "outputs": [ 240 | "data/jfk_flight_weather_features.csv" 241 | ], 242 | "ui_data": { 243 | "label": "merge_data", 244 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 245 | "x_pos": 623, 246 | "y_pos": 166, 247 | "description": "Notebook file" 248 | } 249 | }, 250 | "inputs": [ 251 | { 252 | "id": "inPort", 253 | "app_data": { 254 | "ui_data": { 255 | "cardinality": { 256 | "min": 0, 257 | "max": 2 258 | }, 259 | "label": "Input Port" 260 | } 261 | }, 262 | "links": [ 263 | { 264 | "id": "8d964ba9-5fd5-455d-83ef-0d64ecd05c56", 265 | "node_id_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e", 266 | "port_id_ref": "outPort" 267 | }, 268 | { 269 | "id": "abfb2d60-15ea-4b54-ba87-5eb82aca5204", 270 | "node_id_ref": "71257647-2fc5-4db8-95ef-5813bc386f95", 271 | "port_id_ref": "outPort" 272 | } 273 | ] 274 | } 275 | ], 276 | "outputs": [ 277 | { 278 | "id": "outPort", 279 | "app_data": { 280 | "ui_data": { 281 | "cardinality": { 282 | "min": 0, 283 | "max": -1 284 | }, 285 | "label": "Output Port" 286 | } 287 | } 288 | } 289 | ] 290 | }, 291 | { 292 | "id": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01", 293 | "type": "execution_node", 294 | "op": "execute-notebook-node", 295 | "app_data": { 296 | "filename": "../notebooks/analyze_flight_delays.ipynb", 297 | "runtime_image": "amancevice/pandas:1.1.1", 298 | "env_vars": [], 299 | "include_subdirectories": false, 300 | "invalidNodeError": null, 301 | "ui_data": { 302 | "label": "analyze_flight_delays", 303 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 304 | "x_pos": 943, 305 | "y_pos": 94, 306 | "description": "Notebook file" 307 | } 308 | }, 309 | "inputs": [ 310 | { 311 | "id": "inPort", 312 | "app_data": { 313 | "ui_data": { 314 | "cardinality": { 315 | "min": 0, 316 | "max": 1 317 | }, 318 | "label": "Input Port" 319 | } 320 | }, 321 | "links": [ 322 | { 323 | "id": "809a9b07-21ec-4d30-b607-f0fab979790b", 324 | "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 325 | "port_id_ref": "outPort" 326 | } 327 | ] 328 | } 329 | ], 330 | "outputs": [ 331 | { 332 | "id": "outPort", 333 | "app_data": { 334 | "ui_data": { 335 | "cardinality": { 336 | "min": 0, 337 | "max": -1 338 | }, 339 | "label": "Output Port" 340 | } 341 | } 342 | } 343 | ] 344 | }, 345 | { 346 | "id": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f", 347 | "type": "execution_node", 348 | "op": "execute-notebook-node", 349 | "app_data": { 350 | "filename": "../notebooks/predict_flight_delays.ipynb", 351 | "runtime_image": "amancevice/pandas:1.1.1", 352 | "env_vars": [], 353 | "include_subdirectories": false, 354 | "invalidNodeError": null, 355 | "outputs": [ 356 | "models/model.joblib", 357 | "data/test_rows.npy" 358 | ], 359 | "ui_data": { 360 | "label": "predict_flight_delays", 361 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 362 | "x_pos": 942.013916015625, 363 | "y_pos": 251.00418090820312, 364 | "description": "Notebook file" 365 | } 366 | }, 367 | "inputs": [ 368 | { 369 | "id": "inPort", 370 | "app_data": { 371 | "ui_data": { 372 | "cardinality": { 373 | "min": 0, 374 | "max": 1 375 | }, 376 | "label": "Input Port" 377 | } 378 | }, 379 | "links": [ 380 | { 381 | "id": "17ab35cb-a52f-4c37-984e-e33bf7b67d36", 382 | "node_id_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b", 383 | "port_id_ref": "outPort" 384 | } 385 | ] 386 | } 387 | ], 388 | "outputs": [ 389 | { 390 | "id": "outPort", 391 | "app_data": { 392 | "ui_data": { 393 | "cardinality": { 394 | "min": 0, 395 | "max": -1 396 | }, 397 | "label": "Output Port" 398 | } 399 | } 400 | } 401 | ] 402 | }, 403 | { 404 | "id": "7a4f1f66-4930-4fa4-b5da-293801b3cea6", 405 | "type": "execution_node", 406 | "op": "execute-notebook-node", 407 | "app_data": { 408 | "filename": "../notebooks/deploy_model.ipynb", 409 | "runtime_image": "amancevice/pandas:1.1.1", 410 | "env_vars": [ 411 | "OS_URL=minio-service:9000", 412 | "ACCESS_KEY_ID=minio", 413 | "SECRET_ACCESS_KEY=minio123", 414 | "MODEL_BUCKET=models", 415 | "MODEL_DIR=models", 416 | "MODEL_MODE=local" 417 | ], 418 | "include_subdirectories": false, 419 | "invalidNodeError": null, 420 | "ui_data": { 421 | "label": "deploy_model", 422 | "image": "data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2216%22%20viewBox%3D%220%200%2022%2022%22%3E%0A%20%20%3Cg%20class%3D%22jp-icon-warn0%20jp-icon-selectable%22%20fill%3D%22%23EF6C00%22%3E%0A%20%20%20%20%3Cpath%20d%3D%22M18.7%203.3v15.4H3.3V3.3h15.4m1.5-1.5H1.8v18.3h18.3l.1-18.3z%22%2F%3E%0A%20%20%20%20%3Cpath%20d%3D%22M16.5%2016.5l-5.4-4.3-5.6%204.3v-11h11z%22%2F%3E%0A%20%20%3C%2Fg%3E%0A%3C%2Fsvg%3E%0A", 423 | "x_pos": 1227, 424 | "y_pos": 171, 425 | "description": "Notebook file" 426 | } 427 | }, 428 | "inputs": [ 429 | { 430 | "id": "inPort", 431 | "app_data": { 432 | "ui_data": { 433 | "cardinality": { 434 | "min": 0, 435 | "max": -1 436 | }, 437 | "label": "Input Port" 438 | } 439 | }, 440 | "links": [ 441 | { 442 | "id": "207b171f-3de3-44fd-a4e1-accbb5fcc14b", 443 | "node_id_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f", 444 | "port_id_ref": "outPort" 445 | } 446 | ] 447 | } 448 | ], 449 | "outputs": [ 450 | { 451 | "id": "outPort", 452 | "app_data": { 453 | "ui_data": { 454 | "cardinality": { 455 | "min": 0, 456 | "max": -1 457 | }, 458 | "label": "Output Port" 459 | } 460 | } 461 | } 462 | ] 463 | } 464 | ], 465 | "app_data": { 466 | "ui_data": { 467 | "comments": [ 468 | { 469 | "id": "e5c417b4-9be7-4244-8597-21e0c0e00a70", 470 | "x_pos": 28, 471 | "y_pos": 17, 472 | "width": 175, 473 | "height": 42, 474 | "class_name": "d3-comment-rect", 475 | "content": "Load flight delay data", 476 | "associated_id_refs": [ 477 | { 478 | "node_ref": "2f3f6243-82ef-43ee-af09-9888a5dfbc30" 479 | } 480 | ] 481 | }, 482 | { 483 | "id": "c1e71c34-9ae7-4a85-9251-91c554034cc2", 484 | "x_pos": 30, 485 | "y_pos": 338, 486 | "width": 175, 487 | "height": 42, 488 | "class_name": "d3-comment-rect", 489 | "content": "Load JFK weather data", 490 | "associated_id_refs": [ 491 | { 492 | "node_ref": "b614bf55-c127-413a-935a-175d7afa7f0e" 493 | } 494 | ] 495 | }, 496 | { 497 | "id": "9cd374ba-b6ee-47a3-b963-4f164621d78b", 498 | "x_pos": 292, 499 | "y_pos": 15, 500 | "width": 175, 501 | "height": 42, 502 | "class_name": "d3-comment-rect", 503 | "content": "Clean up & pre-process flight delay data", 504 | "associated_id_refs": [ 505 | { 506 | "node_ref": "67d35156-a7ba-4339-9975-d5eac9be5b1e" 507 | } 508 | ] 509 | }, 510 | { 511 | "id": "44d53e47-c149-4b69-ad6e-259dcd8f8b9f", 512 | "x_pos": 308, 513 | "y_pos": 334, 514 | "width": 175, 515 | "height": 42, 516 | "class_name": "d3-comment-rect", 517 | "content": "Clean up & pre-process weather data", 518 | "associated_id_refs": [ 519 | { 520 | "node_ref": "71257647-2fc5-4db8-95ef-5813bc386f95" 521 | } 522 | ] 523 | }, 524 | { 525 | "id": "b12e0c12-9aa1-4c77-b6d5-0f02f7c64807", 526 | "x_pos": 579, 527 | "y_pos": 29, 528 | "width": 243, 529 | "height": 64, 530 | "class_name": "d3-comment-rect", 531 | "content": "Combine flight delay & weather dataset for downstream analytics & prediction tasks", 532 | "associated_id_refs": [ 533 | { 534 | "node_ref": "d70363d7-8c8f-45d2-b539-746fd2a4e14b" 535 | } 536 | ] 537 | }, 538 | { 539 | "id": "5d7aa386-549a-4c76-8f58-7ead3658fd7f", 540 | "x_pos": 908, 541 | "y_pos": 20, 542 | "width": 175, 543 | "height": 42, 544 | "class_name": "d3-comment-rect", 545 | "content": "Analyze & visualize flight delay & weather data", 546 | "associated_id_refs": [ 547 | { 548 | "node_ref": "1d8baea2-b4a8-4a64-b14b-c0ef0c234f01" 549 | } 550 | ] 551 | }, 552 | { 553 | "id": "bf560da3-5a43-4671-84ea-d98d562e1ec3", 554 | "x_pos": 860, 555 | "y_pos": 357, 556 | "width": 230, 557 | "height": 52, 558 | "class_name": "d3-comment-rect", 559 | "content": "Train & evaluate machine learning models to predict flight delays", 560 | "associated_id_refs": [ 561 | { 562 | "node_ref": "c40e2c40-3f4c-4dbb-8e78-3f20f0de116f" 563 | } 564 | ] 565 | }, 566 | { 567 | "id": "ab7b35f6-20e6-4d63-b8e5-02a41fffa18f", 568 | "x_pos": 1163, 569 | "y_pos": 61, 570 | "width": 175, 571 | "height": 42, 572 | "class_name": "d3-comment-rect", 573 | "content": "Deploy the trained model to Kubeflow Serving", 574 | "associated_id_refs": [ 575 | { 576 | "node_ref": "7a4f1f66-4930-4fa4-b5da-293801b3cea6" 577 | } 578 | ] 579 | } 580 | ] 581 | }, 582 | "version": 3 583 | }, 584 | "runtime_ref": "" 585 | } 586 | ], 587 | "schemas": [] 588 | } --------------------------------------------------------------------------------