├── .gitignore
├── LICENSE
├── README.md
├── tutorials
    ├── cloud_fit_deploy
    │   ├── cloud-aws-lambda-deployment.ipynb
    │   ├── cloud-aws-sagemaker-deployment.ipynb
    │   └── cloud-aws-sagemaker-training.ipynb
    ├── image_prediction
    │   ├── beginner.ipynb
    │   ├── dataset.ipynb
    │   └── hpo.ipynb
    ├── multimodal
    │   ├── beginner_image_cls.ipynb
    │   ├── beginner_multimodal.ipynb
    │   ├── beginner_text.ipynb
    │   ├── customization.ipynb
    │   ├── multilingual_text.ipynb
    │   └── multimodal_text_tabular.ipynb
    ├── object_detection
    │   ├── beginner.ipynb
    │   └── dataset.ipynb
    ├── tabular_prediction
    │   ├── tabular-custom-metric.ipynb
    │   ├── tabular-custom-model-advanced.ipynb
    │   ├── tabular-custom-model.ipynb
    │   ├── tabular-faq.ipynb
    │   ├── tabular-feature-engineering.ipynb
    │   ├── tabular-gpu.ipynb
    │   ├── tabular-indepth.ipynb
    │   ├── tabular-interpretability.ipynb
    │   ├── tabular-kaggle.ipynb
    │   ├── tabular-multilabel.ipynb
    │   ├── tabular-multimodal-text-others.ipynb
    │   ├── tabular-multimodal.ipynb
    │   └── tabular-quickstart.ipynb
    ├── text_prediction
    │   ├── beginner.ipynb
    │   ├── customization.ipynb
    │   └── multilingual_text.ipynb
    └── timeseries
    │   ├── forecasting-faq.ipynb
    │   └── forecasting-quickstart.ipynb
└── welcome-notebook.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # autogluon-tutorials
2 | Staged builds of awslabs/autogluon tutorials
3 | 


--------------------------------------------------------------------------------
/tutorials/cloud_fit_deploy/cloud-aws-lambda-deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f054a4ed",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-lambda-deployment.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-lambda-deployment.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "2457902e",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "1e4be297",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Deploying AutoGluon models with serverless templates\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "After learning how to train a model using AWS SageMaker [tutorials/cloud_fit_deploy/cloud-aws-sagemaker-training.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-sagemaker-training.ipynb), in this section we will learn how to deploy \n",
 33 |     "trained models using AWS Lambda.\n",
 34 |     "\n",
 35 |     "## Reducing the model size to minimize AWS Lambda startup times\n",
 36 |     "\n",
 37 |     "When the Lambda service receives a request to run a function via the Lambda API, the service first prepares an execution environment. During this step, the service \n",
 38 |     "downloads the code for the function, which is stored in Amazon Elastic Container Registry. It then creates an environment with the memory, runtime, and configuration \n",
 39 |     "specified. Once complete, Lambda runs any initialization code outside of the event handler before finally running the handler code. The steps of setting up the \n",
 40 |     "environment and the code are frequently referred to as a \"cold start\".\n",
 41 |     "\n",
 42 |     "After the execution completes, the execution environment is frozen. To improve resource management and performance, the Lambda service retains the execution environment \n",
 43 |     "for a non-deterministic period of time. During this time, if another request arrives for the same function, the service may reuse the environment. This second request \n",
 44 |     "typically finishes more quickly, since the execution environment already exists and it\u2019s not necessary to download the code and run the initialization code. \n",
 45 |     "This is called a \"warm start\".\n",
 46 |     "\n",
 47 |     "Because AutoGluon containers are larger than a typical Lambda container, it might take some time (60+ seconds) to perform steps required for a \"cold start\".  \n",
 48 |     "This could be limiting factor when used with latency-sensitive applications. To reduce start up times with AWS Lambda it is important to reduce model size to a minimum. \n",
 49 |     "This can be done by applying deployment-optimized presets as described in section \"Faster presets or hyperparameters\" of [tutorials/tabular_prediction/tabular-indepth.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-indepth.ipynb):"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "dcf7482f",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "```{.python}\n",
 58 |     "presets = ['good_quality_faster_inference_only_refit', 'optimize_for_deployment']\n",
 59 |     "```\n"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "id": "48423de5",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "If the cold boot latency cannot be tolerated, it is recommended to reserve concurrent capacity as described in this article:\n",
 68 |     "[Managing Lambda reserved concurrency](https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html).\n",
 69 |     "\n",
 70 |     "More details on the lambda performance optimizations can be found in the following article: \n",
 71 |     "[Operating Lambda: Performance optimization](https://aws.amazon.com/blogs/compute/operating-lambda-performance-optimization-part-1/)\n",
 72 |     "\n",
 73 |     "## Creating a base project\n",
 74 |     "\n",
 75 |     "To start the project, please follow the setup steps of the tutorial: \n",
 76 |     "[Deploying machine learning models with serverless templates](https://aws.amazon.com/blogs/compute/deploying-machine-learning-models-with-serverless-templates/).\n",
 77 |     "\n",
 78 |     "To deploy AutoGluon, the following adjustments would be required:\n",
 79 |     "\n",
 80 |     "- the trained model is expected to be in `ag_models` directory.\n",
 81 |     "\n",
 82 |     "- `Dockerfile` to package AutoGluon runtimes and model files\n",
 83 |     "\n",
 84 |     "- Modify serving `app/app.py` script to use AutoGluon\n",
 85 |     "\n",
 86 |     "When building a docker container it's size can be reduced using the following optimizations: \n",
 87 |     "\n",
 88 |     "- use CPU versions of `pytorch`; if the models to be deployed don't use `pytorch`, then don't install it.\n",
 89 |     "\n",
 90 |     "- install only the AutoGluon sub-modules required for inference - specifically `autogluon.tabular[all]` will deploy only all tabular models \n",
 91 |     "without `text` and `vision` modules and their extra dependencies. This instruction can be further narrowed down to a combination of \n",
 92 |     "the following options are: `lightgbm`, `catboost`, `xgboost`, `fastai` and `skex`.\n",
 93 |     "\n",
 94 |     "The following `Dockerfile` can be used as a starting point:"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "11094996",
100 |    "metadata": {},
101 |    "source": [
102 |     "```\n",
103 |     "FROM public.ecr.aws/lambda/python:3.8\n",
104 |     "\n",
105 |     "RUN yum install libgomp git -y \\\n",
106 |     " && yum clean all -y && rm -rf /var/cache/yum\n",
107 |     "\n",
108 |     "ARG TORCH_VER=1.9.1+cpu\n",
109 |     "ARG TORCH_VISION_VER=0.10.1+cpu\n",
110 |     "ARG NUMPY_VER=1.19.5\n",
111 |     "RUN python3.8 -m pip --no-cache-dir install --upgrade --trusted-host pypi.org --trusted-host files.pythonhosted.org pip \\\n",
112 |     " && python3.8 -m pip --no-cache-dir install --upgrade wheel setuptools \\\n",
113 |     " && python3.8 -m pip uninstall -y dataclasses \\\n",
114 |     " && python3.8 -m pip --no-cache-dir install --upgrade torch==\"${TORCH_VER}\" torchvision==\"${TORCH_VISION_VER}\" -f https://download.pytorch.org/whl/torch_stable.html \\\n",
115 |     " && python3.8 -m pip --no-cache-dir install --upgrade numpy==${NUMPY_VER} \\\n",
116 |     " && python3.8 -m pip --no-cache-dir install --upgrade autogluon.tabular[all]\"\n",
117 |     "\n",
118 |     "COPY app.py ./\n",
119 |     "COPY ag_models /opt/ml/model/\n",
120 |     "\n",
121 |     "CMD [\"app.lambda_handler\"]\n",
122 |     "```\n"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "2c0448c6",
128 |    "metadata": {},
129 |    "source": [
130 |     "Lambda serving script (`app/app.py`):"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "3f7a9037",
136 |    "metadata": {},
137 |    "source": [
138 |     "```{.python}\n",
139 |     "import pandas as pd\n",
140 |     "from autogluon.tabular import TabularPredictor\n",
141 |     "\n",
142 |     "model = TabularPredictor.load('/opt/ml/model')\n",
143 |     "model.persist_models(models='all')\n",
144 |     "\n",
145 |     "\n",
146 |     "# Lambda handler code\n",
147 |     "def lambda_handler(event, context):\n",
148 |     "    df = pd.read_json(event['body'])\n",
149 |     "    pred_probs = model.predict_proba(df)\n",
150 |     "    return {\n",
151 |     "        'statusCode': 200,\n",
152 |     "        'body': pred_probs.to_json()\n",
153 |     "    }\n",
154 |     "```\n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "id": "a43bacf1",
160 |    "metadata": {},
161 |    "source": [
162 |     "Once the necessary modifications to the projects are done, proceed with the steps described in \"Deploying the application to Lambda\" section of the \n",
163 |     "[tutorial](https://aws.amazon.com/blogs/compute/deploying-machine-learning-models-with-serverless-templates/).\n",
164 |     "\n",
165 |     "## Conclusion\n",
166 |     "\n",
167 |     "In this tutorial we explored how to deploy AutoGluon models as a serverless application. To explore more, refer to the following documentation:\n",
168 |     "\n",
169 |     "- [Deploying machine learning models with serverless templates](https://aws.amazon.com/blogs/compute/deploying-machine-learning-models-with-serverless-templates/).\n",
170 |     "\n",
171 |     "- [Operating Lambda: Performance optimization](https://aws.amazon.com/blogs/compute/operating-lambda-performance-optimization-part-1/)\n",
172 |     "\n",
173 |     "- [Managing Lambda reserved concurrency](https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html)\n",
174 |     "\n",
175 |     "- [AWS Serverless Application Model (AWS SAM)](https://github.com/aws/serverless-application-model)"
176 |    ]
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "language_info": {
181 |    "name": "python"
182 |   }
183 |  },
184 |  "nbformat": 4,
185 |  "nbformat_minor": 5
186 | }


--------------------------------------------------------------------------------
/tutorials/cloud_fit_deploy/cloud-aws-sagemaker-training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d23218d1",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-sagemaker-training.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-sagemaker-training.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "4fc5427e",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "083d2099",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Cloud Training with AWS SageMaker\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "To help with AutoGluon models training, AWS developed a set of training and inference [deep learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers). \n",
 33 |     "The containers can be used to train models with CPU and GPU instances and deployed as a SageMaker endpoint or used as a batch transform job.\n",
 34 |     "\n",
 35 |     "The full end-to-end example is available in [amazon-sagemaker-examples](https://github.com/aws/amazon-sagemaker-examples/tree/master/advanced_functionality/autogluon-tabular-containers) repository.\n",
 36 |     "\n",
 37 |     "## Pre-requisites\n",
 38 |     "Before starting ensure that the latest version of sagemaker python API is installed via (`pip install --upgrade sagemaker`). \n",
 39 |     "This is required to ensure the information about newly released containers is available.\n",
 40 |     "\n",
 41 |     "## Training Scripts\n",
 42 |     "\n",
 43 |     "To start using the containers, a user training script and the [wrapper classes](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/autogluon-tabular-containers/ag_model.py) are required.\n",
 44 |     "When authoring a training/inference [scripts](https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/autogluon-tabular-containers/scripts/), \n",
 45 |     "please refer to SageMaker [documentation](https://sagemaker.readthedocs.io/en/stable/overview.html#prepare-a-training-script).\n",
 46 |     "\n",
 47 |     "Here is one of the possible training scripts, which takes AutoGluon parameters as a YAML config and outputs predictions, models leaderboard and feature importance:"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "3ee38690",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "```{.python}\n",
 56 |     "import argparse\n",
 57 |     "import os\n",
 58 |     "from pprint import pprint\n",
 59 |     "\n",
 60 |     "import yaml\n",
 61 |     "from autogluon.tabular import TabularDataset, TabularPredictor\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "def get_input_path(path):\n",
 65 |     "    file = os.listdir(path)[0]\n",
 66 |     "    if len(os.listdir(path)) > 1:\n",
 67 |     "        print(f\"WARN: more than one file is found in {channel} directory\")\n",
 68 |     "    print(f\"Using {file}\")\n",
 69 |     "    filename = f\"{path}/{file}\"\n",
 70 |     "    return filename\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "def get_env_if_present(name):\n",
 74 |     "    result = None\n",
 75 |     "    if name in os.environ:\n",
 76 |     "        result = os.environ[name]\n",
 77 |     "    return result\n",
 78 |     "\n",
 79 |     "\n",
 80 |     "if __name__ == \"__main__\":\n",
 81 |     "    # Disable Autotune\n",
 82 |     "    os.environ[\"MXNET_CUDNN_AUTOTUNE_DEFAULT\"] = \"0\"\n",
 83 |     "\n",
 84 |     "    # ------------------------------------------------------------ Arguments parsing\n",
 85 |     "    print(\"Starting AG\")\n",
 86 |     "    parser = argparse.ArgumentParser()\n",
 87 |     "\n",
 88 |     "    # Data, model, and output directories\n",
 89 |     "    parser.add_argument(\n",
 90 |     "        \"--output-data-dir\", type=str, default=get_env_if_present(\"SM_OUTPUT_DATA_DIR\")\n",
 91 |     "    )\n",
 92 |     "    parser.add_argument(\"--model-dir\", type=str, default=get_env_if_present(\"SM_MODEL_DIR\"))\n",
 93 |     "    parser.add_argument(\"--n_gpus\", type=str, default=get_env_if_present(\"SM_NUM_GPUS\"))\n",
 94 |     "    parser.add_argument(\"--training_dir\", type=str, default=get_env_if_present(\"SM_CHANNEL_TRAIN\"))\n",
 95 |     "    parser.add_argument(\n",
 96 |     "        \"--test_dir\", type=str, required=False, default=get_env_if_present(\"SM_CHANNEL_TEST\")\n",
 97 |     "    )\n",
 98 |     "    parser.add_argument(\"--ag_config\", type=str, default=get_env_if_present(\"SM_CHANNEL_CONFIG\"))\n",
 99 |     "\n",
100 |     "    args, _ = parser.parse_known_args()\n",
101 |     "\n",
102 |     "    print(f\"Args: {args}\")\n",
103 |     "\n",
104 |     "    # See SageMaker-specific environment variables: https://sagemaker.readthedocs.io/en/stable/overview.html#prepare-a-training-script\n",
105 |     "    os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True)\n",
106 |     "\n",
107 |     "    config_file = get_input_path(args.ag_config)\n",
108 |     "    with open(config_file) as f:\n",
109 |     "        config = yaml.safe_load(f)  # AutoGluon-specific config\n",
110 |     "\n",
111 |     "    if args.n_gpus:\n",
112 |     "        config[\"num_gpus\"] = int(args.n_gpus)\n",
113 |     "\n",
114 |     "    print(\"Running training job with the config:\")\n",
115 |     "    pprint(config)\n",
116 |     "\n",
117 |     "    # ---------------------------------------------------------------- Training\n",
118 |     "\n",
119 |     "    train_file = get_input_path(args.training_dir)\n",
120 |     "    train_data = TabularDataset(train_file)\n",
121 |     "\n",
122 |     "    ag_predictor_args = config[\"ag_predictor_args\"]\n",
123 |     "    ag_predictor_args[\"path\"] = args.model_dir\n",
124 |     "    ag_fit_args = config[\"ag_fit_args\"]\n",
125 |     "\n",
126 |     "    predictor = TabularPredictor(**ag_predictor_args).fit(train_data, **ag_fit_args)\n",
127 |     "\n",
128 |     "    # --------------------------------------------------------------- Inference\n",
129 |     "\n",
130 |     "    if args.test_dir:\n",
131 |     "        test_file = get_input_path(args.test_dir)\n",
132 |     "        test_data = TabularDataset(test_file)\n",
133 |     "\n",
134 |     "        # Predictions\n",
135 |     "        y_pred_proba = predictor.predict_proba(test_data)\n",
136 |     "        if config.get(\"output_prediction_format\", \"csv\") == \"parquet\":\n",
137 |     "            y_pred_proba.to_parquet(f\"{args.output_data_dir}/predictions.parquet\")\n",
138 |     "        else:\n",
139 |     "            y_pred_proba.to_csv(f\"{args.output_data_dir}/predictions.csv\")\n",
140 |     "\n",
141 |     "        # Leaderboard\n",
142 |     "        if config.get(\"leaderboard\", False):\n",
143 |     "            lb = predictor.leaderboard(test_data, silent=False)\n",
144 |     "            lb.to_csv(f\"{args.output_data_dir}/leaderboard.csv\")\n",
145 |     "\n",
146 |     "        # Feature importance\n",
147 |     "        if config.get(\"feature_importance\", False):\n",
148 |     "            feature_importance = predictor.feature_importance(test_data)\n",
149 |     "            feature_importance.to_csv(f\"{args.output_data_dir}/feature_importance.csv\")\n",
150 |     "    else:\n",
151 |     "        if config.get(\"leaderboard\", False):\n",
152 |     "            lb = predictor.leaderboard(silent=False)\n",
153 |     "            lb.to_csv(f\"{args.output_data_dir}/leaderboard.csv\")\n",
154 |     "```\n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "id": "2983390c",
160 |    "metadata": {},
161 |    "source": [
162 |     "YAML config:"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "id": "026aa10e",
168 |    "metadata": {},
169 |    "source": [
170 |     "```yaml\n",
171 |     "# AutoGluon Predictor constructor arguments\n",
172 |     "# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L51-L159\n",
173 |     "ag_predictor_args:\n",
174 |     "  eval_metric: roc_auc\n",
175 |     "  label: class\n",
176 |     "\n",
177 |     "# AutoGluon Predictor.fit arguments\n",
178 |     "# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L280-L651\n",
179 |     "ag_fit_args:\n",
180 |     "  presets: \"medium_quality_faster_train\"\n",
181 |     "  num_bag_folds: 2\n",
182 |     "  num_bag_sets: 1\n",
183 |     "  num_stack_levels: 0\n",
184 |     "\n",
185 |     "output_prediction_format: csv  # predictions output format: csv or parquet\n",
186 |     "feature_importance: true       # calculate and save feature importance if true\n",
187 |     "leaderboard: true              # save leaderboard output if true\n",
188 |     "```\n"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "id": "f08748b8",
194 |    "metadata": {},
195 |    "source": [
196 |     "## Training\n",
197 |     "\n",
198 |     "To train AutoGluon model, set up a SageMaker session:"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "c8f8c3f6",
204 |    "metadata": {},
205 |    "source": [
206 |     "```{.python}\n",
207 |     "import sagemaker\n",
208 |     "\n",
209 |     "# Helper wrappers referred earlier\n",
210 |     "from ag_model import (\n",
211 |     "    AutoGluonTraining,\n",
212 |     "    AutoGluonInferenceModel,\n",
213 |     "    AutoGluonTabularPredictor,\n",
214 |     ")\n",
215 |     "from sagemaker import utils\n",
216 |     "\n",
217 |     "role = sagemaker.get_execution_role()\n",
218 |     "sagemaker_session = sagemaker.session.Session()\n",
219 |     "region = sagemaker_session._region_name\n",
220 |     "\n",
221 |     "bucket = sagemaker_session.default_bucket()\n",
222 |     "s3_prefix = f\"autogluon_sm/{utils.sagemaker_timestamp()}\"\n",
223 |     "output_path = f\"s3://{bucket}/{s3_prefix}/output/\"\n",
224 |     "```\n"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "id": "02ef9047",
230 |    "metadata": {},
231 |    "source": [
232 |     "Create a training task:"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "ff27fcf5",
238 |    "metadata": {},
239 |    "source": [
240 |     "```{.python}\n",
241 |     "ag = AutoGluonTraining(\n",
242 |     "    role=role,\n",
243 |     "    entry_point=\"scripts/tabular_train.py\",\n",
244 |     "    region=region,\n",
245 |     "    instance_count=1,\n",
246 |     "    instance_type=\"ml.m5.2xlarge\",\n",
247 |     "    framework_version=\"0.4\",\n",
248 |     "    py_version=\"py38\",\n",
249 |     "    base_job_name=\"autogluon-tabular-train\",\n",
250 |     ")\n",
251 |     "```\n"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "id": "2667a26b",
257 |    "metadata": {},
258 |    "source": [
259 |     "Upload the required inputs, via SageMaker session (in this case it is a training set, test set and training YAML config) and start the training job:"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "cf994ed8",
265 |    "metadata": {},
266 |    "source": [
267 |     "```{.python}\n",
268 |     "s3_prefix = f\"autogluon_sm/{utils.sagemaker_timestamp()}\"\n",
269 |     "train_input = ag.sagemaker_session.upload_data(\n",
270 |     "    path=os.path.join(\"data\", \"train.csv\"), key_prefix=s3_prefix\n",
271 |     ")\n",
272 |     "eval_input = ag.sagemaker_session.upload_data(\n",
273 |     "    path=os.path.join(\"data\", \"test.csv\"), key_prefix=s3_prefix\n",
274 |     ")\n",
275 |     "config_input = ag.sagemaker_session.upload_data(\n",
276 |     "    path=os.path.join(\"config\", \"config-med.yaml\"), key_prefix=s3_prefix\n",
277 |     ")\n",
278 |     "\n",
279 |     "job_name = utils.unique_name_from_base(\"test-autogluon-image\")\n",
280 |     "ag.fit(\n",
281 |     "    {\"config\": config_input, \"train\": train_input, \"test\": eval_input},\n",
282 |     "    job_name=job_name,\n",
283 |     ")\n",
284 |     "```\n"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "id": "705a25a8",
290 |    "metadata": {},
291 |    "source": [
292 |     "Once the models are trained, they will be available in S3 location specified in `ag.model_data` field. The model is fully portable and can be downloaded locally\n",
293 |     "if needed.\n",
294 |     "\n",
295 |     "## Conclusion\n",
296 |     "\n",
297 |     "In this tutorial we explored how to train AutoGluon models using SageMaker. Learn how to deploy the trained models using \n",
298 |     "AWS SageMaker - [tutorials/cloud_fit_deploy/cloud-aws-sagemaker-deployment.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-sagemaker-deployment.ipynb) or AWS Lambda - [tutorials/cloud_fit_deploy/cloud-aws-lambda-deployment.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/cloud_fit_deploy/cloud-aws-lambda-deployment.ipynb)."
299 |    ]
300 |   }
301 |  ],
302 |  "metadata": {
303 |   "language_info": {
304 |    "name": "python"
305 |   }
306 |  },
307 |  "nbformat": 4,
308 |  "nbformat_minor": 5
309 | }


--------------------------------------------------------------------------------
/tutorials/image_prediction/beginner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2ce0291f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "a59198fd",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "eee381da",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Image Prediction - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "In this quick start, we'll use the task of image classification to illustrate how to use AutoGluon\u2019s APIs. This tutorial demonstrates how to load images and corresponding labels into AutoGluon and use this data to obtain a neural network that can classify new images. This is different from traditional machine learning where we need to manually define the neural network and then specify the hyperparameters in the training process. Instead, with just a single call to AutoGluon's [fit](https://auto.gluon.ai/stable/api/autogluon.predictor.html#autogluon.vision.ImagePredictor.fit) function, AutoGluon automatically trains many models with different hyperparameter configurations and returns the model that achieved the highest level of accuracy."
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "cae85519",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import autogluon.core as ag\n",
 43 |     "from autogluon.vision import ImagePredictor, ImageDataset"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "a78d26d9",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Create Image Dataset\n",
 52 |     "\n",
 53 |     "For demonstration purposes, we use a subset of the [Shopee-IET dataset](https://www.kaggle.com/c/shopee-iet-machine-learning-competition/data) from Kaggle.\n",
 54 |     "Each image in this data depicts a clothing item and the corresponding label specifies its clothing category.\n",
 55 |     "Our subset of the data contains the following possible labels: `BabyPants`, `BabyShirt`, `womencasualshoes`, `womenchiffontop`.\n",
 56 |     "\n",
 57 |     "We can load a dataset by downloading a url data automatically:"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "0e10fc4b",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "train_dataset, _, test_dataset = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')\n",
 68 |     "print(train_dataset)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "91dd8e9d",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## Use AutoGluon to Fit Models\n",
 77 |     "\n",
 78 |     "Now, we fit a classifier using AutoGluon as follows:"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "e45e2f7d",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "predictor = ImagePredictor()\n",
 89 |     "# since the original dataset does not provide validation split, the `fit` function splits it randomly with 90/10 ratio\n",
 90 |     "predictor.fit(train_dataset, hyperparameters={'epochs': 2})  # you can trust the default config, we reduce the # epoch to save some build time"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "4cbdf5bb",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Within `fit`, the dataset is automatically split into training and validation sets.\n",
 99 |     "The model with the best hyperparameter configuration is selected based on its performance on the validation set.\n",
100 |     "The best model is finally retrained on our entire dataset (i.e., merging training+validation) using the best configuration.\n",
101 |     "\n",
102 |     "The best Top-1 accuracy achieved on the validation set is as follows:"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "f01cb18d",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "fit_result = predictor.fit_summary()\n",
113 |     "print('Top-1 train acc: %.3f, val acc: %.3f' %(fit_result['train_acc'], fit_result['valid_acc']))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "ae935cfd",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Predict on a New Image\n",
122 |     "\n",
123 |     "Given an example image, we can easily use the final model to `predict` the label (and the conditional class-probability denoted as `score`):"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "ef67f524",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "image_path = test_dataset.iloc[0]['image']\n",
134 |     "result = predictor.predict(image_path)\n",
135 |     "print(result)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "id": "4821f0de",
141 |    "metadata": {},
142 |    "source": [
143 |     "If probabilities of all categories are needed, you can call `predict_proba`:"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "a3ff9304",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "proba = predictor.predict_proba(image_path)\n",
154 |     "print(proba)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "id": "742fc3d6",
160 |    "metadata": {},
161 |    "source": [
162 |     "You can also feed in multiple images all together, let's use images in test dataset as an example:"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "1368a54e",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "bulk_result = predictor.predict(test_dataset)\n",
173 |     "print(bulk_result)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "51c82792",
179 |    "metadata": {},
180 |    "source": [
181 |     "An extra column will be included in bulk prediction, to indicate the corresponding image for the row. There will be (# image) rows in the result, each row includes `class`, `score`, `id` and `image` for prediction class, prediction confidence, class id, and image path respectively.\n",
182 |     "\n",
183 |     "\n",
184 |     "## Generate image features with a classifier\n",
185 |     "\n",
186 |     "Extracting representation from the whole image learned by a model is also very useful. We provide `predict_feature` function to allow predictor to return the N-dimensional image feature where `N` depends on the model(usually a 512 to 2048 length vector)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "ee6647bc",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "image_path = test_dataset.iloc[0]['image']\n",
197 |     "feature = predictor.predict_feature(image_path)\n",
198 |     "print(feature)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "42936eb6",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Evaluate on Test Dataset\n",
207 |     "\n",
208 |     "You can evaluate the classifier on a test dataset rather than retrieving the predictions.\n",
209 |     "\n",
210 |     "The validation and test top-1 accuracy are:"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "79bd2932",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "test_acc = predictor.evaluate(test_dataset)\n",
221 |     "print('Top-1 test acc: %.3f' % test_acc['top1'])"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "id": "32d41e3c",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Save and load classifiers\n",
230 |     "\n",
231 |     "You can directly save the instances of classifiers:"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "id": "e7881a2a",
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "filename = 'predictor.ag'\n",
242 |     "predictor.save(filename)\n",
243 |     "predictor_loaded = ImagePredictor.load(filename)\n",
244 |     "# use predictor_loaded as usual\n",
245 |     "result = predictor_loaded.predict(image_path)\n",
246 |     "print(result)"
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "language_info": {
252 |    "name": "python"
253 |   }
254 |  },
255 |  "nbformat": 4,
256 |  "nbformat_minor": 5
257 | }


--------------------------------------------------------------------------------
/tutorials/image_prediction/dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "658e0718",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/dataset.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/dataset.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "c9ff3605",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "27fadc77",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Image Prediction - Properly load any image dataset as ImageDataset\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "Preparing the dataset for ImagePredictor is not difficult at all, however, we'd like to introduce the\n",
 33 |     "recommended ways to initialize the dataset, so you will have a smoother experience using `autogluon.vision.ImagePredictor`.\n",
 34 |     "\n",
 35 |     "There are generally three ways to load a dataset for ImagePredictor:\n",
 36 |     "\n",
 37 |     "- Load a csv file or construct your own pandas `DataFrame` with `image` and `label` columns\n",
 38 |     "\n",
 39 |     "- Load a image folder directly with `ImageDataset`\n",
 40 |     "\n",
 41 |     "- Convert a list of images into a dataset directly with `ImageDataset`\n",
 42 |     "\n",
 43 |     "We will go through these four methods one by one. First of all, let's import AutoGluon:"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "40724477",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "%matplotlib inline\n",
 54 |     "import autogluon.core as ag\n",
 55 |     "from autogluon.vision import ImageDataset\n",
 56 |     "import pandas as pd"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "004d9902",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Load a csv file or construct a DataFrame object\n",
 65 |     "\n",
 66 |     "We use a csv file from PetFinder competition as an example. You may use any tabular data as long as you can\n",
 67 |     "create `image`(absolute or relative paths to images) and `label`(category for each image) columns."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "d05e89be",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "csv_file = ag.utils.download('https://autogluon.s3-us-west-2.amazonaws.com/datasets/petfinder_example.csv')\n",
 78 |     "df = pd.read_csv(csv_file)\n",
 79 |     "df.head()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "08352897",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "If the image paths are not relative to the current working directory, you may use the helper function to prepend a prefix for each image. Using absolute paths can reduce the chance of an OSError happening during file access:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "id": "eefcedbe",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "df = ImageDataset.from_csv(csv_file, root='/home/ubuntu')\n",
 98 |     "df.head()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "0000fd63",
104 |    "metadata": {},
105 |    "source": [
106 |     "Or you can perform the correction by yourself:"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "14820983",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "import os\n",
117 |     "df['image'] = df['image'].apply(lambda x: os.path.join('/home/ubuntu', x))\n",
118 |     "df.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "01b855d7",
124 |    "metadata": {},
125 |    "source": [
126 |     "Otherwise you may use the `DataFrame` as-is, `ImagePredictor` will apply auto conversion during `fit` to ensure other metadata is available for training. You can have multiple columns in the `DataFrame`, `ImagePredictor` only cares about `image` and `label` columns during training.\n",
127 |     "\n",
128 |     "## Load an image directory\n",
129 |     "\n",
130 |     "It's pretty common that sometimes you only have a folder of images, organized by the category names. Recursively looping through images is tedious. You can use `ImageDataset.from_folders` or `ImageDataset.from_folder` to avoid implementing recursive search.\n",
131 |     "\n",
132 |     "The difference between `from_folders` and `from_folder` is the targeting folder structure.\n",
133 |     "If you have a folder with splits, e.g., `train`, `test`, like:\n",
134 |     "\n",
135 |     "- root/train/car/0001.jpg\n",
136 |     "- root/train/car/xxxa.jpg\n",
137 |     "- root/val/bus/123.png\n",
138 |     "- root/test/bus/023.jpg\n",
139 |     "\n",
140 |     "Then you can load the splits with `from_folders`:"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "a2842d50",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "train_data, _, test_data = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip', train='train', test='test')\n",
151 |     "print('train #', len(train_data), 'test #', len(test_data))\n",
152 |     "train_data.head()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "id": "af8d31b0",
158 |    "metadata": {},
159 |    "source": [
160 |     "If you have a folder without `train` or `test` root folders, like:\n",
161 |     "\n",
162 |     "- root/car/0001.jpg\n",
163 |     "- root/car/xxxa.jpg\n",
164 |     "- root/bus/123.png\n",
165 |     "- root/bus/023.jpg\n",
166 |     "\n",
167 |     "Then you can load the splits with `from_folder`:"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "e952d2f7",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# use the train from shopee-iet as new root\n",
178 |     "root = os.path.join(os.path.dirname(train_data.iloc[0]['image']), '..')\n",
179 |     "all_data = ImageDataset.from_folder(root)\n",
180 |     "all_data.head()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "id": "56f07432",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "# you can manually split the dataset or use `random_split`\n",
191 |     "train, val, test = all_data.random_split(val_size=0.1, test_size=0.1)\n",
192 |     "print('train #:', len(train), 'test #:', len(test))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "c44ef34d",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Convert a list of images to dataset\n",
201 |     "\n",
202 |     "You can create a dataset from a list of images with a function, the function is used to determine the label of each image. We use the Oxford-IIIT Pet Dataset mini pack as an example, where images are scattered in `images` directory but with a unique pattern: filenames of cats start with a capital letter, otherwise they are dogs. So we can use a function to distinguish and assign a label to each image:"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "id": "db1f58b8",
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "pets = ag.utils.download('https://autogluon.s3-us-west-2.amazonaws.com/datasets/oxford-iiit-pet-mini.zip')\n",
213 |     "pets = ag.utils.unzip(pets)\n",
214 |     "image_list = [x for x in os.listdir(os.path.join(pets, 'images')) if x.endswith('jpg')]\n",
215 |     "def label_fn(x):\n",
216 |     "    return 'cat' if os.path.basename(x)[0].isupper() else 'dog'\n",
217 |     "new_data = ImageDataset.from_name_func(image_list, label_fn, root=os.path.join(os.getcwd(), pets, 'images'))\n",
218 |     "new_data"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "e9fe3844",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Visualize images\n",
227 |     "\n",
228 |     "You can use `show_images` to visualize the images, as well as the corresponding labels:"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "id": "12274048",
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "new_data.show_images()"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "id": "ba5730c7",
244 |    "metadata": {},
245 |    "source": [
246 |     "For raw DataFrame objects, you can convert them to Dataset first to use `show_images`.\n",
247 |     "\n",
248 |     "Congratulations, you can now proceed to [tutorials/image_prediction/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb) to start training the `ImagePredictor`."
249 |    ]
250 |   }
251 |  ],
252 |  "metadata": {
253 |   "language_info": {
254 |    "name": "python"
255 |   }
256 |  },
257 |  "nbformat": 4,
258 |  "nbformat_minor": 5
259 | }


--------------------------------------------------------------------------------
/tutorials/image_prediction/hpo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d43f5cfe",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/hpo.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/hpo.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "06aa95d2",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "567037b7",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Image Prediction - Search Space and Hyperparameter Optimization (HPO)\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "While the [tutorials/image_prediction/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb) introduced basic usage of AutoGluon `fit`, `evaluate`, `predict` with default configurations, this tutorial dives into the various options that you can specify for more advanced control over the fitting process.\n",
 33 |     "\n",
 34 |     "These options include:\n",
 35 |     "\n",
 36 |     "- Defining the search space of various hyperparameter values for the training of neural networks\n",
 37 |     "- Specifying how to search through your chosen hyperparameter space\n",
 38 |     "- Specifying how to schedule jobs to train a network under a particular hyperparameter configuration.\n",
 39 |     "\n",
 40 |     "The advanced functionalities of AutoGluon enable you to use your external knowledge about your particular prediction problem and computing resources to guide the training process. If properly used, you may be able to achieve superior performance within less training time.\n",
 41 |     "\n",
 42 |     "**Tip**: If you are new to AutoGluon, review [tutorials/image_prediction/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb) to learn the basics of the AutoGluon API.\n",
 43 |     "\n",
 44 |     "Since our task is to classify images, we will use AutoGluon to produce an [ImagePredictor](https://auto.gluon.ai/stable/api/autogluon.predictor.html#autogluon.vision.ImagePredictor):"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "id": "5a68146b",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import autogluon.core as ag\n",
 55 |     "from autogluon.vision import ImagePredictor, ImageDataset"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "id": "23f3f3f5",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Create AutoGluon Dataset\n",
 64 |     "\n",
 65 |     "Let's first create the dataset using the same subset of the `Shopee-IET` dataset as the [tutorials/image_prediction/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb) tutorial.\n",
 66 |     "Recall that there's no validation split in original data, a 90/10 train/validation split is automatically performed when `fit` with `train_data`."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "ad2accd5",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "train_data, _, test_data = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "id": "aa6e6b2c",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "## Specify which Networks to Try\n",
 85 |     "\n",
 86 |     "We start with specifying the pretrained neural network candidates.\n",
 87 |     "Given such a list, AutoGluon tries to train different networks from this list to identify the best-performing candidate.\n",
 88 |     "This is an example of a :class:`autogluon.core.space.Categorical` search space, in which there are a limited number of values to choose from."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "110efdc6",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "model = ag.Categorical('resnet18_v1b', 'mobilenetv3_small')\n",
 99 |     "\n",
100 |     "# you may choose more than 70+ available model in the model zoo provided by GluonCV:\n",
101 |     "model_list = ImagePredictor.list_models()"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "id": "6d6c8fb2",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Specify the training hyper-parameters\n",
110 |     "\n",
111 |     "Similarly, we can manually specify many crucial hyper-parameters, with specific value or search space (`autogluon.core.space`)."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "5f8c72c9",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "batch_size = 8\n",
122 |     "lr = ag.Categorical(1e-2, 1e-3)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "428a2561",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Search Algorithms\n",
131 |     "\n",
132 |     "In AutoGluon, `autogluon.core.searcher` supports different search strategies for both hyperparameter optimization and architecture search.\n",
133 |     "Beyond simply specifying the space of hyperparameter configurations to search over, you can also tell AutoGluon what strategy it should employ to actually search through this space.\n",
134 |     "This process of finding good hyperparameters from a given search space is commonly referred to as *hyperparameter optimization* (HPO) or *hyperparameter tuning*.\n",
135 |     "`autogluon.core.scheduler` orchestrates how individual training jobs are scheduled.\n",
136 |     "We currently support random search.\n",
137 |     "\n",
138 |     "### Random Search\n",
139 |     "\n",
140 |     "Here is an example of using random search using :class:`autogluon.core.searcher.LocalRandomSearcher`."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "58f521b8",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "hyperparameters={'model': model, 'batch_size': batch_size, 'lr': lr, 'epochs': 2}\n",
151 |     "predictor = ImagePredictor()\n",
152 |     "predictor.fit(train_data, time_limit=60*10, hyperparameters=hyperparameters,\n",
153 |     "              hyperparameter_tune_kwargs={'num_trials': 2})\n",
154 |     "print('Top-1 val acc: %.3f' % predictor.fit_summary()['valid_acc'])"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "id": "257a4b45",
160 |    "metadata": {},
161 |    "source": [
162 |     "Load the test dataset and evaluate:"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "f1a6381c",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "results = predictor.evaluate(test_data)\n",
173 |     "print('Test acc on hold-out data:', results)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "8299e969",
179 |    "metadata": {},
180 |    "source": [
181 |     "Note that `num_trials=2` above is only used to speed up the tutorial. In normal\n",
182 |     "practice, it is common to only use `time_limit` and drop `num_trials`."
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "language_info": {
188 |    "name": "python"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 5
193 | }


--------------------------------------------------------------------------------
/tutorials/multimodal/beginner_image_cls.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "278aca48",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/beginner_image_cls.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/beginner_image_cls.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "af4e26ae",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "f1ea9618",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# AutoMM for Image Classification - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "In this quick start, we'll use the task of image classification to illustrate how to use **AutoMMPredictor**. Once the data is prepared in [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) format, a single call to `AutoMMPredictor.fit()` will take care of the model training for you.\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "## Create Image Dataset\n",
 36 |     "\n",
 37 |     "For demonstration purposes, we use a subset of the [Shopee-IET dataset](https://www.kaggle.com/c/shopee-iet-machine-learning-competition/data) from Kaggle.\n",
 38 |     "Each image in this data depicts a clothing item and the corresponding label specifies its clothing category.\n",
 39 |     "Our subset of the data contains the following possible labels: `BabyPants`, `BabyShirt`, `womencasualshoes`, `womenchiffontop`.\n",
 40 |     "\n",
 41 |     "We can load a dataset by downloading a url data automatically:"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "8115bf28",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import warnings\n",
 52 |     "warnings.filterwarnings('ignore')\n",
 53 |     "from autogluon.vision import ImageDataset\n",
 54 |     "train_dataset, _, test_dataset = ImageDataset.from_folders(\"https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip\")\n",
 55 |     "print(train_dataset)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "id": "ac280f3c",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "We can see there are 800 rows and 2 columns in this training dataframe. The 2 columns are **image** and **label**, and each row represents a different training sample.\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "## Use AutoMM to Fit Models\n",
 67 |     "\n",
 68 |     "Now, we fit a classifier using AutoMM as follows:"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "9fdf69e4",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from autogluon.multimodal import AutoMMPredictor\n",
 79 |     "predictor = AutoMMPredictor(label=\"label\", path=\"./automm_imgcls\")\n",
 80 |     "predictor.fit(\n",
 81 |     "    train_data=train_dataset,\n",
 82 |     "    time_limit=30, # seconds\n",
 83 |     ") # you can trust the default config, e.g., we use a `swin_base_patch4_window7_224` model"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "4d654be6",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "**label** is the name of the column that contains the target variable to predict, e.g., it is \"label\" in our example. **path** indicates the directory where models and intermediate outputs should be saved. We set the training time limit to 30 seconds for demonstration purpose, but you can control the training time by setting configurations. To customize AutoMM, please refer to [tutorials/multimodal/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/customization.ipynb).\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "## Evaluate on Test Dataset\n",
 95 |     "\n",
 96 |     "You can evaluate the classifier on the test dataset to see how it performs, the test top-1 accuracy is:"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "3bd75505",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "scores = predictor.evaluate(test_dataset, metrics=[\"accuracy\"])\n",
107 |     "print('Top-1 test acc: %.3f' % scores[\"accuracy\"])"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "6819288d",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Predict on a New Image\n",
116 |     "\n",
117 |     "Given an example image, let's visualize it first,"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "fe8ec5c9",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "image_path = test_dataset.iloc[0]['image']\n",
128 |     "from IPython.display import Image, display\n",
129 |     "pil_img = Image(filename=image_path)\n",
130 |     "display(pil_img)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "427ffda1",
136 |    "metadata": {},
137 |    "source": [
138 |     "We can easily use the final model to `predict` the label,"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "5546c8e8",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "predictions = predictor.predict({'image': [image_path]})\n",
149 |     "print(predictions)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "95228f30",
155 |    "metadata": {},
156 |    "source": [
157 |     "If probabilities of all categories are needed, you can call `predict_proba`:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "aa4e9ee2",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "proba = predictor.predict_proba({'image': [image_path]})\n",
168 |     "print(proba)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "id": "510f3a5c",
174 |    "metadata": {},
175 |    "source": [
176 |     "## Extract Embeddings\n",
177 |     "\n",
178 |     "Extracting representation from the whole image learned by a model is also very useful. We provide `extract_embedding` function to allow predictor to return the N-dimensional image feature where `N` depends on the model(usually a 512 to 2048 length vector)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "b673aea2",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "feature = predictor.extract_embedding({'image': [image_path]})\n",
189 |     "print(feature[0].shape)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "id": "7a57b965",
195 |    "metadata": {},
196 |    "source": [
197 |     "## Save and Load\n",
198 |     "\n",
199 |     "The trained predictor is automatically saved at the end of `fit()`, and you can easily reload it."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "d67f508b",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "loaded_predictor = AutoMMPredictor.load('automm_imgcls')\n",
210 |     "load_proba = loaded_predictor.predict_proba({'image': [image_path]})\n",
211 |     "print(load_proba)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "id": "4c1bd765",
217 |    "metadata": {},
218 |    "source": [
219 |     "We can see the predicted class probabilities are still the same as above, which means same model!"
220 |    ]
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "language_info": {
225 |    "name": "python"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 5
230 | }


--------------------------------------------------------------------------------
/tutorials/multimodal/beginner_multimodal.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6c5eae7b",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/beginner_multimodal.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/beginner_multimodal.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "b5897bbd",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "0bdb40c2",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# AutoMM for Multimodal - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "AutoMM is a deep learning \"model zoo\" of model zoos. It can automatically build deep learning models that are suitable for multimodal datasets. You will only need to convert the data into the multimodal dataframe format\n",
 33 |     "and AutoMM can predict the values of one column conditioned on the features from the other columns including images, text, and tabular data."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "aca89d8e",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import os\n",
 44 |     "import numpy as np\n",
 45 |     "import warnings\n",
 46 |     "warnings.filterwarnings('ignore')\n",
 47 |     "np.random.seed(123)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "132f405a",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## Dataset\n",
 56 |     "\n",
 57 |     "For demonstration, we use a simplified and subsampled version of [PetFinder dataset](https://www.kaggle.com/c/petfinder-adoption-prediction). The task is to predict the animals' adoption rates based on their adoption profile information. In this simplified version, the adoption speed is grouped into two categories: 0 (slow) and 1 (fast).\n",
 58 |     "\n",
 59 |     "To get started, let's download and prepare the dataset."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "9f9ebc6b",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "download_dir = './ag_automm_tutorial'\n",
 70 |     "zip_file = 'https://automl-mm-bench.s3.amazonaws.com/petfinder_for_tutorial.zip'\n",
 71 |     "from autogluon.core.utils.loaders import load_zip\n",
 72 |     "load_zip.unzip(zip_file, unzip_dir=download_dir)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "41c5556f",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "Next, we will load the CSV files."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "3471f1a4",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "import pandas as pd\n",
 91 |     "dataset_path = download_dir + '/petfinder_for_tutorial'\n",
 92 |     "train_data = pd.read_csv(f'{dataset_path}/train.csv', index_col=0)\n",
 93 |     "test_data = pd.read_csv(f'{dataset_path}/test.csv', index_col=0)\n",
 94 |     "label_col = 'AdoptionSpeed'"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "5bc9b057",
100 |    "metadata": {},
101 |    "source": [
102 |     "We need to expand the image paths to load them in training."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "8bed2c8d",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "image_col = 'Images'\n",
113 |     "train_data[image_col] = train_data[image_col].apply(lambda ele: ele.split(';')[0]) # Use the first image for a quick tutorial\n",
114 |     "test_data[image_col] = test_data[image_col].apply(lambda ele: ele.split(';')[0])\n",
115 |     "\n",
116 |     "\n",
117 |     "def path_expander(path, base_folder):\n",
118 |     "    path_l = path.split(';')\n",
119 |     "    return ';'.join([os.path.abspath(os.path.join(base_folder, path)) for path in path_l])\n",
120 |     "\n",
121 |     "train_data[image_col] = train_data[image_col].apply(lambda ele: path_expander(ele, base_folder=dataset_path))\n",
122 |     "test_data[image_col] = test_data[image_col].apply(lambda ele: path_expander(ele, base_folder=dataset_path))\n",
123 |     "\n",
124 |     "train_data[image_col].iloc[0]"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "f198163e",
130 |    "metadata": {},
131 |    "source": [
132 |     "Each animal's adoption profile includes pictures, a text description, and various tabular features such as age, breed, name, color, and more. Let's look at an example row of data and display the text description and a picture."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "130619e5",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "example_row = train_data.iloc[0]\n",
143 |     "\n",
144 |     "example_row"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "6a02db02",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "example_row['Description']"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "f974c735",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "example_image = example_row[image_col]\n",
165 |     "\n",
166 |     "from IPython.display import Image, display\n",
167 |     "pil_img = Image(filename=example_image)\n",
168 |     "display(pil_img)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "id": "208288ff",
174 |    "metadata": {},
175 |    "source": [
176 |     "## Training\n",
177 |     "Now let's fit the predictor with the training data. Here we set a tight time budget for a quick demo."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "id": "18d3e9d0",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "from autogluon.multimodal import AutoMMPredictor\n",
188 |     "predictor = AutoMMPredictor(label=label_col)\n",
189 |     "predictor.fit(\n",
190 |     "    train_data=train_data,\n",
191 |     "    time_limit=120, # seconds\n",
192 |     ")"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "c50f4056",
198 |    "metadata": {},
199 |    "source": [
200 |     "Under the hood, AutoMM automatically infers the problem type (classification or regression), detects the data modalities, selects the related models from the multimodal model pools, and trains the selected models. If multiple backbones are available, AutoMM appends a late-fusion model (MLP or transformer) on top of them.\n",
201 |     "\n",
202 |     "\n",
203 |     "## Evaluation\n",
204 |     "Then we can evaluate the predictor on the test data."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "34b661cb",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "scores = predictor.evaluate(test_data, metrics=[\"roc_auc\"])\n",
215 |     "scores"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "bac94f35",
221 |    "metadata": {},
222 |    "source": [
223 |     "## Prediction\n",
224 |     "Given a multimodal dataframe without the label column, we can predict the labels."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "id": "a1dc282a",
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "predictions = predictor.predict(test_data.drop(columns=label_col))\n",
235 |     "predictions[:5]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "51883f2e",
241 |    "metadata": {},
242 |    "source": [
243 |     "For classification tasks, we can get the probabilities of all classes."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "3efbc093",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "probas = predictor.predict_proba(test_data.drop(columns=label_col))\n",
254 |     "probas[:5]"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "id": "713f3dc1",
260 |    "metadata": {},
261 |    "source": [
262 |     "Note that calling `.predict_proba()` on one regression task will throw an exception.\n",
263 |     "\n",
264 |     "\n",
265 |     "## Extract Embeddings\n",
266 |     "\n",
267 |     "Extracting embeddings can also be useful in many cases, where we want to convert each sample (per row in the dataframe) into an embedding vector."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "dabeb991",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "embeddings = predictor.extract_embedding(test_data.drop(columns=label_col))\n",
278 |     "embeddings.shape"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "id": "cf76afe4",
284 |    "metadata": {},
285 |    "source": [
286 |     "## Save and Load\n",
287 |     "It is also convenient to save a predictor and re-load it."
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "9891ae24",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "predictor.save('my_saved_dir')\n",
298 |     "loaded_predictor = AutoMMPredictor.load('my_saved_dir')\n",
299 |     "scores2 = loaded_predictor.evaluate(test_data, metrics=[\"roc_auc\"])\n",
300 |     "scores2"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "id": "928439e3",
306 |    "metadata": {},
307 |    "source": [
308 |     "To customize AutoMM, please refer to [tutorials/multimodal/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/customization.ipynb)."
309 |    ]
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "language_info": {
314 |    "name": "python"
315 |   }
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 5
319 | }


--------------------------------------------------------------------------------
/tutorials/multimodal/multilingual_text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "46960c97",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/multilingual_text.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/multilingual_text.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "b89ccbf2",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "5c9699aa",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# AutoMM for Text - Multilingual Problems\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "People around the world speaks lots of languages. According to [SIL International](https://en.wikipedia.org/wiki/SIL_International)'s [Ethnologue: Languages of the World](https://en.wikipedia.org/wiki/Ethnologue), \n",
 33 |     "there are more than **7,100** spoken and signed languages. In fact, web data nowadays are highly multilingual and lots of \n",
 34 |     "real-world problems involve text written in languages other than English.\n",
 35 |     "\n",
 36 |     "In this tutorial, we introduce how `AutoMMPredictor` can help you build multilingual models. For the purpose of demonstration, \n",
 37 |     "we use the [Cross-Lingual Amazon Product Review Sentiment](https://webis.de/data/webis-cls-10.html) dataset, which \n",
 38 |     "comprises about 800,000 Amazon product reviews in four languages: English, German, French, and Japanese. \n",
 39 |     "We will demonstrate how to use AutoGluon Text to build sentiment classification models on the German fold of this dataset in two ways:\n",
 40 |     "\n",
 41 |     "- Finetune the German BERT\n",
 42 |     "- Cross-lingual transfer from English to German\n",
 43 |     "\n",
 44 |     "## Load Dataset\n",
 45 |     "\n",
 46 |     "The [Cross-Lingual Amazon Product Review Sentiment](https://webis.de/data/webis-cls-10.html) dataset contains Amazon product reviews in four languages. \n",
 47 |     "Here, we load the English and German fold of the dataset. In the label column, `0` means negative sentiment and `1` means positive sentiment."
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "8b464cd3",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "!wget https://automl-mm-bench.s3.amazonaws.com/multilingual-datasets/amazon_review_sentiment_cross_lingual.zip -O amazon_review_sentiment_cross_lingual.zip\n",
 58 |     "!unzip -o amazon_review_sentiment_cross_lingual.zip -d ."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "cd439845",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "import pandas as pd\n",
 69 |     "import warnings\n",
 70 |     "warnings.filterwarnings('ignore')\n",
 71 |     "\n",
 72 |     "train_de_df = pd.read_csv('amazon_review_sentiment_cross_lingual/de_train.tsv',\n",
 73 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
 74 |     "                .sample(1000, random_state=123)\n",
 75 |     "train_de_df.reset_index(inplace=True, drop=True)\n",
 76 |     "\n",
 77 |     "test_de_df = pd.read_csv('amazon_review_sentiment_cross_lingual/de_test.tsv',\n",
 78 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
 79 |     "               .sample(200, random_state=123)\n",
 80 |     "test_de_df.reset_index(inplace=True, drop=True)\n",
 81 |     "print(train_de_df)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "51c80163",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "train_en_df = pd.read_csv('amazon_review_sentiment_cross_lingual/en_train.tsv',\n",
 92 |     "                          sep='\\t',\n",
 93 |     "                          header=None,\n",
 94 |     "                          names=['label', 'text']) \\\n",
 95 |     "                .sample(1000, random_state=123)\n",
 96 |     "train_en_df.reset_index(inplace=True, drop=True)\n",
 97 |     "\n",
 98 |     "test_en_df = pd.read_csv('amazon_review_sentiment_cross_lingual/en_test.tsv',\n",
 99 |     "                          sep='\\t',\n",
100 |     "                          header=None,\n",
101 |     "                          names=['label', 'text']) \\\n",
102 |     "               .sample(200, random_state=123)\n",
103 |     "test_en_df.reset_index(inplace=True, drop=True)\n",
104 |     "print(train_en_df)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "143f98af",
110 |    "metadata": {},
111 |    "source": [
112 |     "## Finetune the German BERT\n",
113 |     "\n",
114 |     "Our first approach is to finetune the [German BERT model](https://www.deepset.ai/german-bert) pretrained by deepset. \n",
115 |     "Since `AutoMMPredictor` integrates with the [Huggingface/Transformers](https://huggingface.co/docs/transformers/index) (as explained in [tutorials/text_prediction/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/customization.ipynb)), \n",
116 |     "we directly load the German BERT model available in Huggingface/Transformers, with the key as [bert-base-german-cased](https://huggingface.co/bert-base-german-cased). \n",
117 |     "To simplify the experiment, we also just finetune for 4 epochs."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "8a4286b8",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "from autogluon.multimodal import AutoMMPredictor\n",
128 |     "\n",
129 |     "predictor = AutoMMPredictor(label='label')\n",
130 |     "predictor.fit(train_de_df,\n",
131 |     "              hyperparameters={\n",
132 |     "                  'model.hf_text.checkpoint_name': 'bert-base-german-cased',\n",
133 |     "                  'optimization.max_epochs': 4\n",
134 |     "              })"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "d94b8ec8",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "score = predictor.evaluate(test_de_df)\n",
145 |     "print('Score on the German Testset:')\n",
146 |     "print(score)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "b557bef5",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "score = predictor.evaluate(test_en_df)\n",
157 |     "print('Score on the English Testset:')\n",
158 |     "print(score)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "id": "f74b356f",
164 |    "metadata": {},
165 |    "source": [
166 |     "We can find that the model can achieve good performance on the German dataset but performs poorly on the English dataset. \n",
167 |     "Next, we will show how to enable cross-lingual transfer so you can get a model that can magically work for **both German and English**.\n",
168 |     "\n",
169 |     "## Cross-lingual Transfer\n",
170 |     "\n",
171 |     "In the real-world scenario, it is pretty common that you have trained a model for English and would like to extend the model to support other languages like German. \n",
172 |     "This setting is also known as cross-lingual transfer. One way to solve the problem is to apply a machine translation model to translate the sentences from the \n",
173 |     "other language (e.g., German) to English and apply the English model.\n",
174 |     "However, as showed in [\"Unsupervised Cross-lingual Representation Learning at Scale\"](https://arxiv.org/pdf/1911.02116.pdf), \n",
175 |     "there is a better and cost-friendlier way for cross lingual transfer, enabled via large-scale multilingual pretraining.\n",
176 |     "The author showed that via large-scale pretraining, the backbone (called XLM-R) is able to conduct *zero-shot* cross lingual transfer, \n",
177 |     "meaning that you can directly apply the model trained in the English dataset to datasets in other languages. \n",
178 |     "It also outperforms the baseline \"TRANSLATE-TEST\", meaning to translate the data from other languages to English and apply the English model. \n",
179 |     "\n",
180 |     "In AutoGluon, you can just turn on `presets=\"multilingual\"` in AutoMMPredictor to load a backbone that is suitable for zero-shot transfer. \n",
181 |     "Internally, we will automatically use state-of-the-art models like [DeBERTa-V3](https://arxiv.org/abs/2111.09543)."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "eeea41c0",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "from autogluon.multimodal import AutoMMPredictor\n",
192 |     "\n",
193 |     "predictor = AutoMMPredictor(label='label')\n",
194 |     "predictor.fit(train_en_df,\n",
195 |     "              presets='multilingual',\n",
196 |     "              hyperparameters={\n",
197 |     "                  'optimization.max_epochs': 4\n",
198 |     "              })"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "id": "03d1856d",
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "score_in_en = predictor.evaluate(test_en_df)\n",
209 |     "print('Score in the English Testset:')\n",
210 |     "print(score_in_en)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "0351d82d",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "score_in_de = predictor.evaluate(test_de_df)\n",
221 |     "print('Score in the German Testset:')\n",
222 |     "print(score_in_de)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "id": "5b3d3f58",
228 |    "metadata": {},
229 |    "source": [
230 |     "We can see that the model works for both German and English!\n",
231 |     "\n",
232 |     "Let's also inspect the model's performance on Japanese:"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "id": "3c11579a",
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "test_jp_df = pd.read_csv('amazon_review_sentiment_cross_lingual/jp_test.tsv',\n",
243 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
244 |     "               .sample(200, random_state=123)\n",
245 |     "test_jp_df.reset_index(inplace=True, drop=True)\n",
246 |     "print(test_jp_df)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "731f7683",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "print('Negative labe ratio of the Japanese Testset=', test_jp_df['label'].value_counts()[0] / len(test_jp_df))\n",
257 |     "score_in_jp = predictor.evaluate(test_jp_df)\n",
258 |     "print('Score in the Japanese Testset:')\n",
259 |     "print(score_in_jp)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "1a77bdb6",
265 |    "metadata": {},
266 |    "source": [
267 |     "Amazingly, the model also works for Japanese!\n",
268 |     "\n",
269 |     "To customize AutoMM, please refer to [tutorials/multimodal/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/customization.ipynb)."
270 |    ]
271 |   }
272 |  ],
273 |  "metadata": {
274 |   "language_info": {
275 |    "name": "python"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 5
280 | }


--------------------------------------------------------------------------------
/tutorials/multimodal/multimodal_text_tabular.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d572005a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/multimodal_text_tabular.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/multimodal_text_tabular.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "436f8df0",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "9c29f998",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# AutoMM for Text + Tabular - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "In many applications, text data may be mixed with numeric/categorical data. \n",
 33 |     "AutoGluon's `AutoMMPredictor` can train a single neural network that jointly operates on multiple feature types, \n",
 34 |     "including text, categorical, and numerical columns. The general idea is to embed the text, categorical and numeric fields \n",
 35 |     "separately and fuse these features across modalities. This tutorial demonstrates such an application."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "26af2ad7",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import numpy as np\n",
 46 |     "import pandas as pd\n",
 47 |     "import warnings\n",
 48 |     "import os\n",
 49 |     "\n",
 50 |     "warnings.filterwarnings('ignore')\n",
 51 |     "np.random.seed(123)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "842d5267",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "!python3 -m pip install openpyxl"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "id": "eb6bd1e7",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Book Price Prediction Data\n",
 70 |     "\n",
 71 |     "For demonstration, we use the book price prediction dataset from the [MachineHack Salary Prediction Hackathon](https://www.machinehack.com/hackathons/predict_the_price_of_books/overview). Our goal is to predict a book's price given various features like its author, the abstract, the book's rating, etc."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "1baf8615",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!mkdir -p price_of_books\n",
 82 |     "!wget https://automl-mm-bench.s3.amazonaws.com/machine_hack_competitions/predict_the_price_of_books/Data.zip -O price_of_books/Data.zip\n",
 83 |     "!cd price_of_books && unzip -o Data.zip\n",
 84 |     "!ls price_of_books/Participants_Data"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "11e15914",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "train_df = pd.read_excel(os.path.join('price_of_books', 'Participants_Data', 'Data_Train.xlsx'), engine='openpyxl')\n",
 95 |     "train_df.head()"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "id": "41f1f9f5",
101 |    "metadata": {},
102 |    "source": [
103 |     "We do some basic preprocessing to convert `Reviews` and `Ratings` in the data table to numeric values, and we transform prices to a log-scale."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "id": "122d76ae",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "def preprocess(df):\n",
114 |     "    df = df.copy(deep=True)\n",
115 |     "    df.loc[:, 'Reviews'] = pd.to_numeric(df['Reviews'].apply(lambda ele: ele[:-len(' out of 5 stars')]))\n",
116 |     "    df.loc[:, 'Ratings'] = pd.to_numeric(df['Ratings'].apply(lambda ele: ele.replace(',', '')[:-len(' customer reviews')]))\n",
117 |     "    df.loc[:, 'Price'] = np.log(df['Price'] + 1)\n",
118 |     "    return df"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "6246d2a9",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "train_subsample_size = 1500  # subsample for faster demo, you can try setting to larger values\n",
129 |     "test_subsample_size = 5\n",
130 |     "train_df = preprocess(train_df)\n",
131 |     "train_data = train_df.iloc[100:].sample(train_subsample_size, random_state=123)\n",
132 |     "test_data = train_df.iloc[:100].sample(test_subsample_size, random_state=245)\n",
133 |     "train_data.head()"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "a5d9f56a",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Training\n",
142 |     "\n",
143 |     "We can simply create a AutoMMPredictor and call `predictor.fit()` to train a model that operates on across all types of features. \n",
144 |     "Internally, the neural network will be automatically generated based on the inferred data type of each feature column. \n",
145 |     "To save time, we subsample the data and only train for three minutes."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "d926d711",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "from autogluon.multimodal import AutoMMPredictor\n",
156 |     "time_limit = 3 * 60  # set to larger value in your applications\n",
157 |     "predictor = AutoMMPredictor(label='Price', path='automm_text_book_price_prediction')\n",
158 |     "predictor.fit(train_data, time_limit=time_limit)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "id": "dbb6b7a3",
164 |    "metadata": {},
165 |    "source": [
166 |     "## Prediction\n",
167 |     "\n",
168 |     "We can easily obtain predictions and extract data embeddings using the AutoMMPredictor."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "d3cf2d8e",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "predictions = predictor.predict(test_data)\n",
179 |     "print('Predictions:')\n",
180 |     "print('------------')\n",
181 |     "print(np.exp(predictions) - 1)\n",
182 |     "print()\n",
183 |     "print('True Value:')\n",
184 |     "print('------------')\n",
185 |     "print(np.exp(test_data['Price']) - 1)\n"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "id": "c2aa9df3",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "performance = predictor.evaluate(test_data)\n",
196 |     "print(performance)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "8e5fc5bf",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "embeddings = predictor.extract_embedding(test_data)\n",
207 |     "embeddings.shape"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "id": "59c005b0",
213 |    "metadata": {},
214 |    "source": [
215 |     "## What's happening inside?\n",
216 |     "\n",
217 |     "\n",
218 |     "\n",
219 |     "Internally, we use different networks to encode the text columns, categorical columns, and numerical columns. The features generated by individual networks are aggregated by a late-fusion aggregator. The aggregator can output both the logits or score predictions. The architecture can be illustrated as follows:\n",
220 |     "\n",
221 |     "![Multimodal Network with Late Fusion](https://autogluon-text-data.s3.amazonaws.com/figures/fuse-late.png)\n",
222 |     "\n",
223 |     ":width:`600px`\n",
224 |     "\n",
225 |     "\n",
226 |     "Here, we use the pretrained NLP backbone to extract the text features and then use two other towers to extract the feature from categorical column and the numerical column.\n",
227 |     "\n",
228 |     "In addition, to deal with multiple text fields, we separate these fields with the `[SEP]` token and alternate 0s and 1s as the segment IDs, which is shown as follows:\n",
229 |     "\n",
230 |     "![Preprocessing](https://autogluon-text-data.s3.amazonaws.com/figures/preprocess.png)\n",
231 |     "\n",
232 |     ":width:`600px`\n",
233 |     "\n",
234 |     "\n",
235 |     "## How does this compare with TabularPredictor?\n",
236 |     "\n",
237 |     "Note that `TabularPredictor` can also handle data tables with text, numeric, and categorical columns, but it uses an ensemble of many types of models and may featurize text. `AutoMMPredictor` instead directly fuses multiple neural network models directly and handles \n",
238 |     "raw text (which are also capable of handling additional numerical/categorical columns). We generally recommend `TabularPredictor` if your table contains mainly numeric/categorical columns and AutoMMPredictor if your table contains mainly text columns, \n",
239 |     "but you may easily try both and we encourage this. In fact, `TabularPredictor.fit(..., hyperparameters='multimodal')` will train a AutoMMPredictor along with many other tabular models and ensemble them together. \n",
240 |     "Refer to the tutorial \"[tutorials/tabular_prediction/tabular-multimodal-text-others.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-multimodal-text-others.ipynb)\"  for more details.\n",
241 |     "\n",
242 |     "## Other Examples\n",
243 |     "\n",
244 |     "You may go to https://github.com/awslabs/autogluon/tree/master/examples/automm to explore other AutoMMPredictor examples.\n",
245 |     "\n",
246 |     "## Customization\n",
247 |     "To customize AutoMM, please refer to [tutorials/multimodal/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/customization.ipynb)."
248 |    ]
249 |   }
250 |  ],
251 |  "metadata": {
252 |   "language_info": {
253 |    "name": "python"
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 5
258 | }


--------------------------------------------------------------------------------
/tutorials/object_detection/beginner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "146edb65",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/object_detection/beginner.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/object_detection/beginner.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "c45ce714",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "90c28c0c",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Object Detection - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "Object detection is the process of identifying and localizing objects in an image and is an important task in computer vision. Follow this tutorial to learn how to use AutoGluon for object detection.\n",
 33 |     "\n",
 34 |     "**Tip**: If you are new to AutoGluon, review [tutorials/image_prediction/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/image_prediction/beginner.ipynb) first to learn the basics of the AutoGluon API.\n",
 35 |     "\n",
 36 |     "Our goal is to detect motorbike in images by [YOLOv3 model](https://pjreddie.com/media/files/papers/YOLOv3.pdf). A tiny dataset is collected from VOC dataset, which only contains the motorbike category. The model pretrained on the COCO dataset is used to fine-tune our small dataset. With the help of AutoGluon, we are able to try many models with different hyperparameters automatically, and return the best one as our final model.\n",
 37 |     "\n",
 38 |     "To start, import ObjectDetector:"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "1bccb5e8",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from autogluon.vision import ObjectDetector"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "561db3bb",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Tiny_motorbike Dataset\n",
 57 |     "We collect a toy dataset for detecting motorbikes in images. From the VOC dataset, images are randomly selected for training, validation, and testing - 120 images for training, 50 images for validation, and 50 for testing. This tiny dataset follows the same format as VOC.\n",
 58 |     "\n",
 59 |     "Using the commands below, we can download this dataset, which is only 23M. The name of unzipped folder is called `tiny_motorbike`. Anyway, the task dataset helper can perform the download and extraction automatically, and load the dataset according to the detection formats."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "4422d0ba",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "url = 'https://autogluon.s3.amazonaws.com/datasets/tiny_motorbike.zip'\n",
 70 |     "dataset_train = ObjectDetector.Dataset.from_voc(url, splits='trainval')"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "id": "2977de62",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Fit Models by AutoGluon\n",
 79 |     "In this section, we demonstrate how to apply AutoGluon to fit our detection models. We use mobilenet as the backbone for the YOLOv3 model. Two different learning rates are used to fine-tune the network. The best model is the one that obtains the best performance on the validation dataset. You can also try using more networks and hyperparameters to create a larger searching space.\n",
 80 |     "\n",
 81 |     "We `fit` a classifier using AutoGluon as follows. In each experiment (one trial in our searching space), we train the model for 5 epochs to avoid bursting our tutorial runtime."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "10a642e2",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "time_limit = 60*30  # at most 0.5 hour\n",
 92 |     "detector = ObjectDetector()\n",
 93 |     "hyperparameters = {'epochs': 5, 'batch_size': 8}\n",
 94 |     "hyperparameter_tune_kwargs={'num_trials': 2}\n",
 95 |     "detector.fit(dataset_train, time_limit=time_limit, hyperparameters=hyperparameters, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs)\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "id": "eec8b6ec",
101 |    "metadata": {},
102 |    "source": [
103 |     "Note that `num_trials=2` above is only used to speed up the tutorial. In normal\n",
104 |     "practice, it is common to only use `time_limit` and drop `num_trials`. Also note\n",
105 |     "that hyperparameter tuning defaults to random search.\n",
106 |     "\n",
107 |     "After fitting, AutoGluon automatically returns the best model among all models in the searching space. From the output, we know the best model is the one trained with the second learning rate. To see how well the returned model performed on test dataset, call detector.evaluate()."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "481fc809",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "dataset_test = ObjectDetector.Dataset.from_voc(url, splits='test')\n",
118 |     "\n",
119 |     "test_map = detector.evaluate(dataset_test)\n",
120 |     "print(\"mAP on test dataset: {}\".format(test_map[1][-1]))"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "id": "ccf919d0",
126 |    "metadata": {},
127 |    "source": [
128 |     "Below, we randomly select an image from test dataset and show the predicted class, box and probability over the origin image, stored in `predict_class`, `predict_rois` and `predict_score` columns, respectively. You can interpret `predict_rois` as a dict of (`xmin`, `ymin`, `xmax`, `ymax`) proportional to original image size."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "2375dfcf",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "image_path = dataset_test.iloc[0]['image']\n",
139 |     "result = detector.predict(image_path)\n",
140 |     "print(result)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "id": "cc3b9e47",
146 |    "metadata": {},
147 |    "source": [
148 |     "Prediction with multiple images is permitted:"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "d65f38ff",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "bulk_result = detector.predict(dataset_test)\n",
159 |     "print(bulk_result)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "b868e4f2",
165 |    "metadata": {},
166 |    "source": [
167 |     "We can also save the trained model, and use it later."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "06486ab7",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "savefile = 'detector.ag'\n",
178 |     "detector.save(savefile)\n",
179 |     "new_detector = ObjectDetector.load(savefile)"
180 |    ]
181 |   }
182 |  ],
183 |  "metadata": {
184 |   "language_info": {
185 |    "name": "python"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }


--------------------------------------------------------------------------------
/tutorials/object_detection/dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d3fd1fcc",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/object_detection/dataset.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/object_detection/dataset.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "bd31f124",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "f82a83a6",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Object Detection - Prepare Dataset for Object Detector\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "Preparing dataset for object detection is slightly difference and more difficult than image prediction.\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "Our goal in this tutorial is to introduce the simplest methods to initiate or load a object detection datset for `autogluon.vision.ObjectDetector`.\n",
 36 |     "\n",
 37 |     "There are generally two ways to load a dataset for ObjectDetector:\n",
 38 |     "\n",
 39 |     "- Load an existing object detection dataset, in VOC or COCO formats, downloaded or exported by other labeling tools.\n",
 40 |     "\n",
 41 |     "- Manually convert raw annotations in any format, knowing this you will be able to deal with arbitrary dataset format."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "9db543c7",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "%matplotlib inline\n",
 52 |     "import autogluon.core as ag\n",
 53 |     "from autogluon.vision import ObjectDetector"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "5ee45d9b",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Load an existing object detection dataset\n",
 62 |     "Pascal VOC and MS COCO are two most popular data format for object detection. Most public available object detection datasets follow either one of these two formats. In this tutorial we will not touch the details. You may view the original introduction for [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](https://cocodataset.org/#home).\n",
 63 |     "\n",
 64 |     "To distinguish these two formats, you can either refer to the labeling tool or check the folder structure. Usually annotations in VOC format are individual `xml` files, while COCO format use a single `json` file to store all annotations."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "743d6bf8",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "url = 'https://autogluon.s3.amazonaws.com/datasets/tiny_motorbike.zip'\n",
 75 |     "dataset_train = ObjectDetector.Dataset.from_voc(url, splits='trainval')\n",
 76 |     "# or load from coco format, skip as it's too big to download\n",
 77 |     "# dataset_train = ObjectDetector.Dataset.from_coco(annotation_json_file, root='/path/to/root')"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "44a245db",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Manually convert any format to autogluon object detector dataset\n",
 86 |     "\n",
 87 |     "We will walk you through by creating a dataset manually to help you understand the meaning of underlying data, this does not mean you have to do so. We highly recommend you to use a handy labeling tool for object detection if you want to create one by your own. Labeling bounding boxes are time consuming so a nice UI/UX design will significantly reduce the trouble.\n",
 88 |     "\n",
 89 |     "In the following section, we will use a single image and add annotations manually for all three major objects."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "b2148e11",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "ag.utils.download('https://raw.githubusercontent.com/zhreshold/mxnet-ssd/master/data/demo/dog.jpg', path='dog.jpg')\n",
100 |     "import matplotlib.image as mpimg\n",
101 |     "import matplotlib.pyplot as plt\n",
102 |     "img = mpimg.imread('dog.jpg')\n",
103 |     "imgplot = plt.imshow(img)\n",
104 |     "plt.grid()\n",
105 |     "plt.show()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "id": "997a9a01",
111 |    "metadata": {},
112 |    "source": [
113 |     "With the grid on, we can roughly annotate this image like this:"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "fa60733e",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "import pandas as pd\n",
124 |     "\n",
125 |     "class NaiveDetectionGT:\n",
126 |     "    def __init__(self, image):\n",
127 |     "        self._objects = []\n",
128 |     "        self.image = image\n",
129 |     "        img = mpimg.imread('dog.jpg')\n",
130 |     "        self.w = img.shape[1]\n",
131 |     "        self.h = img.shape[0]\n",
132 |     "\n",
133 |     "    def add_object(self, name, xmin, ymin, xmax, ymax, difficult=0):\n",
134 |     "        self._objects.append({'image': self.image, 'class': name,\n",
135 |     "                              'xmin': xmin / self.w, 'ymin': ymin / self.h,\n",
136 |     "                              'xmax': xmax / self.w, 'ymax': ymax / self.h, 'difficult': difficult})\n",
137 |     "\n",
138 |     "    @property\n",
139 |     "    def df(self):\n",
140 |     "        return pd.DataFrame(self._objects)\n",
141 |     "\n",
142 |     "gt = NaiveDetectionGT('dog.jpg')\n",
143 |     "gt.add_object('dog', 140, 220, 300, 540)\n",
144 |     "gt.add_object('bicycle', 120, 140, 580, 420)\n",
145 |     "gt.add_object('car', 460, 70, 680, 170)\n",
146 |     "df = gt.df\n",
147 |     "df"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "id": "c0c8d408",
153 |    "metadata": {},
154 |    "source": [
155 |     "The `df` is a valid dataset and can be used by `ObjectDetector.fit` function. Internally it will be converted to object detection dataset, or you can manually convert it."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "6a1633f2",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "dataset = ObjectDetector.Dataset(df, classes=df['class'].unique().tolist())\n",
166 |     "dataset.show_images(nsample=1, ncol=1)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "9483b403",
172 |    "metadata": {},
173 |    "source": [
174 |     "Congratulations, you can now proceed to [tutorials/object_detection/beginner.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/object_detection/beginner.ipynb) to start training the `ObjectDetector`."
175 |    ]
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "language_info": {
180 |    "name": "python"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 5
185 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-custom-metric.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c0cc5007",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-metric.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-metric.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "0bb5c53d",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "a52590e0",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Adding a custom metric to AutoGluon\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "**Tip**: If you are new to AutoGluon, review [tutorials/tabular_prediction/tabular-quickstart.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-quickstart.ipynb) to learn the basics of the AutoGluon API.\n",
 33 |     "\n",
 34 |     "This tutorial describes how to add a custom evaluation metric to AutoGluon that is used to inform validation scores, model ensembling, hyperparameter tuning, and more.\n",
 35 |     "\n",
 36 |     "In this example, we show a variety of evaluation metrics and how to convert them to an AutoGluon Scorer, which can then be passed to AutoGluon models and predictors.\n",
 37 |     "\n",
 38 |     "First, we will randomly generate 10 ground truth labels and predictions, and show how to calculate metric scores from them."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "2d822514",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import numpy as np\n",
 49 |     "y_true = np.random.randint(low=0, high=2, size=10)\n",
 50 |     "y_pred = np.random.randint(low=0, high=2, size=10)\n",
 51 |     "\n",
 52 |     "print(f'y_true: {y_true}')\n",
 53 |     "print(f'y_pred: {y_pred}')"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "5e8c94bf",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Ensuring Metric is Serializable\n",
 62 |     "You must define your custom metric in a separate python file that is imported for it to be serializable (able to be pickled).\n",
 63 |     "If this is not done, AutoGluon will crash during fit when trying to parallelize model training with Ray.\n",
 64 |     "In the below example, you would want to create a new python file such as `my_metrics.py` with `ag_accuracy_scorer` defined in it,\n",
 65 |     "and then use it via `from my_metrics import ag_accuracy_scorer`.\n",
 66 |     "\n",
 67 |     "If your metric is not serializable, you will get many errors similar to: `_pickle.PicklingError: Can't pickle`. Refer to https://github.com/awslabs/autogluon/issues/1637 for an example.\n",
 68 |     "\n",
 69 |     "The custom metrics in this tutorial are **not** serializable for ease of demonstration. If `best_quality` preset was used, it would crash.\n",
 70 |     "\n",
 71 |     "## Custom Accuracy Metric\n",
 72 |     "We will start with calculating accuracy. A prediction is correct if the predicted value is the same as the true value, otherwise it is wrong."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "id": "3496a87c",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "import sklearn.metrics\n",
 83 |     "sklearn.metrics.accuracy_score(y_true, y_pred)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "9c8cc6c2",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Now, let's convert this evaluation metric to an AutoGluon Scorer.\n",
 92 |     "\n",
 93 |     "We do this by calling `autogluon.core.metrics.make_scorer`."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "id": "07d90122",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "from autogluon.core.metrics import make_scorer\n",
104 |     "ag_accuracy_scorer = make_scorer(name='accuracy',\n",
105 |     "                                 score_func=sklearn.metrics.accuracy_score,\n",
106 |     "                                 optimum=1,\n",
107 |     "                                 greater_is_better=True)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "380cfe05",
113 |    "metadata": {},
114 |    "source": [
115 |     "When creating the Scorer, we need to specify a name for the Scorer. This does not need to be any particular value, but is used when printing information about the Scorer during training.\n",
116 |     "\n",
117 |     "Next, we specify the `score_func`. This is the function we want to wrap, in this case, sklearn's `accuracy_score` function.\n",
118 |     "\n",
119 |     "We then need to specify the optimum value. This is necessary when calculating error as opposed to score. Error is calculated as `optimum - score`. It is also useful to identify when a score is optimal and cannot be improved.\n",
120 |     "\n",
121 |     "Finally, we need to specify `greater_is_better`. In this case, `greater_is_better=True` because the best value returned is 1, and the worst value returned is less than 1 (0). It is very important to set this value correctly, otherwise AutoGluon will try to optimize for the **worst** model instead of the best.\n",
122 |     "\n",
123 |     "Once created, the AutoGluon Scorer can be called in the same fashion as the original metric."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "bdf8f0ed",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "ag_accuracy_scorer(y_true, y_pred)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "3d386227",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Custom Mean Squared Error Metric\n",
142 |     "\n",
143 |     "Next, let's show examples of how to convert regression metrics into Scorers.\n",
144 |     "\n",
145 |     "First we generate random ground truth labels and their predictions, however this time they are floats instead of integers."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "1e8a600c",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "y_true = np.random.rand(10)\n",
156 |     "y_pred = np.random.rand(10)\n",
157 |     "\n",
158 |     "print(f'y_true: {y_true}')\n",
159 |     "print(f'y_pred: {y_pred}')"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "7074a97e",
165 |    "metadata": {},
166 |    "source": [
167 |     "A common regression metric is Mean Squared Error:"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "36e10978",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "sklearn.metrics.mean_squared_error(y_true, y_pred)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "id": "ed8ff603",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "ag_mean_squared_error_scorer = make_scorer(name='mean_squared_error',\n",
188 |     "                                           score_func=sklearn.metrics.mean_squared_error,\n",
189 |     "                                           optimum=0,\n",
190 |     "                                           greater_is_better=False)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "id": "b6dd3192",
196 |    "metadata": {},
197 |    "source": [
198 |     "In this case, optimum is 0 because this is an error metric.\n",
199 |     "\n",
200 |     "Additionally, `greater_is_better=False` because sklearn reports error as positive values, and the lower the value is, the better.\n",
201 |     "\n",
202 |     "A very important point about AutoGluon Scorers is that internally, they will always report scores in `greater_is_better=True` form. This means if the original metric was `greater_is_better=False`, AutoGluon's Scorer will flip the value. Therefore, error will be represented as negative values.\n",
203 |     "\n",
204 |     "This is done to ensure consistency between different metrics."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "539f394e",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "ag_mean_squared_error_scorer(y_true, y_pred)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "id": "850f263f",
220 |    "metadata": {},
221 |    "source": [
222 |     "We can also specify metrics outside of sklearn. For example, below is a minimal implementation of mean squared error:"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "0d8738b9",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "def mse_func(y_true: np.ndarray, y_pred: np.ndarray) -> float:\n",
233 |     "    return ((y_true - y_pred) ** 2).mean()\n",
234 |     "\n",
235 |     "mse_func(y_true, y_pred)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "576d2e08",
241 |    "metadata": {},
242 |    "source": [
243 |     "All that is required is that the function take two arguments: `y_true`, and `y_pred` (or `y_pred_proba`), as numpy arrays, and return a float value.\n",
244 |     "\n",
245 |     "With the same code as before, we can create an AutoGluon Scorer."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "1b4d09ee",
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "ag_mean_squared_error_custom_scorer = make_scorer(name='mean_squared_error',\n",
256 |     "                                                  score_func=mse_func,\n",
257 |     "                                                  optimum=0,\n",
258 |     "                                                  greater_is_better=False)\n",
259 |     "ag_mean_squared_error_custom_scorer(y_true, y_pred)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "807fc591",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Custom ROC AUC Metric\n",
268 |     "\n",
269 |     "Here we show an example of a thresholding metric, `roc_auc`. A thresholding metric cares about the relative ordering of predictions, but not their absolute values."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "id": "6fba1f1e",
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "y_true = np.random.randint(low=0, high=2, size=10)\n",
280 |     "y_pred_proba = np.random.rand(10)\n",
281 |     "\n",
282 |     "print(f'y_true:       {y_true}')\n",
283 |     "print(f'y_pred_proba: {y_pred_proba}')"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "id": "2f563400",
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "sklearn.metrics.roc_auc_score(y_true, y_pred_proba)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "id": "b8c2c8c8",
299 |    "metadata": {},
300 |    "source": [
301 |     "We will need to specify `needs_threshold=True` in order for downstream models to properly use the metric."
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "id": "544dd97c",
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# Score functions that need decision values\n",
312 |     "ag_roc_auc_scorer = make_scorer(name='roc_auc',\n",
313 |     "                                score_func=sklearn.metrics.roc_auc_score,\n",
314 |     "                                optimum=1,\n",
315 |     "                                greater_is_better=True,\n",
316 |     "                                needs_threshold=True)\n",
317 |     "ag_roc_auc_scorer(y_true, y_pred_proba)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "id": "5e8575b0",
323 |    "metadata": {},
324 |    "source": [
325 |     "## Using Custom Metrics in TabularPredictor\n",
326 |     "\n",
327 |     "Now that we have created several custom Scorers, let's use them for training and evaluating models.\n",
328 |     "\n",
329 |     "For this tutorial, we will be using the Adult Income dataset."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "316d0633",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "from autogluon.tabular import TabularDataset\n",
340 |     "\n",
341 |     "train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame\n",
342 |     "test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame\n",
343 |     "label = 'class'  # specifies which column do we want to predict\n",
344 |     "train_data = train_data.sample(n=1000, random_state=0)  # subsample for faster demo\n",
345 |     "\n",
346 |     "train_data.head(5)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "id": "d21870c9",
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "from autogluon.tabular import TabularPredictor\n",
357 |     "\n",
358 |     "predictor = TabularPredictor(label=label).fit(train_data, hyperparameters='toy')\n",
359 |     "\n",
360 |     "predictor.leaderboard(test_data, silent=True)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "c8d2c530",
366 |    "metadata": {},
367 |    "source": [
368 |     "We can pass our custom metrics into `predictor.leaderboard` via the `extra_metrics` argument:"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "bb9a8bd8",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "predictor.leaderboard(test_data, extra_metrics=[ag_roc_auc_scorer, ag_accuracy_scorer], silent=True)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "id": "e03bbbeb",
384 |    "metadata": {},
385 |    "source": [
386 |     "We can also pass our custom metric into the Predictor itself by specifying it during initialization via the `eval_metric` parameter:"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "id": "951925e3",
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "predictor_custom = TabularPredictor(label=label, eval_metric=ag_roc_auc_scorer).fit(train_data, hyperparameters='toy')\n",
397 |     "\n",
398 |     "predictor_custom.leaderboard(test_data, silent=True)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "id": "b6fc8648",
404 |    "metadata": {},
405 |    "source": [
406 |     "That's all it takes to create and use custom metrics in AutoGluon!\n",
407 |     "\n",
408 |     "If you create a custom metric, consider [submitting a PR](https://github.com/awslabs/autogluon/pulls) so that we can add it officially to AutoGluon!\n",
409 |     "\n",
410 |     "For a tutorial on implementing custom models in AutoGluon, refer to [tutorials/tabular_prediction/tabular-custom-model.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-model.ipynb).\n",
411 |     "\n",
412 |     "For more tutorials, refer to [tutorials/tabular_prediction/tabular-quickstart.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-quickstart.ipynb) and [tutorials/tabular_prediction/tabular-indepth.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-indepth.ipynb)."
413 |    ]
414 |   }
415 |  ],
416 |  "metadata": {
417 |   "language_info": {
418 |    "name": "python"
419 |   }
420 |  },
421 |  "nbformat": 4,
422 |  "nbformat_minor": 5
423 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-custom-model-advanced.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6a1a23d9",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-model-advanced.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-model-advanced.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "48d06c53",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "a763f15d",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Adding a custom model to AutoGluon (Advanced)\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "**Tip**: If you are new to AutoGluon, review [tutorials/tabular_prediction/tabular-quickstart.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-quickstart.ipynb) to learn the basics of the AutoGluon API.\n",
 33 |     "\n",
 34 |     "In this tutorial we will cover advanced custom model options that go beyond the topics covered in [tutorials/tabular_prediction/tabular-custom-model.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-model.ipynb).\n",
 35 |     "\n",
 36 |     "It is assumed that you have fully read through [tutorials/tabular_prediction/tabular-custom-model.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-custom-model.ipynb) prior to this tutorial.\n",
 37 |     "\n",
 38 |     "## Loading the data\n",
 39 |     "\n",
 40 |     "First we will load the data. For this tutorial we will use the adult income dataset because it has a mix of integer, float, and categorical features."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "8d6c26dd",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "from autogluon.tabular import TabularDataset\n",
 51 |     "\n",
 52 |     "train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame\n",
 53 |     "test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame\n",
 54 |     "label = 'class'  # specifies which column do we want to predict\n",
 55 |     "train_data = train_data.sample(n=1000, random_state=0)  # subsample for faster demo\n",
 56 |     "\n",
 57 |     "train_data.head(5)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "id": "d872629e",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Force features to be passed to models without preprocessing / dropping\n",
 66 |     "\n",
 67 |     "Reasons why you would want to do this is if you have model logic that requires a particular column to always be present,\n",
 68 |     "regardless of its content. For example, if you are fine-tuning a pre-trained language model that expects\n",
 69 |     "a feature indicating the language of the text in a given row which dictates how the text is preprocessed,\n",
 70 |     "but training data only includes one language, without this adjustment\n",
 71 |     "the language identifier feature would be dropped prior to fitting the model.\n",
 72 |     "\n",
 73 |     "### Force features to not be dropped in model-specific preprocessing\n",
 74 |     "\n",
 75 |     "To avoid dropping features in custom models due to having only 1 unique value,\n",
 76 |     "add the following `_get_default_auxiliary_params` method to your custom model class:"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "912e29a2",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from autogluon.core.models import AbstractModel\n",
 87 |     "\n",
 88 |     "class DummyModel(AbstractModel):\n",
 89 |     "    def _fit(self, X, **kwargs):\n",
 90 |     "        print(f'Before {self.__class__.__name__} Preprocessing ({len(X.columns)} features):\\n\\t{list(X.columns)}')\n",
 91 |     "        X = self.preprocess(X)\n",
 92 |     "        print(f'After  {self.__class__.__name__} Preprocessing ({len(X.columns)} features):\\n\\t{list(X.columns)}')\n",
 93 |     "        print(X.head(5))\n",
 94 |     "\n",
 95 |     "class DummyModelKeepUnique(DummyModel):\n",
 96 |     "    def _get_default_auxiliary_params(self) -> dict:\n",
 97 |     "        default_auxiliary_params = super()._get_default_auxiliary_params()\n",
 98 |     "        extra_auxiliary_params = dict(\n",
 99 |     "            drop_unique=False,  # Whether to drop features that have only 1 unique value, default is True\n",
100 |     "        )\n",
101 |     "        default_auxiliary_params.update(extra_auxiliary_params)\n",
102 |     "        return default_auxiliary_params"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "29623ae3",
108 |    "metadata": {},
109 |    "source": [
110 |     "### Force features to not be dropped in global preprocessing\n",
111 |     "\n",
112 |     "While the above fix for model-specific preprocessing works if the feature is still present after global preprocessing,\n",
113 |     "it won't help if the feature was already dropped before getting to the model. For this, we need to\n",
114 |     "create a new feature generator class\n",
115 |     "which separates the preprocessing logic between normal features and user override features.\n",
116 |     "\n",
117 |     "Here is an example implementation:"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "45399314",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# WARNING: To use this in practice, you must put this code in a separate python file\n",
128 |     "#  from the main process and import it or else it will not be serializable.)\n",
129 |     "from autogluon.features import BulkFeatureGenerator, AutoMLPipelineFeatureGenerator, IdentityFeatureGenerator\n",
130 |     "\n",
131 |     "\n",
132 |     "class CustomFeatureGeneratorWithUserOverride(BulkFeatureGenerator):\n",
133 |     "    def __init__(self, automl_generator_kwargs: dict = None, **kwargs):\n",
134 |     "        generators = self._get_default_generators(automl_generator_kwargs=automl_generator_kwargs)\n",
135 |     "        super().__init__(generators=generators, **kwargs)\n",
136 |     "\n",
137 |     "    def _get_default_generators(self, automl_generator_kwargs: dict = None):\n",
138 |     "        if automl_generator_kwargs is None:\n",
139 |     "            automl_generator_kwargs = dict()\n",
140 |     "\n",
141 |     "        generators = [\n",
142 |     "            [\n",
143 |     "                # Preprocessing logic that handles normal features\n",
144 |     "                AutoMLPipelineFeatureGenerator(banned_feature_special_types=['user_override'], **automl_generator_kwargs),\n",
145 |     "\n",
146 |     "                # Preprocessing logic that handles special features user wishes to treat separately, here we simply skip preprocessing for these features.\n",
147 |     "                IdentityFeatureGenerator(infer_features_in_args=dict(required_special_types=['user_override'])),\n",
148 |     "            ],\n",
149 |     "        ]\n",
150 |     "        return generators"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "id": "4baa263a",
156 |    "metadata": {},
157 |    "source": [
158 |     "The above code splits the preprocessing logic of a feature\n",
159 |     "depending on if it is tagged with the `'user_override'` special type in feature metadata.\n",
160 |     "To tag three features `['age', 'native-country', 'dummy_feature']` in this way,\n",
161 |     "you can do the following:"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "1c376737",
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# add a useless dummy feature to show that it is not dropped in preprocessing\n",
172 |     "train_data['dummy_feature'] = 'dummy value'\n",
173 |     "test_data['dummy_feature'] = 'dummy value'\n",
174 |     "\n",
175 |     "from autogluon.tabular import FeatureMetadata\n",
176 |     "feature_metadata = FeatureMetadata.from_df(train_data)\n",
177 |     "\n",
178 |     "print('Before inserting overrides:')\n",
179 |     "print(feature_metadata)\n",
180 |     "\n",
181 |     "feature_metadata = feature_metadata.add_special_types(\n",
182 |     "    {\n",
183 |     "        'age': ['user_override'],\n",
184 |     "        'native-country': ['user_override'],\n",
185 |     "        'dummy_feature': ['user_override'],\n",
186 |     "    }\n",
187 |     ")\n",
188 |     "\n",
189 |     "print('After inserting overrides:')\n",
190 |     "print(feature_metadata)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "id": "21adfd5c",
196 |    "metadata": {},
197 |    "source": [
198 |     "Note that this is only one example implementation of a custom feature generator that has bifurcated preprocessing logic.\n",
199 |     "Users can make their tagging and feature generator logic arbitrarily complex to fit their needs.\n",
200 |     "In this example, we perform the standard preprocessing on non-tagged features, and for tagged features we pass\n",
201 |     "them through `IdentityFeatureGenerator` which is a no-op logic that does not alter the features in any way.\n",
202 |     "Instead of an `IdentityFeatureGenerator`, you could use any kind of feature generator to suite your needs.\n",
203 |     "\n",
204 |     "### Putting it all together"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "19fd378a",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# Separate features and labels\n",
215 |     "X = train_data.drop(columns=[label])\n",
216 |     "y = train_data[label]\n",
217 |     "X_test = test_data.drop(columns=[label])\n",
218 |     "y_test = test_data[label]\n",
219 |     "\n",
220 |     "# preprocess the label column, as done in the prior custom model tutorial\n",
221 |     "from autogluon.core.data import LabelCleaner\n",
222 |     "from autogluon.core.utils import infer_problem_type\n",
223 |     "# Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original.\n",
224 |     "problem_type = infer_problem_type(y=y)  # Infer problem type (or else specify directly)\n",
225 |     "label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y)\n",
226 |     "y_preprocessed = label_cleaner.transform(y)\n",
227 |     "y_test_preprocessed = label_cleaner.transform(y_test)\n",
228 |     "\n",
229 |     "# Make sure to specify your custom feature metadata to the feature generator\n",
230 |     "my_custom_feature_generator = CustomFeatureGeneratorWithUserOverride(feature_metadata_in=feature_metadata)\n",
231 |     "\n",
232 |     "X_preprocessed = my_custom_feature_generator.fit_transform(X)\n",
233 |     "X_test_preprocessed = my_custom_feature_generator.transform(X_test)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "a17ef240",
239 |    "metadata": {},
240 |    "source": [
241 |     "Notice how the user_override features were not preprocessed:"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "id": "1798f772",
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "print(list(X_preprocessed.columns))\n",
252 |     "X_preprocessed.head(5)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "id": "8ffda032",
258 |    "metadata": {},
259 |    "source": [
260 |     "Now lets see what happens when we send this data to fit a dummy model:"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "id": "3ef687d5",
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "dummy_model = DummyModel()\n",
271 |     "dummy_model.fit(X=X, y=y, feature_metadata=my_custom_feature_generator.feature_metadata)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "id": "2b7769b0",
277 |    "metadata": {},
278 |    "source": [
279 |     "Notice how the model dropped `dummy_feature` during the preprocess call. Now lets see what happens if we use `DummyModelKeepUnique`:"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "ec52a65d",
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "dummy_model_keep_unique = DummyModelKeepUnique()\n",
290 |     "dummy_model_keep_unique.fit(X=X, y=y, feature_metadata=my_custom_feature_generator.feature_metadata)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "id": "73b43614",
296 |    "metadata": {},
297 |    "source": [
298 |     "Now `dummy_feature` is no longer dropped!\n",
299 |     "\n",
300 |     "The above code logic can be re-used for testing your own complex model implementations,\n",
301 |     "simply replace `DummyModelKeepUnique` with your custom model and check that it keeps the features you want to use.\n",
302 |     "\n",
303 |     "### Keeping Features via TabularPredictor\n",
304 |     "\n",
305 |     "Now let's demonstrate how to do this via TabularPredictor in far fewer lines of code.\n",
306 |     "Note that this code will raise an exception if ran in this tutorial because the\n",
307 |     "custom model and feature generator must exist in other files for them to be serializable.\n",
308 |     "Therefore, we will not run the code in the tutorial.\n",
309 |     "(It will also raise an exception because DummyModel isn't a real model)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "id": "db6b0645",
315 |    "metadata": {},
316 |    "source": [
317 |     "```\n",
318 |     "from autogluon.tabular import TabularPredictor\n",
319 |     "\n",
320 |     "feature_generator = CustomFeatureGeneratorWithUserOverride()\n",
321 |     "predictor = TabularPredictor(label=label)\n",
322 |     "predictor.fit(\n",
323 |     "    train_data=train_data,\n",
324 |     "    feature_metadata=feature_metadata,  # feature metadata with your overrides\n",
325 |     "    feature_generator=feature_generator,  # your custom feature generator that handles the overrides\n",
326 |     "    hyperparameters={\n",
327 |     "        'GBM': {},  # Can fit your custom model alongside default models\n",
328 |     "        DummyModel: {},  # Will drop dummy_feature\n",
329 |     "        DummyModelKeepUnique: {},  # Will not drop dummy_feature\n",
330 |     "        # DummyModel: {'ag_args_fit': {'drop_unique': False}},  # This is another way to get same result as using DummyModelKeepUnique\n",
331 |     "    }\n",
332 |     ")\n",
333 |     "```\n"
334 |    ]
335 |   }
336 |  ],
337 |  "metadata": {
338 |   "language_info": {
339 |    "name": "python"
340 |   }
341 |  },
342 |  "nbformat": 4,
343 |  "nbformat_minor": 5
344 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-gpu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6591e275",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-gpu.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-gpu.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "a8644a0c",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "69daecc5",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Training models with GPU support\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "Training with GPU can significantly speed up base algorithms, and is a necessity for text and vision models where training without GPU is infeasibly slow. \n",
 33 |     "CUDA toolkit is required for GPU training. Please refer to the [official documentation](https://docs.nvidia.com/cuda/) for the installation instructions."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "id": "9a9253fe",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "```{.python}\n",
 42 |     "predictor = TabularPredictor(label=label).fit(\n",
 43 |     "    train_data,\n",
 44 |     "    ag_args_fit={'num_gpus': 1}\n",
 45 |     ")\n",
 46 |     "```\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "3d3f1282",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "To enable GPU acceleration on only specific models, the same parameter can be passed into model `hyperparameters`:"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "d4e33090",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "```{.python}\n",
 63 |     "hyperparameters = {\n",
 64 |     "    'GBM': [\n",
 65 |     "        {'ag_args_fit': {'num_gpus': 0}},  # Train with CPU\n",
 66 |     "        {'ag_args_fit': {'num_gpus': 1}}   # Train with GPU\n",
 67 |     "    ]\n",
 68 |     "}\n",
 69 |     "predictor = TabularPredictor(label=label).fit(\n",
 70 |     "    train_data, \n",
 71 |     "    hyperparameters=hyperparameters, \n",
 72 |     ")\n",
 73 |     "```\n"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "id": "d1539319",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Multi-modal\n",
 82 |     "\n",
 83 |     "In [tutorials/tabular_prediction/tabular-multimodal.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-multimodal.ipynb) tutorial we presented how to train an ensemble which can utilize tabular, text and images. \n",
 84 |     "If available GPUs don't have enough VRAM to fit the default model, or it is needed to speedup testing, different backends can be used:\n",
 85 |     "\n",
 86 |     "Regular configuration is retrieved like this:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "16eebe91",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config\n",
 97 |     "hyperparameters = get_hyperparameter_config('multimodal')\n",
 98 |     "hyperparameters"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "9637db71",
104 |    "metadata": {},
105 |    "source": [
106 |     "### Text models\n",
107 |     "\n",
108 |     "Text model preset to use can be set via:"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "a2de2351",
114 |    "metadata": {},
115 |    "source": [
116 |     "```{.python}\n",
117 |     "hyperparameters['AG_TEXT_NN'] = ['<preset>']\n",
118 |     "```\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "badf4310",
124 |    "metadata": {},
125 |    "source": [
126 |     "Available text model presets:"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "f13370a1",
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "from autogluon.text.text_prediction.presets import list_text_presets\n",
137 |     "list_text_presets()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "id": "6cc34ef1",
143 |    "metadata": {},
144 |    "source": [
145 |     "### Vision models\n",
146 |     "\n",
147 |     "Text model preset to use can be set via:"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "id": "29bd48dc",
153 |    "metadata": {},
154 |    "source": [
155 |     "```{.python}\n",
156 |     "hyperparameters['AG_IMAGE_NN'] = {'model': '<model>'}\n",
157 |     "```\n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "id": "73f79a32",
163 |    "metadata": {},
164 |    "source": [
165 |     "The list of available text model presets is:"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "908cbb69",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "from autogluon.vision.predictor.predictor import _get_supported_models\n",
176 |     "_get_supported_models()[:10]  # there're more, we just show a few"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "id": "48c21c6a",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Enabling GPU for LightGBM\n",
185 |     "\n",
186 |     "The default installation of LightGBM does not support GPU training, however GPU support can be enabled via a special install. If `num_gpus` is set, the following warning will be displayed:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "id": "ede4c666",
192 |    "metadata": {},
193 |    "source": [
194 |     "```\n",
195 |     "Warning: GPU mode might not be installed for LightGBM, GPU training raised an exception. Falling back to CPU training...Refer to LightGBM GPU documentation: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-versionOne possible method is:\tpip uninstall lightgbm -y\tpip install lightgbm --install-option=--gpu\n",
196 |     "```\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "89039f06",
202 |    "metadata": {},
203 |    "source": [
204 |     "If the suggested commands do not work, uninstall existing lightgbm `pip uninstall -y lightgbm` and install from sources following the instructions in the [official guide](https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html). The\n",
205 |     "optional [Install Python Interface](https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#install-python-interface-optional) section is also required to make it work with AutoGluon.\n",
206 |     "\n",
207 |     "## Troubleshooting"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "id": "0fc338ba",
213 |    "metadata": {},
214 |    "source": [
215 |     "```\n",
216 |     "OSError: libcudnn.so.X: cannot open shared object file: No such file or directory\n",
217 |     "OSError: libcudart.so.XX.Y: cannot open shared object file: No such file or directory\n",
218 |     "```\n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "42fe7f45",
224 |    "metadata": {},
225 |    "source": [
226 |     "This might happen when installed cuda is not matching MXNet library. To resolve it, get the cuda version installed in system: `nvcc --version` or `nvidia-smi`. Then install matching `mxnet-cuXXX` package (CUDA `11.0` -> `mxnet-cu110`, etc.)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "id": "16151814",
232 |    "metadata": {},
233 |    "source": [
234 |     "```\n",
235 |     "pip install 'mxnet-cu110<2.0.0'\n",
236 |     "```\n"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "language_info": {
242 |    "name": "python"
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 5
247 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-interpretability.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0016db8e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-interpretability.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-interpretability.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "08d4fedf",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "86b1564c",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Interpretable rule-based modeling\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "*Note*: This addition was made through collaboration with [the Yu-Group](https://www.stat.berkeley.edu/~yugroup/) at UC Berkeley.\n",
 33 |     "\n",
 34 |     "**Tip**: Prior to reading this tutorial, it is recommended to have a basic understanding of the TabularPredictor API covered in [tutorials/tabular_prediction/tabular-quickstart.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-quickstart.ipynb).\n",
 35 |     "\n",
 36 |     "In this tutorial, we will explain how to automatically use interpretable models powered by integration with [\ud83d\udd0d the imodels package](https://github.com/csinva/imodels). This allows for automatically learning models based on rules which are extremely concise and can be useful for (1) understanding data or (2) building a transparent predictive model.\n",
 37 |     "\n",
 38 |     "Begin by loading in data to predict. Note: interpretable rule-based modeling is currently only supported for binary classification."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "fedc8e5f",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from autogluon.tabular import TabularDataset, TabularPredictor\n",
 49 |     "train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')\n",
 50 |     "subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values\n",
 51 |     "train_data = train_data.sample(n=subsample_size, random_state=0)\n",
 52 |     "train_data.head()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "cbf04923",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "Now, we create a predictor and fit it to the data. By specifying `presets='interpretable'`, we tell the predictor to fit only interpretable models."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "39e8d9a6",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "predictor = TabularPredictor(label='class')\n",
 71 |     "predictor.fit(train_data, presets='interpretable')\n",
 72 |     "predictor.leaderboard()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "abdbc981",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "The rule-based models take slightly different forms (see below), but all try to optimize predictive performance using as few rules as possible. See the [imodels package](https://github.com/csinva/imodels) for more details.\n",
 81 |     "\n",
 82 |     " <img align=\"center\" width=60% src=\"https://csinva.io/imodels/img/imodels_logo.svg?sanitize=True\"/>\n",
 83 |     "\n",
 84 |     "![](https://raw.githubusercontent.com/csinva/imodels/master/docs/img/model_table_rules.png)\n",
 85 |     "\n",
 86 |     "Specifically, the interpretable preset fits different hyperparameter configurations of 5 models types:\n",
 87 |     "1. Greedy CART decision tree - returns a tree learned via greedy optimization ([Description](https://scikit-learn.org/stable/modules/tree.html#tree), [Paper](https://www.taylorfrancis.com/books/mono/10.1201/9781315139470/classification-regression-trees-leo-breiman-jerome-friedman-richard-olshen-charles-stone))\n",
 88 |     "2. Hierarchical Shrinkage tree - returns regularized version of CART decision tree ([Description](https://csinva.io/imodels/shrinkage.html), [Paper](https://arxiv.org/abs/2202.00858))\n",
 89 |     "3. Fast interpretable greedy-tree sum - returns a *sum* of trees, which are greedily grown simultaneously ([Description](https://csinva.io/imodels/figs.html), [Paper](https://arxiv.org/abs/2202.00858))\n",
 90 |     "4. RuleFit - returns a set of weighted rules, which are learned by a sparse linear model on rules extracted from decision trees ([Description](https://christophm.github.io/interpretable-ml-book/rulefit.html), [Paper](https://arxiv.org/abs/0811.1679))\n",
 91 |     "5. Boosted rule set - returns a set of rules, which are learned sequentially via AdaBoost ([Description](https://scikit-learn.org/stable/modules/ensemble.html#adaboost), [Paper](https://www.sciencedirect.com/science/article/pii/S002200009791504X)) \n",
 92 |     "\n",
 93 |     "\n",
 94 |     "In addition to the usual functions in `TabularPredictor`, this predictor fitted with interpretable models has some additional functionality. For example, we can now inspect the complexity of the fitted models (i.e. how many rules they contain)."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "9c1a4e77",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "predictor.interpretable_models_summary()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "171cb5cf",
110 |    "metadata": {},
111 |    "source": [
112 |     "We can also explicitly inspect the rules of the best-performing model."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "a1bb54d7",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "predictor.print_interpretable_rules() # can optionally specify a model name or complexity threshold"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "ae88461d",
128 |    "metadata": {},
129 |    "source": [
130 |     "In some cases, these rules are sufficient to accurately make predictions. In other cases, they may just be used to gain a better understanding of the data before proceeding with more black-box models."
131 |    ]
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "language_info": {
136 |    "name": "python"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 5
141 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-kaggle.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "70516618",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-kaggle.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-kaggle.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "b107a6e2",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "4a71b654",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# How to use AutoGluon for Kaggle competitions\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "This tutorial will teach you how to use AutoGluon to become a serious Kaggle competitor without writing lots of code.\n",
 33 |     "We first outline the general steps to use AutoGluon in Kaggle contests. Here, we assume the competition involves tabular data which are stored in one (or more) CSV files.\n",
 34 |     "\n",
 35 |     "1) Run Bash command: pip install kaggle\n",
 36 |     "\n",
 37 |     "2) Navigate to: https://www.kaggle.com/account and create an account (if necessary).\n",
 38 |     "Then , click on \"Create New API Token\" and move downloaded file to this location on your machine: `~/.kaggle/kaggle.json`. For troubleshooting, see [Kaggle API instructions](https://www.kaggle.com/docs/api).\n",
 39 |     "\n",
 40 |     "3) To download data programmatically: Execute this Bash command in your terminal:\n",
 41 |     "\n",
 42 |     "`kaggle competitions download -c [COMPETITION]`\n",
 43 |     "\n",
 44 |     "Here, [COMPETITION] should be replaced by the name of the competition you wish to enter.\n",
 45 |     "Alternatively, you can download data manually: Just navigate to website of the Kaggle competition you wish to enter, click \"Download All\", and accept the competition's terms.\n",
 46 |     "\n",
 47 |     "4) If the competition's training data is comprised of multiple CSV files, use [pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html) to properly merge/join them into a single data table where rows = training examples, columns = features.\n",
 48 |     "\n",
 49 |     "5) Run autogluon `fit()` on the resulting data table.\n",
 50 |     "\n",
 51 |     "6) Load the test dataset from competition (again making the necessary merges/joins to ensure it is in the exact same format as the training data table), and then call autogluon `predict()`.  Subsequently use [pandas.read_csv](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) to load the competition's `sample_submission.csv` file into a DataFrame, put the AutoGluon predictions in the right column of this DataFrame, and finally save it as a CSV file via [pandas.to_csv](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html). If the competition does not offer a sample submission file, you will need to create the submission file yourself by appropriately reformatting AutoGluon's test predictions.\n",
 52 |     "\n",
 53 |     "7) Submit your predictions via Bash command:\n",
 54 |     "\n",
 55 |     "`kaggle competitions submit -c [COMPETITION] -f [FILE] -m [\"MESSAGE\"]`\n",
 56 |     "\n",
 57 |     "Here, [COMPETITION] again is the competition's name, [FILE] is the name of the CSV file you created with your predictions, and [\"MESSAGE\"] is a string message you want to record with this submitted entry. Alternatively, you can  manually upload your file of predictions on the competition website.\n",
 58 |     "\n",
 59 |     "8) Finally, navigate to competition leaderboard website to see how well your submission performed!\n",
 60 |     "It may take time for your submission to appear.\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "Below, we demonstrate how to do steps (4)-(6) in Python for a specific Kaggle competition: [ieee-fraud-detection](https://www.kaggle.com/c/ieee-fraud-detection/).\n",
 65 |     "This means you'll need to run the above steps with `[COMPETITION]` replaced by `ieee-fraud-detection` in each command.  Here, we assume you've already completed steps (1)-(3) and the data CSV files are available on your computer. To begin step (4), we first load the competition's training data into Python:"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "id": "9d98aefe",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "```\n",
 74 |     "import pandas as pd\n",
 75 |     "import numpy as np\n",
 76 |     "from autogluon.tabular import TabularPredictor\n",
 77 |     "\n",
 78 |     "directory = '~/IEEEfraud/'  # directory where you have downloaded the data CSV files from the competition\n",
 79 |     "label = 'isFraud'  # name of target variable to predict in this competition\n",
 80 |     "eval_metric = 'roc_auc'  # Optional: specify that competition evaluation metric is AUC\n",
 81 |     "save_path = directory + 'AutoGluonModels/'  # where to store trained models\n",
 82 |     "\n",
 83 |     "train_identity = pd.read_csv(directory+'train_identity.csv')\n",
 84 |     "train_transaction = pd.read_csv(directory+'train_transaction.csv')\n",
 85 |     "```\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "id": "13e13262",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Since the training data for this competition is comprised of multiple CSV files, we just first join them into a single large table (with rows = examples, columns = features) before applying AutoGluon:"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "99c5f463",
 99 |    "metadata": {},
100 |    "source": [
101 |     "```\n",
102 |     "train_data = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')\n",
103 |     "```\n"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "dd330817",
109 |    "metadata": {},
110 |    "source": [
111 |     "Note that a left-join on the `TransactionID` key happened to be most appropriate for this Kaggle competition, but for others involving multiple training data files, you will likely need to use a different join strategy (always consider this very carefully). Now that all our training data resides within a single table, we can apply AutoGluon. Below, we specify the `presets` argument to maximize AutoGluon's predictive accuracy which usually requires that you run `fit()` with longer time limits (3600s below should likely be increased in your run):"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "125d0f9d",
117 |    "metadata": {},
118 |    "source": [
119 |     "```\n",
120 |     "predictor = TabularPredictor(label=label, eval_metric=eval_metric, path=save_path, verbosity=3).fit(\n",
121 |     "    train_data, presets='best_quality', time_limit=3600\n",
122 |     ")\n",
123 |     "\n",
124 |     "results = predictor.fit_summary()\n",
125 |     "```\n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "id": "62fc472b",
131 |    "metadata": {},
132 |    "source": [
133 |     "Now, we use the trained AutoGluon Predictor to make predictions on the competition's test data. It is imperative that multiple test data files are joined together in the exact same manner as the training data. Because this competition is evaluated based on the AUC (Area under the ROC curve) metric, we ask AutoGluon for predicted class-probabilities rather than class predictions. In general, when to use `predict` vs `predict_proba` will depend on the particular competition."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "2535562d",
139 |    "metadata": {},
140 |    "source": [
141 |     "```\n",
142 |     "test_identity = pd.read_csv(directory+'test_identity.csv')\n",
143 |     "test_transaction = pd.read_csv(directory+'test_transaction.csv')\n",
144 |     "test_data = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')  # same join applied to training files\n",
145 |     "\n",
146 |     "y_predproba = predictor.predict_proba(test_data)\n",
147 |     "y_predproba.head(5)  # some example predicted fraud-probabilities\n",
148 |     "```\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "id": "86997f02",
154 |    "metadata": {},
155 |    "source": [
156 |     "When submitting predicted probabilities for classification competitions, it is imperative these correspond to the same class expected by Kaggle. For binary classification tasks, you can see which class AutoGluon's predicted probabilities correspond to via:"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "id": "80077733",
162 |    "metadata": {},
163 |    "source": [
164 |     "```\n",
165 |     "predictor.positive_class\n",
166 |     "```\n"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "882af6b8",
172 |    "metadata": {},
173 |    "source": [
174 |     "For multiclass classification tasks, you can see which classes AutoGluon's predicted probabilities correspond to via:"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "b63dd441",
180 |    "metadata": {},
181 |    "source": [
182 |     "```\n",
183 |     "predictor.class_labels  # classes in this list correspond to columns of predict_proba() output\n",
184 |     "```\n"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "id": "c8b8e4e3",
190 |    "metadata": {},
191 |    "source": [
192 |     "Now, let's get prediction probabilities for the entire test data, while only getting the positive class predictions by specifying:"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "9d4f7157",
198 |    "metadata": {},
199 |    "source": [
200 |     "```\n",
201 |     "y_predproba = predictor.predict_proba(test_data, as_multiclass=False)\n",
202 |     "```\n"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "06a32cd9",
208 |    "metadata": {},
209 |    "source": [
210 |     "Now that we have made a prediction for each row in the test dataset, we can submit these predictions to Kaggle. Most Kaggle competitions provide a sample submission file, in which you can simply overwrite the sample predictions with your own as we do below:"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "id": "05b4f6d3",
216 |    "metadata": {},
217 |    "source": [
218 |     "```\n",
219 |     "submission = pd.read_csv(directory+'sample_submission.csv')\n",
220 |     "submission['isFraud'] = y_predproba\n",
221 |     "submission.head()\n",
222 |     "submission.to_csv(directory+'my_submission.csv', index=False)\n",
223 |     "```\n"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "id": "ae892f84",
229 |    "metadata": {},
230 |    "source": [
231 |     "We have now completed steps (4)-(6) from the top of this tutorial. To submit your predictions to Kaggle, you can run the following command in your terminal (from the appropriate directory):\n",
232 |     "\n",
233 |     "`kaggle competitions submit -c ieee-fraud-detection -f sample_submission.csv -m \"my first submission\"`\n",
234 |     "\n",
235 |     "You can now play with different `fit()` arguments and feature-engineering techniques to try and maximize the rank of your submissions in the Kaggle Leaderboard!\n",
236 |     "\n",
237 |     "\n",
238 |     "**Tips to maximize predictive performance:**\n",
239 |     "\n",
240 |     "   - Be sure to specify the appropriate evaluation metric if one is specified on the competition website! If you are unsure which metric is best, then simply do not specify this argument when invoking `fit()`; AutoGluon should still produce high-quality models by automatically inferring which metric to use.\n",
241 |     "\n",
242 |     "   - If the training examples are time-based and the competition test examples come from future data, we recommend you reserve the most recently-collected training examples as a separate validation dataset passed to `fit()`. Otherwise, you do not need to specify a validation set yourself and AutoGluon will automatically partition the competition training data into its own training/validation sets.\n",
243 |     "\n",
244 |     "   - Beyond simply specifying `presets = 'best_quality'`, you may play with more advanced `fit()` arguments such as: `num_bag_folds`, `num_stack_levels`, `num_bag_sets`, `hyperparameter_tune_kwargs`, `hyperparameters`, `refit_full`. However we recommend spending most of your time on [feature-engineering](https://www.coursera.org/lecture/competitive-data-science/overview-1Nh5Q) and just specifying `presets = 'best_quality'` inside the call to `fit()`.\n",
245 |     "\n",
246 |     "\n",
247 |     "**Troubleshooting:**\n",
248 |     "\n",
249 |     "- Check that you have the right user-permissions on your computer to access the data files downloaded from Kaggle.\n",
250 |     "\n",
251 |     "- For issues downloading Kaggle data or submitting predictions, check your Kaggle account setup and the [Kaggle FAQ](https://www.kaggle.com/general/14438)."
252 |    ]
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "language_info": {
257 |    "name": "python"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 5
262 | }


--------------------------------------------------------------------------------
/tutorials/tabular_prediction/tabular-multimodal-text-others.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "99f270a0",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-multimodal-text-others.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-multimodal-text-others.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "32cd1dc7",
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "c64a38af",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Multimodal Data Tables: Combining BERT/Transformers and Classical Tabular Models\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "**Tip**: If your data contains images, consider also checking out [tutorials/tabular_prediction/tabular-multimodal.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-multimodal.ipynb) which handles images in addition to text and tabular features.\n",
 34 |     "\n",
 35 |     "Here we introduce how to use AutoGluon Tabular to deal with multimodal tabular data that contains text, numeric, and categorical columns. In AutoGluon, **raw text data** is considered as a first-class citizen of data tables. AutoGluon Tabular can help you train and combine a diverse set of models including classical tabular models like LightGBM/RF/CatBoost as well as our pretrained NLP model based multimodal network that is introduced in Section \"What's happening inside?\" of [tutorials/multimodal_text_tabular.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/multimodal/multimodal_text_tabular.ipynb) (used by AutoGluon's `TextPredictor`)."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "32cc0b3a",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "%matplotlib inline\n",
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "import numpy as np\n",
 48 |     "import pandas as pd\n",
 49 |     "import pprint\n",
 50 |     "import random\n",
 51 |     "from autogluon.tabular import TabularPredictor\n",
 52 |     "\n",
 53 |     "np.random.seed(123)\n",
 54 |     "random.seed(123)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "7968688a",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Product Sentiment Analysis Dataset\n",
 63 |     "\n",
 64 |     "We consider the product sentiment analysis dataset from a [MachineHack hackathon](https://www.machinehack.com/hackathons/product_sentiment_classification_weekend_hackathon_19/leaderboard). The goal is to predict a user's sentiment towards a product given their review (raw text) and a categorical feature indicating the product's type (e.g., Tablet, Mobile, etc.). We have already split the original dataset to be 90% for training and 10% for development/testing (if submitting your models to the hackathon, we recommend training them on 100% of the dataset)."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "a36c2cbe",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "!mkdir -p product_sentiment_machine_hack\n",
 75 |     "!wget https://autogluon-text-data.s3.amazonaws.com/multimodal_text/machine_hack_product_sentiment/train.csv -O product_sentiment_machine_hack/train.csv\n",
 76 |     "!wget https://autogluon-text-data.s3.amazonaws.com/multimodal_text/machine_hack_product_sentiment/dev.csv -O product_sentiment_machine_hack/dev.csv\n",
 77 |     "!wget https://autogluon-text-data.s3.amazonaws.com/multimodal_text/machine_hack_product_sentiment/test.csv -O product_sentiment_machine_hack/test.csv"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "ccb1b89a",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "subsample_size = 2000  # for quick demo, try setting to larger values\n",
 88 |     "feature_columns = ['Product_Description', 'Product_Type']\n",
 89 |     "label = 'Sentiment'\n",
 90 |     "\n",
 91 |     "train_df = pd.read_csv('product_sentiment_machine_hack/train.csv', index_col=0).sample(2000, random_state=123)\n",
 92 |     "dev_df = pd.read_csv('product_sentiment_machine_hack/dev.csv', index_col=0)\n",
 93 |     "test_df = pd.read_csv('product_sentiment_machine_hack/test.csv', index_col=0)\n",
 94 |     "\n",
 95 |     "train_df = train_df[feature_columns + [label]]\n",
 96 |     "dev_df = dev_df[feature_columns + [label]]\n",
 97 |     "test_df = test_df[feature_columns]\n",
 98 |     "print('Number of training samples:', len(train_df))\n",
 99 |     "print('Number of dev samples:', len(dev_df))\n",
100 |     "print('Number of test samples:', len(test_df))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "29c5dbb3",
106 |    "metadata": {},
107 |    "source": [
108 |     "There are two features in the dataset: the users' review of the product and the product's type, and four possible classes to predict."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "8b718fc8",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "train_df.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "f40e1337",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "dev_df.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "f85e22c5",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "test_df.head()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "id": "5bcae9a6",
144 |    "metadata": {},
145 |    "source": [
146 |     "## AutoGluon Tabular with Multimodal Support\n",
147 |     "\n",
148 |     "To utilize the `TextPredictor` model inside of `TabularPredictor`, we must specify the `hyperparameters = 'multimodal'` in AutoGluon Tabular. Internally, this will train multiple tabular models as well as the TextPredictor model, and then combine them via either a weighted ensemble or stack ensemble, as  explained in [AutoGluon Tabular Paper](https://arxiv.org/pdf/2003.06505.pdf). If you do not specify `hyperparameters = 'multimodal'`, then AutoGluon Tabular will simply featurize text fields using N-grams and train only tabular models (which may work better if your text is mostly uncommon strings/vocabulary)."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "978e1ec0",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "from autogluon.tabular import TabularPredictor\n",
159 |     "predictor = TabularPredictor(label='Sentiment', path='ag_tabular_product_sentiment_multimodal')\n",
160 |     "predictor.fit(train_df, hyperparameters='multimodal')"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "14849d71",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "predictor.leaderboard(dev_df)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "id": "c2387098",
176 |    "metadata": {},
177 |    "source": [
178 |     "## Improve the Performance with Stack Ensemble\n",
179 |     "\n",
180 |     "You can improve predictive performance by using stack ensembling. One way to turn it on is as follows:"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "id": "0b34e348",
186 |    "metadata": {},
187 |    "source": [
188 |     "```\n",
189 |     "predictor.fit(train_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1)\n",
190 |     "```\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "id": "d79de7dd",
196 |    "metadata": {},
197 |    "source": [
198 |     "or using:"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "d59a4241",
204 |    "metadata": {},
205 |    "source": [
206 |     "```\n",
207 |     "predictor.fit(train_df, hyperparameters='multimodal', presets='best_quality')\n",
208 |     "```\n"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "id": "76dc3f4b",
214 |    "metadata": {},
215 |    "source": [
216 |     "which will automatically select values for `num_stack_levels` (how many stacking layers) and `num_bag_folds` (how many folds to split data into during bagging).\n",
217 |     "Stack ensembling can take much longer, so we won't run with this configuration here. You may explore more examples in https://github.com/awslabs/autogluon/tree/master/examples/text_prediction, which demonstrate how you can achieve top performance in competitions with a stack ensembling based solution."
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "language_info": {
223 |    "name": "python"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 5
228 | }
229 | 


--------------------------------------------------------------------------------
/tutorials/text_prediction/customization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c1dcc85c",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/customization.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/customization.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "b106f1c8",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "258f5664",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Text Prediction - Customization\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "This tutorial introduces the presets of `TextPredictor` and how to customize hyperparameters."
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "61f1211e",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import numpy as np\n",
 43 |     "import warnings\n",
 44 |     "import autogluon as ag\n",
 45 |     "warnings.filterwarnings(\"ignore\")\n",
 46 |     "np.random.seed(123)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "aa38abc0",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Stanford Sentiment Treebank Data\n",
 55 |     "\n",
 56 |     "For demonstration, we use the Stanford Sentiment Treebank ([SST](https://nlp.stanford.edu/sentiment/)) dataset."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "2be55dd8",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "from autogluon.core import TabularDataset\n",
 67 |     "subsample_size = 1000  # subsample for faster demo, you may try specifying larger value\n",
 68 |     "train_data = TabularDataset(\"https://autogluon-text.s3-accelerate.amazonaws.com/glue/sst/train.parquet\")\n",
 69 |     "test_data = TabularDataset(\"https://autogluon-text.s3-accelerate.amazonaws.com/glue/sst/dev.parquet\")\n",
 70 |     "train_data = train_data.sample(n=subsample_size, random_state=0)\n",
 71 |     "train_data.head(10)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "c656baab",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Configure TextPredictor\n",
 80 |     "\n",
 81 |     "### Preset Configurations\n",
 82 |     "\n",
 83 |     "`TextPredictor` provides several simple preset configurations. Let's take a look at the available presets."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "dc0de8c2",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from autogluon.text.text_prediction.presets import list_text_presets\n",
 94 |     "list_text_presets()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "44951e0c",
100 |    "metadata": {},
101 |    "source": [
102 |     "You may be interested in the configuration differences behind the preset strings."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "fa6d410e",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "list_text_presets(verbose=True)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "c24f98ca",
118 |    "metadata": {},
119 |    "source": [
120 |     "We can find that the main difference between different presets lie in the choices of Huggingface transformer checkpoints. Preset `default` has the same configuration as preset `high_quality`. Designing the presets follows the rule of thumb that larger backbones tend to have better performance but with higher cost.\n",
121 |     "\n",
122 |     "Let's train a text predictor with preset `medium_quality_faster_train`."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "667de396",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "from autogluon.text import TextPredictor\n",
133 |     "predictor = TextPredictor(eval_metric=\"acc\", label=\"label\")\n",
134 |     "predictor.fit(\n",
135 |     "    train_data=train_data,\n",
136 |     "    presets=\"medium_quality_faster_train\",\n",
137 |     "    time_limit=60,\n",
138 |     ")"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "id": "366bbe71",
144 |    "metadata": {},
145 |    "source": [
146 |     "Below we report both `f1` and `acc` metrics for our predictions."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "cb9da160",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "predictor.evaluate(test_data, metrics=[\"f1\", \"acc\"])"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "id": "8a5f4977",
162 |    "metadata": {},
163 |    "source": [
164 |     "The pre-registered configurations provide reasonable default hyperparameters. A common workflow is to first train a model with one of the presets and then tune some hyperparameters to see if the performance can be further improved.\n",
165 |     "\n",
166 |     "### Customize Hyperparameters\n",
167 |     "\n",
168 |     "Customizing hyperparameters is easy for `TextPredictor`. For example, you may want to try backbones beyond those in the presets. Since `TextPredictor` supports loading Huggingface transformers, you can choose any desired text backbones in the [Hugginface model zoo](https://huggingface.co/models), e.g., `prajjwal1/bert-tiny`."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "b4a43df4",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "from autogluon.text import TextPredictor\n",
179 |     "predictor = TextPredictor(eval_metric=\"acc\", label=\"label\")\n",
180 |     "predictor.fit(\n",
181 |     "    train_data=train_data,\n",
182 |     "    hyperparameters={\n",
183 |     "        \"model.hf_text.checkpoint_name\": \"prajjwal1/bert-tiny\",\n",
184 |     "    },\n",
185 |     "    time_limit=60,\n",
186 |     ")"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "897b6c40",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "predictor.evaluate(test_data, metrics=[\"f1\", \"acc\"])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "c7c5ff97",
202 |    "metadata": {},
203 |    "source": [
204 |     "`TextPredictor` also supports using the models that are not available in the [Huggingface model zoo](https://huggingface.co/models). To do this, you need to make sure that the models' definition follow Hugginface's AutoModel, AutoConfig, and AutoTokenizer. Let's simulate a local model."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "8a1c6258",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "import os\n",
215 |     "from transformers import AutoModel, AutoConfig, AutoTokenizer\n",
216 |     "model_key = 'prajjwal1/bert-tiny'\n",
217 |     "local_path = 'custom_local_bert_tiny'\n",
218 |     "\n",
219 |     "model = AutoModel.from_pretrained(model_key)\n",
220 |     "config = AutoConfig.from_pretrained(model_key)\n",
221 |     "tokenizer = AutoTokenizer.from_pretrained(model_key)\n",
222 |     "\n",
223 |     "model.save_pretrained(local_path)\n",
224 |     "config.save_pretrained(local_path)\n",
225 |     "tokenizer.save_pretrained(local_path)\n",
226 |     "os.listdir(local_path)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "id": "5116a0d0",
232 |    "metadata": {},
233 |    "source": [
234 |     "Now we can use this local model in `TextPredictor`."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "ce7b90f1",
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "from autogluon.text import TextPredictor\n",
245 |     "predictor = TextPredictor(eval_metric=\"acc\", label=\"label\")\n",
246 |     "predictor.fit(\n",
247 |     "    train_data=train_data,\n",
248 |     "    hyperparameters={\n",
249 |     "        \"model.hf_text.checkpoint_name\": \"custom_local_bert_tiny/\",\n",
250 |     "    },\n",
251 |     "    time_limit=60,\n",
252 |     ")"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "id": "acbc2937",
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "predictor.evaluate(test_data, metrics=[\"f1\", \"acc\"])"
263 |    ]
264 |   }
265 |  ],
266 |  "metadata": {
267 |   "language_info": {
268 |    "name": "python"
269 |   }
270 |  },
271 |  "nbformat": 4,
272 |  "nbformat_minor": 5
273 | }


--------------------------------------------------------------------------------
/tutorials/text_prediction/multilingual_text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "146e3623",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/multilingual_text.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/multilingual_text.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "9c18a7e3",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "55de066d",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Text Prediction - Solving Multilingual Problems\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "People around the world speaks lots of languages. According to [SIL International](https://en.wikipedia.org/wiki/SIL_International)'s [Ethnologue: Languages of the World](https://en.wikipedia.org/wiki/Ethnologue), there are more than **7,100** spoken and signed languages. In fact, web data nowadays are highly multilingual and lots of real-world problems involve text written in languages other than English.\n",
 33 |     "\n",
 34 |     "In this tutorial, we introduce how AutoGluon Text can help you build multilingual models. For the purpose of demonstration, we use the [Cross-Lingual Amazon Product Review Sentiment](https://webis.de/data/webis-cls-10.html) dataset, which comprises about 800,000 Amazon product reviews in four languages: English, German, French, and Japanese. We will demonstrate how to use AutoGluon Text to build sentiment classification models on the German fold of this dataset in two ways:\n",
 35 |     "\n",
 36 |     "- Finetune the German BERT\n",
 37 |     "- Cross-lingual transfer from English to German\n",
 38 |     "\n",
 39 |     "## Load Dataset\n",
 40 |     "\n",
 41 |     "The [Cross-Lingual Amazon Product Review Sentiment](https://webis.de/data/webis-cls-10.html) dataset contains Amazon product reviews in four languages. Here, we load the English and German fold of the dataset. In the label column, `0` means negative sentiment and `1` means positive sentiment."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "50093cca",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!wget https://automl-mm-bench.s3.amazonaws.com/multilingual-datasets/amazon_review_sentiment_cross_lingual.zip -O amazon_review_sentiment_cross_lingual.zip\n",
 52 |     "!unzip -o amazon_review_sentiment_cross_lingual.zip -d ."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "d5fde1df",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import pandas as pd\n",
 63 |     "import warnings\n",
 64 |     "warnings.filterwarnings('ignore')\n",
 65 |     "\n",
 66 |     "train_de_df = pd.read_csv('amazon_review_sentiment_cross_lingual/de_train.tsv',\n",
 67 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
 68 |     "                .sample(1000, random_state=123)\n",
 69 |     "train_de_df.reset_index(inplace=True, drop=True)\n",
 70 |     "\n",
 71 |     "test_de_df = pd.read_csv('amazon_review_sentiment_cross_lingual/de_test.tsv',\n",
 72 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
 73 |     "               .sample(200, random_state=123)\n",
 74 |     "test_de_df.reset_index(inplace=True, drop=True)\n",
 75 |     "print(train_de_df)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "36822acd",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "train_en_df = pd.read_csv('amazon_review_sentiment_cross_lingual/en_train.tsv',\n",
 86 |     "                          sep='\\t',\n",
 87 |     "                          header=None,\n",
 88 |     "                          names=['label', 'text']) \\\n",
 89 |     "                .sample(1000, random_state=123)\n",
 90 |     "train_en_df.reset_index(inplace=True, drop=True)\n",
 91 |     "\n",
 92 |     "test_en_df = pd.read_csv('amazon_review_sentiment_cross_lingual/en_test.tsv',\n",
 93 |     "                          sep='\\t',\n",
 94 |     "                          header=None,\n",
 95 |     "                          names=['label', 'text']) \\\n",
 96 |     "               .sample(200, random_state=123)\n",
 97 |     "test_en_df.reset_index(inplace=True, drop=True)\n",
 98 |     "print(train_en_df)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "26f45655",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Finetune the German BERT\n",
107 |     "\n",
108 |     "Our first approach is to finetune the [German BERT model](https://www.deepset.ai/german-bert) pretrained by deepset. Since AutoGluon Text integrates with the [Huggingface/Transformers](https://huggingface.co/docs/transformers/index) (as explained in [tutorials/text_prediction/customization.ipynb](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/text_prediction/customization.ipynb)), we directly load the German BERT model via Huggingface/Transformers, with the key as [bert-base-german-cased](https://huggingface.co/bert-base-german-cased). To simplify the experiment, we also just finetune for 4 epochs."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "f7b9dd21",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "from autogluon.text import TextPredictor\n",
119 |     "\n",
120 |     "predictor = TextPredictor(label='label')\n",
121 |     "predictor.fit(train_de_df,\n",
122 |     "              hyperparameters={\n",
123 |     "                  'model.hf_text.checkpoint_name': 'bert-base-german-cased',\n",
124 |     "                  'optimization.max_epochs': 4\n",
125 |     "              })"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "3d05611b",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "score = predictor.evaluate(test_de_df)\n",
136 |     "print('Score on the German Testset:')\n",
137 |     "print(score)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "id": "5ddb7723",
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "score = predictor.evaluate(test_en_df)\n",
148 |     "print('Score on the English Testset:')\n",
149 |     "print(score)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "8b4b75e3",
155 |    "metadata": {},
156 |    "source": [
157 |     "We can find that the model can achieve good performance on the German dataset but performs poorly on the English dataset. Next, we will show how to enable cross-lingual transfer so you can get a model that can magically work for **both German and English**.\n",
158 |     "\n",
159 |     "## Cross-lingual Transfer\n",
160 |     "\n",
161 |     "In the real-world scenario, it is pretty common that you have trained a model for English and would like to extend the model to support other languages like German. This setting is also known as cross-lingual transfer. \n",
162 |     "One way to solve the problem is to apply a machine translation model to translate the sentences from the other language (e.g., German) to English and apply the English model.\n",
163 |     "However, as showed in [\"Unsupervised Cross-lingual Representation Learning at Scale\"](https://arxiv.org/pdf/1911.02116.pdf), there is a better and cost-friendlier way for cross lingual transfer, enabled via large-scale multilingual pretraining.\n",
164 |     "The author showed that via large-scale pretraining, the backbone (called XLM-R) is able to conduct *zero-shot* cross lingual transfer, meaning that you can directly apply the model trained in the English dataset to datasets in other languages. \n",
165 |     "It also outperforms the baseline \"TRANSLATE-TEST\", meaning to translate the data from other languages to English and apply the English model. \n",
166 |     "\n",
167 |     "In AutoGluon, you can just turn on `presets=\"multilingual\"` to load a backbone that is suitable for zero-shot transfer. \n",
168 |     "Internally, we will automatically use state-of-the-art models like [DeBERTa-V3](https://arxiv.org/abs/2111.09543)."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "5ef1b406",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "from autogluon.text import TextPredictor\n",
179 |     "\n",
180 |     "predictor = TextPredictor(label='label')\n",
181 |     "predictor.fit(train_en_df,\n",
182 |     "              presets='multilingual',\n",
183 |     "              hyperparameters={\n",
184 |     "                  'optimization.max_epochs': 4\n",
185 |     "              })"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "id": "58832d70",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "score_in_en = predictor.evaluate(test_en_df)\n",
196 |     "print('Score in the English Testset:')\n",
197 |     "print(score_in_en)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "id": "4adc755b",
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "score_in_de = predictor.evaluate(test_de_df)\n",
208 |     "print('Score in the German Testset:')\n",
209 |     "print(score_in_de)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "id": "52edb68d",
215 |    "metadata": {},
216 |    "source": [
217 |     "We can see that the model works for both German and English!\n",
218 |     "\n",
219 |     "Let's also inspect the model's performance on Japanese:"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "id": "4bbbdf0c",
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "test_jp_df = pd.read_csv('amazon_review_sentiment_cross_lingual/jp_test.tsv',\n",
230 |     "                          sep='\\t', header=None, names=['label', 'text']) \\\n",
231 |     "               .sample(200, random_state=123)\n",
232 |     "test_jp_df.reset_index(inplace=True, drop=True)\n",
233 |     "print(test_jp_df)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "978662eb",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "print('Negative labe ratio of the Japanese Testset=', test_jp_df['label'].value_counts()[0] / len(test_jp_df))\n",
244 |     "score_in_jp = predictor.evaluate(test_jp_df)\n",
245 |     "print('Score in the Japanese Testset:')\n",
246 |     "print(score_in_jp)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "id": "4a4b912a",
252 |    "metadata": {},
253 |    "source": [
254 |     "Amazingly, the model also works for Japanese!"
255 |    ]
256 |   }
257 |  ],
258 |  "metadata": {
259 |   "language_info": {
260 |    "name": "python"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 5
265 | }


--------------------------------------------------------------------------------
/tutorials/timeseries/forecasting-faq.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "28169181",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/timeseries/forecasting-faq.ipynb)\n",
 9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/timeseries/forecasting-faq.ipynb)\n"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "id": "61319df3",
16 |    "outputs": [],
17 |    "metadata": {},
18 |    "source": [
19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "markdown",
25 |    "id": "37e30e32",
26 |    "metadata": {},
27 |    "source": [
28 |     "# FAQ - Time Series\n",
29 |     "\n",
30 |     "\n",
31 |     "\n",
32 |     "\n",
33 |     "### Where can I find more information about the models/metrics?\n",
34 |     "\n",
35 |     "Metrics are implemented in the `autogluon.timeseries.evaluator` module. We also follow some of \n",
36 |     "the same conventions followed by GluonTS in their evaluation.\n",
37 |     "Please refer to\n",
38 |     "the GluonTS [documentation](https://ts.gluon.ai/api/gluonts/gluonts.html) and \n",
39 |     "[github](https://github.com/awslabs/gluon-ts) for further information.\n",
40 |     "\n",
41 |     "A detailed description of evaluation metrics is also available at \n",
42 |     "[here](https://docs.aws.amazon.com/forecast/latest/dg/metrics.html).\n",
43 |     "\n",
44 |     "### How can I get the most accurate forecast predictions?\n",
45 |     "\n",
46 |     "Generally setting the `predictor.fit()` argument `presets=\"best_quality\"` will result in high accuracy. \n",
47 |     "Alternative options include manually specifying hyperparameter search spaces for certain models and \n",
48 |     "manually increasing the number of hyperparameter optimization trials.\n",
49 |     "\n",
50 |     "\n",
51 |     "### Can I use GPUs for model training?\n",
52 |     "\n",
53 |     "Yes! Most of the models used by AutoGluon-Forecasting support GPU training, but it is not \n",
54 |     "required that you train on a GPU. Make sure you have installed CUDA and the GPU version of MXNet.\n",
55 |     "AutoGluon will try to automatically detect whether your machine has a GPU, and train\n",
56 |     "neural network based models on these. Multi-GPU training is not yet supported.\n",
57 |     "\n",
58 |     "\n",
59 |     "### What machine is best for running AutoGluon-Forecasting?\n",
60 |     "\n",
61 |     "As an open-source library, AutoGluon-Forecasting can be run on any machine including your laptop. \n",
62 |     "Currently it is not necessary to use a GPU to train forecasting models so CPU machines are fine \n",
63 |     "albeit slower for certain models. We recommend running on a machine with as much memory as possible \n",
64 |     "and the best available GPU (for instance if using AWS EC2, we \n",
65 |     "recommend [P3 instances](https://aws.amazon.com/ec2/instance-types/p3/)).\n",
66 |     "\n",
67 |     "\n",
68 |     "### Issues not addressed here\n",
69 |     "\n",
70 |     "First search if your issue is addressed in the [tutorials](https://auto.gluon.ai/stable/tutorials/index.html), \n",
71 |     "[examples](https://github.com/awslabs/autogluon/tree/master/examples/forecasting), \n",
72 |     "[documentation](https://auto.gluon.ai/stable/api/autogluon.predictor.html), or [Github issues](https://github.com/awslabs/autogluon/issues) \n",
73 |     "(search both Closed and Open issues). \n",
74 |     "If it is not there, please open a [new Github Issue](https://github.com/awslabs/autogluon/issues/new) and \n",
75 |     "clearly state your issue and clarify it relates to forecasting. \n",
76 |     "\n",
77 |     "If you have a bug, please include: your code (ideally set `verbosity=4` which will print out more details), the \n",
78 |     "output printed during the code execution, and information about your operating system, Python version, and \n",
79 |     "installed packages (output of `pip freeze`). \n",
80 |     "Many user issues stem from incorrectly formatted data, so please describe your data as clearly as possible."
81 |    ]
82 |   }
83 |  ],
84 |  "metadata": {
85 |   "language_info": {
86 |    "name": "python"
87 |   }
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 5
91 | }


--------------------------------------------------------------------------------
/tutorials/timeseries/forecasting-quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c15805ab",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/tutorials/timeseries/forecasting-quickstart.ipynb)\n",
  9 |     "[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/tutorials/timeseries/forecasting-quickstart.ipynb)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "id": "3112933a",
 16 |    "outputs": [],
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Uncomment the code below and run this cell if AutoGluon is not yet installed in the kernel.\n",
 20 |     "# !pip install autogluon==0.5.0  # These tutorials are based on AutoGluon v0.5.0 and might not work with different versions."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "b644b889",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Forecasting Time Series - Quick Start\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "Via a simple `fit()` call, AutoGluon can train \n",
 33 |     "\n",
 34 |     "- simple forecasting models (e.g., ARIMA, ETS),\n",
 35 |     "- powerful neural network-based models (e.g., DeepAR, Transformer, MQ-CNN),\n",
 36 |     "- and fit greedy weighted ensembles built on these\n",
 37 |     "\n",
 38 |     "to produce multi-step ahead _probabilistic_ forecasts for univariate time series data. \n",
 39 |     "\n",
 40 |     "---\n",
 41 |     "**NOTE**\n",
 42 |     "\n",
 43 |     "`autogluon.timeseries` depends on Apache MXNet. Please install MXNet by"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "a31d4806",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "```shell\n",
 52 |     "python -m pip install mxnet>=1.9\n",
 53 |     "```\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "249037f8",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "or, if you are using a GPU with a matching MXNet package for your CUDA driver. See the \n",
 62 |     "MXNet [documentation](https://mxnet.apache.org/versions/1.9.1/get_started?) for more info.\n",
 63 |     "\n",
 64 |     "---\n",
 65 |     "\n",
 66 |     "This tutorial demonstrates how to quickly start using AutoGluon to produce forecasts of COVID-19 cases in a \n",
 67 |     "country given [historical data from each country](https://www.kaggle.com/c/covid19-global-forecasting-week-4). \n",
 68 |     "\n",
 69 |     "`autogluon.timeseries` provides the `TimeSeriesPredictor` and `TimeSeriesDataFrame` classes for interacting \n",
 70 |     "with time series models. `TimeSeriesDataFrame` contains time series data. The `TimeSeriesPredictor` class \n",
 71 |     "provides the interface for fitting, tuning and selecting forecasting models."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "9a9e15f4",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "import pandas as pd\n",
 82 |     "from matplotlib import pyplot as plt\n",
 83 |     "\n",
 84 |     "from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame "
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "d37647fd",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "`TimeSeriesDataFrame` objects hold time series data, often with multiple \"items\" (time series) such as different\n",
 93 |     "products in demand forecasting. This setting is also sometimes referred to as a \"panel\" of time series.\n",
 94 |     "`TimeSeriesDataFrame` inherits from \n",
 95 |     "[Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html), and \n",
 96 |     "the attributes and methods of `pandas.DataFrame`s are available in `TimeSeriesDataFrame`s.\n",
 97 |     "\n",
 98 |     "In our example, we work with COVID case data as of April 2020 where our goal is to forecast the number of confirmed COVID cases\n",
 99 |     "for each country in the data set.\n",
100 |     "Below, we load time series data from an [AWS S3 bucket](https://aws.amazon.com/s3/), and prepare it for use in\n",
101 |     "`autogluon.timeseries`. \n",
102 |     "Note that we make sure the date field is parsed by pandas, and provide the columns containing\n",
103 |     "the item (`id_column`) and timestamps (`timestamp_column`) to `TimeSeriesDataFrame`. \n",
104 |     "We also plot the trajectories of COVID cases with two example countries:\n",
105 |     "Germany and the UK."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "dd685d7b",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "df = pd.read_csv(\n",
116 |     "    \"https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv\",\n",
117 |     "    parse_dates=[\"Date\"],\n",
118 |     ")\n",
119 |     "\n",
120 |     "train_data = TimeSeriesDataFrame.from_data_frame(\n",
121 |     "    df,\n",
122 |     "    id_column=\"name\",\n",
123 |     "    timestamp_column=\"Date\",\n",
124 |     ")\n",
125 |     "\n",
126 |     "plt.figure(figsize=(20, 3))\n",
127 |     "for country in [\"United Kingdom_\", \"Germany_\"]:\n",
128 |     "    plt.plot(train_data.loc[country], label=country)\n",
129 |     "plt.legend()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "id": "27137f36",
135 |    "metadata": {},
136 |    "source": [
137 |     "Note how `TimeSeriesDataFrame` objects organize the data with a `pandas.MultiIndex` where the first _level_ of the index \n",
138 |     "corresponds to the item (here, country) and the second level contains the dates for which the values were observed.\n",
139 |     "We can also use the `loc` accessor, as in pandas, to access individual country data."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "47fa74ed",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "train_data.head()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "e8c21b30",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "train_data.loc['Afghanistan_'].head()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "d2b6c4ce",
165 |    "metadata": {},
166 |    "source": [
167 |     "The primary use case of `autogluon.timeseries` is time series forecasting. In our example, our goal is to train models on COVID case data \n",
168 |     "that can forecast the future trajectory of cases given the past, for each country in the data set. \n",
169 |     "By default, `autogluon.timeseries` supports multi-step ahead _probabilistic_ forecasting. That is, multiple time steps in the future \n",
170 |     "can be forecasted, given that models are trained with the prerequisite number of steps (also known as the _forecast horizon_). \n",
171 |     "Moreover, when trained models are used to predict the future, the library will provide both `\"mean\"` \n",
172 |     "forecasts--expected values of the time series in the future, as well as _quantiles_ of the forecast distribution.\n",
173 |     "\n",
174 |     "In order to train our forecasting models, we first split the data into training and test data sets. \n",
175 |     "In forecasting, this is often done via excluding the last `prediction_length` many steps of the data set during training, and \n",
176 |     "only use these steps to compute validation scores (also known as an \"out of time\" validation sample).\n",
177 |     "We carry out this split via the `slice_by_timestep` method provided by `TimeSeriesDataFrame` which takes python `slice` objects."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "id": "9bf7afc5",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "prediction_length = 5\n",
188 |     "\n",
189 |     "test_data = train_data.copy()  # the full data set\n",
190 |     "\n",
191 |     "# the data set with the last prediction_length time steps included, i.e., akin to `a[:-5]`\n",
192 |     "train_data = train_data.slice_by_timestep(slice(None, -prediction_length))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "b600aac9",
198 |    "metadata": {},
199 |    "source": [
200 |     "Below, for a single country we plot the training and test data sets showing how they overlap and explicitly mark the forecast horizon of the\n",
201 |     "test data set. The test scores will be computed on forecasts provided for this range."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "id": "82f8d7cc",
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "plt.figure(figsize=(20, 3))\n",
212 |     "plt.plot(test_data.loc[\"Germany_\"], label=\"test\")\n",
213 |     "plt.plot(train_data.loc[\"Germany_\"], label=\"train\")\n",
214 |     "\n",
215 |     "test_range = (\n",
216 |     "    test_data.loc[\"Germany_\"].index.max(),\n",
217 |     "    train_data.loc[\"Germany_\"].index.max(),\n",
218 |     ")\n",
219 |     "\n",
220 |     "plt.fill_betweenx(\n",
221 |     "    y=(0, test_data.loc[\"Germany_\"][\"ConfirmedCases\"].max()),\n",
222 |     "    x1=test_range[0],\n",
223 |     "    x2=test_range[1],\n",
224 |     "    alpha=0.1,\n",
225 |     "    label=\"test forecast horizon\",\n",
226 |     ")\n",
227 |     "\n",
228 |     "plt.legend()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "id": "85ed3865",
234 |    "metadata": {},
235 |    "source": [
236 |     "Below we instantiate a `TimeSeriesPredictor` object and instruct AutoGluon to fit models that can forecast up to \n",
237 |     "5 time-points into the future (`prediction_length`) and save them in the folder `./autogluon-covidforecast`.\n",
238 |     "We also specify that AutoGluon should rank models according to mean absolute percentage error (MAPE) and that\n",
239 |     "the target field to be forecasted is `\"ConfirmedCases\"`."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "id": "40ac8b7b",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "predictor = TimeSeriesPredictor(\n",
250 |     "    path=\"autogluon-covidforecast\",     \n",
251 |     "    target=\"ConfirmedCases\",\n",
252 |     "    prediction_length=prediction_length,\n",
253 |     "    eval_metric=\"MAPE\",\n",
254 |     ")\n",
255 |     "predictor.fit(\n",
256 |     "    train_data=train_data,\n",
257 |     "    presets=\"low_quality\",\n",
258 |     ")"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "id": "c697b55d",
264 |    "metadata": {},
265 |    "source": [
266 |     "In a short amount of time AutoGluon fits four time series forecasting models on the training data.\n",
267 |     "These models are three neural network forecasters: DeepAR, MQCNN, a simple feedforward neural network; and a simple exponential smoothing model with \n",
268 |     "automatic parameter tuning: Auto-ETS.\n",
269 |     "AutoGluon also constructs a weighted ensemble of these models capable of quantile forecasting.\n",
270 |     "\n",
271 |     "We can view the test performance of each model AutoGluon has trained via the `leaderboard()` method.\n",
272 |     "We provide the test data set to the leaderboard function to see how well our fitted models are doing on the held out time frame. \n",
273 |     "In AutoGluon leaderboards, higher scores always correspond to better predictive performance. \n",
274 |     "Therefore our MAPE scores are presented with a \"flipped\" sign, such that higher \"negative MAPE\"s correspond to better models."
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "id": "fd8f2ac6",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "predictor.leaderboard(test_data, silent=True)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "id": "f72b8295",
290 |    "metadata": {},
291 |    "source": [
292 |     "We can now use the `TimeSeriesPredictor` to look at actual forecasts. \n",
293 |     "By default, AutoGluon will select the best performing model to forecast time series with. \n",
294 |     "Let's use the predictor to compute forecasts, and plot forecasts for an example country."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "5cf97f34",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "predictions = predictor.predict(train_data)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "id": "492e1cc2",
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "plt.figure(figsize=(20, 3))\n",
315 |     "\n",
316 |     "ytrue = train_data.loc['France_'][\"ConfirmedCases\"]\n",
317 |     "ypred = predictions.loc['France_']\n",
318 |     "\n",
319 |     "# prepend the last value of true range to predicted range for plotting continuity\n",
320 |     "ypred.loc[ytrue.index[-1]] = [ytrue[-1]] * 10\n",
321 |     "ypred = ypred.sort_index()\n",
322 |     "\n",
323 |     "ytrue_test = test_data.loc['France_'][\"ConfirmedCases\"][-5:]\n",
324 |     "\n",
325 |     "plt.plot(ytrue[-30:], label=\"Training Data\")\n",
326 |     "plt.plot(ypred[\"mean\"], label=\"Mean Forecasts\")\n",
327 |     "plt.plot(ytrue_test, label=\"Actual\")\n",
328 |     "\n",
329 |     "plt.fill_between(\n",
330 |     "    ypred.index, ypred[\"0.1\"], ypred[\"0.9\"], color=\"red\", alpha=0.1\n",
331 |     ")\n",
332 |     "plt.title(\"COVID Case Forecasts in France, compared to actual trajectory\")\n",
333 |     "_ = plt.legend()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "id": "a85d99d6",
339 |    "metadata": {},
340 |    "source": [
341 |     "As we used a \"toy\" presets setting (`presets=\"low_quality\"`) our forecasts may appear to not be doing very well. In realistic scenarios, \n",
342 |     "users can set `presets` to be one of: `\"best_quality\"`, `\"high_quality\"`, `\"good_quality\"`, `\"medium_quality\"`. \n",
343 |     "Higher quality presets will generally produce superior forecasting accuracy but take longer to train and may produce less efficient models."
344 |    ]
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "language_info": {
349 |    "name": "python"
350 |   }
351 |  },
352 |  "nbformat": 4,
353 |  "nbformat_minor": 5
354 | }


--------------------------------------------------------------------------------
/welcome-notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "9d6bb706-49b4-4544-b8cf-64ba886d6055",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "## AutoGluon Tutorial Notebooks\n",
 9 |     "AutoGluon maintains a set of [tutorial notebooks](https://github.com/gidler/autogluon-tutorials/tree/main/tutorials) that demonstrate Autogluon functionality, from basic usage to advanced features, providing an easy way to get started with AutoGluon."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "markdown",
14 |    "id": "c145bff2-0cbf-4e08-9e20-3a345541f38d",
15 |    "metadata": {},
16 |    "source": [
17 |     "### Running Notebooks\n",
18 |     "The [tutorial notebooks](https://github.com/gidler/autogluon-tutorials/tree/main/tutorials) can be run in several ways:\n",
19 |     "\n",
20 |     "- By cloning the [notebook repository](https://github.com/gidler/autogluon-tutorials) and running [Jupyter](https://docs.jupyter.org/en/latest/index.html) on your local machine\n",
21 |     "\n",
22 |     "- On [Google Colab](https://colab.research.google.com/) if you have a Google account [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gidler/autogluon-tutorials/blob/main/welcome-notebook.ipynb) \n",
23 |     "\n",
24 |     "- On [SageMaker Studio Labs](https://studiolab.sagemaker.aws/) with a free account [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/gidler/autogluon-tutorials/blob/main/welcome-notebook.ipynb)"
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "markdown",
29 |    "id": "447167dc-7c86-40d8-81da-6deba1e72817",
30 |    "metadata": {},
31 |    "source": [
32 |     "The notebooks will allow you to immediately modify the code and start training state of the art machine learning models."
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "markdown",
37 |    "id": "34cf22fd-3001-4297-a1df-11647ef311aa",
38 |    "metadata": {},
39 |    "source": [
40 |     "### Ready To Begin\n",
41 |     "\n",
42 |     "Check out a [Quickstart notebook](https://github.com/gidler/autogluon-tutorials/blob/main/tutorials/tabular_prediction/tabular-quickstart.ipynb) and start exploring AutoGluon."
43 |    ]
44 |   }
45 |  ],
46 |  "metadata": {
47 |   "kernelspec": {
48 |    "display_name": "Python 3 (ipykernel)",
49 |    "language": "python",
50 |    "name": "python3"
51 |   },
52 |   "language_info": {
53 |    "codemirror_mode": {
54 |     "name": "ipython",
55 |     "version": 3
56 |    },
57 |    "file_extension": ".py",
58 |    "mimetype": "text/x-python",
59 |    "name": "python",
60 |    "nbconvert_exporter": "python",
61 |    "pygments_lexer": "ipython3",
62 |    "version": "3.9.7"
63 |   }
64 |  },
65 |  "nbformat": 4,
66 |  "nbformat_minor": 5
67 | }
68 | 


--------------------------------------------------------------------------------