├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── SECURITY.md
└── demos
    ├── OracleAutoMLx_AnomalyDetection.html
    ├── OracleAutoMLx_AnomalyDetection.ipynb
    ├── OracleAutoMLx_Classification.html
    ├── OracleAutoMLx_Classification.ipynb
    ├── OracleAutoMLx_Classification_Text.html
    ├── OracleAutoMLx_Classification_Text.ipynb
    ├── OracleAutoMLx_ExecutionEngineSetup.html
    ├── OracleAutoMLx_ExecutionEngineSetup.ipynb
    ├── OracleAutoMLx_Fairness.html
    ├── OracleAutoMLx_Fairness.ipynb
    ├── OracleAutoMLx_Forecasting.html
    ├── OracleAutoMLx_Forecasting.ipynb
    ├── OracleAutoMLx_ImageClassification.html
    ├── OracleAutoMLx_ImageClassification.ipynb
    ├── OracleAutoMLx_Recommendation.html
    ├── OracleAutoMLx_Recommendation.ipynb
    ├── OracleAutoMLx_Regression.html
    ├── OracleAutoMLx_Regression.ipynb
    ├── OracleAutoMLx_train_model.html
    └── OracleAutoMLx_train_model.ipynb


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to this repository
 2 | 
 3 | We welcome your contributions! There are multiple ways to contribute.
 4 | 
 5 | ## Opening issues
 6 | 
 7 | For bugs or enhancement requests, please file a GitHub issue unless it's
 8 | security related. When filing a bug remember that the better written the bug is,
 9 | the more likely it is to be fixed. If you think you've found a security
10 | vulnerability, do not raise a GitHub issue and follow the instructions in our
11 | [security policy](./SECURITY.md).
12 | 
13 | ## Contributing code
14 | 
15 | We welcome your code contributions. Before submitting code via a pull request,
16 | you will need to have signed the [Oracle Contributor Agreement][OCA] (OCA) and
17 | your commits need to include the following line using the name and e-mail
18 | address you used to sign the OCA:
19 | 
20 | ```text
21 | Signed-off-by: Your Name <you@example.org>
22 | ```
23 | 
24 | This can be automatically added to pull requests by committing with `--sign-off`
25 | or `-s`, e.g.
26 | 
27 | ```text
28 | git commit --signoff
29 | ```
30 | 
31 | Only pull requests from committers that can be verified as having signed the OCA
32 | can be accepted.
33 | 
34 | ## Pull request process
35 | 
36 | 1. Ensure there is an issue created to track and discuss the fix or enhancement
37 |    you intend to submit.
38 | 1. Fork this repository.
39 | 1. Create a branch in your fork to implement the changes. We recommend using
40 |    the issue number as part of your branch name, e.g. `1234-fixes`.
41 | 1. Ensure that any documentation is updated with the changes that are required
42 |    by your change.
43 | 1. Ensure that any samples are updated if the base image has been changed.
44 | 1. Submit the pull request. *Do not leave the pull request blank*. Explain exactly
45 |    what your changes are meant to do and provide simple steps on how to validate.
46 |    your changes. Ensure that you reference the issue you created as well.
47 | 1. We will assign the pull request to 2-3 people for review before it is merged.
48 | 
49 | ## Code of conduct
50 | 
51 | Follow the [Golden Rule](https://en.wikipedia.org/wiki/Golden_Rule). If you'd
52 | like more specific guidelines, see the [Contributor Covenant Code of Conduct][COC].
53 | 
54 | [OCA]: https://oca.opensource.oracle.com
55 | [COC]: https://www.contributor-covenant.org/version/1/4/code-of-conduct/
56 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025 Oracle and/or its affiliates.
 2 | 
 3 | The Universal Permissive License (UPL), Version 1.0
 4 | 
 5 | Subject to the condition set forth below, permission is hereby granted to any
 6 | person obtaining a copy of this software, associated documentation and/or data
 7 | (collectively the "Software"), free of charge and under any and all copyright
 8 | rights in the Software, and any and all patent rights owned or freely
 9 | licensable by each licensor hereunder covering either (i) the unmodified
10 | Software as contributed to or provided by such licensor, or (ii) the Larger
11 | Works (as defined below), to deal in both
12 | 
13 | (a) the Software, and
14 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
15 | one is included with the Software (each a "Larger Work" to which the Software
16 | is contributed by such licensors),
17 | 
18 | without restriction, including without limitation the rights to copy, create
19 | derivative works of, display, perform, and distribute the Software and make,
20 | use, sell, offer for sale, import, export, have made, and have sold the
21 | Software and the Larger Work(s), and to sublicense the foregoing rights on
22 | either these or other terms.
23 | 
24 | This license is subject to the following condition:
25 | The above copyright notice and either this complete permission notice or at
26 | a minimum a reference to the UPL must be included in all copies or
27 | substantial portions of the Software.
28 | 
29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 | SOFTWARE.
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoMLx Demo Notebooks
 2 | 
 3 | This repository contains demo notebooks (sample code) for the AutoMLx (automated machine learning and explainability) package from Oracle Labs.
 4 | 
 5 | The notebooks are intended to show how to initialize, train and explain an AutoML model in a few lines of code. The notebooks also cover many of the advanced features available in the AutoMLx package.
 6 | 
 7 | ## Installation
 8 | 
 9 | Pre-executed copies of each of the demo notebooks are available as html files, which can be viewed without installing anything.
10 | 
11 | To run the demo notebooks yourself:
12 | 1. Create a free trial account and a new project on the [OCI Data Science](https://apexapps.oracle.com/pls/apex/r/dbpm/livelabs/view-workshop?wid=673) service.
13 | 2. Install the [AutoMLx conda pack](https://docs.oracle.com/en-us/iaas/data-science/using/conda-automlx-fam.htm).
14 | 3. Import, open and run the demo notebooks.
15 | 
16 | ## Documentation
17 | 
18 | The demo notebooks in this repository serve as supplementary documentation for the AutoMLx package. The AutoMLx class documentation is available on [docs.oracle.com](https://docs.oracle.com/en-us/iaas/tools/automlx/latest/latest/).
19 | 
20 | ## Examples
21 | 
22 | These demo notebooks cover the five machine learning tasks supported by the AutoMLx package:
23 |  - [Classification](./demos/OracleAutoMLx_Classification.ipynb) [(html)](./demos/OracleAutoMLx_Classification.html),
24 |  - [Regression](./demos/OracleAutoMLx_Regression.ipynb) [(html)](./demos/OracleAutoMLx_Regression.html),
25 |  - [Forecasting](./demos/OracleAutoMLx_Forecasting.ipynb) [(html)](./demos/OracleAutoMLx_Forecasting.html),
26 |  - [Anomaly Detection](./demos/OracleAutoMLx_AnomalyDetection.ipynb) [(html)](./demos/OracleAutoMLx_AnomalyDetection.html),
27 |  - [Text Classification](./demos/OracleAutoMLx_Classification_Text.ipynb) [(html)](./demos/OracleAutoMLx_Classification_Text.html),
28 |  - [Image Classification](./demos/OracleAutoMLx_ImageClassification.ipynb) [(html)](./demos/OracleAutoMLx_ImageClassification.html),
29 |  - [Recommendation](./demos/OracleAutoMLx_Recommendation.ipynb) [(html)](./demos/OracleAutoMLx_Recommendation.html),
30 |  - [Fairness](./demos/OracleAutoMLx_Fairness.ipynb) [(html)](./demos/OracleAutoMLx_Fairness.html),
31 |  - [Simple API](./demos/OracleAutoMLx_train_model.ipynb) [(html)](./demos/OracleAutoMLx_train_model.html), and
32 |  - [Execution Engine Setup](./demos/OracleAutoMLx_ExecutionEngineSetup.ipynb) [(html)](./demos/OracleAutoMLx_ExecutionEngineSetup.html).
33 | 
34 | ## Help
35 | 
36 | Create a GitHub [issue](https://github.com/oracle-samples/automlx/issues).
37 | 
38 | ## Contributing
39 | 
40 | This project welcomes contributions from the community. Before submitting a pull request, please [review our contribution guide](./CONTRIBUTING.md).
41 | 
42 | ## Security
43 | 
44 | Please consult the [security guide](./SECURITY.md) for our responsible security vulnerability disclosure process.
45 | 
46 | ## License
47 | 
48 | Copyright (c) 2025 Oracle and/or its affiliates.
49 | 
50 | Released under the Universal Permissive License v1.0 as shown at [https://oss.oracle.com/licenses/upl/](https://oss.oracle.com/licenses/upl/).
51 | 
52 | ## Third-Party Software
53 | 
54 | Developers choosing to distribute a binary implementation of this project are responsible for obtaining and providing all required licenses and copyright notices for the third-party code used in order to ensure compliance with their respective open source licenses.
55 | 
56 | ## Third-Party Datasets
57 | 
58 | The AutoMLx demo notebooks download and use several third-party datasets to showcase AutoMLx functionality.
59 | 
60 | | Dataset            | License                                                                                                                  | Description                                                                                                                                                                                                           |
61 | |--------------------|--------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
62 | | Airlines           | No License                                                                                                               | The dataset consists of a large amount of records, containing flight arrival and departure details for all the commercial flights within the USA, from October 1987 to April 2008.                                    |
63 | | M4 Competition     | [GPL 3.0](https://www.gnu.org/licenses/gpl-3.0.en.html) with [MOFC Terms](https://mofc.unic.ac.cy/terms-and-conditions/) | We select a series from the finance sector with weekly collection frequency (Finance W142) from the M4 forecasting competition.                                                                                       |
64 | | Census Income      | [CC By 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)                                                       | Census Income Dataset is used to predict whether income exceeds $50K/yr based on the 1994 US Census data. It is also frequently known as the "Adult" dataset.                                                         |
65 | | Credit Card Fraud  | [ODC DbCL v1.0](https://opendatacommons.org/licenses/odbl/1-0/)                                                          | The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. |
66 | | California Housing | No License                                                                                                               | This dataset was derived from the 1990 U.S. census, using one row per census block group. The target variable is the median house value for California districts.                                                     |
67 | | Newsgroup 20       | No License                                                                                                               | The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.                                                                   |
68 | | PneumoniaMNIST     | [CC By 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)                                                       | The PneumoniaMNIST is consist of 5,856 pediatric chest X-Ray images.                                                                   |
69 | | MovieLens     | [Custom](https://files.grouplens.org/datasets/movielens/ml-32m-README.html)                                                       | MovieLens 100K movie ratings. Stable benchmark dataset. 100,000 ratings from 1000 users on 1700 movies.                                                                   |


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Reporting security vulnerabilities
 2 | 
 3 | Oracle values the independent security research community and believes that
 4 | responsible disclosure of security vulnerabilities helps us ensure the security
 5 | and privacy of all our users.
 6 | 
 7 | Please do NOT raise a GitHub Issue to report a security vulnerability. If you
 8 | believe you have found a security vulnerability, please submit a report to
 9 | [secalert_us@oracle.com][1] preferably with a proof of concept. Please review
10 | some additional information on [how to report security vulnerabilities to Oracle][2].
11 | We encourage people who contact Oracle Security to use email encryption using
12 | [our encryption key][3].
13 | 
14 | We ask that you do not use other channels or contact the project maintainers
15 | directly.
16 | 
17 | Non-vulnerability related security issues including ideas for new or improved
18 | security features are welcome on GitHub Issues.
19 | 
20 | ## Security updates, alerts and bulletins
21 | 
22 | Security updates will be released on a regular cadence. Many of our projects
23 | will typically release security fixes in conjunction with the
24 | Oracle Critical Patch Update program. Additional
25 | information, including past advisories, is available on our [security alerts][4]
26 | page.
27 | 
28 | ## Security-related information
29 | 
30 | We will provide security related information such as a threat model, considerations
31 | for secure use, or any known security issues in our documentation. Please note
32 | that labs and sample code are intended to demonstrate a concept and may not be
33 | sufficiently hardened for production use.
34 | 
35 | [1]: mailto:secalert_us@oracle.com
36 | [2]: https://www.oracle.com/corporate/security-practices/assurance/vulnerability/reporting.html
37 | [3]: https://www.oracle.com/security-alerts/encryptionkey.html
38 | [4]: https://www.oracle.com/security-alerts/
39 | 


--------------------------------------------------------------------------------
/demos/OracleAutoMLx_ExecutionEngineSetup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "93cb42a9",
  6 |    "metadata": {
  7 |     "lines_to_next_cell": 0
  8 |    },
  9 |    "source": [
 10 |     " ***\n",
 11 |     " # <font color=red>Using AutoMLx Execution engine</font>\n",
 12 |     " <p style=\"margin-left:10%; margin-right:10%;\">by the <font color=teal> Oracle AutoMLx Team </font></p>\n",
 13 |     "\n",
 14 |     " ***"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "79b7e8be",
 20 |    "metadata": {
 21 |     "lines_to_next_cell": 0
 22 |    },
 23 |    "source": [
 24 |     " Execution Engine Setup Notebook.\n",
 25 |     "\n",
 26 |     " Copyright © 2024, Oracle and/or its affiliates.\n",
 27 |     "\n",
 28 |     " Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "ce38d50b",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     " ## Overview of this Notebook\n",
 37 |     "\n",
 38 |     " In this notebook we will showcase the different options provided by AutoMLx execution engine.\n",
 39 |     "\n",
 40 |     " ## Prerequisites\n",
 41 |     "\n",
 42 |     "   - Experience level: intermediate to advanced (Python and Machine Learning)\n",
 43 |     "   - Professional experience: Some industry experience"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "420f2021",
 49 |    "metadata": {
 50 |     "lines_to_next_cell": 0
 51 |    },
 52 |    "source": [
 53 |     "## Table of Contents\n",
 54 |     "\n",
 55 |     "- <a href='#setup'> Engine configuration</a>\n",
 56 |     "    - <a href='#modules'> Imported modules</a>\n",
 57 |     "    - <a href='#default-setup'> Default engine setup</a>\n",
 58 |     "    - <a href='#custom-setup'> Custom setup for all engine types</a>\n",
 59 |     "- <a href='#ray'> Ray Engine</a>\n",
 60 |     "    - <a href='#ray-conf'> Configuring the Ray Engine</a>\n",
 61 |     "    - <a href='#mult-ray'> Multi-node Ray cluster</a>\n",
 62 |     "        - <a href='#utils-ray'> Creating a Ray cluster through AutoMLX's utils</a>\n",
 63 |     "        - <a href='#manual-ray'> Creating a Ray cluster manually</a>\n",
 64 |     "        - <a href='#auto-ray'> Creating a Ray cluster through the launcher</a>\n",
 65 |     "        - <a href='#TLS-ray'> Manual TLS authentication setup</a>\n",
 66 |     "        - <a href='#automlx-ray'> Connecting AutoMLx backend to an existing Ray cluster </a>\n",
 67 |     "        - <a href='#stopping-ray'> Stopping the Ray cluster </a>\n",
 68 |     "- <a href='#multiproc'> Multiprocessing Engine</a>\n",
 69 |     "- <a href='#threading'> Multithreading Engine</a>\n",
 70 |     "- <a href='#differences'> Differences between engines </a>\n",
 71 |     "- <a href='#refs'> References </a>"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "ae1419da",
 77 |    "metadata": {
 78 |     "lines_to_next_cell": 0
 79 |    },
 80 |    "source": [
 81 |     " <a id='setup'></a>\n",
 82 |     "# Engine configuration\n",
 83 |     "The AutoMLx package is compatible with multiple distributed execution engines. This section will showcase how to start an AutoMLx engine instance with default and custom configurations.\n"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "e6e6aabc",
 89 |    "metadata": {
 90 |     "lines_to_next_cell": 0
 91 |    },
 92 |    "source": [
 93 |     "<a id='modules'></a>\n",
 94 |     "## Imported modules\n",
 95 |     "Note that all of the engine operations imports here are fully optional, as the engine will be initialized and shutdown with default settings if these methods are not used."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "902d0b6a",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# To run some example classification task with AutoMLx\n",
106 |     "from tempfile import TemporaryDirectory\n",
107 |     "from pathlib import Path\n",
108 |     "\n",
109 |     "from sklearn.datasets import fetch_openml\n",
110 |     "from sklearn.model_selection import train_test_split\n",
111 |     "\n",
112 |     "# Code for engine operations\n",
113 |     "from automlx import Pipeline, init, shutdown\n",
114 |     "from automlx._backend.utils import (\n",
115 |     "    TLSConfig,\n",
116 |     "    initialize_ray_head_node,\n",
117 |     "    initialize_ray_worker_node,\n",
118 |     "    stop_ray_cluster,\n",
119 |     ")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "985b2386",
125 |    "metadata": {},
126 |    "source": [
127 |     " <a id='default-setup'></a>\n",
128 |     "## Default engine setup\n",
129 |     "The AutoMLx package offers the function `init` to initialize the parallelization engine.\n",
130 |     "This method can be used to manually setup the engine options.\n",
131 |     "However, if not used, AutoMLx will automatically setup the engine with default options as soon as it is needed.\n",
132 |     "At the end of an AutoMLx Pipeline, `shutdown` can be called to reset the parallelization engine. If not manually called, the Engine will stay alive until the python executable finishes, or a SIGKILL is sent (this happens when the program is forcefully terminated).\n",
133 |     "This means it is generally not needed to manually call `shutdown`.\n",
134 |     "\n",
135 |     "[Ray](https://docs.ray.io/en/latest/index.html) is the default execution backend for AutoMLx, It provides the compute layer for parallel processing.\n",
136 |     "It is an open source project that makes it simple to scale any compute-intensive Python workload.\n",
137 |     "\n",
138 |     "In case of the Ray engine, AutoMLx offers both multi-node and single-node (local cluster) options.\n",
139 |     "With \"local cluster\" we mean a cluster of worker threads within the same compute node that asynchronously use the available CPU cores.\n",
140 |     "On the other hand, by \"multi-node\" we mean a cluster of worker threads belonging to multiple separate nodes that can interact with a single head node via Ray to schedule tasks.\n",
141 |     "By default, AutoMLx starts a local cluster Ray Engine that has access to all the CPUs and GPUs available in the current machine.\n"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "c32cc567",
148 |    "metadata": {
149 |     "lines_to_next_cell": 0
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "# Example classification pipeline\n",
154 |     "dataset = fetch_openml(name=\"adult\", version=1, as_frame=True)\n",
155 |     "df, y = dataset.data, dataset.target\n",
156 |     "\n",
157 |     "# Several of the columns are incorrectly labeled as category type in the original dataset\n",
158 |     "numeric_columns = [\"age\", \"capitalgain\", \"capitalloss\", \"hoursperweek\"]\n",
159 |     "for col in df.columns:\n",
160 |     "    if col in numeric_columns:\n",
161 |     "        df[col] = df[col].astype(int)\n",
162 |     "\n",
163 |     "\n",
164 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
165 |     "    df, y.map({\">50K\": 1, \"<=50K\": 0}).astype(int), train_size=0.7, random_state=0\n",
166 |     ")\n",
167 |     "\n",
168 |     "est1 = Pipeline(task=\"classification\")\n",
169 |     "est1.fit(X_train, y_train)\n",
170 |     "\n",
171 |     "shutdown()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "31168c31",
177 |    "metadata": {},
178 |    "source": [
179 |     " <a id='custom-setup'></a>\n",
180 |     "## Custom setup for all engine types\n",
181 |     "The `engine` parameter of `init` can accept three values (`ray` which is the default, `local` for multiprocessing, and `threading`) to match the three execution engines offered by AutoMLx.\n",
182 |     "\n",
183 |     "The `init` function can also customize the resources used by the individual trials through the following parameters:\n",
184 |     "- `n_jobs` : Specifies the number of parallel Jobs that AutoMLx can run concurrently\n",
185 |     "- `model_n_jobs` : Specifies the number of cores used by every Job to train a model\n",
186 |     "\n",
187 |     "For example, if you have a node where you want to preserve half the CPU cores for some orthogonal tasks (e.g., https server),\n",
188 |     "you may limit AutoMLx to a certain number of nodes: if we have 10 virtual CPUs (Intel Hyper-threading or AMD SMT enabled) but want to run at most 5 Jobs using 1 core each, we can set `n_jobs` to 5 and `model_n_jobs` to 1."
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "dcb8d479",
195 |    "metadata": {
196 |     "lines_to_next_cell": 0
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "init(engine_opts={\"n_jobs\": 5, \"model_n_jobs\": 1})"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "id": "9835d701",
206 |    "metadata": {},
207 |    "source": [
208 |     " <a id='ray'></a>\n",
209 |     " # Ray Engine\n",
210 |     "This section explains how to use AutoMLx to start a local cluster and how to start a multi-node ray cluster and connect AutoMLx Engine to it."
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "id": "16409de2",
216 |    "metadata": {
217 |     "lines_to_next_cell": 0
218 |    },
219 |    "source": [
220 |     " <a id='ray-conf'></a>\n",
221 |     " ## Configuring the Ray Engine\n",
222 |     "The cluster configuration can be customized through the `ray_setup` parameter.\n",
223 |     "`ray_setup` is a dictionary that specifies all the options to be passed to [`ray.init`](https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html).\n",
224 |     "For instance, it can be used to disable the dashboard, a useful web application showing a bird's eye view of what is happening in each worker.\n",
225 |     "The dashboard is active by default, but since it requires resources and occupies an additional port, in some cases the user might want to disable it.\n",
226 |     "In this example, we start a local Ray cluster limited to one CPU of the current machine, and with the dashboard disabled.\n",
227 |     "Note that Ray does not constrain execution to happen in the specified resources, but uses resource specification only for scheduling. For instance, specifying\n",
228 |     "5 CPUs to be used will not guarantee that only 5 CPUs are used; instead, it guarantees that Ray will limit the number of concurrent tasks to ensure that the sum of CPUs specified for all tasks does not exceed 5."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "id": "3a600858",
235 |    "metadata": {
236 |     "lines_to_next_cell": 0
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "init(engine_opts={\"ray_setup\": {\"num_cpus\": 1, \"include_dashboard\": False}})"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "id": "7887e2d4",
246 |    "metadata": {},
247 |    "source": [
248 |     " <a id='mult-ray'></a>\n",
249 |     " ## Multi-node Ray cluster\n",
250 |     "This section will show how to start a custom multi-node ray cluster and configure it properly.\n",
251 |     "\n",
252 |     "Generally, a multi-node cluster will prove useful when there's the need for more parallelization than what a single machine can provide.\n",
253 |     "Moreover, several AutoMLx instances running on different machines will be able to connect to the same multi-node cluster and participate in the same\n",
254 |     "scheduling of resources.\n",
255 |     "\n",
256 |     "A multi-node Ray cluster can be started in different modalities, but we recommend using AutoMLx's utils to do so,\n",
257 |     "as they wrap the Ray commands in a straightforward and intuitive manner, especially for what concerns setting up TLS.\n",
258 |     "Ray needs a `head` node to which the several `worker` nodes can be connected.\n",
259 |     "The head node needs to be setup first, and needs to be reachable from all of the worker nodes and viceversa.\n",
260 |     "Moreover, Ray will open up several ports, some of which are customizable, while others are always chosen at random between the open ones. All the used ports\n",
261 |     "need to be open. The default ports can be found [here](https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations).\n",
262 |     "After setting up the cluster, AutoMLx can connect to it through the `init` method from whichever node belongs to it (either head or workers).\n",
263 |     "\n",
264 |     "<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAXsAAACFCAMAAACND6jkAAABDlBMVEX////c3uCfxej0zMz/5ZmmqqnKzM7f4OHRxMbb3d/bvL3l5eaQtNWnvNHo05Xm3saky+/n49cAAADv8ff43o7/5ZSYnJz809PAx87Eyc6Zt9K7xtD/5pq/wsLJy8vMyMr/7J7mv7+Od3d2kqyp0vdEVGP/2dk3RFDJqKg/NTVNQEDVsrKDor+av+EzP0p/amqehISxlJSxtLXXwYFkfJJZboEXHSKJqshth59vXV2rj4/AoaEdGBh6l7KFpcNUaXsqND1bTEwkHh7/9qRJQiwhKTARFRmWfn4xKSljU1PFsXY1MCB0aEWdjV5gVzqGeFBKXG3Ks7WNkZIhHhQ/OCaYiVt+cUyvnWlWTTQdGhI0wlPaAAAL50lEQVR4nO2bC1viSBaGHRAYJOvWxl5qehd7kpALEEJAQAh3RBQdBplLt+7//yN7KlxtIEo6qUB3fY9BSHxyvnrrVNVJiCeX0WAUO1koaAeJgOJfnkRjPwWixJJ90A4SwYSPRU+iwUSOr7EP2EEiHowBxp6xD9IBYx+cg0NgH6OhRbBt7KkY2HSwYk/XwBr76D9oKPZ1y1fsY1QM/OOXnew/0jHwcZP9xxAF/ceJPUfDAfff2A72sV9oxA+FFgYYe8aesfdfjD1jz9gz9ow9Y8/Yb1HeCBl5+9dMHWP9KFeDbYkt7xzXJXsSnbNNbIvSnm+2jJqzA3fs2yhU49aitF8dJUBWUN5AsBd7VE2jCo8Q/CAUQkjWOPv3fKvzsNXtYyEkD8ne3XHdsSeh+SpqV2dBkKHNfs+36y5C12j+udadWdklV+zRdQgVFFRTFgiuXxlQejJSuFnDUajgbGC/vK91Za2SNxS5qlVRujudGu066mp1udKtIFRXNDlNPleAyXQo1zTF2Hkql+wVrlbpyd2aoWh5VC20FXlqcJrGGdUeNLmndaDPyWdU16ZVuQI2dzpwx76aN6YFVE+HphokopLuyVVipgZm8iE0VerylLPN5bWuIte1rgP8vdijHjS+2k6noVn1SgVN69ANbVnhNBheYGjaTtcrED2vcSFevpZDvZ2RXbJPpys1xejJPQ4SnDcMRcmT1Bsad+ToEPHyUIa3d0ZBrlfbxOZOB+7mnHylTSyEIG7BGNZQoVqXewa0FcxAahh3xpQrcPJ16E4GCorcduj9/dgr6bpc6BrdDuK6lQ6aDiuyAupoRsieb+7qaYVD+cpQRppxB0c8Zh8yuhpKQ49fy6hb6yGDHxohfqpohmLPOXK6WwjBPKSlK4irVjRF8Zo90qZGp1IgHV5v92AGHNZkYsBOM8hMrtc1oPXV9JS46ClKxSv2dd5AU0ipilyvVztICfW4bl6ucwv2HT5dScNQKBgwMwOf3Z3ukj3SFABel4eGPAwNkaFAQ+9kuTtnD0sQL5M1wYDpr5quyPnd651L9gp07XVVVmqw2gF7GGqwyV14sdmjKm/AOBga1zKn1aZyLe0R+xAHgdswtU+1KarDHAeDHsx0oe1wsN2G6bBNPiOuoMBwKxQ6O8/kln0ljchqBnN6WoZu6MqVdqdQyBtVu+koZPQQfO6Q+b4ONhWHtrhjDyMKhhysaFpFhtxWgAdX0OrI7vwqLDoaR8yhdgEWnKqm7V7y9q3vl4s6st+Sn/n7+fHZ52Xxs1Nu63s0C7tygFb7ZgeWDt9w4LK+3zSwhmBpAL3DALu2OpZrKw/F2DP2jD1jz9gz9oy9v2Lst7GPfeQoqObEvkPDQceBPY34HLcl73+U59I2ep/2c2kLA+x5zMN4HpOqGHvGnrEP1AFjH5wDxj44B8Gz//mMhv7YaPka+z+oWPj1awdL9r9Sif9HfJN9mIbOnNifUbHw8272VOKLPzB7h7ynEp+xZ+wZ+3ewxxjDi/1u25F3yz37VZyNePtZcMveKwT7stf7WTOMTQghSqs4GRJXyvZFnHlvYNfscTGby2C9hOGdvjqdODMHpt5twS17K2uJuAgBsfUVggaxZlt5j/ZkL/Ki2syqI2h4WAxjkSSAiMUHAuJeFXk1i+29eEtSvpZb9tgEvNlMszhzQOKQFxO6Ad+EVbMoWniRk7a/3XLHHrd0VeexRYKfzdNfxBiQQN+rYT4MVmwEb1LYjz2+J+mWE0cPuZLYxzkzK+ojy2q0oNeb0OVhnb+HHX01my1JVs4pA12z51Vo5EOmZY6glc2cJanmQzYz6oOxbBHaLN3oppVt6q2smDVzDg5csm/CoMdS08xZlnqjWlZWD4/MbJMHMmKrCbBHfbIjbGbNZv+h6AB/T/Zmxs6xEVZvwlbTFBsSDL+mSMYezli8pObUHCQf/IEI3XLjENj1nEMSDBJfUptFSb/RxRsxq4Z1iUw/uJjjddEU+yoe6aZqNkSz4XAmV+xxA3DiRsPS1T60MSfquWJJzYTBRBifma0HLJ1JGbVUNDMqfyby3rEvQmDAnsPqKGw1zEZDz8EAI+wxTAJqNpyD+QgXm1msZxuNkg/sMbQGujnTwBlJ0lulRkmH6CJhLxZVFWCYugmpoUuqdQ/+dp/JZd7rDypM9BkLclDM6rlGoynBgA/DZBtu6lg1m5Ju6mAOXnhA4HCmfdfaFjTVVPlM80G0RBhYjaIktsIjmPwaOb15g3O6dC/eQHLiltjs+8G+lNUzvNgciX1IdqsktvCNXpQkko2thm7ei7lw66xk6RIuWeG+07Tncr7v3+vFrGpJ4g2sbi0RQpEB0IIVVm9lMiNRagCcHOmAXOas5Vneg4pmE4ebjXuywOpmCeOGJOIGmYoy0j2EL8KOM1widdC903nc1zmkd7EI2aZKOoauh0hFLEozc2CoqNs7MmHcNDNOa53LOgeXzAYgaEoijHEIBS0Hzs2SbQ3aDgUgCZyBzriXnGoeF/X97BVbkr2Wh5fF7uwQDi9qXOda9xuurRYWdMi1WcxlvT1zsLLgeB7X11YLA40cXoZcO7Iqb5wRuL+udZpJ3yEPrmvF9xbSO/TN17XfiIDdUziiewpeibE/UPbfOJm8UwfI/tefqciJPR0H0Z3so3QMbLKnLPZ9LWPP2AfqgLEPzkGQ7OOBKLbGPlgHiWDCA/tEQFqiD95BLGgDTEesAPoxEqEe8gAVv0wmqQeNRAKBn/x0QLNF4ioJFKK0w0YiwcCPRiLJqzj9uFsVI+gjyUu6USORgOBH7eZeUY+7XckZheQnijEjV7OgV5QhxOeNPZTEv1zkYJLmxGNHpTzTJZYtpRt3t+KRpZL08sFmT3OonZxcJZcNpRrXScmry5Wr2Nt/74mos1+1MUm5z50UScBoTC6N0anBKLOPrsj/dPLn4ZSZtpPlXAgl2AWFoFTZx1bkyRJDuaZ7h+IrgxTMUWR/sRrUh0d9oShF+vTYLyd6OgPatT69Hpw+ihb7VYsoLWTfoEtK9OmwX5tHD+VyylGXVPzSYL9WPxxOVemsiysK49R/9qtmHPASu6m1cv/KJ/q+sz+WJXZTMb/LfZ/ZR49oid2Uz+W+r+yPbYnd1Fq57z0kH9kf4RK7RT6W+76xP9Ildot8Kzj9Yr9aYiNHtsRu0ZU/95f9Yb82TdK6Ge6r1gaxhzWDH+x/onZHhJ7Wy31vBvKVD9/XJo7hdqULrZf7npxwgcmTk9lajc5DeQjBM3ld7ie9RX951NdSb8rjcj/pIfro0V9LvSlvy/2kV+jXJsTvZondIi/LfY/K7+/nWuotXfhU7rvW8d6udKHEznL/n75rYxk9pm8EPdGOx0kuPvzbb/3rtZHjv13pQlsLzosPpz7rNfvv43alC331OAl5uJQu+x9nid2iNfrReDJBj709sf9QS+wWffV0KS32fx77N4LeaO1Knhr7eITWI0QHruWsm6TGfvmvGj/aErupxZ3bOC323+/tSheyC04oNuiwTyS3Xtf9oLqc/9MkHfafDuh+RvC6mnOgwz5O5rgkS/rXonZtFci/wR+2aF7Xsrx/LUf2qfm2tif19e7U6z9wYs/0Wk7sU+WUva3tOf8AH8sCbKn1PYy9GzmxF/5KnQq8kBIE6APyIgx+f4Q85wH+38J8z+dHgbxh7PeXI/vfBGHwm3A6npwLj5PxufD7ZEzY30Kf3A4eJ08p4fNkXBaeJmQkMPb7ypH97d+3t/zglnTCeFB+enwavBD2k8ex8AKZ//jl8fPgtvylPPjtdHfmM/a79HbeD/jx+Db1PJk8fS4Lz4T9y2DyOHl8hmNkT/nv8filzNjvL2f2MKH/NfjfYPB7eQJ5//w8eLLZC6f85HwyOJ/Ani/ll9PBM8t7F3Jk/wLsX4Ty7cuz8DJ++iJMJpMysB8LwuN48Hlyew57Xsrnt5MnNt+7kGN9L9hbihQysAmns4omBbsXH8gLOeZwEsZ+l2h/V860EmMfnBj74MTYByfGPjgx9sGJsQ9OjH1wYuyDE2MfnBj74MTYB6eLDymfxdjv0sW57zok9v8HaxPp3RptLMIAAAAASUVORK5CYII=\" alt=\"Multi-node Ray cluster\" />\n",
265 |     "\n",
266 |     " <a id='utils-ray'></a>\n",
267 |     " ### Creating a Ray cluster through utils provided by AutoMLx\n",
268 |     "AutoMLx provides utils that wrap the Ray cluster setup. In particular, it allows for the initialization of the `head` node and `worker` nodes.\n",
269 |     "To begin, from a shell in the head node, run the following command to set up the Ray Engine with default settings and ports. Refer to documentation of the method to explore customization options."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "id": "c96b1786",
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "temp_directory = TemporaryDirectory(dir=str(Path.cwd())).name\n",
280 |     "head_ip = initialize_ray_head_node(temp_dir=temp_directory)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "id": "012f9b1d",
286 |    "metadata": {},
287 |    "source": [
288 |     "The following action will be to run this method on each of the worker nodes:"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "id": "b80fbdf2",
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "initialize_ray_worker_node(head_address=head_ip, temp_dir=temp_directory)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "id": "f0312314",
304 |    "metadata": {},
305 |    "source": [
306 |     "This will initialize the worker node by connecting to the IP address and default port (6379, can be changed) to the head node.\n",
307 |     "\n",
308 |     "Ray also offers the capability to setup a TLS connection between its `gRPC` channels (https://docs.ray.io/en/latest/ray-core/configure.html#tls-authentication).\n",
309 |     "gRPC is a Remote Procedure Call protocol used by Ray to allow the workers to communicate, and TLS (Transport Layer Security) will allow the exchanged messages to be encrypted to avoid malicious agents to eavesdrop on the communications.\n",
310 |     "Establishing end-to-end security of data is optional but may offer secure communication in-between cluster nodes, often at an insignificant overhead to running time. We provide utilities to quickly create a TLSConfig as follows:"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "id": "bf4505bb",
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "node_certificate_location = \"placeholder.crt\"\n",
321 |     "node_private_key_location = \"placeholder.key\"\n",
322 |     "certification_authority_certificate_location = \"placeholder.crt\"\n",
323 |     "tls_conf = TLSConfig(\n",
324 |     "    node_certificate_location,\n",
325 |     "    node_private_key_location,\n",
326 |     "    certification_authority_certificate_location,\n",
327 |     ")"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "id": "334924d0",
333 |    "metadata": {},
334 |    "source": [
335 |     "TLSConfig or TLS connection requires several certificates and key files. The certificate issued by the certification authority will need to be in each of the nodes.\n",
336 |     "Ray offers some [scripts that can be used as reference for generating these files](https://raw.githubusercontent.com/ray-project/ray/master/doc/source/cluster/kubernetes/configs/static-ray-cluster.tls.yaml).\n",
337 |     "You can refer to the `gencert_head` and `gencert_worker` shell scripts. These are designed for Linux systems.\n",
338 |     "Note that creating keys and certificates will usually require sudo rights.\n",
339 |     "To finish your TLS Cluster setup, you need to run the previous methods with the TLSConfig as a parameter.\n",
340 |     "In the head node:"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "id": "923de00a",
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "head_ip = initialize_ray_head_node(tls_config=tls_conf, temp_dir=temp_directory)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "id": "222be822",
356 |    "metadata": {},
357 |    "source": [
358 |     "In the worker nodes:"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "id": "c7ac10d1",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "initialize_ray_worker_node(head_address=head_ip, tls_config=tls_conf, temp_dir=temp_directory)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "id": "7e56aefb",
374 |    "metadata": {
375 |     "lines_to_next_cell": 0
376 |    },
377 |    "source": [
378 |     " <a id='manual-ray'></a>\n",
379 |     " ### Creating a Ray cluster manually\n",
380 |     "A Ray cluster can be also started through Command Line Interface using the following commands:\n",
381 |     "- `ray start --head --port=<HEAD_PORT>` : This command has to be run on the head node\n",
382 |     "- `ray start --address=<head-node-address:HEAD_PORT>` : This command has to be run on the worker nodes\n",
383 |     "\n",
384 |     "The ray cluster can be later shutdown by running  `ray stop` on every node.\n",
385 |     "\n",
386 |     "More information on how to setup a cluster manually can be found [here](https://docs.ray.io/en/latest/cluster/vms/user-guides/launching-clusters/on-premises.html#manually-set-up-a-ray-cluster)\n"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "id": "c0d5b8f8",
392 |    "metadata": {
393 |     "lines_to_next_cell": 0
394 |    },
395 |    "source": [
396 |     " <a id='auto-ray'></a>\n",
397 |     " ### Creating a Ray cluster manually through the launcher\n",
398 |     "A Ray cluster can also be created in an Automated way from any machine using a YAML config file.\n",
399 |     "\n",
400 |     "`ray attach <YAML_FILE_PATH>` can be used to start the cluster and setup the head node and its workers.\n",
401 |     "\n",
402 |     "The cluster can then be torn down using `ray down <YAML_FILE_PATH>`.\n",
403 |     "\n",
404 |     "An example config file can be found [here](https://github.com/ray-project/ray/tree/eacc763c84d47c9c5b86b26a32fd62c685be84e6/python/ray/autoscaler/local/example-full.yaml).\n",
405 |     "\n",
406 |     "A more detailed explanation on the cluster setup can also be found in the [official ray documentation website](https://docs.ray.io/en/latest/cluster/vms/user-guides/launching-clusters/on-premises.html#start-ray-with-the-ray-cluster-launcher)\n"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "id": "b9fc01cc",
412 |    "metadata": {
413 |     "lines_to_next_cell": 0
414 |    },
415 |    "source": [
416 |     " <a id='tls-ray'></a>\n",
417 |     " ### Manual TLS authentication setup\n",
418 |     "Instead of using AutoMLx's utils for TLS setup, you can also set it up manually.\n",
419 |     "TLS authentication can only be used before the Ray cluster is started by specifying the following environmental variables:\n",
420 |     "- `RAY_USE_TLS`: Either 1 or 0 to use/not-use TLS. If this is set to 1 then all of the environment variables below must be set. Default: 0.\n",
421 |     "- `RAY_TLS_SERVER_CERT`: Location of a certificate file (tls.crt), which is presented to other endpoints to achieve mutual authentication.\n",
422 |     "- `RAY_TLS_SERVER_KEY`: Location of a private key file (tls.key) for the current node, which is the cryptographic means to prove to other endpoints that you are the authorized user of a given certificate.\n",
423 |     "- `RAY_TLS_CA_CERT`: Location of a CA certificate file (ca.crt) present in all nodes, which allows TLS to decide whether an endpoint’s certificate has been signed by the correct authority.\n",
424 |     "\n",
425 |     "More information can be found [here](https://docs.ray.io/en/latest/ray-core/configure.html#tls-authentication)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "id": "936d7077",
431 |    "metadata": {},
432 |    "source": [
433 |     " <a id='automlx-ray'></a>\n",
434 |     "\n",
435 |     "### Connecting AutoMLx backend to an existing Ray cluster\n",
436 |     "\n",
437 |     "Finally, regardless of how the cluster was created, you will be able to connect to it by initializing the engine with the `cluster_mode` option set to True:"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "id": "d027c7b0",
444 |    "metadata": {
445 |     "lines_to_next_cell": 0
446 |    },
447 |    "outputs": [],
448 |    "source": [
449 |     "init(engine_opts={\"cluster_mode\": True, \"ray_setup\": {\"_temp_dir\": temp_directory}})"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "id": "72ecc3f1",
455 |    "metadata": {
456 |     "lines_to_next_cell": 0
457 |    },
458 |    "source": [
459 |     "Ray will autodetect the running cluster and connect to it.\n",
460 |     "Note that this can be done from any of the nodes in the cluster (not necessarily the head node) as all of the nodes will be able to schedule tasks\n",
461 |     "across all of the available cluster resources."
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "id": "0ce6823a",
467 |    "metadata": {
468 |     "lines_to_next_cell": 0
469 |    },
470 |    "source": [
471 |     " <a id='stopping-ray'></a>\n",
472 |     "\n",
473 |     "### Stopping the Ray cluster"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "markdown",
478 |    "id": "cecca488",
479 |    "metadata": {
480 |     "lines_to_next_cell": 0
481 |    },
482 |    "source": [
483 |     "In the case of the Ray engine, the local Ray instance will be automatically stopped at the end of the Python execution.\n",
484 |     "However, in the case of multi-node execution, all of the nodes will need to stop the cluster in order to successfully clear it (see <a href='#stopping-ray'> Stopping the Ray cluster </a>).\n",
485 |     "To do so, you can run this method on each of the nodes:"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "id": "7aaf74cd",
492 |    "metadata": {
493 |     "lines_to_next_cell": 0
494 |    },
495 |    "outputs": [],
496 |    "source": [
497 |     "stop_ray_cluster()"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "id": "3014fc89",
503 |    "metadata": {
504 |     "lines_to_next_cell": 0
505 |    },
506 |    "source": [
507 |     "Or `ray stop --force` from your terminal, again on each of the nodes."
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "id": "2eda675f",
513 |    "metadata": {},
514 |    "source": [
515 |     " <a id='multiproc'></a>\n",
516 |     "## Multiprocessing Engine\n",
517 |     "To use AutoMlx with Python Multiprocessing, we need to pass the `engine=\"local\"` parameter to the `init` function."
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "id": "b23cbc38",
524 |    "metadata": {
525 |     "lines_to_next_cell": 0
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "init(engine=\"local\")"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "markdown",
534 |    "id": "caf9fb0a",
535 |    "metadata": {},
536 |    "source": [
537 |     " <a id='threading'></a>\n",
538 |     "## Threading Engine\n",
539 |     "To use AutoMLx with Python threading, we need to pass the `engine=\"threading\"` parameter to the `init` function."
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": null,
545 |    "id": "f4ed8577",
546 |    "metadata": {
547 |     "lines_to_next_cell": 0
548 |    },
549 |    "outputs": [],
550 |    "source": [
551 |     "init(engine=\"threading\")"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "id": "4a995d7a",
557 |    "metadata": {
558 |     "lines_to_next_cell": 0
559 |    },
560 |    "source": [
561 |     "<a id='differences'></a>\n",
562 |     "## Differences between Execution Engines\n",
563 |     "Be aware that engines other than Ray cannot achieve multi-node execution and are generally slower than the Ray one.\n",
564 |     "On average, Ray will be faster than Multiprocessing, which is in turn faster than threading.\n",
565 |     "Note that the multi-threading engine is confined by the Global Interpreter Lock (GIL) of the Python language, and hence cannot use more than two threads (i.e. one CPU) for its compute. This backend however might be suitable for prediction service deployments in production containers\n",
566 |     "as it does not require file writes to disk (such as those needed by Ray and multiprocessing to escape the GIL).\n",
567 |     "Using the multiprocessing and threading engines might be suitable for single-node executions with low levels of parallelization (that is, a few concurrent tasks).\n",
568 |     "Moreover, Ray is the only engine that supports distributed training.\n",
569 |     "Thus, Ray is generally the recommended engine for AutoMLx."
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "markdown",
574 |    "id": "b883b35a",
575 |    "metadata": {},
576 |    "source": [
577 |     "<a id='ref'></a>\n",
578 |     "## References\n",
579 |     "* Oracle AutoML: http://www.vldb.org/pvldb/vol13/p3166-yakovlev.pdf\n",
580 |     "* Ray configuration page: https://docs.ray.io/en/latest/ray-core/configure.html"
581 |    ]
582 |   }
583 |  ],
584 |  "metadata": {
585 |   "jupytext": {
586 |    "formats": "ipynb,md,py:percent"
587 |   },
588 |   "kernelspec": {
589 |    "display_name": "Python 3 (ipykernel)",
590 |    "language": "python",
591 |    "name": "python3"
592 |   }
593 |  },
594 |  "nbformat": 4,
595 |  "nbformat_minor": 5
596 | }
597 | 


--------------------------------------------------------------------------------
/demos/OracleAutoMLx_train_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "88c154e0",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "***\n",
  9 |     "# <font color=red>Building and Evaluating a Machine Learning Model using AutoMLx</font>\n",
 10 |     "<p style=\"margin-left:10%; margin-right:10%;\">by the <font color=teal> Oracle AutoMLx Team </font></p>\n",
 11 |     "\n",
 12 |     "***"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "cccb4f86",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "AutoMLx Demo Notebook.\n",
 21 |     "\n",
 22 |     "Copyright © 2024, Oracle and/or its affiliates.\n",
 23 |     "\n",
 24 |     "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "b60f51c4",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Overview of this Notebook\n",
 33 |     "\n",
 34 |     "In this notebook, we will build and evaluate a machine learning model using Oracle AutoMLx. The first dataset is a binary classification dataset.\n",
 35 |     "We explore the various options provided by Oracle AutoMLx, allowing the user to specify different options in the training procedure. We then evaluate the model trained by AutoMLx.\n",
 36 |     "\n",
 37 |     "---\n",
 38 |     "## Prerequisites\n",
 39 |     "\n",
 40 |     "  - Experience level: Novice (Python and Machine Learning)\n",
 41 |     "  - Professional experience: Some industry experience\n",
 42 |     "---\n",
 43 |     "\n",
 44 |     "## Business Use\n",
 45 |     "\n",
 46 |     "Data analytics and modeling problems using Machine Learning (ML) are becoming popular and often rely on data science expertise to build accurate ML models. Such modeling tasks primarily involve the following steps:\n",
 47 |     "- Preprocessing the dataset (for example, cleaning, imputing, engineering features and normalization).\n",
 48 |     "- Picking an appropriate model for the given dataset and prediction task at hand.\n",
 49 |     "- Tuning the chosen model’s hyperparameters for the given dataset.\n",
 50 |     "\n",
 51 |     "All of these steps are significantly time consuming and heavily rely on data scientist expertise. Unfortunately, to make this problem harder, the best feature subset, model, and hyperparameter choice widely varies with the dataset and the prediction task. Hence, there is no one-size-fits-all solution to achieve reasonably good model performance. Using a simple Python API, AutoML can quickly jump-start the datascience process with an accurately-tuned model and appropriate features for a given prediction task.\n",
 52 |     "\n",
 53 |     "## Table of Contents\n",
 54 |     "\n",
 55 |     "- <a href='#train'> Train a Model using AutoMLx</a>\n",
 56 |     "- <a href='#quality'> Inspect the Model's Quality</a>\n",
 57 |     "- <a href='#prediction'> Make Predictions using the Model</a>\n",
 58 |     "- <a href='#evaluate'> Evaluate the Quality of a Model on a Given Dataset</a>\n",
 59 |     "- <a href='#save'> Save a model</a>\n",
 60 |     "- <a href='#load'> Load a model</a>\n",
 61 |     "- <a href='#regression'> Train a Regression Model using AutoMLx</a>\n",
 62 |     "- <a href='#ref'>References</a>"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "073b3a38",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "<a id='train'></a>\n",
 71 |     "## Train a Model using AutoMLx\n",
 72 |     "\n",
 73 |     "Here we show how easy it is to use the AutoMLx train_model API to quickly and automatically train a model for a\n",
 74 |     "machine learning problem. We pass the data, with the name of the target to predict and task to the `train_model`\n",
 75 |     "function. This function will return the best, fully-trained model that AutoML could find for the given dataset.\n",
 76 |     "\n",
 77 |     "You can find the synthetic datasets used in this notebook at https://docs.oracle.com/en-us/iaas/tools/automlx/latest/data/\n",
 78 |     "\n",
 79 |     "The data argument can be a string, in which case it should be the path to a CSV file that contains your dataset.\n",
 80 |     "Alternatively, you can directly pass a pandas DataFrame.\n",
 81 |     "\n",
 82 |     "The task can be either `classification` or `regression`, or we can import and use Task.CLASSIFICATION and TASK.REGRESSION."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 1,
 88 |    "id": "104d6600",
 89 |    "metadata": {
 90 |     "execution": {
 91 |      "iopub.execute_input": "2024-12-26T14:22:51.656164Z",
 92 |      "iopub.status.busy": "2024-12-26T14:22:51.655988Z",
 93 |      "iopub.status.idle": "2024-12-26T14:23:35.323486Z",
 94 |      "shell.execute_reply": "2024-12-26T14:23:35.322909Z"
 95 |     }
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] Number of positive: 116, number of negative: 396\n",
103 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.\n",
104 |       "\u001b[36m(run pid=31671)\u001b[0m You can set `force_col_wise=true` to remove the overhead.\n",
105 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] Total Bins 55\n",
106 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] Number of data points in the train set: 512, number of used features: 4\n",
107 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.773437 -> initscore=1.227824\n",
108 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Info] Start training from score 1.227824\n",
109 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
110 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
111 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
112 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
113 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
114 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
115 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
116 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
117 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
118 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
119 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
120 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
121 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
122 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
123 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
124 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
125 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
126 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
127 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
128 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
129 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
130 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
131 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
132 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
133 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
134 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
135 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
136 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
137 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
138 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
139 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
140 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
141 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
142 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
143 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
144 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
145 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
146 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
147 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
148 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
149 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
150 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
151 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
152 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
153 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
154 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
155 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
156 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
157 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
158 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
159 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
160 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
161 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
162 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
163 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
164 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
165 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
166 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
167 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
168 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
169 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
170 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
171 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
172 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
173 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
174 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
175 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
176 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
177 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
178 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
179 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
180 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
181 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
182 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
183 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
184 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
185 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
186 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
187 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
188 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
189 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
190 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
191 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
192 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
193 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
194 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
195 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
196 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
197 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
198 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
199 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
200 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
201 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
202 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
203 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
204 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
205 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
206 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
207 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
208 |       "\u001b[36m(run pid=31671)\u001b[0m [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "from automlx import train_model\n",
214 |     "\n",
215 |     "model = train_model(\n",
216 |     "    data = \"classification_train.csv\",  # path to dataset CSV file or a pandas DataFrame\n",
217 |     "    target_to_predict = \"income_group\",  # name of the target column in the dataset\n",
218 |     "    task = 'classification',  # type of problem you are interested in solving,\n",
219 |     ")"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "id": "c109147d",
225 |    "metadata": {},
226 |    "source": [
227 |     "That's it! The model is fully trained and ready to be used to make predictions or to be deployed."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "id": "8469a7b6",
233 |    "metadata": {},
234 |    "source": [
235 |     "<a id='quality'></a>\n",
236 |     "## Inspect the Model's Quality\n",
237 |     "\n",
238 |     "But how well can you expect your model to work? There are many ways to measure the quality of a machine learning\n",
239 |     "model. AutoML automatically calculates two different types of scores for your model. The first is the model's\n",
240 |     "training score, this tells you how well the model learned to predict the target on the data that was used for\n",
241 |     "training the model. Generally, higher scores are better; however, sometimes a model may see patterns in your data\n",
242 |     "that appeared by random chance. When this happens, your model typically won't perform well when deployed, because\n",
243 |     "those same patterns aren't likely to appear in future data that the model encounters. For this reason, AutoML\n",
244 |     "automatically reserves 20% of the training data as a stress test for your model. This data is not used to train the\n",
245 |     "model; instead, it is used to estimate the future quality of your model on new data. Both scores can be accessed\n",
246 |     "using `model.quality`."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 2,
252 |    "id": "f8c3e9ff",
253 |    "metadata": {
254 |     "execution": {
255 |      "iopub.execute_input": "2024-12-26T14:23:35.326524Z",
256 |      "iopub.status.busy": "2024-12-26T14:23:35.325613Z",
257 |      "iopub.status.idle": "2024-12-26T14:23:35.334264Z",
258 |      "shell.execute_reply": "2024-12-26T14:23:35.333812Z"
259 |     }
260 |    },
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/html": [
265 |        "<div>\n",
266 |        "<style scoped>\n",
267 |        "    .dataframe tbody tr th:only-of-type {\n",
268 |        "        vertical-align: middle;\n",
269 |        "    }\n",
270 |        "\n",
271 |        "    .dataframe tbody tr th {\n",
272 |        "        vertical-align: top;\n",
273 |        "    }\n",
274 |        "\n",
275 |        "    .dataframe thead th {\n",
276 |        "        text-align: right;\n",
277 |        "    }\n",
278 |        "</style>\n",
279 |        "<table border=\"1\" class=\"dataframe\">\n",
280 |        "  <thead>\n",
281 |        "    <tr style=\"text-align: right;\">\n",
282 |        "      <th></th>\n",
283 |        "      <th>neg_log_loss</th>\n",
284 |        "    </tr>\n",
285 |        "    <tr>\n",
286 |        "      <th>Evaluated on 2024-12-26</th>\n",
287 |        "      <th></th>\n",
288 |        "    </tr>\n",
289 |        "  </thead>\n",
290 |        "  <tbody>\n",
291 |        "    <tr>\n",
292 |        "      <th>Measured quality on training data</th>\n",
293 |        "      <td>-0.109292</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>Estimate of future quality</th>\n",
297 |        "      <td>-0.131209</td>\n",
298 |        "    </tr>\n",
299 |        "  </tbody>\n",
300 |        "</table>\n",
301 |        "</div>"
302 |       ],
303 |       "text/plain": [
304 |        "                                   neg_log_loss\n",
305 |        "Evaluated on 2024-12-26                        \n",
306 |        "Measured quality on training data     -0.109292\n",
307 |        "Estimate of future quality            -0.131209"
308 |       ]
309 |      },
310 |      "execution_count": 2,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "model.quality"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "id": "8c0659fd",
322 |    "metadata": {},
323 |    "source": [
324 |     "The more similar the two scores are the better. If there is a large gap between them, it may mean that the model learned to rely on spurious correlations. However, if the model quality is still good on the stress test data, then the gap may not be a cause for concern."
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "id": "32e16d06",
330 |    "metadata": {},
331 |    "source": [
332 |     "<a id='prediction'></a>\n",
333 |     "## Make Predictions using the Model\n",
334 |     "\n",
335 |     "We can now use this model to make predictions! The following method will return a new dataset that is identical to\n",
336 |     "the provided dataset with an additional column that contains the model's predictions."
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 3,
342 |    "id": "00434d42",
343 |    "metadata": {
344 |     "execution": {
345 |      "iopub.execute_input": "2024-12-26T14:23:35.336129Z",
346 |      "iopub.status.busy": "2024-12-26T14:23:35.335687Z",
347 |      "iopub.status.idle": "2024-12-26T14:23:35.359746Z",
348 |      "shell.execute_reply": "2024-12-26T14:23:35.359287Z"
349 |     }
350 |    },
351 |    "outputs": [
352 |     {
353 |      "data": {
354 |       "text/html": [
355 |        "<div>\n",
356 |        "<style scoped>\n",
357 |        "    .dataframe tbody tr th:only-of-type {\n",
358 |        "        vertical-align: middle;\n",
359 |        "    }\n",
360 |        "\n",
361 |        "    .dataframe tbody tr th {\n",
362 |        "        vertical-align: top;\n",
363 |        "    }\n",
364 |        "\n",
365 |        "    .dataframe thead th {\n",
366 |        "        text-align: right;\n",
367 |        "    }\n",
368 |        "</style>\n",
369 |        "<table border=\"1\" class=\"dataframe\">\n",
370 |        "  <thead>\n",
371 |        "    <tr style=\"text-align: right;\">\n",
372 |        "      <th></th>\n",
373 |        "      <th>age</th>\n",
374 |        "      <th>education</th>\n",
375 |        "      <th>sex</th>\n",
376 |        "      <th>income_group</th>\n",
377 |        "      <th>prediction for income_group</th>\n",
378 |        "    </tr>\n",
379 |        "  </thead>\n",
380 |        "  <tbody>\n",
381 |        "    <tr>\n",
382 |        "      <th>0</th>\n",
383 |        "      <td>42</td>\n",
384 |        "      <td>diploma</td>\n",
385 |        "      <td>female</td>\n",
386 |        "      <td>&lt;=50k</td>\n",
387 |        "      <td>&lt;=50k</td>\n",
388 |        "    </tr>\n",
389 |        "    <tr>\n",
390 |        "      <th>1</th>\n",
391 |        "      <td>57</td>\n",
392 |        "      <td>bachelors</td>\n",
393 |        "      <td>male</td>\n",
394 |        "      <td>&lt;=50k</td>\n",
395 |        "      <td>&lt;=50k</td>\n",
396 |        "    </tr>\n",
397 |        "  </tbody>\n",
398 |        "</table>\n",
399 |        "</div>"
400 |       ],
401 |       "text/plain": [
402 |        "   age  education     sex income_group prediction for income_group\n",
403 |        "0   42    diploma  female        <=50k                       <=50k\n",
404 |        "1   57  bachelors    male        <=50k                       <=50k"
405 |       ]
406 |      },
407 |      "execution_count": 3,
408 |      "metadata": {},
409 |      "output_type": "execute_result"
410 |     }
411 |    ],
412 |    "source": [
413 |     "data_with_prediction = model.predict(\"classification_train.csv\")\n",
414 |     "data_with_prediction.head(2)"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "id": "9e2d1c54",
420 |    "metadata": {},
421 |    "source": [
422 |     "We can also save the dataset with the predictions by passing desired path to a new CSV file the `output` parameter."
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 4,
428 |    "id": "fe0815c1",
429 |    "metadata": {
430 |     "execution": {
431 |      "iopub.execute_input": "2024-12-26T14:23:35.361684Z",
432 |      "iopub.status.busy": "2024-12-26T14:23:35.361158Z",
433 |      "iopub.status.idle": "2024-12-26T14:23:35.414873Z",
434 |      "shell.execute_reply": "2024-12-26T14:23:35.414401Z"
435 |     }
436 |    },
437 |    "outputs": [
438 |     {
439 |      "data": {
440 |       "text/html": [
441 |        "<div>\n",
442 |        "<style scoped>\n",
443 |        "    .dataframe tbody tr th:only-of-type {\n",
444 |        "        vertical-align: middle;\n",
445 |        "    }\n",
446 |        "\n",
447 |        "    .dataframe tbody tr th {\n",
448 |        "        vertical-align: top;\n",
449 |        "    }\n",
450 |        "\n",
451 |        "    .dataframe thead th {\n",
452 |        "        text-align: right;\n",
453 |        "    }\n",
454 |        "</style>\n",
455 |        "<table border=\"1\" class=\"dataframe\">\n",
456 |        "  <thead>\n",
457 |        "    <tr style=\"text-align: right;\">\n",
458 |        "      <th></th>\n",
459 |        "      <th>age</th>\n",
460 |        "      <th>education</th>\n",
461 |        "      <th>sex</th>\n",
462 |        "      <th>income_group</th>\n",
463 |        "      <th>prediction for income_group</th>\n",
464 |        "    </tr>\n",
465 |        "  </thead>\n",
466 |        "  <tbody>\n",
467 |        "    <tr>\n",
468 |        "      <th>0</th>\n",
469 |        "      <td>42</td>\n",
470 |        "      <td>diploma</td>\n",
471 |        "      <td>female</td>\n",
472 |        "      <td>&lt;=50k</td>\n",
473 |        "      <td>&lt;=50k</td>\n",
474 |        "    </tr>\n",
475 |        "    <tr>\n",
476 |        "      <th>1</th>\n",
477 |        "      <td>57</td>\n",
478 |        "      <td>bachelors</td>\n",
479 |        "      <td>male</td>\n",
480 |        "      <td>&lt;=50k</td>\n",
481 |        "      <td>&lt;=50k</td>\n",
482 |        "    </tr>\n",
483 |        "  </tbody>\n",
484 |        "</table>\n",
485 |        "</div>"
486 |       ],
487 |       "text/plain": [
488 |        "   age  education     sex income_group prediction for income_group\n",
489 |        "0   42    diploma  female        <=50k                       <=50k\n",
490 |        "1   57  bachelors    male        <=50k                       <=50k"
491 |       ]
492 |      },
493 |      "execution_count": 4,
494 |      "metadata": {},
495 |      "output_type": "execute_result"
496 |     }
497 |    ],
498 |    "source": [
499 |     "data_with_prediction = model.predict(\"classification_train.csv\", output='data_with_prediction.csv')\n",
500 |     "data_with_prediction.head(2)"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "id": "741db142",
506 |    "metadata": {},
507 |    "source": [
508 |     "<a id='evaluate'></a>\n",
509 |     "## Evaluate the Quality of a Model on a New Dataset\n",
510 |     "\n",
511 |     "Inspecting `model.quality` (see above) is always a good idea to ensure that the model performed well when it was\n",
512 |     "trained. However, it is never a bad idea to continue evaluating the model over time on new data as you collect it.\n",
513 |     "This can be achieved using the `evaluate_model_quality` function. We just need to pass the model and the desired\n",
514 |     "dataset to this function."
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 5,
520 |    "id": "e37fca1b",
521 |    "metadata": {
522 |     "execution": {
523 |      "iopub.execute_input": "2024-12-26T14:23:35.416628Z",
524 |      "iopub.status.busy": "2024-12-26T14:23:35.416322Z",
525 |      "iopub.status.idle": "2024-12-26T14:23:35.443846Z",
526 |      "shell.execute_reply": "2024-12-26T14:23:35.443373Z"
527 |     }
528 |    },
529 |    "outputs": [
530 |     {
531 |      "data": {
532 |       "text/html": [
533 |        "<div>\n",
534 |        "<style scoped>\n",
535 |        "    .dataframe tbody tr th:only-of-type {\n",
536 |        "        vertical-align: middle;\n",
537 |        "    }\n",
538 |        "\n",
539 |        "    .dataframe tbody tr th {\n",
540 |        "        vertical-align: top;\n",
541 |        "    }\n",
542 |        "\n",
543 |        "    .dataframe thead th {\n",
544 |        "        text-align: right;\n",
545 |        "    }\n",
546 |        "</style>\n",
547 |        "<table border=\"1\" class=\"dataframe\">\n",
548 |        "  <thead>\n",
549 |        "    <tr style=\"text-align: right;\">\n",
550 |        "      <th></th>\n",
551 |        "      <th>neg_log_loss</th>\n",
552 |        "    </tr>\n",
553 |        "  </thead>\n",
554 |        "  <tbody>\n",
555 |        "    <tr>\n",
556 |        "      <th>classification_test.csv</th>\n",
557 |        "      <td>-0.159003</td>\n",
558 |        "    </tr>\n",
559 |        "  </tbody>\n",
560 |        "</table>\n",
561 |        "</div>"
562 |       ],
563 |       "text/plain": [
564 |        "                         neg_log_loss\n",
565 |        "classification_test.csv     -0.159003"
566 |       ]
567 |      },
568 |      "execution_count": 5,
569 |      "metadata": {},
570 |      "output_type": "execute_result"
571 |     }
572 |    ],
573 |    "source": [
574 |     "from automlx import evaluate_model_quality\n",
575 |     "\n",
576 |     "score = evaluate_model_quality(model, \"classification_test.csv\")\n",
577 |     "\n",
578 |     "score"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "markdown",
583 |    "id": "909befe4",
584 |    "metadata": {},
585 |    "source": [
586 |     "You can always compare the new results to `model.quality` to see if the model is still performing as well as it\n",
587 |     "was expected to when it was trained. If not, it may be time to call `train_model` again with your new data so that\n",
588 |     "the model can learn any new trends that have appeared in your dataset."
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "id": "127a6d75",
594 |    "metadata": {},
595 |    "source": [
596 |     "<a id='save'></a>\n",
597 |     "## Save a model\n",
598 |     "\n",
599 |     "Once we are satisfied with the results, we can save the model, using the `save` method, by passing a desired file\n",
600 |     "path."
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": 6,
606 |    "id": "549287f8",
607 |    "metadata": {
608 |     "execution": {
609 |      "iopub.execute_input": "2024-12-26T14:23:35.445745Z",
610 |      "iopub.status.busy": "2024-12-26T14:23:35.445251Z",
611 |      "iopub.status.idle": "2024-12-26T14:23:35.817393Z",
612 |      "shell.execute_reply": "2024-12-26T14:23:35.816774Z"
613 |     }
614 |    },
615 |    "outputs": [],
616 |    "source": [
617 |     "model.save('model.amlx')"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "markdown",
622 |    "id": "e58e7368",
623 |    "metadata": {},
624 |    "source": [
625 |     "<a id='load'></a>\n",
626 |     "## Load a model\n",
627 |     "\n",
628 |     "We can also load a saved model using the `load_model` function by providing the path to the model."
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 7,
634 |    "id": "866735bb",
635 |    "metadata": {
636 |     "execution": {
637 |      "iopub.execute_input": "2024-12-26T14:23:35.820171Z",
638 |      "iopub.status.busy": "2024-12-26T14:23:35.819618Z",
639 |      "iopub.status.idle": "2024-12-26T14:23:36.306420Z",
640 |      "shell.execute_reply": "2024-12-26T14:23:36.305857Z"
641 |     }
642 |    },
643 |    "outputs": [
644 |     {
645 |      "data": {
646 |       "text/html": [
647 |        "<div>\n",
648 |        "<style scoped>\n",
649 |        "    .dataframe tbody tr th:only-of-type {\n",
650 |        "        vertical-align: middle;\n",
651 |        "    }\n",
652 |        "\n",
653 |        "    .dataframe tbody tr th {\n",
654 |        "        vertical-align: top;\n",
655 |        "    }\n",
656 |        "\n",
657 |        "    .dataframe thead th {\n",
658 |        "        text-align: right;\n",
659 |        "    }\n",
660 |        "</style>\n",
661 |        "<table border=\"1\" class=\"dataframe\">\n",
662 |        "  <thead>\n",
663 |        "    <tr style=\"text-align: right;\">\n",
664 |        "      <th></th>\n",
665 |        "      <th>neg_log_loss</th>\n",
666 |        "    </tr>\n",
667 |        "    <tr>\n",
668 |        "      <th>Evaluated on 2024-12-26</th>\n",
669 |        "      <th></th>\n",
670 |        "    </tr>\n",
671 |        "  </thead>\n",
672 |        "  <tbody>\n",
673 |        "    <tr>\n",
674 |        "      <th>Measured quality on training data</th>\n",
675 |        "      <td>-0.109292</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>Estimate of future quality</th>\n",
679 |        "      <td>-0.131209</td>\n",
680 |        "    </tr>\n",
681 |        "  </tbody>\n",
682 |        "</table>\n",
683 |        "</div>"
684 |       ],
685 |       "text/plain": [
686 |        "                                   neg_log_loss\n",
687 |        "Evaluated on 2024-12-26                        \n",
688 |        "Measured quality on training data     -0.109292\n",
689 |        "Estimate of future quality            -0.131209"
690 |       ]
691 |      },
692 |      "execution_count": 7,
693 |      "metadata": {},
694 |      "output_type": "execute_result"
695 |     }
696 |    ],
697 |    "source": [
698 |     "from automlx import load_model\n",
699 |     "\n",
700 |     "loaded_model = load_model('model.amlx')\n",
701 |     "\n",
702 |     "loaded_model.quality"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "markdown",
707 |    "id": "369ef6cc",
708 |    "metadata": {},
709 |    "source": [
710 |     "<a id='load'></a>\n",
711 |     "## Train a Regression Model using AutoMLx\n",
712 |     "\n",
713 |     "Here we show how to train a model for a regression task. We also cover some optional arguments that can be used to\n",
714 |     "further control how AutoML works.\n",
715 |     "  - metric: The name of the desired scoring metric. By default, this is `auto`, which means that an appropriate\n",
716 |     "  metric is chosen based on the dataset and task.\n",
717 |     "  - time_budget: The maximum time budget in seconds. By default, this is `None`, which means that AutoML should run\n",
718 |     "  until it is done (that is, until it cannot find any better models for your data). Note that AutoML will sometimes\n",
719 |     "  run for longer than your requested time budget. This is to ensure that we can always return a fully-trained model\n",
720 |     "  that is ready to be deployed.\n",
721 |     "  - test_data: Advanced users can pass in a custom dataset for stress testing the model. This will be used to estimate\n",
722 |     "  the quality of the final model on future data. If not provided, the test scores are estimated automatically by\n",
723 |     "  reserving 20% of the training data for evaluation of the final model."
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": 8,
729 |    "id": "4cee7548",
730 |    "metadata": {
731 |     "execution": {
732 |      "iopub.execute_input": "2024-12-26T14:23:36.308596Z",
733 |      "iopub.status.busy": "2024-12-26T14:23:36.308065Z",
734 |      "iopub.status.idle": "2024-12-26T14:23:57.995953Z",
735 |      "shell.execute_reply": "2024-12-26T14:23:57.995245Z"
736 |     }
737 |    },
738 |    "outputs": [],
739 |    "source": [
740 |     "from automlx import Task\n",
741 |     "\n",
742 |     "model_regression = train_model(\n",
743 |     "    data = \"regression_train.csv\",\n",
744 |     "    target_to_predict = \"income\",\n",
745 |     "    task = Task.REGRESSION,\n",
746 |     "    metric = 'auto',\n",
747 |     "    time_budget=30,\n",
748 |     "    test_data = \"regression_test.csv\"\n",
749 |     ")"
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": 9,
755 |    "id": "a8c27319",
756 |    "metadata": {
757 |     "execution": {
758 |      "iopub.execute_input": "2024-12-26T14:23:57.998741Z",
759 |      "iopub.status.busy": "2024-12-26T14:23:57.998400Z",
760 |      "iopub.status.idle": "2024-12-26T14:23:58.004098Z",
761 |      "shell.execute_reply": "2024-12-26T14:23:58.003542Z"
762 |     }
763 |    },
764 |    "outputs": [
765 |     {
766 |      "data": {
767 |       "text/html": [
768 |        "<div>\n",
769 |        "<style scoped>\n",
770 |        "    .dataframe tbody tr th:only-of-type {\n",
771 |        "        vertical-align: middle;\n",
772 |        "    }\n",
773 |        "\n",
774 |        "    .dataframe tbody tr th {\n",
775 |        "        vertical-align: top;\n",
776 |        "    }\n",
777 |        "\n",
778 |        "    .dataframe thead th {\n",
779 |        "        text-align: right;\n",
780 |        "    }\n",
781 |        "</style>\n",
782 |        "<table border=\"1\" class=\"dataframe\">\n",
783 |        "  <thead>\n",
784 |        "    <tr style=\"text-align: right;\">\n",
785 |        "      <th></th>\n",
786 |        "      <th>neg_mean_squared_error</th>\n",
787 |        "    </tr>\n",
788 |        "    <tr>\n",
789 |        "      <th>Evaluated on 2024-12-26</th>\n",
790 |        "      <th></th>\n",
791 |        "    </tr>\n",
792 |        "  </thead>\n",
793 |        "  <tbody>\n",
794 |        "    <tr>\n",
795 |        "      <th>Measured quality on training data</th>\n",
796 |        "      <td>-3.781031e+07</td>\n",
797 |        "    </tr>\n",
798 |        "    <tr>\n",
799 |        "      <th>Estimate of future quality</th>\n",
800 |        "      <td>-3.829070e+07</td>\n",
801 |        "    </tr>\n",
802 |        "  </tbody>\n",
803 |        "</table>\n",
804 |        "</div>"
805 |       ],
806 |       "text/plain": [
807 |        "                                   neg_mean_squared_error\n",
808 |        "Evaluated on 2024-12-26                                  \n",
809 |        "Measured quality on training data           -3.781031e+07\n",
810 |        "Estimate of future quality                  -3.829070e+07"
811 |       ]
812 |      },
813 |      "execution_count": 9,
814 |      "metadata": {},
815 |      "output_type": "execute_result"
816 |     }
817 |    ],
818 |    "source": [
819 |     "model_regression.quality"
820 |    ]
821 |   },
822 |   {
823 |    "cell_type": "markdown",
824 |    "id": "14624174",
825 |    "metadata": {},
826 |    "source": [
827 |     "<a id='ref'></a>\n",
828 |     "## References\n",
829 |     "* Oracle AutoML http://www.vldb.org/pvldb/vol13/p3166-yakovlev.pdf"
830 |    ]
831 |   }
832 |  ],
833 |  "metadata": {
834 |   "jupytext": {
835 |    "formats": "ipynb,md,py:percent"
836 |   },
837 |   "kernelspec": {
838 |    "display_name": "Python 3 (ipykernel)",
839 |    "language": "python",
840 |    "name": "python3"
841 |   },
842 |   "language_info": {
843 |    "codemirror_mode": {
844 |     "name": "ipython",
845 |     "version": 3
846 |    },
847 |    "file_extension": ".py",
848 |    "mimetype": "text/x-python",
849 |    "name": "python",
850 |    "nbconvert_exporter": "python",
851 |    "pygments_lexer": "ipython3",
852 |    "version": "3.9.21"
853 |   }
854 |  },
855 |  "nbformat": 4,
856 |  "nbformat_minor": 5
857 | }
858 | 


--------------------------------------------------------------------------------