12 | #
Warning
13 | #
14 | # This example uploads data. For that reason, this example connects to the
15 | # test server at test.openml.org.
17 | # This prevents the main server from becoming overloaded with example datasets, tasks,
18 | # runs, and other submissions.
19 | # Using this test server may affect the behavior and performance of the
20 | # OpenML-Python API.
21 | #
22 | #
23 |
24 | # %%
25 | openml.config.start_using_configuration_for_example()
26 |
27 | # %% [markdown]
28 | # ## Train a machine learning model and evaluate it
29 | # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20
30 |
31 | # %%
32 | task = openml.tasks.get_task(119)
33 |
34 | # Get the data
35 | dataset = task.get_dataset()
36 | X, y, categorical_indicator, attribute_names = dataset.get_data(
37 | target=dataset.default_target_attribute
38 | )
39 |
40 | # Get the holdout split from the task
41 | train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0)
42 | X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
43 | y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
44 |
45 | knn_parameters = {
46 | "n_neighbors": 3,
47 | }
48 | clf = KNeighborsClassifier(**knn_parameters)
49 | clf.fit(X_train, y_train)
50 |
51 | # Get experiment results
52 | y_pred = clf.predict(X_test)
53 | y_pred_proba = clf.predict_proba(X_test)
54 |
55 | # %% [markdown]
56 | # ## Upload the machine learning experiments to OpenML
57 | # First, create a fow and fill it with metadata about the machine learning model.
58 |
59 | # %%
60 | knn_flow = openml.flows.OpenMLFlow(
61 | # Metadata
62 | model=clf, # or None, if you do not want to upload the model object.
63 | name="CustomKNeighborsClassifier",
64 | description="A custom KNeighborsClassifier flow for OpenML.",
65 | external_version=f"{sklearn.__version__}",
66 | language="English",
67 | tags=["openml_tutorial_knn"],
68 | dependencies=f"{sklearn.__version__}",
69 | # Hyperparameters
70 | parameters={k: str(v) for k, v in knn_parameters.items()},
71 | parameters_meta_info={
72 | "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"}
73 | },
74 | # If you have a pipeline with subcomponents, such as preprocessing, add them here.
75 | components={},
76 | )
77 | knn_flow.publish()
78 | print(f"knn_flow was published with the ID {knn_flow.flow_id}")
79 |
80 | # %% [markdown]
81 | # Second, we create a run to store the results associated with the flow.
82 |
83 | # %%
84 |
85 | # Format the predictions for OpenML
86 | predictions = []
87 | for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
88 | test_indices, y_test, y_pred, y_pred_proba
89 | ):
90 | predictions.append(
91 | openml.runs.functions.format_prediction(
92 | task=task,
93 | repeat=0,
94 | fold=0,
95 | index=test_index,
96 | prediction=y_pred_i,
97 | truth=y_true_i,
98 | proba=dict(zip(task.class_labels, y_pred_proba_i)),
99 | )
100 | )
101 |
102 | # Format the parameters for OpenML
103 | oml_knn_parameters = [
104 | {"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id}
105 | for k, v in knn_parameters.items()
106 | ]
107 |
108 | knn_run = openml.runs.OpenMLRun(
109 | task_id=task.task_id,
110 | flow_id=knn_flow.flow_id,
111 | dataset_id=dataset.dataset_id,
112 | parameter_settings=oml_knn_parameters,
113 | data_content=predictions,
114 | tags=["openml_tutorial_knn"],
115 | description_text="Run generated by the tutorial.",
116 | )
117 | knn_run = knn_run.publish()
118 | print(f"Run was uploaded to {knn_run.openml_url}")
119 | print(f"The flow can be found at {knn_run.flow.openml_url}")
120 |
121 | # %%
122 | openml.config.stop_using_configuration_for_example()
123 |
--------------------------------------------------------------------------------
/openml/extensions/functions.py:
--------------------------------------------------------------------------------
1 | # License: BSD 3-Clause
2 | from __future__ import annotations
3 |
4 | from typing import TYPE_CHECKING, Any
5 |
6 | # Need to implement the following by its full path because otherwise it won't be possible to
7 | # access openml.extensions.extensions
8 | import openml.extensions
9 |
10 | # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
11 | if TYPE_CHECKING:
12 | from openml.flows import OpenMLFlow
13 |
14 | from . import Extension
15 |
16 | SKLEARN_HINT = (
17 | "But it looks related to scikit-learn. "
18 | "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
19 | "For more information, see "
20 | "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation"
21 | )
22 |
23 |
24 | def register_extension(extension: type[Extension]) -> None:
25 | """Register an extension.
26 |
27 | Registered extensions are considered by ``get_extension_by_flow`` and
28 | ``get_extension_by_model``, which are used by ``openml.flow`` and ``openml.runs``.
29 |
30 | Parameters
31 | ----------
32 | extension : Type[Extension]
33 |
34 | Returns
35 | -------
36 | None
37 | """
38 | openml.extensions.extensions.append(extension)
39 |
40 |
41 | def get_extension_by_flow(
42 | flow: OpenMLFlow,
43 | raise_if_no_extension: bool = False, # noqa: FBT001, FBT002
44 | ) -> Extension | None:
45 | """Get an extension which can handle the given flow.
46 |
47 | Iterates all registered extensions and checks whether they can handle the presented flow.
48 | Raises an exception if two extensions can handle a flow.
49 |
50 | Parameters
51 | ----------
52 | flow : OpenMLFlow
53 |
54 | raise_if_no_extension : bool (optional, default=False)
55 | Raise an exception if no registered extension can handle the presented flow.
56 |
57 | Returns
58 | -------
59 | Extension or None
60 | """
61 | candidates = []
62 | for extension_class in openml.extensions.extensions:
63 | if extension_class.can_handle_flow(flow):
64 | candidates.append(extension_class())
65 | if len(candidates) == 0:
66 | if raise_if_no_extension:
67 | install_instruction = ""
68 | if flow.name.startswith("sklearn"):
69 | install_instruction = SKLEARN_HINT
70 | raise ValueError(
71 | f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). "
72 | f"{install_instruction}"
73 | )
74 |
75 | return None
76 |
77 | if len(candidates) == 1:
78 | return candidates[0]
79 |
80 | raise ValueError(
81 | f"Multiple extensions registered which can handle flow: {flow}, but only one "
82 | f"is allowed ({candidates}).",
83 | )
84 |
85 |
86 | def get_extension_by_model(
87 | model: Any,
88 | raise_if_no_extension: bool = False, # noqa: FBT001, FBT002
89 | ) -> Extension | None:
90 | """Get an extension which can handle the given flow.
91 |
92 | Iterates all registered extensions and checks whether they can handle the presented model.
93 | Raises an exception if two extensions can handle a model.
94 |
95 | Parameters
96 | ----------
97 | model : Any
98 |
99 | raise_if_no_extension : bool (optional, default=False)
100 | Raise an exception if no registered extension can handle the presented model.
101 |
102 | Returns
103 | -------
104 | Extension or None
105 | """
106 | candidates = []
107 | for extension_class in openml.extensions.extensions:
108 | if extension_class.can_handle_model(model):
109 | candidates.append(extension_class())
110 | if len(candidates) == 0:
111 | if raise_if_no_extension:
112 | install_instruction = ""
113 | if type(model).__module__.startswith("sklearn"):
114 | install_instruction = SKLEARN_HINT
115 | raise ValueError(
116 | f"No extension registered which can handle model: {model}. {install_instruction}"
117 | )
118 |
119 | return None
120 |
121 | if len(candidates) == 1:
122 | return candidates[0]
123 |
124 | raise ValueError(
125 | f"Multiple extensions registered which can handle model: {model}, but only one "
126 | f"is allowed ({candidates}).",
127 | )
128 |
--------------------------------------------------------------------------------
/examples/_external_or_deprecated/2015_neurips_feurer_example.py:
--------------------------------------------------------------------------------
1 | """
2 | Feurer et al. (2015)
3 | ====================
4 |
5 | A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
6 |
7 | Auto-sklearn website: https://automl.github.io/auto-sklearn/
8 |
9 | Publication
10 | ~~~~~~~~~~~
11 |
12 | | Efficient and Robust Automated Machine Learning
13 | | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
14 | | In *Advances in Neural Information Processing Systems 28*, 2015
15 | | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
16 | """ # noqa F401
17 |
18 | # License: BSD 3-Clause
19 |
20 | import pandas as pd
21 |
22 | import openml
23 |
24 | ####################################################################################################
25 | # List of dataset IDs given in the supplementary material of Feurer et al.:
26 | # https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
27 | # fmt: off
28 | dataset_ids = [
29 | 3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
30 | 57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
31 | 390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
32 | 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
33 | 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
34 | 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
35 | 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
36 | 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
37 | 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
38 | 1134, 1138, 1139, 1142, 1146, 1161, 1166,
39 | ]
40 | # fmt: on
41 |
42 | ####################################################################################################
43 | # The dataset IDs could be used directly to load the dataset and split the data into a training set
44 | # and a test set. However, to be reproducible, we will first obtain the respective tasks from
45 | # OpenML, which define both the target feature and the train/test split.
46 | #
47 | # .. note::
48 | # It is discouraged to work directly on datasets and only provide dataset IDs in a paper as
49 | # this does not allow reproducibility (unclear splitting). Please do not use datasets but the
50 | # respective tasks as basis for a paper and publish task IDS. This example is only given to
51 | # showcase the use of OpenML-Python for a published paper and as a warning on how not to do it.
52 | # Please check the `OpenML documentation of tasks