├── openml ├── py.typed ├── __version__.py ├── evaluations │ ├── __init__.py │ └── evaluation.py ├── setups │ ├── __init__.py │ └── setup.py ├── flows │ └── __init__.py ├── extensions │ ├── __init__.py │ └── functions.py ├── runs │ └── __init__.py ├── datasets │ ├── __init__.py │ └── data_feature.py ├── tasks │ └── __init__.py ├── study │ └── __init__.py ├── exceptions.py ├── __init__.py └── base.py ├── tests ├── test_flows │ ├── __init__.py │ └── dummy_learn │ │ ├── __init__.py │ │ └── dummy_forest.py ├── test_runs │ ├── __init__.py │ └── test_trace.py ├── test_study │ └── __init__.py ├── test_utils │ ├── __init__.py │ └── test_utils.py ├── test_datasets │ └── __init__.py ├── test_extensions │ ├── __init__.py │ └── test_functions.py ├── test_openml │ ├── __init__.py │ ├── test_openml.py │ └── test_api_calls.py ├── test_evaluations │ ├── __init__.py │ └── test_evaluations_example.py ├── files │ ├── org │ │ └── openml │ │ │ └── test │ │ │ ├── datasets │ │ │ ├── 2 │ │ │ │ └── description.xml │ │ │ ├── 30 │ │ │ │ └── dataset_30.pq │ │ │ └── -1 │ │ │ │ ├── description.xml │ │ │ │ └── qualities.xml │ │ │ ├── setups │ │ │ └── 1 │ │ │ │ └── description.xml │ │ │ └── tasks │ │ │ ├── 1 │ │ │ └── task.xml │ │ │ ├── 3 │ │ │ └── task.xml │ │ │ └── 1882 │ │ │ └── task.xml │ ├── mock_responses │ │ ├── datasets │ │ │ ├── data_delete_successful.xml │ │ │ ├── data_delete_not_exist.xml │ │ │ ├── data_delete_not_owned.xml │ │ │ ├── data_delete_has_tasks.xml │ │ │ └── data_description_61.xml │ │ ├── flows │ │ │ ├── flow_delete_successful.xml │ │ │ ├── flow_delete_not_exist.xml │ │ │ ├── flow_delete_not_owned.xml │ │ │ ├── flow_delete_has_runs.xml │ │ │ └── flow_delete_is_subflow.xml │ │ ├── runs │ │ │ ├── run_delete_successful.xml │ │ │ ├── run_delete_not_exist.xml │ │ │ └── run_delete_not_owned.xml │ │ └── tasks │ │ │ ├── task_delete_successful.xml │ │ │ ├── task_delete_not_exist.xml │ │ │ ├── task_delete_not_owned.xml │ │ │ └── task_delete_has_runs.xml │ └── misc │ │ └── features_with_whitespaces.xml ├── __init__.py ├── test_setups │ └── __init__.py └── test_tasks │ ├── __init__.py │ ├── test_supervised_task.py │ ├── test_learning_curve_task.py │ ├── test_classification_task.py │ ├── test_task_methods.py │ ├── test_clustering_task.py │ ├── test_regression_task.py │ ├── test_split.py │ └── test_task.py ├── MANIFEST.in ├── docs ├── images │ └── openml_icon.png ├── stylesheets │ └── extra.css ├── contributing.md ├── details.md └── index.md ├── examples ├── _external_or_deprecated │ ├── README.md │ ├── plot_svm_hyperparameters_tutorial.py │ ├── flow_id_tutorial.py │ ├── 2015_neurips_feurer_example.py │ ├── run_setup_tutorial.py │ ├── 2018_ida_strang_example.py │ └── benchmark_with_optunahub.py ├── Basics │ ├── simple_tasks_tutorial.py │ ├── simple_datasets_tutorial.py │ ├── introduction_tutorial.py │ ├── simple_suites_tutorial.py │ └── simple_flows_and_runs_tutorial.py ├── introduction.py └── Advanced │ ├── configure_logging.py │ ├── suites_tutorial.py │ ├── study_tutorial.py │ ├── datasets_tutorial.py │ └── task_manual_iteration_tutorial.py ├── .github ├── dependabot.yml ├── workflows │ ├── dist.yaml │ ├── docs.yaml │ ├── release_docker.yaml │ └── test.yml ├── PULL_REQUEST_TEMPLATE.md └── ISSUE_TEMPLATE │ └── ISSUE_TEMPLATE.md ├── Makefile ├── docker ├── Dockerfile ├── startup.sh └── readme.md ├── .pre-commit-config.yaml ├── .gitignore ├── scripts └── gen_ref_pages.py ├── CITATION.cff ├── LICENSE ├── README.md └── mkdocs.yml /openml/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_flows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_runs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_study/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_extensions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_openml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_evaluations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /tests/test_flows/dummy_learn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/images/openml_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openml/openml-python/HEAD/docs/images/openml_icon.png -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | .jp-InputArea-prompt, .jp-InputPrompt { 2 | display: none !important; 3 | } 4 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/datasets/30/dataset_30.pq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openml/openml-python/HEAD/tests/files/org/openml/test/datasets/30/dataset_30.pq -------------------------------------------------------------------------------- /tests/files/mock_responses/datasets/data_delete_successful.xml: -------------------------------------------------------------------------------- 1 | 2 | 40000 3 | 4 | -------------------------------------------------------------------------------- /tests/files/mock_responses/flows/flow_delete_successful.xml: -------------------------------------------------------------------------------- 1 | 2 | 33364 3 | 4 | -------------------------------------------------------------------------------- /tests/files/mock_responses/runs/run_delete_successful.xml: -------------------------------------------------------------------------------- 1 | 2 | 10591880 3 | 4 | -------------------------------------------------------------------------------- /tests/files/mock_responses/tasks/task_delete_successful.xml: -------------------------------------------------------------------------------- 1 | 2 | 361323 3 | 4 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | # Dummy to allow mock classes in the test files to have a version number for 4 | # their parent module 5 | __version__ = "0.1" 6 | -------------------------------------------------------------------------------- /tests/test_setups/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | # Dummy to allow mock classes in the test files to have a version number for 4 | # their parent module 5 | __version__ = "0.1" 6 | -------------------------------------------------------------------------------- /tests/files/mock_responses/runs/run_delete_not_exist.xml: -------------------------------------------------------------------------------- 1 | 2 | 392 3 | Run does not exist 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/flows/flow_delete_not_exist.xml: -------------------------------------------------------------------------------- 1 | 2 | 322 3 | flow does not exist 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/runs/run_delete_not_owned.xml: -------------------------------------------------------------------------------- 1 | 2 | 393 3 | Run is not owned by you 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/tasks/task_delete_not_exist.xml: -------------------------------------------------------------------------------- 1 | 2 | 452 3 | Task does not exist 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/datasets/data_delete_not_exist.xml: -------------------------------------------------------------------------------- 1 | 2 | 352 3 | Dataset does not exist 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/datasets/data_delete_not_owned.xml: -------------------------------------------------------------------------------- 1 | 2 | 353 3 | Dataset is not owned by you 4 | -------------------------------------------------------------------------------- /tests/files/mock_responses/flows/flow_delete_not_owned.xml: -------------------------------------------------------------------------------- 1 | 2 | 323 3 | flow is not owned by you 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/tasks/task_delete_not_owned.xml: -------------------------------------------------------------------------------- 1 | 2 | 453 3 | Task is not owned by you 4 | 5 | -------------------------------------------------------------------------------- /openml/__version__.py: -------------------------------------------------------------------------------- 1 | """Version information.""" 2 | 3 | # License: BSD 3-Clause 4 | 5 | # The following line *must* be the last in the module, exactly as formatted: 6 | from __future__ import annotations 7 | 8 | __version__ = "0.16.0" 9 | -------------------------------------------------------------------------------- /tests/files/mock_responses/tasks/task_delete_has_runs.xml: -------------------------------------------------------------------------------- 1 | 2 | 454 3 | Task is executed in some runs. Delete these first 4 | 5 | -------------------------------------------------------------------------------- /tests/files/mock_responses/datasets/data_delete_has_tasks.xml: -------------------------------------------------------------------------------- 1 | 2 | 354 3 | Dataset is in use by other content. Can not be deleted 4 | 5 | -------------------------------------------------------------------------------- /tests/test_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .test_supervised_task import OpenMLSupervisedTaskTest 4 | from .test_task import OpenMLTaskTest 5 | 6 | __all__ = [ 7 | "OpenMLTaskTest", 8 | "OpenMLSupervisedTaskTest", 9 | ] 10 | -------------------------------------------------------------------------------- /tests/files/mock_responses/flows/flow_delete_has_runs.xml: -------------------------------------------------------------------------------- 1 | 2 | 324 3 | flow is in use by other content (runs). Can not be deleted 4 | {10716, 10707} () 5 | 6 | -------------------------------------------------------------------------------- /tests/files/mock_responses/flows/flow_delete_is_subflow.xml: -------------------------------------------------------------------------------- 1 | 2 | 328 3 | flow is in use by other content (it is a subflow). Can not be deleted 4 | {37661} 5 | 6 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/README.md: -------------------------------------------------------------------------------- 1 | # External or Deprecated Examples 2 | 3 | This directory contains examples that are either external or deprecated. They may not be maintained or updated 4 | regularly, and their functionality might not align with the latest version of the library. Moreover, 5 | they are not shown on the documentation website. -------------------------------------------------------------------------------- /openml/evaluations/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .evaluation import OpenMLEvaluation 4 | from .functions import list_evaluation_measures, list_evaluations, list_evaluations_setups 5 | 6 | __all__ = [ 7 | "OpenMLEvaluation", 8 | "list_evaluations", 9 | "list_evaluation_measures", 10 | "list_evaluations_setups", 11 | ] 12 | -------------------------------------------------------------------------------- /openml/setups/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .functions import get_setup, initialize_model, list_setups, setup_exists 4 | from .setup import OpenMLParameter, OpenMLSetup 5 | 6 | __all__ = [ 7 | "OpenMLSetup", 8 | "OpenMLParameter", 9 | "get_setup", 10 | "list_setups", 11 | "setup_exists", 12 | "initialize_model", 13 | ] 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | # This will check for updates to github actions every day 5 | # https://docs.github.com/en/enterprise-server@3.4/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot 6 | - package-ecosystem: "github-actions" 7 | directory: "/" 8 | schedule: 9 | interval: "daily" 10 | -------------------------------------------------------------------------------- /tests/test_flows/dummy_learn/dummy_forest.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | 5 | class DummyRegressor: 6 | def fit(self, X, y): 7 | return self 8 | 9 | def predict(self, X): 10 | return X[:, 0] 11 | 12 | def get_params(self, deep=False): 13 | return {} 14 | 15 | def set_params(self, params): 16 | return self 17 | -------------------------------------------------------------------------------- /openml/flows/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .flow import OpenMLFlow 4 | from .functions import ( 5 | assert_flows_equal, 6 | delete_flow, 7 | flow_exists, 8 | get_flow, 9 | get_flow_id, 10 | list_flows, 11 | ) 12 | 13 | __all__ = [ 14 | "OpenMLFlow", 15 | "get_flow", 16 | "list_flows", 17 | "get_flow_id", 18 | "flow_exists", 19 | "assert_flows_equal", 20 | "delete_flow", 21 | ] 22 | -------------------------------------------------------------------------------- /openml/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from typing import List, Type # noqa: F401 4 | 5 | from .extension_interface import Extension 6 | from .functions import get_extension_by_flow, get_extension_by_model, register_extension 7 | 8 | extensions = [] # type: List[Type[Extension]] 9 | 10 | 11 | __all__ = [ 12 | "Extension", 13 | "register_extension", 14 | "get_extension_by_model", 15 | "get_extension_by_flow", 16 | ] 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # simple makefile to simplify repetitive build env management tasks under posix 2 | 3 | PYTHON ?= python 4 | CYTHON ?= cython 5 | PYTEST ?= pytest 6 | CTAGS ?= ctags 7 | 8 | all: clean inplace test 9 | 10 | check: 11 | pre-commit run --all-files 12 | 13 | clean: 14 | $(PYTHON) setup.py clean 15 | rm -rf dist openml.egg-info 16 | 17 | in: inplace # just a shortcut 18 | inplace: 19 | $(PYTHON) setup.py build_ext -i 20 | 21 | test-code: in 22 | $(PYTEST) -s -v tests 23 | 24 | test-coverage: 25 | rm -rf coverage .coverage 26 | $(PYTEST) -s -v --cov=. tests 27 | 28 | test: test-code 29 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/datasets/-1/description.xml: -------------------------------------------------------------------------------- 1 | 2 | -1 3 | dexter 4 | 1 5 | Test set of the dexter dataset as used 6 | in the AutoWEKA paper (Thornton et al. 2013) 7 | ARFF 8 | 9 | Public 10 | http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/dexter.zip 11 | class 12 | 13 | 14 | -------------------------------------------------------------------------------- /openml/runs/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .functions import ( 4 | delete_run, 5 | get_run, 6 | get_run_trace, 7 | get_runs, 8 | initialize_model_from_run, 9 | initialize_model_from_trace, 10 | list_runs, 11 | run_exists, 12 | run_flow_on_task, 13 | run_model_on_task, 14 | ) 15 | from .run import OpenMLRun 16 | from .trace import OpenMLRunTrace, OpenMLTraceIteration 17 | 18 | __all__ = [ 19 | "OpenMLRun", 20 | "OpenMLRunTrace", 21 | "OpenMLTraceIteration", 22 | "run_model_on_task", 23 | "run_flow_on_task", 24 | "get_run", 25 | "list_runs", 26 | "get_runs", 27 | "get_run_trace", 28 | "run_exists", 29 | "initialize_model_from_run", 30 | "initialize_model_from_trace", 31 | "delete_run", 32 | ] 33 | -------------------------------------------------------------------------------- /openml/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .data_feature import OpenMLDataFeature 4 | from .dataset import OpenMLDataset 5 | from .functions import ( 6 | attributes_arff_from_df, 7 | check_datasets_active, 8 | create_dataset, 9 | delete_dataset, 10 | edit_dataset, 11 | fork_dataset, 12 | get_dataset, 13 | get_datasets, 14 | list_datasets, 15 | list_qualities, 16 | status_update, 17 | ) 18 | 19 | __all__ = [ 20 | "attributes_arff_from_df", 21 | "check_datasets_active", 22 | "create_dataset", 23 | "get_dataset", 24 | "get_datasets", 25 | "list_datasets", 26 | "OpenMLDataset", 27 | "OpenMLDataFeature", 28 | "status_update", 29 | "list_qualities", 30 | "edit_dataset", 31 | "fork_dataset", 32 | "delete_dataset", 33 | ] 34 | -------------------------------------------------------------------------------- /openml/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .functions import ( 4 | create_task, 5 | delete_task, 6 | get_task, 7 | get_tasks, 8 | list_tasks, 9 | ) 10 | from .split import OpenMLSplit 11 | from .task import ( 12 | OpenMLClassificationTask, 13 | OpenMLClusteringTask, 14 | OpenMLLearningCurveTask, 15 | OpenMLRegressionTask, 16 | OpenMLSupervisedTask, 17 | OpenMLTask, 18 | TaskType, 19 | ) 20 | 21 | __all__ = [ 22 | "OpenMLTask", 23 | "OpenMLSupervisedTask", 24 | "OpenMLClusteringTask", 25 | "OpenMLRegressionTask", 26 | "OpenMLClassificationTask", 27 | "OpenMLLearningCurveTask", 28 | "create_task", 29 | "get_task", 30 | "get_tasks", 31 | "list_tasks", 32 | "OpenMLSplit", 33 | "TaskType", 34 | "delete_task", 35 | ] 36 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile to build an image with preinstalled dependencies 2 | # Useful building docs or running unix tests from a Windows host. 3 | FROM python:3.10 4 | 5 | RUN git clone https://github.com/openml/openml-python.git openml 6 | WORKDIR openml 7 | RUN python -m venv venv 8 | RUN venv/bin/pip install wheel setuptools 9 | RUN venv/bin/pip install -e .[test,examples,docs,examples_unix] 10 | 11 | WORKDIR / 12 | RUN mkdir scripts 13 | ADD startup.sh scripts/ 14 | ADD readme.md / 15 | 16 | # Due to the nature of the Docker container it might often be built from Windows. 17 | # It is typical to have the files with \r\n line-ending, we want to remove it for the unix image. 18 | RUN sed -i 's/\r//g' scripts/startup.sh 19 | 20 | # overwrite the default `python` entrypoint 21 | ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"] 22 | -------------------------------------------------------------------------------- /examples/Basics/simple_tasks_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # A brief example on how to use tasks from OpenML. 3 | 4 | # %% 5 | 6 | import openml 7 | 8 | # %% [markdown] 9 | # Get a [task](https://docs.openml.org/concepts/tasks/) for 10 | # [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): 11 | 12 | # %% 13 | task = openml.tasks.get_task(31) 14 | 15 | # %% [markdown] 16 | # Get the dataset and its data from the task. 17 | 18 | # %% 19 | dataset = task.get_dataset() 20 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name) 21 | 22 | # %% [markdown] 23 | # Get the first out of the 10 cross-validation splits from the task. 24 | 25 | # %% 26 | train_indices, test_indices = task.get_train_test_split_indices(fold=0) 27 | print(train_indices[:10]) # print the first 10 indices of the training set 28 | -------------------------------------------------------------------------------- /openml/study/__init__.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | 3 | from .functions import ( 4 | attach_to_study, 5 | attach_to_suite, 6 | create_benchmark_suite, 7 | create_study, 8 | delete_study, 9 | delete_suite, 10 | detach_from_study, 11 | detach_from_suite, 12 | get_study, 13 | get_suite, 14 | list_studies, 15 | list_suites, 16 | update_study_status, 17 | update_suite_status, 18 | ) 19 | from .study import OpenMLBenchmarkSuite, OpenMLStudy 20 | 21 | __all__ = [ 22 | "OpenMLStudy", 23 | "OpenMLBenchmarkSuite", 24 | "attach_to_study", 25 | "attach_to_suite", 26 | "create_benchmark_suite", 27 | "create_study", 28 | "delete_study", 29 | "delete_suite", 30 | "detach_from_study", 31 | "detach_from_suite", 32 | "get_study", 33 | "get_suite", 34 | "list_studies", 35 | "list_suites", 36 | "update_suite_status", 37 | "update_study_status", 38 | ] 39 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/setups/1/description.xml: -------------------------------------------------------------------------------- 1 | 2 | 100 3 | 60 4 | 5 | 3432 6 | 60 7 | weka.J48 8 | weka.J48(1)_C 9 | C 10 | option 11 | 0.25 12 | 0.9 13 | 14 | 15 | 3435 16 | 60 17 | weka.J48 18 | weka.J48(1)_M 19 | M 20 | option 21 | 2 22 | 2 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/test_tasks/test_supervised_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import unittest 5 | 6 | import pandas as pd 7 | 8 | from openml.tasks import get_task 9 | 10 | from .test_task import OpenMLTaskTest 11 | 12 | 13 | class OpenMLSupervisedTaskTest(OpenMLTaskTest): 14 | """ 15 | A helper class. The methods of the test case 16 | are only executed in subclasses of the test case. 17 | """ 18 | 19 | __test__ = False 20 | 21 | @classmethod 22 | def setUpClass(cls): 23 | if cls is OpenMLSupervisedTaskTest: 24 | raise unittest.SkipTest("Skip OpenMLSupervisedTaskTest tests," " it's a base class") 25 | super().setUpClass() 26 | 27 | def setUp(self, n_levels: int = 1): 28 | super().setUp() 29 | 30 | def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]: 31 | task = get_task(self.task_id) 32 | X, Y = task.get_X_and_y() 33 | return X, Y 34 | -------------------------------------------------------------------------------- /tests/files/misc/features_with_whitespaces.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0 4 | V1 5 | numeric 6 | false 7 | false 8 | false 9 | 0 10 | 11 | 12 | 1 13 | V42 14 | nominal 15 | - 50000. 16 | 50000+. 17 | false 18 | false 19 | false 20 | 0 21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/test_tasks/test_learning_curve_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import pandas as pd 5 | 6 | from openml.tasks import TaskType, get_task 7 | 8 | from .test_supervised_task import OpenMLSupervisedTaskTest 9 | 10 | 11 | class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest): 12 | __test__ = True 13 | 14 | def setUp(self, n_levels: int = 1): 15 | super().setUp() 16 | self.task_id = 801 # diabetes 17 | self.task_type = TaskType.LEARNING_CURVE 18 | self.estimation_procedure = 13 19 | 20 | def test_get_X_and_Y(self): 21 | X, Y = super().test_get_X_and_Y() 22 | assert X.shape == (768, 8) 23 | assert isinstance(X, pd.DataFrame) 24 | assert Y.shape == (768,) 25 | assert isinstance(Y, pd.Series) 26 | assert pd.api.types.is_categorical_dtype(Y) 27 | 28 | def test_download_task(self): 29 | task = super().test_download_task() 30 | assert task.task_id == self.task_id 31 | assert task.task_type_id == TaskType.LEARNING_CURVE 32 | assert task.dataset_id == 20 33 | 34 | def test_class_labels(self): 35 | task = get_task(self.task_id) 36 | assert task.class_labels == ["tested_negative", "tested_positive"] 37 | -------------------------------------------------------------------------------- /.github/workflows/dist.yaml: -------------------------------------------------------------------------------- 1 | name: dist-check 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | push: 7 | branches: 8 | - main 9 | - develop 10 | tags: 11 | - "v*.*.*" 12 | 13 | pull_request: 14 | branches: 15 | - main 16 | - develop 17 | 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 20 | cancel-in-progress: true 21 | 22 | jobs: 23 | dist: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Setup Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: 3.8 31 | - name: Build dist 32 | run: | 33 | pip install build 34 | python -m build --sdist 35 | - name: Twine check 36 | run: | 37 | pip install twine 38 | last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1) 39 | twine check $last_dist 40 | - name: Install dist 41 | run: | 42 | last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1) 43 | pip install $last_dist 44 | - name: PEP 561 Compliance 45 | run: | 46 | pip install mypy 47 | cd .. # required to use the installed version of openml 48 | if ! python -m mypy -c "import openml"; then exit 1; fi 49 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contribution to the OpenML package is highly appreciated in all forms. 4 | In particular, a few ways to contribute to openml-python are: 5 | 6 | - A direct contribution to the package, by means of improving the 7 | code, documentation or examples. To get started, see [this 8 | file](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md) 9 | with details on how to set up your environment to develop for 10 | openml-python. 11 | - A contribution to an openml-python extension. An extension package 12 | allows OpenML to interface with a machine learning package (such 13 | as scikit-learn or keras). These extensions are hosted in separate 14 | repositories and may have their own guidelines. For more 15 | information, see also [extensions](extensions.md). 16 | - Bug reports. If something doesn't work for you or is cumbersome, 17 | please open a new issue to let us know about the problem. 18 | - [Cite OpenML](https://www.openml.org/terms) if you use it in a 19 | scientific publication. 20 | - Visit one of our [hackathons](https://www.openml.org/meet). 21 | - Contribute to another OpenML project, such as [the main OpenML 22 | project](https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md). 23 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | #### Metadata 16 | * Reference Issue: 17 | * New Tests Added: 18 | * Documentation Updated: 19 | * Change Log Entry: 20 | 21 | 22 | #### Details 23 | 31 | 32 | -------------------------------------------------------------------------------- /examples/introduction.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # 3 | # We provide a set of examples here to get started with OpenML-Python. These examples cover various aspects of using the 4 | # OpenML API, including downloading datasets, uploading results, and working with tasks. 5 | # 6 | # ## Basics 7 | # 8 | # 1. [Installing and setting up OpenML-Python](../Basics/introduction_tutorial/) 9 | # 2. [Downloading datasets](../Basics/simple_datasets_tutorial/) 10 | # 3. [Using tasks](../Basics/simple_tasks_tutorial/) 11 | # 3. [Uploading experiment results](../Basics/simple_flows_and_runs_tutorial/) 12 | # 4. [Working with collections of tasks](../Basics/simple_suites_tutorial/) 13 | # 14 | # ## Advanced 15 | # 1. [Getting splits for datasets from tasks](../Advanced/task_manual_iteration_tutorial/) 16 | # 2. [Creating and uploading datasets](../Advanced/create_upload_tutorial/) 17 | # 3. [Searching and editing datasets](../Advanced/datasets_tutorial/) 18 | # 4. [Searching and creating tasks](../Advanced/task_tutorial/) 19 | # 5. [Listing, downloading, and uploading suites](../Advanced/suites_tutorial/) 20 | # 6. [Listing, downloading, and uploading studies](../Advanced/study_tutorial/) 21 | # 7. [Downloading evaluation results](../Advanced/fetch_evaluations_tutorial/) 22 | # 8. [Configuring logging](../Advanced/configure_logging/) 23 | -------------------------------------------------------------------------------- /tests/test_tasks/test_classification_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import pandas as pd 5 | import pytest 6 | 7 | from openml.tasks import TaskType, get_task 8 | 9 | from .test_supervised_task import OpenMLSupervisedTaskTest 10 | 11 | 12 | class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest): 13 | __test__ = True 14 | 15 | def setUp(self, n_levels: int = 1): 16 | super().setUp() 17 | self.task_id = 119 # diabetes 18 | self.task_type = TaskType.SUPERVISED_CLASSIFICATION 19 | self.estimation_procedure = 5 20 | 21 | def test_download_task(self): 22 | task = super().test_download_task() 23 | assert task.task_id == self.task_id 24 | assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION 25 | assert task.dataset_id == 20 26 | assert task.estimation_procedure_id == self.estimation_procedure 27 | 28 | def test_class_labels(self): 29 | task = get_task(self.task_id) 30 | assert task.class_labels == ["tested_negative", "tested_positive"] 31 | 32 | 33 | @pytest.mark.server() 34 | def test_get_X_and_Y(): 35 | task = get_task(119) 36 | X, Y = task.get_X_and_y() 37 | assert X.shape == (768, 8) 38 | assert isinstance(X, pd.DataFrame) 39 | assert Y.shape == (768,) 40 | assert isinstance(Y, pd.Series) 41 | assert pd.api.types.is_categorical_dtype(Y) 42 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | files: | 4 | (?x)^( 5 | openml| 6 | tests 7 | )/.*\.py$ 8 | repos: 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | rev: v0.7.3 11 | hooks: 12 | - id: ruff 13 | args: [--fix, --exit-non-zero-on-fix, --no-cache] 14 | - id: ruff-format 15 | - repo: https://github.com/pre-commit/mirrors-mypy 16 | rev: v1.13.0 17 | hooks: 18 | - id: mypy 19 | additional_dependencies: 20 | - types-requests 21 | - types-python-dateutil 22 | - repo: https://github.com/python-jsonschema/check-jsonschema 23 | rev: 0.29.4 24 | hooks: 25 | - id: check-github-workflows 26 | files: '^github/workflows/.*\.ya?ml$' 27 | types: ["yaml"] 28 | - id: check-dependabot 29 | files: '^\.github/dependabot\.ya?ml$' 30 | - repo: https://github.com/pre-commit/pre-commit-hooks 31 | rev: v5.0.0 32 | hooks: 33 | - id: check-added-large-files 34 | files: ".*" 35 | - id: check-case-conflict 36 | files: ".*" 37 | - id: check-merge-conflict 38 | files: ".*" 39 | - id: check-yaml 40 | files: ".*" 41 | - id: end-of-file-fixer 42 | files: ".*" 43 | types: ["yaml"] 44 | - id: check-toml 45 | files: ".*" 46 | types: ["toml"] 47 | - id: debug-statements 48 | files: '^src/.*\.py$' 49 | -------------------------------------------------------------------------------- /tests/test_openml/test_openml.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | from unittest import mock 5 | 6 | import openml 7 | from openml.testing import TestBase 8 | 9 | 10 | class TestInit(TestBase): 11 | # Splitting not helpful, these test's don't rely on the server and take less 12 | # than 1 seconds 13 | 14 | @mock.patch("openml.tasks.functions.get_task") 15 | @mock.patch("openml.datasets.functions.get_dataset") 16 | @mock.patch("openml.flows.functions.get_flow") 17 | @mock.patch("openml.runs.functions.get_run") 18 | def test_populate_cache( 19 | self, 20 | run_mock, 21 | flow_mock, 22 | dataset_mock, 23 | task_mock, 24 | ): 25 | openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8]) 26 | assert run_mock.call_count == 2 27 | for argument, fixture in zip(run_mock.call_args_list, [(7,), (8,)]): 28 | assert argument[0] == fixture 29 | 30 | assert flow_mock.call_count == 2 31 | for argument, fixture in zip(flow_mock.call_args_list, [(5,), (6,)]): 32 | assert argument[0] == fixture 33 | 34 | assert dataset_mock.call_count == 2 35 | for argument, fixture in zip( 36 | dataset_mock.call_args_list, 37 | [(3,), (4,)], 38 | ): 39 | assert argument[0] == fixture 40 | 41 | assert task_mock.call_count == 2 42 | for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): 43 | assert argument[0] == fixture 44 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/tasks/1882/task.xml: -------------------------------------------------------------------------------- 1 | 2 | 1882 3 | 1 4 | Supervised Classification 5 | 6 | 7 | 2 8 | class 9 | 10 | 11 | 12 | 3 13 | crossvalidation 14 | http://capa.win.tue.nl/api_splits/get/1882/Task_1882_splits.arff 15 | 10 16 | 10 17 | 18 | true 19 | 20 | 21 | 22 | 23 | 24 | predictive_accuracy 25 | 26 | 27 | 28 | ARFF 29 | 30 | 31 | 32 | 33 | 34 | 35 | under100k 36 | under1m 37 | 38 | -------------------------------------------------------------------------------- /examples/Basics/simple_datasets_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # A basic tutorial on how to list, load and visualize datasets. 3 | # 4 | # In general, we recommend working with tasks, so that the results can 5 | # be easily reproduced. Furthermore, the results can be compared to existing results 6 | # at OpenML. However, for the purposes of this tutorial, we are going to work with 7 | # the datasets directly. 8 | 9 | # %% 10 | 11 | import openml 12 | 13 | # %% [markdown] 14 | # ## List datasets stored on OpenML 15 | 16 | # %% 17 | datasets_df = openml.datasets.list_datasets() 18 | print(datasets_df.head(n=10)) 19 | 20 | # %% [markdown] 21 | # ## Download a dataset 22 | 23 | # %% 24 | # Iris dataset https://www.openml.org/d/61 25 | dataset = openml.datasets.get_dataset(dataset_id=61) 26 | 27 | # Print a summary 28 | print( 29 | f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'" 30 | ) 31 | print(f"URL: {dataset.url}") 32 | print(dataset.description[:500]) 33 | 34 | # %% [markdown] 35 | # ## Load a dataset 36 | # * `X` - A dataframe where each row represents one example with 37 | # the corresponding feature values. 38 | # * `y` - the classes for each example 39 | # * `categorical_indicator` - a list that indicates which feature is categorical 40 | # * `attribute_names` - the names of the features for the examples (X) and 41 | # target feature (y) 42 | 43 | # %% 44 | X, y, categorical_indicator, attribute_names = dataset.get_data( 45 | target=dataset.default_target_attribute 46 | ) 47 | 48 | # %% [markdown] 49 | # Visualize the dataset 50 | 51 | # %% 52 | import matplotlib.pyplot as plt 53 | import pandas as pd 54 | import seaborn as sns 55 | 56 | iris_plot = sns.pairplot(pd.concat([X, y], axis=1), hue="class") 57 | plt.show() 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | doc/generated 3 | examples/.ipynb_checkpoints 4 | venv 5 | .uv-lock 6 | uv.lock 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # scikit-learn specific 17 | doc/_build/ 18 | doc/auto_examples/ 19 | doc/modules/generated/ 20 | doc/datasets/generated/ 21 | 22 | # Some stuff from testing? 23 | tests/files/org/openml/test/datasets/1/ 24 | tests/files/org/openml/test/datasets/2/features.xml.pkl 25 | tests/files/org/openml/test/datasets/2/qualities.xml.pkl 26 | tests/files/org/openml/test/locks/ 27 | tests/files/org/openml/test/tasks/1/datasplits.pkl.py3 28 | tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3 29 | 30 | # Distribution / packaging 31 | 32 | .Python 33 | env/ 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | cover 62 | coverage 63 | htmlcov 64 | .tox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *,cover 71 | .hypothesis/ 72 | prof/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # IDE 88 | .idea 89 | *.swp 90 | .vscode 91 | 92 | # MYPY 93 | .mypy_cache 94 | dmypy.json 95 | dmypy.sock 96 | 97 | # Tests 98 | .pytest_cache 99 | .venv -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 12 | 13 | #### Description 14 | 15 | 16 | #### Steps/Code to Reproduce 17 | 26 | 27 | #### Expected Results 28 | 29 | 30 | #### Actual Results 31 | 32 | 33 | #### Versions 34 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/tasks/1/task.xml: -------------------------------------------------------------------------------- 1 | 2 | 1 3 | 1 4 | Supervised Classification 5 | 6 | 7 | 1 8 | class 9 | 10 | 11 | 12 | 1 13 | crossvalidation 14 | http://www.openml.org/api_splits/get/1/Task_1_splits.arff 15 | 1 16 | 10 17 | 18 | true 19 | 20 | 21 | 22 | 23 | 24 | predictive_accuracy 25 | 26 | 27 | 28 | ARFF 29 | 30 | 31 | 32 | 33 | 34 | 35 | basic 36 | study_1 37 | study_7 38 | under100k 39 | under1m 40 | 41 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/tasks/3/task.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 3 | 1 4 | Supervised Classification 5 | 6 | 7 | 3 8 | class 9 | 10 | 11 | 12 | 1 13 | crossvalidation 14 | http://www.openml.org/api_splits/get/3/Task_3_splits.arff 15 | 1 16 | 10 17 | 18 | true 19 | 20 | 21 | 22 | 23 | 24 | predictive_accuracy 25 | 26 | 27 | 28 | ARFF 29 | 30 | 31 | 32 | 33 | 34 | 35 | basic 36 | mythbusting 37 | mythbusting_1 38 | study_1 39 | study_7 40 | under100k 41 | under1m 42 | 43 | -------------------------------------------------------------------------------- /tests/test_evaluations/test_evaluations_example.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import unittest 5 | 6 | from openml.config import overwrite_config_context 7 | 8 | 9 | class TestEvaluationsExample(unittest.TestCase): 10 | def test_example_python_paper(self): 11 | # Example script which will appear in the upcoming OpenML-Python paper 12 | # This test ensures that the example will keep running! 13 | with overwrite_config_context( 14 | { 15 | "server": "https://www.openml.org/api/v1/xml", 16 | "apikey": None, 17 | } 18 | ): 19 | import matplotlib.pyplot as plt 20 | import numpy as np 21 | import openml 22 | 23 | df = openml.evaluations.list_evaluations_setups( 24 | "predictive_accuracy", 25 | flows=[8353], 26 | tasks=[6], 27 | parameters_in_separate_columns=True, 28 | ) # Choose an SVM flow, for example 8353, and a task. 29 | 30 | assert len(df) > 0, ( 31 | "No evaluation found for flow 8353 on task 6, could " 32 | "be that this task is not available on the test server." 33 | ) 34 | 35 | hp_names = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"] 36 | df[hp_names] = df[hp_names].astype(float).apply(np.log) 37 | C, gamma, score = df[hp_names[0]], df[hp_names[1]], df["value"] 38 | 39 | cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") 40 | plt.colorbar(cntr, label="accuracy") 41 | plt.xlim((min(C), max(C))) 42 | plt.ylim((min(gamma), max(gamma))) 43 | plt.xlabel("C (log10)", size=16) 44 | plt.ylabel("gamma (log10)", size=16) 45 | plt.title("SVM performance landscape", size=20) 46 | 47 | plt.tight_layout() 48 | -------------------------------------------------------------------------------- /.github/workflows/docs.yaml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | on: 3 | workflow_dispatch: 4 | 5 | push: 6 | branches: 7 | - main 8 | - develop 9 | tags: 10 | - "v*.*.*" 11 | 12 | pull_request: 13 | branches: 14 | - main 15 | - develop 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | build-and-deploy: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v4 26 | with: 27 | fetch-depth: 0 28 | - name: Setup Python 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: 3.8 32 | - name: Install dependencies 33 | run: | 34 | pip install -e .[docs,examples] 35 | - name: Make docs 36 | run: | 37 | mkdocs build 38 | - name: Deploy to GitHub Pages 39 | env: 40 | CI: false 41 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 42 | PAGES_BRANCH: gh-pages 43 | if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push' 44 | run: | 45 | git config user.name doc-bot 46 | git config user.email doc-bot@openml.com 47 | current_version=$(git tag | sort --version-sort | tail -n 1) 48 | # This block will rename previous retitled versions 49 | retitled_versions=$(mike list -j | jq ".[] | select(.title != .version) | .version" | tr -d '"') 50 | for version in $retitled_versions; do 51 | mike retitle "${version}" "${version}" 52 | done 53 | 54 | echo "Deploying docs for ${current_version}" 55 | mike set-default latest 56 | mike deploy \ 57 | --push \ 58 | --title "${current_version} (latest)" \ 59 | --update-aliases \ 60 | "${current_version}" \ 61 | "latest"\ 62 | -b $PAGES_BRANCH 63 | -------------------------------------------------------------------------------- /scripts/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | """Generate the code reference pages. 2 | 3 | based on https://github.com/mkdocstrings/mkdocstrings/blob/33aa573efb17b13e7b9da77e29aeccb3fbddd8e8/docs/recipes.md 4 | but modified for lack of "src/" file structure. 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from pathlib import Path 11 | 12 | import mkdocs_gen_files 13 | 14 | nav = mkdocs_gen_files.Nav() 15 | 16 | root = Path(__file__).parent.parent 17 | src = root / "openml" 18 | 19 | for path in sorted(src.rglob("*.py")): 20 | module_path = path.relative_to(root).with_suffix("") 21 | doc_path = path.relative_to(src).with_suffix(".md") 22 | full_doc_path = Path("reference", doc_path) 23 | 24 | parts = tuple(module_path.parts) 25 | 26 | if parts[-1] == "__init__": 27 | parts = parts[:-1] 28 | doc_path = doc_path.with_name("index.md") 29 | full_doc_path = full_doc_path.with_name("index.md") 30 | elif parts[-1] == "__main__": 31 | continue 32 | 33 | nav[parts] = doc_path.as_posix() 34 | 35 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 36 | identifier = ".".join(parts) 37 | print("::: " + identifier, file=fd) 38 | 39 | mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) 40 | 41 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: 42 | nav_file.writelines(nav.build_literate_nav()) 43 | 44 | nav = mkdocs_gen_files.Nav() 45 | examples_dir = root / "examples" 46 | examples_doc_dir = root / "docs" / "examples" 47 | for path in sorted(examples_dir.rglob("*.py")): 48 | if "_external_or_deprecated" in path.parts: 49 | continue 50 | dest_path = Path("examples") / path.relative_to(examples_dir) 51 | with mkdocs_gen_files.open(dest_path, "w") as dest_file: 52 | print(path.read_text(), file=dest_file) 53 | 54 | new_relative_location = Path("../") / dest_path 55 | nav[new_relative_location.parts[2:]] = new_relative_location.as_posix() 56 | 57 | with mkdocs_gen_files.open("examples/SUMMARY.md", "w") as nav_file: 58 | nav_file.writelines(nav.build_literate_nav()) 59 | -------------------------------------------------------------------------------- /.github/workflows/release_docker.yaml: -------------------------------------------------------------------------------- 1 | name: release-docker 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - 'develop' 8 | - 'docker' 9 | tags: 10 | - 'v*' 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | 18 | docker: 19 | 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Login to DockerHub 30 | if: github.event_name != 'pull_request' 31 | uses: docker/login-action@v3 32 | with: 33 | username: ${{ secrets.DOCKERHUB_USERNAME }} 34 | password: ${{ secrets.DOCKERHUB_TOKEN }} 35 | 36 | - name: Check out the repo 37 | uses: actions/checkout@v4 38 | 39 | - name: Extract metadata (tags, labels) for Docker Hub 40 | id: meta_dockerhub 41 | uses: docker/metadata-action@v5 42 | with: 43 | images: "openml/openml-python" 44 | 45 | - name: Build and push 46 | id: docker_build 47 | uses: docker/build-push-action@v6 48 | with: 49 | context: ./docker/ 50 | tags: ${{ steps.meta_dockerhub.outputs.tags }} 51 | labels: ${{ steps.meta_dockerhub.outputs.labels }} 52 | platforms: linux/amd64,linux/arm64 53 | push: ${{ github.event_name == 'push' }} 54 | 55 | - name: Update repo description 56 | if: ${{ startsWith(github.ref, 'refs/tags/v') }} 57 | uses: peter-evans/dockerhub-description@v4 58 | with: 59 | username: ${{ secrets.DOCKERHUB_USERNAME }} 60 | password: ${{ secrets.DOCKERHUB_TOKEN }} 61 | repository: openml/openml-python 62 | short-description: "pre-installed openml-python environment" 63 | readme-filepath: ./docker/readme.md 64 | 65 | - name: Image digest 66 | run: echo ${{ steps.docker_build.outputs.digest }} 67 | -------------------------------------------------------------------------------- /tests/test_tasks/test_task_methods.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | from time import time 5 | 6 | import openml 7 | from openml.testing import TestBase 8 | 9 | 10 | # Common methods between tasks 11 | class OpenMLTaskMethodsTest(TestBase): 12 | def setUp(self): 13 | super().setUp() 14 | 15 | def tearDown(self): 16 | super().tearDown() 17 | 18 | def test_tagging(self): 19 | task = openml.tasks.get_task(1) # anneal; crossvalidation 20 | # tags can be at most 64 alphanumeric (+ underscore) chars 21 | unique_indicator = str(time()).replace(".", "") 22 | tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}" 23 | tasks = openml.tasks.list_tasks(tag=tag) 24 | assert len(tasks) == 0 25 | task.push_tag(tag) 26 | tasks = openml.tasks.list_tasks(tag=tag) 27 | assert len(tasks) == 1 28 | assert 1 in tasks["tid"] 29 | task.remove_tag(tag) 30 | tasks = openml.tasks.list_tasks(tag=tag) 31 | assert len(tasks) == 0 32 | 33 | def test_get_train_and_test_split_indices(self): 34 | openml.config.set_root_cache_directory(self.static_cache_dir) 35 | task = openml.tasks.get_task(1882) 36 | train_indices, test_indices = task.get_train_test_split_indices(0, 0) 37 | assert train_indices[0] == 16 38 | assert train_indices[-1] == 395 39 | assert test_indices[0] == 412 40 | assert test_indices[-1] == 364 41 | train_indices, test_indices = task.get_train_test_split_indices(2, 2) 42 | assert train_indices[0] == 237 43 | assert train_indices[-1] == 681 44 | assert test_indices[0] == 583 45 | assert test_indices[-1] == 24 46 | self.assertRaisesRegex( 47 | ValueError, 48 | "Fold 10 not known", 49 | task.get_train_test_split_indices, 50 | 10, 51 | 0, 52 | ) 53 | self.assertRaisesRegex( 54 | ValueError, 55 | "Repeat 10 not known", 56 | task.get_train_test_split_indices, 57 | 0, 58 | 10, 59 | ) 60 | -------------------------------------------------------------------------------- /examples/Basics/introduction_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # ## Installation 3 | # Installation is done via ``pip``: 4 | # 5 | # ```bash 6 | # pip install openml 7 | # ``` 8 | 9 | # %% [markdown] 10 | # ## Authentication 11 | # 12 | # For certain functionality, such as uploading tasks or datasets, users have to 13 | # sign up. Only accessing the data on OpenML does not require an account! 14 | # 15 | # If you don’t have an account yet, sign up now. 16 | # You will receive an API key, which will authenticate you to the server 17 | # and allow you to download and upload datasets, tasks, runs and flows. 18 | # 19 | # * Create an OpenML account (free) on https://www.openml.org. 20 | # * After logging in, open your account page (avatar on the top right) 21 | # * Open 'Account Settings', then 'API authentication' to find your API key. 22 | # 23 | # There are two ways to permanently authenticate: 24 | # 25 | # * Use the ``openml`` CLI tool with ``openml configure apikey MYKEY``, 26 | # replacing **MYKEY** with your API key. 27 | # * Create a plain text file **~/.openml/config** with the line 28 | # **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config 29 | # file must be in the directory ~/.openml/config and exist prior to 30 | # importing the openml module. 31 | # 32 | # Alternatively, by running the code below and replacing 'YOURKEY' with your API key, 33 | # you authenticate for the duration of the Python process. 34 | 35 | # %% 36 | import openml 37 | 38 | openml.config.apikey = "YOURKEY" 39 | 40 | # %% [markdown] 41 | # ## Caching 42 | # When downloading datasets, tasks, runs and flows, they will be cached to 43 | # retrieve them without calling the server later. As with the API key, 44 | # the cache directory can be either specified through the config file or 45 | # through the API: 46 | # 47 | # * Add the line **cachedir = 'MYDIR'** to the config file, replacing 48 | # 'MYDIR' with the path to the cache directory. By default, OpenML 49 | # will use **~/.openml/cache** as the cache directory. 50 | # * Run the code below, replacing 'YOURDIR' with the path to the cache directory. 51 | 52 | # %% 53 | import openml 54 | 55 | openml.config.set_root_cache_directory("YOURDIR") -------------------------------------------------------------------------------- /examples/Basics/simple_suites_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # This is a brief showcase of OpenML benchmark suites, which were introduced by 3 | # [Bischl et al. (2019)](https://arxiv.org/abs/1708.03731v2). Benchmark suites standardize the 4 | # datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML 5 | # and simplify both the sharing of the setup and the results. 6 | 7 | # %% 8 | import openml 9 | 10 | # %% [markdown] 11 | # ## OpenML-CC18 12 | # 13 | # As an example we have a look at the OpenML-CC18, which is a suite of 72 classification datasets 14 | # from OpenML which were carefully selected to be usable by many algorithms. These are all datasets 15 | # from mid-2018 that satisfy a large set of clear requirements for thorough yet practical benchmarking: 16 | # 17 | # 1. the number of observations are between 500 and 100,000 to focus on medium-sized datasets, 18 | # 2. the number of features does not exceed 5,000 features to keep the runtime of the algorithms 19 | # low 20 | # 3. the target attribute has at least two classes with no class having less than 20 observations 21 | # 4. the ratio of the minority class and the majority class is above 0.05 (to eliminate highly 22 | # imbalanced datasets which require special treatment for both algorithms and evaluation 23 | # measures). 24 | # 25 | # A full description can be found in the 26 | # [OpenML benchmarking docs](https://docs.openml.org/benchmark/#openml-cc18). 27 | # 28 | # In this example, we'll focus on how to use benchmark suites in practice. 29 | 30 | # %% [markdown] 31 | # ## Downloading benchmark suites 32 | 33 | # %% 34 | suite = openml.study.get_suite(99) 35 | print(suite) 36 | 37 | # %% [markdown] 38 | # The benchmark suite does not download the included tasks and datasets itself, but only contains 39 | # a list of which tasks constitute the study. 40 | # 41 | # Tasks can then be accessed via 42 | 43 | # %% 44 | tasks = suite.tasks 45 | print(tasks) 46 | 47 | # %% [markdown] 48 | # and iterated over for benchmarking. For speed reasons, we only iterate over the first three tasks: 49 | 50 | # %% 51 | for task_id in tasks[:3]: 52 | task = openml.tasks.get_task(task_id) 53 | print(task) 54 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software in a publication, please cite the metadata from preferred-citation." 3 | preferred-citation: 4 | type: article 5 | authors: 6 | - family-names: "Feurer" 7 | given-names: "Matthias" 8 | orcid: "https://orcid.org/0000-0001-9611-8588" 9 | - family-names: "van Rijn" 10 | given-names: "Jan N." 11 | orcid: "https://orcid.org/0000-0003-2898-2168" 12 | - family-names: "Kadra" 13 | given-names: "Arlind" 14 | - family-names: "Gijsbers" 15 | given-names: "Pieter" 16 | orcid: "https://orcid.org/0000-0001-7346-8075" 17 | - family-names: "Mallik" 18 | given-names: "Neeratyoy" 19 | orcid: "https://orcid.org/0000-0002-0598-1608" 20 | - family-names: "Ravi" 21 | given-names: "Sahithya" 22 | - family-names: "Müller" 23 | given-names: "Andreas" 24 | orcid: "https://orcid.org/0000-0002-2349-9428" 25 | - family-names: "Vanschoren" 26 | given-names: "Joaquin" 27 | orcid: "https://orcid.org/0000-0001-7044-9805" 28 | - family-names: "Hutter" 29 | given-names: "Frank" 30 | orcid: "https://orcid.org/0000-0002-2037-3694" 31 | journal: "Journal of Machine Learning Research" 32 | title: "OpenML-Python: an extensible Python API for OpenML" 33 | abstract: "OpenML is an online platform for open science collaboration in machine learning, used to share datasets and results of machine learning experiments. In this paper, we introduce OpenML-Python, a client API for Python, which opens up the OpenML platform for a wide range of Python-based machine learning tools. It provides easy access to all datasets, tasks and experiments on OpenML from within Python. It also provides functionality to conduct machine learning experiments, upload the results to OpenML, and reproduce results which are stored on OpenML. Furthermore, it comes with a scikit-learn extension and an extension mechanism to easily integrate other machine learning libraries written in Python into the OpenML ecosystem. Source code and documentation are available at https://github.com/openml/openml-python/." 34 | volume: 22 35 | year: 2021 36 | start: 1 37 | end: 5 38 | pages: 5 39 | number: 100 40 | url: https://jmlr.org/papers/v22/19-920.html 41 | -------------------------------------------------------------------------------- /openml/exceptions.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | 5 | class PyOpenMLError(Exception): 6 | """Base class for all exceptions in OpenML-Python.""" 7 | 8 | def __init__(self, message: str): 9 | self.message = message 10 | super().__init__(message) 11 | 12 | 13 | class OpenMLServerError(PyOpenMLError): 14 | """class for when something is really wrong on the server 15 | (result did not parse to dict), contains unparsed error. 16 | """ 17 | 18 | 19 | class OpenMLServerException(OpenMLServerError): # noqa: N818 20 | """exception for when the result of the server was 21 | not 200 (e.g., listing call w/o results). 22 | """ 23 | 24 | # Code needs to be optional to allow the exception to be picklable: 25 | # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable # noqa: E501 26 | def __init__(self, message: str, code: int | None = None, url: str | None = None): 27 | self.message = message 28 | self.code = code 29 | self.url = url 30 | super().__init__(message) 31 | 32 | def __str__(self) -> str: 33 | return f"{self.url} returned code {self.code}: {self.message}" 34 | 35 | 36 | class OpenMLServerNoResult(OpenMLServerException): 37 | """Exception for when the result of the server is empty.""" 38 | 39 | 40 | class OpenMLCacheException(PyOpenMLError): # noqa: N818 41 | """Dataset / task etc not found in cache""" 42 | 43 | 44 | class OpenMLHashException(PyOpenMLError): # noqa: N818 45 | """Locally computed hash is different than hash announced by the server.""" 46 | 47 | 48 | class OpenMLPrivateDatasetError(PyOpenMLError): 49 | """Exception thrown when the user has no rights to access the dataset.""" 50 | 51 | 52 | class OpenMLRunsExistError(PyOpenMLError): 53 | """Indicates run(s) already exists on the server when they should not be duplicated.""" 54 | 55 | def __init__(self, run_ids: set[int], message: str) -> None: 56 | if len(run_ids) < 1: 57 | raise ValueError("Set of run ids must be non-empty.") 58 | self.run_ids = run_ids 59 | super().__init__(message) 60 | 61 | 62 | class OpenMLNotAuthorizedError(OpenMLServerError): 63 | """Indicates an authenticated user is not authorized to execute the requested action.""" 64 | 65 | 66 | class ObjectNotPublishedError(PyOpenMLError): 67 | """Indicates an object has not been published yet.""" 68 | -------------------------------------------------------------------------------- /examples/Advanced/configure_logging.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # This tutorial explains openml-python logging, and shows how to configure it. 3 | # Openml-python uses the [Python logging module](https://docs.python.org/3/library/logging.html) 4 | # to provide users with log messages. Each log message is assigned a level of importance, see 5 | # the table in Python's logging tutorial 6 | # [here](https://docs.python.org/3/howto/logging.html#when-to-use-logging). 7 | # 8 | # By default, openml-python will print log messages of level `WARNING` and above to console. 9 | # All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be 10 | # found in your cache directory (see also the 11 | # [introduction tutorial](../Basics/introduction_tutorial). 12 | # These file logs are automatically deleted if needed, and use at most 2MB of space. 13 | # 14 | # It is possible to configure what log levels to send to console and file. 15 | # When downloading a dataset from OpenML, a `DEBUG`-level message is written: 16 | 17 | # %% 18 | import openml 19 | 20 | openml.datasets.get_dataset("iris", version=1) 21 | 22 | # %% [markdown] 23 | # With default configuration, the above example will show no output to console. 24 | # However, in your cache directory you should find a file named 'openml_python.log', 25 | # which has a DEBUG message written to it. It should be either like 26 | # "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..." 27 | # or like 28 | # "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date." 29 | # , depending on whether or not you had downloaded iris before. 30 | # The processed log levels can be configured programmatically: 31 | 32 | # %% 33 | import logging 34 | 35 | openml.config.set_console_log_level(logging.DEBUG) 36 | openml.config.set_file_log_level(logging.WARNING) 37 | openml.datasets.get_dataset("iris", version=1) 38 | 39 | # %% [markdown] 40 | # Now the log level that was previously written to file should also be shown in the console. 41 | # The message is now no longer written to file as the `file_log` was set to level `WARNING`. 42 | # 43 | # It is also possible to specify the desired log levels through the configuration file. 44 | # This way you will not need to set them on each script separately. 45 | # Add the line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file, 46 | # where 'NUMBER' should be one of: 47 | # 48 | # * 0: `logging.WARNING` and up. 49 | # * 1: `logging.INFO` and up. 50 | # * 2: `logging.DEBUG` and up (i.e. all messages). 51 | -------------------------------------------------------------------------------- /examples/Advanced/suites_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # How to list, download and upload benchmark suites. 3 | 4 | # %% 5 | import uuid 6 | 7 | import numpy as np 8 | 9 | import openml 10 | 11 | # %% [markdown] 12 | # ## Listing suites 13 | # 14 | # * Use the output_format parameter to select output type 15 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an 16 | # easier-to-work-with data structure 17 | 18 | # %% 19 | suites = openml.study.list_suites(status="all") 20 | print(suites.head(n=10)) 21 | 22 | # %% [markdown] 23 | # ## Downloading suites 24 | # This is done based on the dataset ID. 25 | 26 | # %% 27 | suite = openml.study.get_suite(99) 28 | print(suite) 29 | 30 | # %% [markdown] 31 | # Suites also feature a description: 32 | 33 | # %% 34 | print(suite.description) 35 | 36 | # %% [markdown] 37 | # Suites are a container for tasks: 38 | 39 | # %% 40 | print(suite.tasks) 41 | 42 | # %% [markdown] 43 | # And we can use the task listing functionality to learn more about them: 44 | 45 | # %% 46 | tasks = openml.tasks.list_tasks() 47 | 48 | # %% [markdown] 49 | # Using ``@`` in 50 | # [pd.DataFrame.query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html) 51 | # accesses variables outside of the current dataframe. 52 | 53 | # %% 54 | tasks = tasks.query("tid in @suite.tasks") 55 | print(tasks.describe().transpose()) 56 | 57 | # %% [markdown] 58 | # We'll use the test server for the rest of this tutorial. 59 | 60 | # %% 61 | openml.config.start_using_configuration_for_example() 62 | 63 | # %% [markdown] 64 | # ## Uploading suites 65 | # 66 | # Uploading suites is as simple as uploading any kind of other OpenML 67 | # entity - the only reason why we need so much code in this example is 68 | # because we upload some random data. 69 | 70 | # We'll take a random subset of at least ten tasks of all available tasks on 71 | # the test server: 72 | 73 | # %% 74 | all_tasks = list(openml.tasks.list_tasks()["tid"]) 75 | task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20)) 76 | 77 | # The study needs a machine-readable and unique alias. To obtain this, 78 | # we simply generate a random uuid. 79 | 80 | alias = uuid.uuid4().hex 81 | 82 | new_suite = openml.study.create_benchmark_suite( 83 | name="Test-Suite", 84 | description="Test suite for the Python tutorial on benchmark suites", 85 | task_ids=task_ids_for_suite, 86 | alias=alias, 87 | ) 88 | new_suite.publish() 89 | print(new_suite) 90 | 91 | # %% 92 | openml.config.stop_using_configuration_for_example() 93 | -------------------------------------------------------------------------------- /tests/test_tasks/test_clustering_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import pytest 5 | 6 | import openml 7 | from openml.exceptions import OpenMLServerException 8 | from openml.tasks import TaskType 9 | from openml.testing import TestBase 10 | 11 | from .test_task import OpenMLTaskTest 12 | 13 | 14 | class OpenMLClusteringTaskTest(OpenMLTaskTest): 15 | __test__ = True 16 | 17 | def setUp(self, n_levels: int = 1): 18 | super().setUp() 19 | self.task_id = 146714 20 | self.task_type = TaskType.CLUSTERING 21 | self.estimation_procedure = 17 22 | 23 | @pytest.mark.production() 24 | def test_get_dataset(self): 25 | # no clustering tasks on test server 26 | self.use_production_server() 27 | task = openml.tasks.get_task(self.task_id) 28 | task.get_dataset() 29 | 30 | @pytest.mark.production() 31 | def test_download_task(self): 32 | # no clustering tasks on test server 33 | self.use_production_server() 34 | task = super().test_download_task() 35 | assert task.task_id == self.task_id 36 | assert task.task_type_id == TaskType.CLUSTERING 37 | assert task.dataset_id == 36 38 | 39 | def test_upload_task(self): 40 | compatible_datasets = self._get_compatible_rand_dataset() 41 | for i in range(100): 42 | try: 43 | dataset_id = compatible_datasets[i % len(compatible_datasets)] 44 | # Upload a clustering task without a ground truth. 45 | task = openml.tasks.create_task( 46 | task_type=self.task_type, 47 | dataset_id=dataset_id, 48 | estimation_procedure_id=self.estimation_procedure, 49 | ) 50 | task = task.publish() 51 | TestBase._mark_entity_for_removal("task", task.id) 52 | TestBase.logger.info( 53 | f"collected from {__file__.split('/')[-1]}: {task.id}", 54 | ) 55 | # success 56 | break 57 | except OpenMLServerException as e: 58 | # Error code for 'task already exists' 59 | # Should be 533 according to the docs 60 | # (# https://www.openml.org/api_docs#!/task/post_task) 61 | if e.code == 614: 62 | continue 63 | else: 64 | raise e 65 | else: 66 | raise ValueError( 67 | f"Could not create a valid task for task type ID {self.task_type}", 68 | ) 69 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # # Plotting hyperparameter surfaces 3 | 4 | # %% 5 | import openml 6 | import numpy as np 7 | 8 | # %% [markdown] 9 | # # First step - obtaining the data 10 | # First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are 11 | # not part of this tutorial, this could for example be done via the website. 12 | # 13 | # For this we use the function ``list_evaluations_setup`` which can automatically join 14 | # evaluations conducted by the server with the hyperparameter settings extracted from the 15 | # uploaded runs (called *setup*). 16 | 17 | # %% 18 | df = openml.evaluations.list_evaluations_setups( 19 | function="predictive_accuracy", 20 | flows=[8353], 21 | tasks=[6], 22 | # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise, 23 | # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary. 24 | parameters_in_separate_columns=True, 25 | ) 26 | print(df.head(n=10)) 27 | 28 | # %% [markdown] 29 | # We can see all the hyperparameter names in the columns of the dataframe: 30 | 31 | # %% 32 | for name in df.columns: 33 | print(name) 34 | 35 | # %% [markdown] 36 | # Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we 37 | # can nicely plot them. 38 | 39 | # %% 40 | hyperparameters = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"] 41 | df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log10) 42 | 43 | # %% [markdown] 44 | # ## Option 1 - plotting via the pandas helper functions 45 | 46 | # %% 47 | df.plot.hexbin( 48 | x="sklearn.svm.classes.SVC(16)_C", 49 | y="sklearn.svm.classes.SVC(16)_gamma", 50 | C="value", 51 | reduce_C_function=np.mean, 52 | gridsize=25, 53 | title="SVM performance landscape", 54 | ) 55 | 56 | # %% [markdown] 57 | # ## Option 2 - plotting via matplotlib 58 | 59 | # %% 60 | import matplotlib.pyplot as plt 61 | 62 | fig, ax = plt.subplots() 63 | 64 | C = df["sklearn.svm.classes.SVC(16)_C"] 65 | gamma = df["sklearn.svm.classes.SVC(16)_gamma"] 66 | score = df["value"] 67 | 68 | # Plotting all evaluations: 69 | ax.plot(C, gamma, "ko", ms=1) 70 | # Create a contour plot 71 | cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") 72 | # Adjusting the colorbar 73 | fig.colorbar(cntr, ax=ax, label="accuracy") 74 | # Adjusting the axis limits 75 | ax.set( 76 | xlim=(min(C), max(C)), 77 | ylim=(min(gamma), max(gamma)), 78 | xlabel="C (log10)", 79 | ylabel="gamma (log10)", 80 | ) 81 | ax.set_title("SVM performance landscape") 82 | # License: BSD 3-Clause 83 | -------------------------------------------------------------------------------- /tests/test_tasks/test_regression_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import ast 5 | 6 | import pandas as pd 7 | 8 | import openml 9 | from openml.exceptions import OpenMLServerException 10 | from openml.tasks import TaskType 11 | from openml.testing import TestBase, check_task_existence 12 | 13 | from .test_supervised_task import OpenMLSupervisedTaskTest 14 | 15 | 16 | class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest): 17 | __test__ = True 18 | 19 | def setUp(self, n_levels: int = 1): 20 | super().setUp() 21 | self.estimation_procedure = 9 22 | task_meta_data = { 23 | "task_type": TaskType.SUPERVISED_REGRESSION, 24 | "dataset_id": 105, # wisconsin 25 | "estimation_procedure_id": self.estimation_procedure, # non default value to test estimation procedure id 26 | "target_name": "time", 27 | } 28 | _task_id = check_task_existence(**task_meta_data) 29 | if _task_id is not None: 30 | task_id = _task_id 31 | else: 32 | new_task = openml.tasks.create_task(**task_meta_data) 33 | # publishes the new task 34 | try: 35 | new_task = new_task.publish() 36 | task_id = new_task.task_id 37 | # mark to remove the uploaded task 38 | TestBase._mark_entity_for_removal("task", task_id) 39 | TestBase.logger.info(f"collected from test_run_functions: {task_id}") 40 | except OpenMLServerException as e: 41 | if e.code == 614: # Task already exists 42 | # the exception message contains the task_id that was matched in the format 43 | # 'Task already exists. - matched id(s): [xxxx]' 44 | task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] 45 | else: 46 | raise Exception(repr(e)) 47 | self.task_id = task_id 48 | self.task_type = TaskType.SUPERVISED_REGRESSION 49 | 50 | 51 | def test_get_X_and_Y(self): 52 | X, Y = super().test_get_X_and_Y() 53 | assert X.shape == (194, 32) 54 | assert isinstance(X, pd.DataFrame) 55 | assert Y.shape == (194,) 56 | assert isinstance(Y, pd.Series) 57 | assert pd.api.types.is_numeric_dtype(Y) 58 | 59 | def test_download_task(self): 60 | task = super().test_download_task() 61 | assert task.task_id == self.task_id 62 | assert task.task_type_id == TaskType.SUPERVISED_REGRESSION 63 | assert task.dataset_id == 105 64 | assert task.estimation_procedure_id == self.estimation_procedure 65 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/datasets/-1/qualities.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | DefaultAccuracy 5 | 0.5 6 | 7 | 8 | Dimensionality 9 | 33.335 10 | 11 | 12 | MajorityClassPercentage 13 | 50.0 14 | 15 | 16 | MajorityClassSize 17 | 300.0 18 | 19 | 20 | MinorityClassPerentage 21 | 50.0 22 | 23 | 24 | MinorityClassSize 25 | 300.0 26 | 27 | 28 | NumberOfBinaryFeatures 29 | 1.0 30 | 31 | 32 | NumberOfClasses 33 | 2.0 34 | 35 | 36 | NumberOfFeatures 37 | 20001.0 38 | 39 | 40 | NumberOfInstances 41 | 600.0 42 | 43 | 44 | NumberOfInstancesWithMissingValues 45 | 0.0 46 | 47 | 48 | NumberOfMissingValues 49 | 0.0 50 | 51 | 52 | NumberOfNumericFeatures 53 | 20000.0 54 | 55 | 56 | NumberOfSymbolicFeatures 57 | 1.0 58 | 59 | 60 | PercentageOfBinaryFeatures 61 | 0.004999750012499375 62 | 63 | 64 | PercentageOfInstancesWithMissingValues 65 | 0.0 66 | 67 | 68 | PercentageOfMissingValues 69 | 0.0 70 | 71 | 72 | PercentageOfNumericFeatures 73 | 99.9950002499875 74 | 75 | 76 | PercentageOfSymbolicFeatures 77 | 0.004999750012499375 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /tests/files/mock_responses/datasets/data_description_61.xml: -------------------------------------------------------------------------------- 1 | 2 | 61 3 | iris 4 | 1 5 | **Author**: R.A. Fisher 6 | **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall 7 | **Please cite**: 8 | 9 | **Iris Plants Database** 10 | This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. 11 | 12 | Predicted attribute: class of iris plant. 13 | This is an exceedingly simple domain. 14 | 15 | ### Attribute Information: 16 | 1. sepal length in cm 17 | 2. sepal width in cm 18 | 3. petal length in cm 19 | 4. petal width in cm 20 | 5. class: 21 | -- Iris Setosa 22 | -- Iris Versicolour 23 | -- Iris Virginica 24 | 4 25 | ARFF 26 | R.A. Fisher 1936 2014-04-06T23:23:39 27 | English Public https://api.openml.org/data/v1/download/61/iris.arff 28 | https://data.openml.org/datasets/0000/0061/dataset_61.pq 61 class 1 https://archive.ics.uci.edu/ml/citation_policy.html BotanyEcologyKaggleMachine Learningstudy_1study_25study_4study_41study_50study_52study_7study_86study_88study_89uci public https://archive.ics.uci.edu/ml/datasets/Iris http://digital.library.adelaide.edu.au/dspace/handle/2440/15227 https://data.openml.org/datasets/0000/0061/dataset_61.pq active 29 | 2020-11-20 19:02:18 ad484452702105cbf3d30f8deaba39a9 30 | 31 | -------------------------------------------------------------------------------- /openml/datasets/data_feature.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING, Any, ClassVar, Sequence 5 | 6 | if TYPE_CHECKING: 7 | from IPython.lib import pretty 8 | 9 | 10 | class OpenMLDataFeature: 11 | """ 12 | Data Feature (a.k.a. Attribute) object. 13 | 14 | Parameters 15 | ---------- 16 | index : int 17 | The index of this feature 18 | name : str 19 | Name of the feature 20 | data_type : str 21 | can be nominal, numeric, string, date (corresponds to arff) 22 | nominal_values : list(str) 23 | list of the possible values, in case of nominal attribute 24 | number_missing_values : int 25 | Number of rows that have a missing value for this feature. 26 | ontologies : list(str) 27 | list of ontologies attached to this feature. An ontology describes the 28 | concept that are described in a feature. An ontology is defined by an 29 | URL where the information is provided. 30 | """ 31 | 32 | LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"] 33 | 34 | def __init__( # noqa: PLR0913 35 | self, 36 | index: int, 37 | name: str, 38 | data_type: str, 39 | nominal_values: list[str], 40 | number_missing_values: int, 41 | ontologies: list[str] | None = None, 42 | ): 43 | if not isinstance(index, int): 44 | raise TypeError(f"Index must be `int` but is {type(index)}") 45 | 46 | if data_type not in self.LEGAL_DATA_TYPES: 47 | raise ValueError( 48 | f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}", 49 | ) 50 | 51 | if data_type == "nominal": 52 | if nominal_values is None: 53 | raise TypeError( 54 | "Dataset features require attribute `nominal_values` for nominal " 55 | "feature type.", 56 | ) 57 | 58 | if not isinstance(nominal_values, list): 59 | raise TypeError( 60 | "Argument `nominal_values` is of wrong datatype, should be list, " 61 | f"but is {type(nominal_values)}", 62 | ) 63 | elif nominal_values is not None: 64 | raise TypeError("Argument `nominal_values` must be None for non-nominal feature.") 65 | 66 | if not isinstance(number_missing_values, int): 67 | msg = f"number_missing_values must be int but is {type(number_missing_values)}" 68 | raise TypeError(msg) 69 | 70 | self.index = index 71 | self.name = str(name) 72 | self.data_type = str(data_type) 73 | self.nominal_values = nominal_values 74 | self.number_missing_values = number_missing_values 75 | self.ontologies = ontologies 76 | 77 | def __repr__(self) -> str: 78 | return "[%d - %s (%s)]" % (self.index, self.name, self.data_type) 79 | 80 | def __eq__(self, other: Any) -> bool: 81 | return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__ 82 | 83 | def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: FBT001, ARG002 84 | pp.text(str(self)) 85 | -------------------------------------------------------------------------------- /tests/test_extensions/test_functions.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import inspect 5 | 6 | import pytest 7 | 8 | import openml.testing 9 | from openml.extensions import get_extension_by_flow, get_extension_by_model, register_extension 10 | 11 | 12 | class DummyFlow: 13 | external_version = "DummyFlow==0.1" 14 | name = "Dummy Flow" 15 | flow_id = 1 16 | dependencies = None 17 | 18 | 19 | class DummyModel: 20 | pass 21 | 22 | 23 | class DummyExtension1: 24 | @staticmethod 25 | def can_handle_flow(flow): 26 | return inspect.stack()[2].filename.endswith("test_functions.py") 27 | 28 | @staticmethod 29 | def can_handle_model(model): 30 | return inspect.stack()[2].filename.endswith("test_functions.py") 31 | 32 | 33 | class DummyExtension2: 34 | @staticmethod 35 | def can_handle_flow(flow): 36 | return False 37 | 38 | @staticmethod 39 | def can_handle_model(model): 40 | return False 41 | 42 | 43 | def _unregister(): 44 | # "Un-register" the test extensions 45 | while True: 46 | rem_dum_ext1 = False 47 | rem_dum_ext2 = False 48 | try: 49 | openml.extensions.extensions.remove(DummyExtension1) 50 | rem_dum_ext1 = True 51 | except ValueError: 52 | pass 53 | try: 54 | openml.extensions.extensions.remove(DummyExtension2) 55 | rem_dum_ext2 = True 56 | except ValueError: 57 | pass 58 | if not rem_dum_ext1 and not rem_dum_ext2: 59 | break 60 | 61 | 62 | class TestInit(openml.testing.TestBase): 63 | def setUp(self): 64 | super().setUp() 65 | _unregister() 66 | 67 | def test_get_extension_by_flow(self): 68 | assert get_extension_by_flow(DummyFlow()) is None 69 | with pytest.raises(ValueError, match="No extension registered which can handle flow:"): 70 | get_extension_by_flow(DummyFlow(), raise_if_no_extension=True) 71 | register_extension(DummyExtension1) 72 | assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1) 73 | register_extension(DummyExtension2) 74 | assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1) 75 | register_extension(DummyExtension1) 76 | with pytest.raises( 77 | ValueError, match="Multiple extensions registered which can handle flow:" 78 | ): 79 | get_extension_by_flow(DummyFlow()) 80 | 81 | def test_get_extension_by_model(self): 82 | assert get_extension_by_model(DummyModel()) is None 83 | with pytest.raises(ValueError, match="No extension registered which can handle model:"): 84 | get_extension_by_model(DummyModel(), raise_if_no_extension=True) 85 | register_extension(DummyExtension1) 86 | assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1) 87 | register_extension(DummyExtension2) 88 | assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1) 89 | register_extension(DummyExtension1) 90 | with pytest.raises( 91 | ValueError, match="Multiple extensions registered which can handle model:" 92 | ): 93 | get_extension_by_model(DummyModel()) 94 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/flow_id_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # # Obtaining Flow IDs 3 | # This tutorial discusses different ways to obtain the ID of a flow in order to perform further 4 | # analysis. 5 | 6 | 7 | # %% 8 | import sklearn.tree 9 | 10 | import openml 11 | 12 | 13 | # %% [markdown] 14 | # .. warning:: 15 | # .. include:: ../../test_server_usage_warning.txt 16 | 17 | # %% 18 | openml.config.start_using_configuration_for_example() 19 | openml.config.server = "https://api.openml.org/api/v1/xml" 20 | 21 | # %% 22 | # Defining a classifier 23 | clf = sklearn.tree.DecisionTreeClassifier() 24 | 25 | # %% [markdown] 26 | # ## 1. Obtaining a flow given a classifier 27 | 28 | # %% 29 | flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() 30 | flow_id = flow.flow_id 31 | print(flow_id) 32 | 33 | # %% [markdown] 34 | # This piece of code is rather involved. First, it retrieves a 35 | # :class:`~openml.extensions.Extension` which is registered and can handle the given model, 36 | # in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension 37 | # converts the classifier into an instance of :class:`openml.OpenMLFlow`. Third and finally, 38 | # the publish method checks whether the current flow is already present on OpenML. If not, 39 | # it uploads the flow, otherwise, it updates the current instance with all information computed 40 | # by the server (which is obviously also done when uploading/publishing a flow). 41 | # 42 | # To simplify the usage we have created a helper function which automates all these steps: 43 | 44 | # %% 45 | flow_id = openml.flows.get_flow_id(model=clf) 46 | print(flow_id) 47 | 48 | # %% [markdown] 49 | # ## 2. Obtaining a flow given its name 50 | # The schema of a flow is given in XSD ( 51 | # [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # noqa E501 52 | # Only two fields are required, a unique name, and an external version. While it should be pretty 53 | # obvious why we need a name, the need for the additional external version information might not 54 | # be immediately clear. However, this information is very important as it allows to have multiple 55 | # flows with the same name for different versions of a software. This might be necessary if an 56 | # algorithm or implementation introduces, renames or drop hyperparameters over time. 57 | 58 | # %% 59 | print(flow.name, flow.external_version) 60 | 61 | # %% [markdown] 62 | # The name and external version are automatically added to a flow when constructing it from a 63 | # model. We can then use them to retrieve the flow id as follows: 64 | 65 | # %% 66 | flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version) 67 | print(flow_id) 68 | 69 | # %% [markdown] 70 | # We can also retrieve all flows for a given name: 71 | 72 | # %% 73 | flow_ids = openml.flows.get_flow_id(name=flow.name) 74 | print(flow_ids) 75 | 76 | # %% [markdown] 77 | # This also works with the actual model (generalizing the first part of this example): 78 | 79 | # %% 80 | flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) 81 | print(flow_ids) 82 | 83 | # %% 84 | # Deactivating test configuration 85 | openml.config.stop_using_configuration_for_example() 86 | # License: BSD 3-Clause 87 | -------------------------------------------------------------------------------- /docker/startup.sh: -------------------------------------------------------------------------------- 1 | # Entry script to switch between the different Docker functionalities. 2 | # By default, execute Python with OpenML pre-installed 3 | # 4 | # Entry script to allow docker to be ran for bash, tests and docs. 5 | # The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``. 6 | # Executes ``mode`` on ``branch`` or the provided ``code`` directory. 7 | # $1: Mode, optional. Options: 8 | # - test: execute unit tests 9 | # - doc: build documentation, requires a mounted ``output`` directory if built from a branch. 10 | # - if not provided: execute bash. 11 | # $2: Branch, optional. 12 | # Mutually exclusive with mounting a ``code`` directory. 13 | # Can be a branch on a Github fork, specified with the USERNAME#BRANCH format. 14 | # The test or doc build is executed on this branch. 15 | 16 | if [[ ! ( $1 = "doc" || $1 = "test" ) ]]; then 17 | cd openml 18 | source venv/bin/activate 19 | python "$@" 20 | exit 0 21 | fi 22 | 23 | # doc and test modes require mounted directories and/or specified branches 24 | if ! [ -d "/code" ] && [ -z "$2" ]; then 25 | echo "To perform $1 a code repository must be mounted to '/code' or a branch must be specified." >> /dev/stderr 26 | exit 1 27 | fi 28 | if [ -d "/code" ] && [ -n "$2" ]; then 29 | # We want to avoid switching the git environment from within the docker container 30 | echo "You can not specify a branch for a mounted code repository." >> /dev/stderr 31 | exit 1 32 | fi 33 | if [ "$1" == "doc" ] && [ -n "$2" ] && ! [ -d "/output" ]; then 34 | echo "To build docs from an online repository, you need to mount an output directory." >> /dev/stderr 35 | exit 1 36 | fi 37 | 38 | if [ -n "$2" ]; then 39 | # if a branch is provided, we will pull it into the `openml` local repository that was created with the image. 40 | cd openml 41 | if [[ $2 == *#* ]]; then 42 | # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling 43 | # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<< 44 | readarray -d '#' -t fork_name_and_branch<<<"$2#" 45 | fork_url="https://github.com/${fork_name_and_branch[0]}/openml-python.git" 46 | fork_branch="${fork_name_and_branch[1]}" 47 | echo git fetch "$fork_url" "$fork_branch":branch_from_fork 48 | git fetch "$fork_url" "$fork_branch":branch_from_fork 49 | branch=branch_from_fork 50 | else 51 | git fetch origin "$2" 52 | branch=$2 53 | fi 54 | if ! git checkout "$branch" ; then 55 | echo "Could not checkout $branch. If the branch lives on a fork, specify it as USER#BRANCH. Make sure to push the branch." >> /dev/stderr 56 | exit 1 57 | fi 58 | git pull 59 | code_dir="/openml" 60 | else 61 | code_dir="/code" 62 | fi 63 | 64 | source /openml/venv/bin/activate 65 | cd $code_dir 66 | # The most recent ``main`` is already installed, but we want to update any outdated dependencies 67 | pip install -e .[test,examples,docs,examples_unix] 68 | 69 | if [ "$1" == "test" ]; then 70 | pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv 71 | fi 72 | 73 | if [ "$1" == "doc" ]; then 74 | cd doc 75 | make html 76 | make linkcheck 77 | if [ -d "/output" ]; then 78 | cp -r /openml/doc/build /output 79 | fi 80 | fi -------------------------------------------------------------------------------- /openml/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The OpenML module implements a python interface to 3 | `OpenML `_, a collaborative platform for machine 4 | learning. OpenML can be used to 5 | 6 | * store, download and analyze datasets 7 | * make experiments and their results (e.g. models, predictions) 8 | accesible and reproducible for everybody 9 | * analyze experiments (uploaded by you and other collaborators) and conduct 10 | meta studies 11 | 12 | In particular, this module implements a python interface for the 13 | `OpenML REST API `_ 14 | (`REST on wikipedia 15 | `_). 16 | """ 17 | 18 | # License: BSD 3-Clause 19 | from __future__ import annotations 20 | 21 | from . import ( 22 | _api_calls, 23 | config, 24 | datasets, 25 | evaluations, 26 | exceptions, 27 | extensions, 28 | flows, 29 | runs, 30 | setups, 31 | study, 32 | tasks, 33 | utils, 34 | ) 35 | from .__version__ import __version__ 36 | from .datasets import OpenMLDataFeature, OpenMLDataset 37 | from .evaluations import OpenMLEvaluation 38 | from .flows import OpenMLFlow 39 | from .runs import OpenMLRun 40 | from .setups import OpenMLParameter, OpenMLSetup 41 | from .study import OpenMLBenchmarkSuite, OpenMLStudy 42 | from .tasks import ( 43 | OpenMLClassificationTask, 44 | OpenMLClusteringTask, 45 | OpenMLLearningCurveTask, 46 | OpenMLRegressionTask, 47 | OpenMLSplit, 48 | OpenMLSupervisedTask, 49 | OpenMLTask, 50 | ) 51 | 52 | 53 | def populate_cache( 54 | task_ids: list[int] | None = None, 55 | dataset_ids: list[int | str] | None = None, 56 | flow_ids: list[int] | None = None, 57 | run_ids: list[int] | None = None, 58 | ) -> None: 59 | """ 60 | Populate a cache for offline and parallel usage of the OpenML connector. 61 | 62 | Parameters 63 | ---------- 64 | task_ids : iterable 65 | 66 | dataset_ids : iterable 67 | 68 | flow_ids : iterable 69 | 70 | run_ids : iterable 71 | 72 | Returns 73 | ------- 74 | None 75 | """ 76 | if task_ids is not None: 77 | for task_id in task_ids: 78 | tasks.functions.get_task(task_id) 79 | 80 | if dataset_ids is not None: 81 | for dataset_id in dataset_ids: 82 | datasets.functions.get_dataset(dataset_id) 83 | 84 | if flow_ids is not None: 85 | for flow_id in flow_ids: 86 | flows.functions.get_flow(flow_id) 87 | 88 | if run_ids is not None: 89 | for run_id in run_ids: 90 | runs.functions.get_run(run_id) 91 | 92 | 93 | __all__ = [ 94 | "OpenMLDataset", 95 | "OpenMLDataFeature", 96 | "OpenMLRun", 97 | "OpenMLSplit", 98 | "OpenMLEvaluation", 99 | "OpenMLSetup", 100 | "OpenMLParameter", 101 | "OpenMLTask", 102 | "OpenMLSupervisedTask", 103 | "OpenMLClusteringTask", 104 | "OpenMLLearningCurveTask", 105 | "OpenMLRegressionTask", 106 | "OpenMLClassificationTask", 107 | "OpenMLFlow", 108 | "OpenMLStudy", 109 | "OpenMLBenchmarkSuite", 110 | "datasets", 111 | "evaluations", 112 | "exceptions", 113 | "extensions", 114 | "config", 115 | "runs", 116 | "flows", 117 | "tasks", 118 | "setups", 119 | "study", 120 | "utils", 121 | "_api_calls", 122 | "__version__", 123 | ] 124 | -------------------------------------------------------------------------------- /tests/test_runs/test_trace.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import pytest 5 | 6 | from openml.runs import OpenMLRunTrace, OpenMLTraceIteration 7 | from openml.testing import TestBase 8 | 9 | 10 | class TestTrace(TestBase): 11 | def test_get_selected_iteration(self): 12 | trace_iterations = {} 13 | for i in range(5): 14 | for j in range(5): 15 | for k in range(5): 16 | t = OpenMLTraceIteration( 17 | repeat=i, 18 | fold=j, 19 | iteration=5, 20 | setup_string="parameter_%d%d%d" % (i, j, k), 21 | evaluation=1.0 * i + 0.1 * j + 0.01 * k, 22 | selected=(i == j and i == k and i == 2), 23 | parameters=None, 24 | ) 25 | trace_iterations[(i, j, k)] = t 26 | 27 | trace = OpenMLRunTrace(-1, trace_iterations=trace_iterations) 28 | # This next one should simply not fail 29 | assert trace.get_selected_iteration(2, 2) == 2 30 | with pytest.raises( 31 | ValueError, match="Could not find the selected iteration for rep/fold 3/3" 32 | ): 33 | trace.get_selected_iteration(3, 3) 34 | 35 | def test_initialization(self): 36 | """Check all different ways to fail the initialization""" 37 | with pytest.raises(ValueError, match="Trace content not available."): 38 | OpenMLRunTrace.generate(attributes="foo", content=None) 39 | with pytest.raises(ValueError, match="Trace attributes not available."): 40 | OpenMLRunTrace.generate(attributes=None, content="foo") 41 | with pytest.raises(ValueError, match="Trace content is empty."): 42 | OpenMLRunTrace.generate(attributes="foo", content=[]) 43 | with pytest.raises(ValueError, match="Trace_attributes and trace_content not compatible:"): 44 | OpenMLRunTrace.generate(attributes=["abc"], content=[[1, 2]]) 45 | 46 | def test_duplicate_name(self): 47 | # Test that the user does not pass a parameter which has the same name 48 | # as one of the required trace attributes 49 | trace_attributes = [ 50 | ("repeat", "NUMERICAL"), 51 | ("fold", "NUMERICAL"), 52 | ("iteration", "NUMERICAL"), 53 | ("evaluation", "NUMERICAL"), 54 | ("selected", ["true", "false"]), 55 | ("repeat", "NUMERICAL"), 56 | ] 57 | trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]] 58 | with pytest.raises( 59 | ValueError, 60 | match="Either `setup_string` or `parameters` needs to be passed as argument.", 61 | ): 62 | OpenMLRunTrace.generate(trace_attributes, trace_content) 63 | 64 | trace_attributes = [ 65 | ("repeat", "NUMERICAL"), 66 | ("fold", "NUMERICAL"), 67 | ("iteration", "NUMERICAL"), 68 | ("evaluation", "NUMERICAL"), 69 | ("selected", ["true", "false"]), 70 | ("sunshine", "NUMERICAL"), 71 | ] 72 | trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]] 73 | with pytest.raises( 74 | ValueError, 75 | match="Encountered unknown attribute sunshine that does not start with " 76 | "prefix parameter_", 77 | ): 78 | OpenMLRunTrace.generate(trace_attributes, trace_content) 79 | -------------------------------------------------------------------------------- /docs/details.md: -------------------------------------------------------------------------------- 1 | # Advanced User Guide 2 | 3 | This document highlights some of the more advanced features of 4 | `openml-python`. 5 | 6 | ## Configuration 7 | 8 | The configuration file resides in a directory `.config/openml` in the 9 | home directory of the user and is called config (More specifically, it 10 | resides in the [configuration directory specified by the XDGB Base 11 | Directory 12 | Specification](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html)). 13 | It consists of `key = value` pairs which are separated by newlines. The 14 | following keys are defined: 15 | 16 | - apikey: required to access the server. 17 | - server: the server to connect to (default: `http://www.openml.org`). 18 | For connection to the test server, set this to `test.openml.org`. 19 | - cachedir: the root folder where the cache file directories should be created. 20 | If not given, will default to `~/.openml/cache` 21 | - avoid_duplicate_runs: if set to `True` (default), when certain functions 22 | are called a lookup is performed to see if there already 23 | exists such a run on the server. If so, download those 24 | results instead. 25 | - retry_policy: Defines how to react when the server is unavailable or 26 | experiencing high load. It determines both how often to 27 | attempt to reconnect and how quickly to do so. Please don't 28 | use `human` in an automated script that you run more than 29 | one instance of, it might increase the time to complete your 30 | jobs and that of others. One of: 31 | - human (default): For people running openml in interactive 32 | fashion. Try only a few times, but in quick succession. 33 | - robot: For people using openml in an automated fashion. Keep 34 | trying to reconnect for a longer time, quickly increasing 35 | the time between retries. 36 | 37 | - connection_n_retries: number of times to retry a request if they fail. 38 | Default depends on retry_policy (5 for `human`, 50 for `robot`) 39 | - verbosity: the level of output: 40 | - 0: normal output 41 | - 1: info output 42 | - 2: debug output 43 | 44 | This file is easily configurable by the `openml` command line interface. 45 | To see where the file is stored, and what its values are, use openml 46 | configure none. 47 | 48 | ## Docker 49 | 50 | It is also possible to try out the latest development version of 51 | `openml-python` with docker: 52 | 53 | ``` bash 54 | docker run -it openml/openml-python 55 | ``` 56 | 57 | See the [openml-python docker 58 | documentation](https://github.com/openml/openml-python/blob/main/docker/readme.md) 59 | for more information. 60 | 61 | ## Key concepts 62 | 63 | OpenML contains several key concepts which it needs to make machine 64 | learning research shareable. A machine learning experiment consists of 65 | one or several **runs**, which describe the performance of an algorithm 66 | (called a **flow** in OpenML), its hyperparameter settings (called a 67 | **setup**) on a **task**. A **Task** is the combination of a 68 | **dataset**, a split and an evaluation metric. In this user guide we 69 | will go through listing and exploring existing **tasks** to actually 70 | running machine learning algorithms on them. In a further user guide we 71 | will examine how to search through **datasets** in order to curate a 72 | list of **tasks**. 73 | 74 | A further explanation is given in the [OpenML user 75 | guide](https://docs.openml.org/concepts/). 76 | 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller, 4 | Joaquin Vanschoren and others. 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | License of the files CONTRIBUTING.md, ISSUE_TEMPLATE.md and 33 | PULL_REQUEST_TEMPLATE.md: 34 | 35 | Those files are modifications of the respecting templates in scikit-learn and 36 | they are licensed under a New BSD license: 37 | 38 | New BSD License 39 | 40 | Copyright (c) 2007–2018 The scikit-learn developers. 41 | All rights reserved. 42 | 43 | 44 | Redistribution and use in source and binary forms, with or without 45 | modification, are permitted provided that the following conditions are met: 46 | 47 | a. Redistributions of source code must retain the above copyright notice, 48 | this list of conditions and the following disclaimer. 49 | b. Redistributions in binary form must reproduce the above copyright 50 | notice, this list of conditions and the following disclaimer in the 51 | documentation and/or other materials provided with the distribution. 52 | c. Neither the name of the Scikit-learn Developers nor the names of 53 | its contributors may be used to endorse or promote products 54 | derived from this software without specific prior written 55 | permission. 56 | 57 | 58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 59 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 62 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 64 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 65 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 68 | DAMAGE. 69 | -------------------------------------------------------------------------------- /tests/test_tasks/test_split.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import inspect 5 | import os 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | 10 | from openml import OpenMLSplit 11 | from openml.testing import TestBase 12 | 13 | 14 | class OpenMLSplitTest(TestBase): 15 | # Splitting not helpful, these test's don't rely on the server and take less 16 | # than 5 seconds + rebuilding the test would potentially be costly 17 | 18 | def setUp(self): 19 | __file__ = inspect.getfile(OpenMLSplitTest) 20 | self.directory = os.path.dirname(__file__) 21 | # This is for dataset 22 | self.arff_filepath = ( 23 | Path(self.directory).parent 24 | / "files" 25 | / "org" 26 | / "openml" 27 | / "test" 28 | / "tasks" 29 | / "1882" 30 | / "datasplits.arff" 31 | ) 32 | self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3") 33 | 34 | def tearDown(self): 35 | try: 36 | os.remove(self.pd_filename) 37 | except (OSError, FileNotFoundError): 38 | # Replaced bare except. Not sure why these exceptions are acceptable. 39 | pass 40 | 41 | def test_eq(self): 42 | split = OpenMLSplit._from_arff_file(self.arff_filepath) 43 | assert split == split 44 | 45 | split2 = OpenMLSplit._from_arff_file(self.arff_filepath) 46 | split2.name = "a" 47 | assert split != split2 48 | 49 | split2 = OpenMLSplit._from_arff_file(self.arff_filepath) 50 | split2.description = "a" 51 | assert split != split2 52 | 53 | split2 = OpenMLSplit._from_arff_file(self.arff_filepath) 54 | split2.split[10] = {} 55 | assert split != split2 56 | 57 | split2 = OpenMLSplit._from_arff_file(self.arff_filepath) 58 | split2.split[0][10] = {} 59 | assert split != split2 60 | 61 | def test_from_arff_file(self): 62 | split = OpenMLSplit._from_arff_file(self.arff_filepath) 63 | assert isinstance(split.split, dict) 64 | assert isinstance(split.split[0], dict) 65 | assert isinstance(split.split[0][0], dict) 66 | assert isinstance(split.split[0][0][0][0], np.ndarray) 67 | assert isinstance(split.split[0][0][0].train, np.ndarray) 68 | assert isinstance(split.split[0][0][0].train, np.ndarray) 69 | assert isinstance(split.split[0][0][0][1], np.ndarray) 70 | assert isinstance(split.split[0][0][0].test, np.ndarray) 71 | assert isinstance(split.split[0][0][0].test, np.ndarray) 72 | for i in range(10): 73 | for j in range(10): 74 | assert split.split[i][j][0].train.shape[0] >= 808 75 | assert split.split[i][j][0].test.shape[0] >= 89 76 | assert ( 77 | split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0] == 898 78 | ) 79 | 80 | def test_get_split(self): 81 | split = OpenMLSplit._from_arff_file(self.arff_filepath) 82 | train_split, test_split = split.get(fold=5, repeat=2) 83 | assert train_split.shape[0] == 808 84 | assert test_split.shape[0] == 90 85 | self.assertRaisesRegex( 86 | ValueError, 87 | "Repeat 10 not known", 88 | split.get, 89 | 10, 90 | 2, 91 | ) 92 | self.assertRaisesRegex( 93 | ValueError, 94 | "Fold 10 not known", 95 | split.get, 96 | 2, 97 | 10, 98 | ) 99 | -------------------------------------------------------------------------------- /examples/Advanced/study_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # How to list, download and upload benchmark studies. 3 | # In contrast to 4 | # [benchmark suites](https://docs.openml.org/benchmark/#benchmarking-suites) which 5 | # hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and 6 | # tasks, all required information about a study can be retrieved. 7 | 8 | # %% 9 | import uuid 10 | 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | import openml 14 | 15 | # %% [markdown] 16 | # ## Listing studies 17 | # 18 | # * Use the output_format parameter to select output type 19 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an 20 | # easier-to-work-with data structure 21 | 22 | # %% 23 | studies = openml.study.list_studies(status="all") 24 | print(studies.head(n=10)) 25 | 26 | 27 | # %% [markdown] 28 | # ## Downloading studies 29 | # This is done based on the study ID. 30 | 31 | # %% 32 | study = openml.study.get_study(123) 33 | print(study) 34 | 35 | # %% [markdown] 36 | # Studies also features a description: 37 | 38 | # %% 39 | print(study.description) 40 | 41 | # %% [markdown] 42 | # Studies are a container for runs: 43 | 44 | # %% 45 | print(study.runs) 46 | 47 | # %% [markdown] 48 | # And we can use the evaluation listing functionality to learn more about 49 | # the evaluations available for the conducted runs: 50 | 51 | # %% 52 | evaluations = openml.evaluations.list_evaluations( 53 | function="predictive_accuracy", 54 | study=study.study_id, 55 | output_format="dataframe", 56 | ) 57 | print(evaluations.head()) 58 | 59 | # %% [markdown] 60 | # We'll use the test server for the rest of this tutorial. 61 | 62 | # %% 63 | openml.config.start_using_configuration_for_example() 64 | 65 | # %% [markdown] 66 | # ## Uploading studies 67 | # 68 | # Creating a study is as simple as creating any kind of other OpenML entity. 69 | # In this examples we'll create a few runs for the OpenML-100 benchmark 70 | # suite which is available on the OpenML test server. 71 | 72 | #
73 | #

Warning

74 | #

75 | # For the rest of this tutorial, we will require the `openml-sklearn` package. 76 | # Install it with `pip install openml-sklearn`. 77 | #

78 | #
79 | 80 | # %% 81 | # Get sklearn extension to run sklearn models easily on OpenML tasks. 82 | from openml_sklearn import SklearnExtension 83 | 84 | extension = SklearnExtension() 85 | 86 | # Model to be used 87 | clf = RandomForestClassifier() 88 | 89 | # We'll create a study with one run on 3 datasets present in the suite 90 | tasks = [115, 259, 307] 91 | 92 | # To verify 93 | # https://test.openml.org/api/v1/study/1 94 | suite = openml.study.get_suite("OpenML100") 95 | print(all(t_id in suite.tasks for t_id in tasks)) 96 | 97 | run_ids = [] 98 | for task_id in tasks: 99 | task = openml.tasks.get_task(task_id) 100 | run = openml.runs.run_model_on_task(clf, task) 101 | run.publish() 102 | run_ids.append(run.run_id) 103 | 104 | # The study needs a machine-readable and unique alias. To obtain this, 105 | # we simply generate a random uuid. 106 | alias = uuid.uuid4().hex 107 | 108 | new_study = openml.study.create_study( 109 | name="Test-Study", 110 | description="Test study for the Python tutorial on studies", 111 | run_ids=run_ids, 112 | alias=alias, 113 | benchmark_suite=suite.study_id, 114 | ) 115 | new_study.publish() 116 | print(new_study) 117 | 118 | 119 | # %% 120 | openml.config.stop_using_configuration_for_example() 121 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # OpenML 2 | 3 | **The Python API for a World of Data and More** 4 | 5 | Welcome to the documentation of the OpenML Python API, a connector to 6 | the collaborative machine learning platform 7 | [OpenML.org](https://www.openml.org). 8 | OpenML-Python can download or upload data from OpenML, such as datasets 9 | and machine learning experiment results. 10 | 11 | If you are new to OpenML, we recommend checking out the [OpenML documentation](https://docs.openml.org/) 12 | to get familiar with the concepts and features of OpenML. In particular, we recommend 13 | reading more about the [OpenML concepts](https://docs.openml.org/concepts/). 14 | 15 | ## :joystick: Minimal Examples 16 | 17 | Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/): 18 | 19 | ```python 20 | import openml 21 | 22 | dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31) 23 | X, y, categorical_indicator, attribute_names = dataset.get_data(target="class") 24 | ``` 25 | 26 | Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): 27 | 28 | ```python 29 | import openml 30 | 31 | task = openml.tasks.get_task(31) 32 | dataset = task.get_dataset() 33 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name) 34 | # get splits for the first fold of 10-fold cross-validation 35 | train_indices, test_indices = task.get_train_test_split_indices(fold=0) 36 | ``` 37 | 38 | Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks: 39 | ```python 40 | import openml 41 | 42 | suite = openml.study.get_suite("amlb-classification-all") # Get a curated list of tasks for classification 43 | for task_id in suite.tasks: 44 | task = openml.tasks.get_task(task_id) 45 | ``` 46 | Find more examples in the navbar at the top. 47 | 48 | ## :magic_wand: Installation 49 | 50 | OpenML-Python is available on Linux, MacOS, and Windows. 51 | 52 | You can install OpenML-Python with: 53 | 54 | ```bash 55 | pip install openml 56 | ``` 57 | 58 | For more advanced installation information, please see the 59 | ["Introduction"](../examples/Basics/introduction_tutorial) example. 60 | 61 | 62 | ## Further information 63 | 64 | - [OpenML documentation](https://docs.openml.org/) 65 | - [OpenML client APIs](https://docs.openml.org/APIs/) 66 | - [OpenML developer guide](https://docs.openml.org/contributing/) 67 | - [Contact information](https://www.openml.org/contact) 68 | - [Citation request](https://www.openml.org/cite) 69 | - [OpenML blog](https://medium.com/open-machine-learning) 70 | - [OpenML twitter account](https://twitter.com/open_ml) 71 | 72 | 73 | ## Contributing 74 | 75 | Contributing to the OpenML package is highly appreciated. Please see the 76 | ["Contributing"](contributing.md) page for more information. 77 | 78 | ## Citing OpenML-Python 79 | 80 | If you use OpenML-Python in a scientific publication, we would 81 | appreciate a reference to our JMLR-MLOSS paper 82 | ["OpenML-Python: an extensible Python API for OpenML"](https://www.jmlr.org/papers/v22/19-920.html): 83 | 84 | === "Bibtex" 85 | 86 | ```bibtex 87 | @article{JMLR:v22:19-920, 88 | author = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter}, 89 | title = {OpenML-Python: an extensible Python API for OpenML}, 90 | journal = {Journal of Machine Learning Research}, 91 | year = {2021}, 92 | volume = {22}, 93 | number = {100}, 94 | pages = {1--5}, 95 | url = {http://jmlr.org/papers/v22/19-920.html} 96 | } 97 | ``` 98 | 99 | === "MLA" 100 | 101 | Feurer, Matthias, et al. 102 | "OpenML-Python: an extensible Python API for OpenML." 103 | _Journal of Machine Learning Research_ 22.100 (2021):1−5. 104 | -------------------------------------------------------------------------------- /examples/Basics/simple_flows_and_runs_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # A simple tutorial on how to upload results from a machine learning experiment to OpenML. 3 | 4 | # %% 5 | import sklearn 6 | from sklearn.neighbors import KNeighborsClassifier 7 | 8 | import openml 9 | 10 | # %% [markdown] 11 | #
12 | #

Warning

13 | #

14 | # This example uploads data. For that reason, this example connects to the 15 | # test server at test.openml.org.
17 | # This prevents the main server from becoming overloaded with example datasets, tasks, 18 | # runs, and other submissions.
19 | # Using this test server may affect the behavior and performance of the 20 | # OpenML-Python API. 21 | #

22 | #
23 | 24 | # %% 25 | openml.config.start_using_configuration_for_example() 26 | 27 | # %% [markdown] 28 | # ## Train a machine learning model and evaluate it 29 | # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20 30 | 31 | # %% 32 | task = openml.tasks.get_task(119) 33 | 34 | # Get the data 35 | dataset = task.get_dataset() 36 | X, y, categorical_indicator, attribute_names = dataset.get_data( 37 | target=dataset.default_target_attribute 38 | ) 39 | 40 | # Get the holdout split from the task 41 | train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0) 42 | X_train, X_test = X.iloc[train_indices], X.iloc[test_indices] 43 | y_train, y_test = y.iloc[train_indices], y.iloc[test_indices] 44 | 45 | knn_parameters = { 46 | "n_neighbors": 3, 47 | } 48 | clf = KNeighborsClassifier(**knn_parameters) 49 | clf.fit(X_train, y_train) 50 | 51 | # Get experiment results 52 | y_pred = clf.predict(X_test) 53 | y_pred_proba = clf.predict_proba(X_test) 54 | 55 | # %% [markdown] 56 | # ## Upload the machine learning experiments to OpenML 57 | # First, create a fow and fill it with metadata about the machine learning model. 58 | 59 | # %% 60 | knn_flow = openml.flows.OpenMLFlow( 61 | # Metadata 62 | model=clf, # or None, if you do not want to upload the model object. 63 | name="CustomKNeighborsClassifier", 64 | description="A custom KNeighborsClassifier flow for OpenML.", 65 | external_version=f"{sklearn.__version__}", 66 | language="English", 67 | tags=["openml_tutorial_knn"], 68 | dependencies=f"{sklearn.__version__}", 69 | # Hyperparameters 70 | parameters={k: str(v) for k, v in knn_parameters.items()}, 71 | parameters_meta_info={ 72 | "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"} 73 | }, 74 | # If you have a pipeline with subcomponents, such as preprocessing, add them here. 75 | components={}, 76 | ) 77 | knn_flow.publish() 78 | print(f"knn_flow was published with the ID {knn_flow.flow_id}") 79 | 80 | # %% [markdown] 81 | # Second, we create a run to store the results associated with the flow. 82 | 83 | # %% 84 | 85 | # Format the predictions for OpenML 86 | predictions = [] 87 | for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip( 88 | test_indices, y_test, y_pred, y_pred_proba 89 | ): 90 | predictions.append( 91 | openml.runs.functions.format_prediction( 92 | task=task, 93 | repeat=0, 94 | fold=0, 95 | index=test_index, 96 | prediction=y_pred_i, 97 | truth=y_true_i, 98 | proba=dict(zip(task.class_labels, y_pred_proba_i)), 99 | ) 100 | ) 101 | 102 | # Format the parameters for OpenML 103 | oml_knn_parameters = [ 104 | {"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id} 105 | for k, v in knn_parameters.items() 106 | ] 107 | 108 | knn_run = openml.runs.OpenMLRun( 109 | task_id=task.task_id, 110 | flow_id=knn_flow.flow_id, 111 | dataset_id=dataset.dataset_id, 112 | parameter_settings=oml_knn_parameters, 113 | data_content=predictions, 114 | tags=["openml_tutorial_knn"], 115 | description_text="Run generated by the tutorial.", 116 | ) 117 | knn_run = knn_run.publish() 118 | print(f"Run was uploaded to {knn_run.openml_url}") 119 | print(f"The flow can be found at {knn_run.flow.openml_url}") 120 | 121 | # %% 122 | openml.config.stop_using_configuration_for_example() 123 | -------------------------------------------------------------------------------- /openml/extensions/functions.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING, Any 5 | 6 | # Need to implement the following by its full path because otherwise it won't be possible to 7 | # access openml.extensions.extensions 8 | import openml.extensions 9 | 10 | # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles 11 | if TYPE_CHECKING: 12 | from openml.flows import OpenMLFlow 13 | 14 | from . import Extension 15 | 16 | SKLEARN_HINT = ( 17 | "But it looks related to scikit-learn. " 18 | "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. " 19 | "For more information, see " 20 | "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation" 21 | ) 22 | 23 | 24 | def register_extension(extension: type[Extension]) -> None: 25 | """Register an extension. 26 | 27 | Registered extensions are considered by ``get_extension_by_flow`` and 28 | ``get_extension_by_model``, which are used by ``openml.flow`` and ``openml.runs``. 29 | 30 | Parameters 31 | ---------- 32 | extension : Type[Extension] 33 | 34 | Returns 35 | ------- 36 | None 37 | """ 38 | openml.extensions.extensions.append(extension) 39 | 40 | 41 | def get_extension_by_flow( 42 | flow: OpenMLFlow, 43 | raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 44 | ) -> Extension | None: 45 | """Get an extension which can handle the given flow. 46 | 47 | Iterates all registered extensions and checks whether they can handle the presented flow. 48 | Raises an exception if two extensions can handle a flow. 49 | 50 | Parameters 51 | ---------- 52 | flow : OpenMLFlow 53 | 54 | raise_if_no_extension : bool (optional, default=False) 55 | Raise an exception if no registered extension can handle the presented flow. 56 | 57 | Returns 58 | ------- 59 | Extension or None 60 | """ 61 | candidates = [] 62 | for extension_class in openml.extensions.extensions: 63 | if extension_class.can_handle_flow(flow): 64 | candidates.append(extension_class()) 65 | if len(candidates) == 0: 66 | if raise_if_no_extension: 67 | install_instruction = "" 68 | if flow.name.startswith("sklearn"): 69 | install_instruction = SKLEARN_HINT 70 | raise ValueError( 71 | f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). " 72 | f"{install_instruction}" 73 | ) 74 | 75 | return None 76 | 77 | if len(candidates) == 1: 78 | return candidates[0] 79 | 80 | raise ValueError( 81 | f"Multiple extensions registered which can handle flow: {flow}, but only one " 82 | f"is allowed ({candidates}).", 83 | ) 84 | 85 | 86 | def get_extension_by_model( 87 | model: Any, 88 | raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 89 | ) -> Extension | None: 90 | """Get an extension which can handle the given flow. 91 | 92 | Iterates all registered extensions and checks whether they can handle the presented model. 93 | Raises an exception if two extensions can handle a model. 94 | 95 | Parameters 96 | ---------- 97 | model : Any 98 | 99 | raise_if_no_extension : bool (optional, default=False) 100 | Raise an exception if no registered extension can handle the presented model. 101 | 102 | Returns 103 | ------- 104 | Extension or None 105 | """ 106 | candidates = [] 107 | for extension_class in openml.extensions.extensions: 108 | if extension_class.can_handle_model(model): 109 | candidates.append(extension_class()) 110 | if len(candidates) == 0: 111 | if raise_if_no_extension: 112 | install_instruction = "" 113 | if type(model).__module__.startswith("sklearn"): 114 | install_instruction = SKLEARN_HINT 115 | raise ValueError( 116 | f"No extension registered which can handle model: {model}. {install_instruction}" 117 | ) 118 | 119 | return None 120 | 121 | if len(candidates) == 1: 122 | return candidates[0] 123 | 124 | raise ValueError( 125 | f"Multiple extensions registered which can handle model: {model}, but only one " 126 | f"is allowed ({candidates}).", 127 | ) 128 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/2015_neurips_feurer_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Feurer et al. (2015) 3 | ==================== 4 | 5 | A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al.. 6 | 7 | Auto-sklearn website: https://automl.github.io/auto-sklearn/ 8 | 9 | Publication 10 | ~~~~~~~~~~~ 11 | 12 | | Efficient and Robust Automated Machine Learning 13 | | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter 14 | | In *Advances in Neural Information Processing Systems 28*, 2015 15 | | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf 16 | """ # noqa F401 17 | 18 | # License: BSD 3-Clause 19 | 20 | import pandas as pd 21 | 22 | import openml 23 | 24 | #################################################################################################### 25 | # List of dataset IDs given in the supplementary material of Feurer et al.: 26 | # https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip 27 | # fmt: off 28 | dataset_ids = [ 29 | 3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46, 30 | 57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389, 31 | 390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722, 32 | 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799, 33 | 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847, 34 | 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930, 35 | 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995, 36 | 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053, 37 | 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130, 38 | 1134, 1138, 1139, 1142, 1146, 1161, 1166, 39 | ] 40 | # fmt: on 41 | 42 | #################################################################################################### 43 | # The dataset IDs could be used directly to load the dataset and split the data into a training set 44 | # and a test set. However, to be reproducible, we will first obtain the respective tasks from 45 | # OpenML, which define both the target feature and the train/test split. 46 | # 47 | # .. note:: 48 | # It is discouraged to work directly on datasets and only provide dataset IDs in a paper as 49 | # this does not allow reproducibility (unclear splitting). Please do not use datasets but the 50 | # respective tasks as basis for a paper and publish task IDS. This example is only given to 51 | # showcase the use of OpenML-Python for a published paper and as a warning on how not to do it. 52 | # Please check the `OpenML documentation of tasks `_ if you 53 | # want to learn more about them. 54 | 55 | #################################################################################################### 56 | # This lists both active and inactive tasks (because of ``status='all'``). Unfortunately, 57 | # this is necessary as some of the datasets contain issues found after the publication and became 58 | # deactivated, which also deactivated the tasks on them. More information on active or inactive 59 | # datasets can be found in the `online docs `_. 60 | tasks = openml.tasks.list_tasks( 61 | task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, 62 | status="all", 63 | output_format="dataframe", 64 | ) 65 | 66 | # Query only those with holdout as the resampling startegy. 67 | tasks = tasks.query('estimation_procedure == "33% Holdout set"') 68 | 69 | task_ids = [] 70 | for did in dataset_ids: 71 | tasks_ = list(tasks.query("did == {}".format(did)).tid) 72 | if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest). 73 | task_id = min(tasks_) 74 | else: 75 | raise ValueError(did) 76 | 77 | # Optional - Check that the task has the same target attribute as the 78 | # dataset default target attribute 79 | # (disabled for this example as it needs to run fast to be rendered online) 80 | # task = openml.tasks.get_task(task_id) 81 | # dataset = task.get_dataset() 82 | # if task.target_name != dataset.default_target_attribute: 83 | # raise ValueError( 84 | # (task.target_name, dataset.default_target_attribute) 85 | # ) 86 | 87 | task_ids.append(task_id) 88 | 89 | assert len(task_ids) == 140 90 | task_ids.sort() 91 | 92 | # These are the tasks to work with: 93 | print(task_ids) 94 | -------------------------------------------------------------------------------- /tests/test_tasks/test_task.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import unittest 5 | from random import randint, shuffle 6 | 7 | from openml.datasets import ( 8 | get_dataset, 9 | list_datasets, 10 | ) 11 | from openml.exceptions import OpenMLServerException 12 | from openml.tasks import TaskType, create_task, get_task 13 | from openml.testing import TestBase 14 | 15 | 16 | class OpenMLTaskTest(TestBase): 17 | """ 18 | A helper class. The methods of the test case 19 | are only executed in subclasses of the test case. 20 | """ 21 | 22 | __test__ = False 23 | 24 | @classmethod 25 | def setUpClass(cls): 26 | if cls is OpenMLTaskTest: 27 | raise unittest.SkipTest("Skip OpenMLTaskTest tests," " it's a base class") 28 | super().setUpClass() 29 | 30 | def setUp(self, n_levels: int = 1): 31 | super().setUp() 32 | 33 | def test_download_task(self): 34 | return get_task(self.task_id) 35 | 36 | def test_upload_task(self): 37 | # We don't know if the task in question already exists, so we try a few times. Checking 38 | # beforehand would not be an option because a concurrent unit test could potentially 39 | # create the same task and make this unit test fail (i.e. getting a dataset and creating 40 | # a task for it is not atomic). 41 | compatible_datasets = self._get_compatible_rand_dataset() 42 | for i in range(100): 43 | try: 44 | dataset_id = compatible_datasets[i % len(compatible_datasets)] 45 | # TODO consider implementing on the diff task types. 46 | task = create_task( 47 | task_type=self.task_type, 48 | dataset_id=dataset_id, 49 | target_name=self._get_random_feature(dataset_id), 50 | estimation_procedure_id=self.estimation_procedure, 51 | ) 52 | 53 | task.publish() 54 | TestBase._mark_entity_for_removal("task", task.id) 55 | TestBase.logger.info( 56 | f"collected from {__file__.split('/')[-1]}: {task.id}", 57 | ) 58 | # success 59 | break 60 | except OpenMLServerException as e: 61 | # Error code for 'task already exists' 62 | # Should be 533 according to the docs 63 | # (# https://www.openml.org/api_docs#!/task/post_task) 64 | if e.code == 614: 65 | continue 66 | else: 67 | raise e 68 | else: 69 | raise ValueError( 70 | f"Could not create a valid task for task type ID {self.task_type}", 71 | ) 72 | 73 | def _get_compatible_rand_dataset(self) -> list: 74 | active_datasets = list_datasets(status="active") 75 | 76 | # depending on the task type, find either datasets 77 | # with only symbolic features or datasets with only 78 | # numerical features. 79 | if self.task_type == TaskType.SUPERVISED_REGRESSION: 80 | compatible_datasets = active_datasets[active_datasets["NumberOfSymbolicFeatures"] == 0] 81 | elif self.task_type == TaskType.CLUSTERING: 82 | compatible_datasets = active_datasets 83 | else: 84 | compatible_datasets = active_datasets[active_datasets["NumberOfNumericFeatures"] == 0] 85 | 86 | compatible_datasets = list(compatible_datasets["did"]) 87 | # in-place shuffling 88 | shuffle(compatible_datasets) 89 | return compatible_datasets 90 | 91 | # random_dataset_pos = randint(0, len(compatible_datasets) - 1) 92 | # 93 | # return compatible_datasets[random_dataset_pos] 94 | 95 | def _get_random_feature(self, dataset_id: int) -> str: 96 | random_dataset = get_dataset(dataset_id) 97 | # necessary loop to overcome string and date type 98 | # features. 99 | while True: 100 | random_feature_index = randint(0, len(random_dataset.features) - 1) 101 | random_feature = random_dataset.features[random_feature_index] 102 | if self.task_type == TaskType.SUPERVISED_REGRESSION: 103 | if random_feature.data_type == "numeric": 104 | break 105 | else: 106 | if random_feature.data_type == "nominal": 107 | break 108 | return random_feature.name 109 | -------------------------------------------------------------------------------- /tests/files/org/openml/test/datasets/2/description.xml: -------------------------------------------------------------------------------- 1 | 2 | 2 3 | anneal 4 | 1 5 | **Author**: 6 | **Source**: Unknown - 7 | **Please cite**: 8 | 9 | 1. Title of Database: Annealing Data 10 | 11 | 2. Source Information: donated by David Sterling and Wray Buntine. 12 | 13 | 3. Past Usage: unknown 14 | 15 | 4. Relevant Information: 16 | -- Explanation: I suspect this was left by Ross Quinlan in 1987 at the 17 | 4th Machine Learning Workshop. I'd have to check with Jeff Schlimmer 18 | to double check this. 19 | 20 | 5. Number of Instances: 798 21 | 22 | 6. Number of Attributes: 38 23 | -- 6 continuously-valued 24 | -- 3 integer-valued 25 | -- 29 nominal-valued 26 | 27 | 7. Attribute Information: 28 | 1. family: --,GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS 29 | 2. product-type: C, H, G 30 | 3. steel: -,R,A,U,K,M,S,W,V 31 | 4. carbon: continuous 32 | 5. hardness: continuous 33 | 6. temper_rolling: -,T 34 | 7. condition: -,S,A,X 35 | 8. formability: -,1,2,3,4,5 36 | 9. strength: continuous 37 | 10. non-ageing: -,N 38 | 11. surface-finish: P,M,- 39 | 12. surface-quality: -,D,E,F,G 40 | 13. enamelability: -,1,2,3,4,5 41 | 14. bc: Y,- 42 | 15. bf: Y,- 43 | 16. bt: Y,- 44 | 17. bw/me: B,M,- 45 | 18. bl: Y,- 46 | 19. m: Y,- 47 | 20. chrom: C,- 48 | 21. phos: P,- 49 | 22. cbond: Y,- 50 | 23. marvi: Y,- 51 | 24. exptl: Y,- 52 | 25. ferro: Y,- 53 | 26. corr: Y,- 54 | 27. blue/bright/varn/clean: B,R,V,C,- 55 | 28. lustre: Y,- 56 | 29. jurofm: Y,- 57 | 30. s: Y,- 58 | 31. p: Y,- 59 | 32. shape: COIL, SHEET 60 | 33. thick: continuous 61 | 34. width: continuous 62 | 35. len: continuous 63 | 36. oil: -,Y,N 64 | 37. bore: 0000,0500,0600,0760 65 | 38. packing: -,1,2,3 66 | classes: 1,2,3,4,5,U 67 | 68 | -- The '-' values are actually 'not_applicable' values rather than 69 | 'missing_values' (and so can be treated as legal discrete 70 | values rather than as showing the absence of a discrete value). 71 | 72 | 8. Missing Attribute Values: Signified with "?" 73 | Attribute: Number of instances missing its value: 74 | 1 0 75 | 2 0 76 | 3 70 77 | 4 0 78 | 5 0 79 | 6 675 80 | 7 271 81 | 8 283 82 | 9 0 83 | 10 703 84 | 11 790 85 | 12 217 86 | 13 785 87 | 14 797 88 | 15 680 89 | 16 736 90 | 17 609 91 | 18 662 92 | 19 798 93 | 20 775 94 | 21 791 95 | 22 730 96 | 23 798 97 | 24 796 98 | 25 772 99 | 26 798 100 | 27 793 101 | 28 753 102 | 29 798 103 | 30 798 104 | 31 798 105 | 32 0 106 | 33 0 107 | 34 0 108 | 35 0 109 | 36 740 110 | 37 0 111 | 38 789 112 | 39 0 113 | 114 | 9. Distribution of Classes 115 | Class Name: Number of Instances: 116 | 1 8 117 | 2 88 118 | 3 608 119 | 4 0 120 | 5 60 121 | U 34 122 | --- 123 | 798 124 | ARFF 125 | 2014-04-06T23:19:24 126 | Public http://www.openml.org/data/download/1666876/phpFsFYVN 127 | 128 | 1666876 class 1 hallostudy_1uciwelt public active 129 | 4eaed8b6ec9d8211024b6c089b064761 130 | 131 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/run_setup_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # # Run Setup 3 | # One of the key features of the openml-python library is that is allows to 4 | # reinstantiate flows with hyperparameter settings that were uploaded before. 5 | # This tutorial uses the concept of setups. Although setups are not extensively 6 | # described in the OpenML documentation (because most users will not directly 7 | # use them), they form a important concept within OpenML distinguishing between 8 | # hyperparameter configurations. 9 | # A setup is the combination of a flow with all its hyperparameters set. 10 | # 11 | # A key requirement for reinstantiating a flow is to have the same scikit-learn 12 | # version as the flow that was uploaded. However, this tutorial will upload the 13 | # flow (that will later be reinstantiated) itself, so it can be ran with any 14 | # scikit-learn version that is supported by this library. In this case, the 15 | # requirement of the corresponding scikit-learn versions is automatically met. 16 | # 17 | # In this tutorial we will 18 | # 1) Create a flow and use it to solve a task; 19 | # 2) Download the flow, reinstantiate the model with same hyperparameters, 20 | # and solve the same task again; 21 | # 3) We will verify that the obtained results are exactly the same. 22 | 23 | # %% 24 | 25 | import numpy as np 26 | import openml 27 | from openml.extensions.sklearn import cat, cont 28 | 29 | from sklearn.pipeline import make_pipeline, Pipeline 30 | from sklearn.compose import ColumnTransformer 31 | from sklearn.impute import SimpleImputer 32 | from sklearn.preprocessing import OneHotEncoder, FunctionTransformer 33 | from sklearn.ensemble import RandomForestClassifier 34 | from sklearn.decomposition import TruncatedSVD 35 | 36 | # %% [markdown] 37 | # .. warning:: 38 | # .. include:: ../../test_server_usage_warning.txt 39 | 40 | # %% 41 | openml.config.start_using_configuration_for_example() 42 | 43 | # %% [markdown] 44 | # 1) Create a flow and use it to solve a task 45 | 46 | # First, let's download the task that we are interested in 47 | 48 | # %% 49 | task = openml.tasks.get_task(6) 50 | 51 | # %% [markdown] 52 | # we will create a fairly complex model, with many preprocessing components and 53 | # many potential hyperparameters. Of course, the model can be as complex and as 54 | # easy as you want it to be 55 | 56 | 57 | # %% 58 | cat_imp = make_pipeline( 59 | OneHotEncoder(handle_unknown="ignore"), 60 | TruncatedSVD(), 61 | ) 62 | cont_imp = SimpleImputer(strategy="median") 63 | ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) 64 | model_original = Pipeline( 65 | steps=[ 66 | ("transform", ct), 67 | ("estimator", RandomForestClassifier()), 68 | ] 69 | ) 70 | 71 | # %% [markdown] 72 | # Let's change some hyperparameters. Of course, in any good application we 73 | # would tune them using, e.g., Random Search or Bayesian Optimization, but for 74 | # the purpose of this tutorial we set them to some specific values that might 75 | # or might not be optimal 76 | 77 | # %% 78 | hyperparameters_original = { 79 | "estimator__criterion": "gini", 80 | "estimator__n_estimators": 50, 81 | "estimator__max_depth": 10, 82 | "estimator__min_samples_leaf": 1, 83 | } 84 | model_original.set_params(**hyperparameters_original) 85 | 86 | # solve the task and upload the result (this implicitly creates the flow) 87 | run = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False) 88 | run_original = run.publish() # this implicitly uploads the flow 89 | 90 | # %% [markdown] 91 | # ## 2) Download the flow and solve the same task again. 92 | 93 | # %% 94 | # obtain setup id (note that the setup id is assigned by the OpenML server - 95 | # therefore it was not yet available in our local copy of the run) 96 | run_downloaded = openml.runs.get_run(run_original.run_id) 97 | setup_id = run_downloaded.setup_id 98 | 99 | # after this, we can easily reinstantiate the model 100 | model_duplicate = openml.setups.initialize_model(setup_id) 101 | # it will automatically have all the hyperparameters set 102 | 103 | # and run the task again 104 | run_duplicate = openml.runs.run_model_on_task(model_duplicate, task, avoid_duplicate_runs=False) 105 | 106 | 107 | # %% [markdown] 108 | # ## 3) We will verify that the obtained results are exactly the same. 109 | 110 | # %% 111 | # the run has stored all predictions in the field data content 112 | np.testing.assert_array_equal(run_original.data_content, run_duplicate.data_content) 113 | 114 | 115 | # %% 116 | openml.config.stop_using_configuration_for_example() 117 | 118 | # By: Jan N. van Rijn 119 | # License: BSD 3-Clause 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |
6 |
    7 | 8 | OpenML Logo 9 |

    OpenML-Python

    10 | Python Logo 11 |
    12 |
13 |
14 | 15 | ## The Python API for a World of Data and More :dizzy: 16 | 17 | [![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases) 18 | [![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)](https://pypi.org/project/openml/) 19 | [![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml) 20 | [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) 21 | 22 | 23 | [Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md) 24 |
25 | 26 | OpenML-Python provides an easy-to-use and straightforward Python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning. 27 | It can download or upload data from OpenML, such as datasets and machine learning experiment results. 28 | 29 | ## :joystick: Minimal Example 30 | 31 | Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/): 32 | 33 | ```python 34 | import openml 35 | 36 | dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31) 37 | X, y, categorical_indicator, attribute_names = dataset.get_data(target="class") 38 | ``` 39 | 40 | Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): 41 | 42 | ```python 43 | import openml 44 | 45 | task = openml.tasks.get_task(31) 46 | dataset = task.get_dataset() 47 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name) 48 | # get splits for the first fold of 10-fold cross-validation 49 | train_indices, test_indices = task.get_train_test_split_indices(fold=0) 50 | ``` 51 | 52 | Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks: 53 | ```python 54 | import openml 55 | 56 | suite = openml.study.get_suite("amlb-classification-all") # Get a curated list of tasks for classification 57 | for task_id in suite.tasks: 58 | task = openml.tasks.get_task(task_id) 59 | ``` 60 | 61 | ## :magic_wand: Installation 62 | 63 | OpenML-Python is supported on Python 3.8 - 3.13 and is available on Linux, MacOS, and Windows. 64 | 65 | You can install OpenML-Python with: 66 | 67 | ```bash 68 | pip install openml 69 | ``` 70 | 71 | ## :page_facing_up: Citing OpenML-Python 72 | 73 | If you use OpenML-Python in a scientific publication, we would appreciate a reference to the following paper: 74 | 75 | [Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren, Frank Hutter
76 | **OpenML-Python: an extensible Python API for OpenML**
77 | Journal of Machine Learning Research, 22(100):1−5, 2021](https://www.jmlr.org/papers/v22/19-920.html) 78 | 79 | Bibtex entry: 80 | ```bibtex 81 | @article{JMLR:v22:19-920, 82 | author = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter}, 83 | title = {OpenML-Python: an extensible Python API for OpenML}, 84 | journal = {Journal of Machine Learning Research}, 85 | year = {2021}, 86 | volume = {22}, 87 | number = {100}, 88 | pages = {1--5}, 89 | url = {http://jmlr.org/papers/v22/19-920.html} 90 | } 91 | ``` 92 | ## :handshake: Contributing 93 | 94 | We welcome contributions from both new and experienced developers! 95 | 96 | If you would like to contribute to OpenML-Python, please read our 97 | [Contribution Guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md). 98 | 99 | If you are new to open-source development, a great way to get started is by 100 | looking at issues labeled **"good first issue"** in our GitHub issue tracker. 101 | These tasks are beginner-friendly and help you understand the project structure, 102 | development workflow, and how to submit a pull request. 103 | -------------------------------------------------------------------------------- /openml/evaluations/evaluation.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import openml.config 5 | import openml.datasets 6 | import openml.flows 7 | import openml.runs 8 | import openml.tasks 9 | 10 | 11 | # TODO(eddiebergman): A lot of this class is automatically 12 | # handled by a dataclass 13 | class OpenMLEvaluation: 14 | """ 15 | Contains all meta-information about a run / evaluation combination, 16 | according to the evaluation/list function 17 | 18 | Parameters 19 | ---------- 20 | run_id : int 21 | Refers to the run. 22 | task_id : int 23 | Refers to the task. 24 | setup_id : int 25 | Refers to the setup. 26 | flow_id : int 27 | Refers to the flow. 28 | flow_name : str 29 | Name of the referred flow. 30 | data_id : int 31 | Refers to the dataset. 32 | data_name : str 33 | The name of the dataset. 34 | function : str 35 | The evaluation metric of this item (e.g., accuracy). 36 | upload_time : str 37 | The time of evaluation. 38 | uploader: int 39 | Uploader ID (user ID) 40 | upload_name : str 41 | Name of the uploader of this evaluation 42 | value : float 43 | The value (score) of this evaluation. 44 | values : List[float] 45 | The values (scores) per repeat and fold (if requested) 46 | array_data : str 47 | list of information per class. 48 | (e.g., in case of precision, auroc, recall) 49 | """ 50 | 51 | def __init__( # noqa: PLR0913 52 | self, 53 | run_id: int, 54 | task_id: int, 55 | setup_id: int, 56 | flow_id: int, 57 | flow_name: str, 58 | data_id: int, 59 | data_name: str, 60 | function: str, 61 | upload_time: str, 62 | uploader: int, 63 | uploader_name: str, 64 | value: float | None, 65 | values: list[float] | None, 66 | array_data: str | None = None, 67 | ): 68 | self.run_id = run_id 69 | self.task_id = task_id 70 | self.setup_id = setup_id 71 | self.flow_id = flow_id 72 | self.flow_name = flow_name 73 | self.data_id = data_id 74 | self.data_name = data_name 75 | self.function = function 76 | self.upload_time = upload_time 77 | self.uploader = uploader 78 | self.uploader_name = uploader_name 79 | self.value = value 80 | self.values = values 81 | self.array_data = array_data 82 | 83 | def _to_dict(self) -> dict: 84 | return { 85 | "run_id": self.run_id, 86 | "task_id": self.task_id, 87 | "setup_id": self.setup_id, 88 | "flow_id": self.flow_id, 89 | "flow_name": self.flow_name, 90 | "data_id": self.data_id, 91 | "data_name": self.data_name, 92 | "function": self.function, 93 | "upload_time": self.upload_time, 94 | "uploader": self.uploader, 95 | "uploader_name": self.uploader_name, 96 | "value": self.value, 97 | "values": self.values, 98 | "array_data": self.array_data, 99 | } 100 | 101 | def __repr__(self) -> str: 102 | header = "OpenML Evaluation" 103 | header = f"{header}\n{'=' * len(header)}\n" 104 | 105 | fields = { 106 | "Upload Date": self.upload_time, 107 | "Run ID": self.run_id, 108 | "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id), 109 | "Task ID": self.task_id, 110 | "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), 111 | "Flow ID": self.flow_id, 112 | "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), 113 | "Setup ID": self.setup_id, 114 | "Data ID": self.data_id, 115 | "Data Name": self.data_name, 116 | "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id), 117 | "Metric Used": self.function, 118 | "Result": self.value, 119 | } 120 | 121 | order = [ 122 | "Uploader Date", 123 | "Run ID", 124 | "OpenML Run URL", 125 | "Task ID", 126 | "OpenML Task URL" "Flow ID", 127 | "OpenML Flow URL", 128 | "Setup ID", 129 | "Data ID", 130 | "Data Name", 131 | "OpenML Data URL", 132 | "Metric Used", 133 | "Result", 134 | ] 135 | _fields = [(key, fields[key]) for key in order if key in fields] 136 | 137 | longest_field_name_length = max(len(name) for name, _ in _fields) 138 | field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" 139 | body = "\n".join(field_line_format.format(name, value) for name, value in _fields) 140 | return header + body 141 | -------------------------------------------------------------------------------- /tests/test_openml/test_api_calls.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest.mock 4 | from pathlib import Path 5 | from typing import NamedTuple, Iterable, Iterator 6 | from unittest import mock 7 | 8 | import minio 9 | import pytest 10 | 11 | import openml 12 | from openml.config import ConfigurationForExamples 13 | import openml.testing 14 | from openml._api_calls import _download_minio_bucket, API_TOKEN_HELP_LINK 15 | 16 | 17 | class TestConfig(openml.testing.TestBase): 18 | def test_too_long_uri(self): 19 | with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"): 20 | openml.datasets.list_datasets(data_id=list(range(10000))) 21 | 22 | @unittest.mock.patch("time.sleep") 23 | @unittest.mock.patch("requests.Session") 24 | def test_retry_on_database_error(self, Session_class_mock, _): 25 | response_mock = unittest.mock.Mock() 26 | response_mock.text = ( 27 | "\n" 28 | "107" 29 | "Database connection error. " 30 | "Usually due to high server load. " 31 | "Please wait for N seconds and try again.\n" 32 | "" 33 | ) 34 | Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock 35 | with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"): 36 | openml._api_calls._send_request("get", "/abc", {}) 37 | 38 | assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 39 | 40 | 41 | class FakeObject(NamedTuple): 42 | object_name: str 43 | etag: str 44 | """We use the etag of a Minio object as the name of a marker if we already downloaded it.""" 45 | 46 | 47 | class FakeMinio: 48 | def __init__(self, objects: Iterable[FakeObject] | None = None): 49 | self._objects = objects or [] 50 | 51 | def list_objects(self, *args, **kwargs) -> Iterator[FakeObject]: 52 | yield from self._objects 53 | 54 | def fget_object(self, object_name: str, file_path: str, *args, **kwargs) -> None: 55 | if object_name in [obj.object_name for obj in self._objects]: 56 | Path(file_path).write_text("foo") 57 | return 58 | raise FileNotFoundError 59 | 60 | 61 | @mock.patch.object(minio, "Minio") 62 | def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None: 63 | some_prefix, some_filename = "some/prefix", "dataset.arff" 64 | some_object_path = f"{some_prefix}/{some_filename}" 65 | some_url = f"https://not.real.com/bucket/{some_object_path}" 66 | mock_minio.return_value = FakeMinio( 67 | objects=[ 68 | FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))), 69 | ], 70 | ) 71 | 72 | _download_minio_bucket(source=some_url, destination=tmp_path) 73 | time_created = (tmp_path / "dataset.arff").stat().st_ctime 74 | 75 | _download_minio_bucket(source=some_url, destination=tmp_path) 76 | time_modified = (tmp_path / some_filename).stat().st_mtime 77 | 78 | assert time_created == time_modified 79 | 80 | 81 | @mock.patch.object(minio, "Minio") 82 | def test_download_minio_failure(mock_minio, tmp_path: Path) -> None: 83 | some_prefix, some_filename = "some/prefix", "dataset.arff" 84 | some_object_path = f"{some_prefix}/{some_filename}" 85 | some_url = f"https://not.real.com/bucket/{some_object_path}" 86 | mock_minio.return_value = FakeMinio( 87 | objects=[ 88 | FakeObject(object_name=None, etag="tmp"), 89 | ], 90 | ) 91 | 92 | with pytest.raises(ValueError): 93 | _download_minio_bucket(source=some_url, destination=tmp_path) 94 | 95 | mock_minio.return_value = FakeMinio( 96 | objects=[ 97 | FakeObject(object_name="tmp", etag=None), 98 | ], 99 | ) 100 | 101 | with pytest.raises(ValueError): 102 | _download_minio_bucket(source=some_url, destination=tmp_path) 103 | 104 | 105 | @pytest.mark.parametrize( 106 | "endpoint, method", 107 | [ 108 | # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php 109 | ("flow/exists", "post"), # 102 110 | ("dataset", "post"), # 137 111 | ("dataset/42", "delete"), # 350 112 | # ("flow/owned", "post"), # 310 - Couldn't find what would trigger this 113 | ("flow/42", "delete"), # 320 114 | ("run/42", "delete"), # 400 115 | ("task/42", "delete"), # 460 116 | ], 117 | ) 118 | def test_authentication_endpoints_requiring_api_key_show_relevant_help_link( 119 | endpoint: str, 120 | method: str, 121 | ) -> None: 122 | # We need to temporarily disable the API key to test the error message 123 | with openml.config.overwrite_config_context({"apikey": None}): 124 | with pytest.raises(openml.exceptions.OpenMLNotAuthorizedError, match=API_TOKEN_HELP_LINK): 125 | openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None) 126 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/2018_ida_strang_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Strang et al. (2018) 3 | ==================== 4 | 5 | A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models 6 | Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*. 7 | 8 | Publication 9 | ~~~~~~~~~~~ 10 | 11 | | Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML 12 | | Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter 13 | | In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018 14 | | Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25 15 | """ 16 | 17 | # License: BSD 3-Clause 18 | 19 | import matplotlib.pyplot as plt 20 | import openml 21 | import pandas as pd 22 | 23 | ############################################################################## 24 | # A basic step for each data-mining or machine learning task is to determine 25 | # which model to choose based on the problem and the data at hand. In this 26 | # work we investigate when non-linear classifiers outperform linear 27 | # classifiers by means of a large scale experiment. 28 | # 29 | # The paper is accompanied with a study object, containing all relevant tasks 30 | # and runs (``study_id=123``). The paper features three experiment classes: 31 | # Support Vector Machines (SVM), Neural Networks (NN) and Decision Trees (DT). 32 | # This example demonstrates how to reproduce the plots, comparing two 33 | # classifiers given the OpenML flow ids. Note that this allows us to reproduce 34 | # the SVM and NN experiment, but not the DT experiment, as this requires a bit 35 | # more effort to distinguish the same flow with different hyperparameter 36 | # values. 37 | 38 | study_id = 123 39 | # for comparing svms: flow_ids = [7754, 7756] 40 | # for comparing nns: flow_ids = [7722, 7729] 41 | # for comparing dts: flow_ids = [7725], differentiate on hyper-parameter value 42 | classifier_family = "SVM" 43 | flow_ids = [7754, 7756] 44 | measure = "predictive_accuracy" 45 | meta_features = ["NumberOfInstances", "NumberOfFeatures"] 46 | class_values = ["non-linear better", "linear better", "equal"] 47 | 48 | # Downloads all evaluation records related to this study 49 | evaluations = openml.evaluations.list_evaluations( 50 | measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe" 51 | ) 52 | # gives us a table with columns data_id, flow1_value, flow2_value 53 | evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna() 54 | # downloads all data qualities (for scatter plot) 55 | data_qualities = openml.datasets.list_datasets( 56 | data_id=list(evaluations.index.values), output_format="dataframe" 57 | ) 58 | # removes irrelevant data qualities 59 | data_qualities = data_qualities[meta_features] 60 | # makes a join between evaluation table and data qualities table, 61 | # now we have columns data_id, flow1_value, flow2_value, meta_feature_1, 62 | # meta_feature_2 63 | evaluations = evaluations.join(data_qualities, how="inner") 64 | 65 | # adds column that indicates the difference between the two classifiers 66 | evaluations["diff"] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]] 67 | 68 | 69 | ############################################################################## 70 | # makes the s-plot 71 | 72 | fig_splot, ax_splot = plt.subplots() 73 | ax_splot.plot(range(len(evaluations)), sorted(evaluations["diff"])) 74 | ax_splot.set_title(classifier_family) 75 | ax_splot.set_xlabel("Dataset (sorted)") 76 | ax_splot.set_ylabel("difference between linear and non-linear classifier") 77 | ax_splot.grid(linestyle="--", axis="y") 78 | plt.show() 79 | 80 | 81 | ############################################################################## 82 | # adds column that indicates the difference between the two classifiers, 83 | # needed for the scatter plot 84 | 85 | 86 | def determine_class(val_lin, val_nonlin): 87 | if val_lin < val_nonlin: 88 | return class_values[0] 89 | elif val_nonlin < val_lin: 90 | return class_values[1] 91 | else: 92 | return class_values[2] 93 | 94 | 95 | evaluations["class"] = evaluations.apply( 96 | lambda row: determine_class(row[flow_ids[0]], row[flow_ids[1]]), axis=1 97 | ) 98 | 99 | # does the plotting and formatting 100 | fig_scatter, ax_scatter = plt.subplots() 101 | for class_val in class_values: 102 | df_class = evaluations[evaluations["class"] == class_val] 103 | plt.scatter(df_class[meta_features[0]], df_class[meta_features[1]], label=class_val) 104 | ax_scatter.set_title(classifier_family) 105 | ax_scatter.set_xlabel(meta_features[0]) 106 | ax_scatter.set_ylabel(meta_features[1]) 107 | ax_scatter.legend() 108 | ax_scatter.set_xscale("log") 109 | ax_scatter.set_yscale("log") 110 | plt.show() 111 | 112 | ############################################################################## 113 | # makes a scatter plot where each data point represents the performance of the 114 | # two algorithms on various axis (not in the paper) 115 | 116 | fig_diagplot, ax_diagplot = plt.subplots() 117 | ax_diagplot.grid(linestyle="--") 118 | ax_diagplot.plot([0, 1], ls="-", color="black") 119 | ax_diagplot.plot([0.2, 1.2], ls="--", color="black") 120 | ax_diagplot.plot([-0.2, 0.8], ls="--", color="black") 121 | ax_diagplot.scatter(evaluations[flow_ids[0]], evaluations[flow_ids[1]]) 122 | ax_diagplot.set_xlabel(measure) 123 | ax_diagplot.set_ylabel(measure) 124 | plt.show() 125 | -------------------------------------------------------------------------------- /examples/_external_or_deprecated/benchmark_with_optunahub.py: -------------------------------------------------------------------------------- 1 | """ 2 | ==================================================== 3 | Hyperparameter Optimization Benchmark with OptunaHub 4 | ==================================================== 5 | 6 | In this tutorial, we walk through how to conduct hyperparameter optimization experiments using OpenML and OptunaHub. 7 | """ 8 | ############################################################################ 9 | # Please make sure to install the dependencies with: 10 | # ``pip install "openml>=0.15.1" plotly`` 11 | # Then we import all the necessary modules. 12 | 13 | # License: BSD 3-Clause 14 | 15 | import logging 16 | 17 | import optuna 18 | from sklearn.compose import ColumnTransformer 19 | from sklearn.ensemble import RandomForestClassifier 20 | from sklearn.impute import SimpleImputer 21 | from sklearn.pipeline import Pipeline 22 | from sklearn.preprocessing import OneHotEncoder 23 | 24 | import openml 25 | 26 | logger = logging.Logger(name="Experiment Logger", level=1) 27 | 28 | #
29 | #

Warning

30 | #

31 | # For the rest of this tutorial, we will require the `openml-sklearn` package. 32 | # Install it with `pip install openml-sklearn`. 33 | #

34 | #
35 | 36 | # %% 37 | # Get sklearn extension to run sklearn models easily on OpenML tasks. 38 | from openml_sklearn import SklearnExtension, cat, cont 39 | 40 | extension = SklearnExtension() 41 | 42 | # Set your openml api key if you want to upload your results to OpenML (eg: 43 | # https://openml.org/search?type=run&sort=date) . To get one, simply make an 44 | # account (you don't need one for anything else, just to upload your results), 45 | # go to your profile and select the API-KEY. 46 | # Or log in, and navigate to https://www.openml.org/auth/api-key 47 | openml.config.apikey = "" 48 | ############################################################################ 49 | # Prepare for preprocessors and an OpenML task 50 | # ============================================ 51 | 52 | # OpenML contains several key concepts which it needs to make machine learning research shareable. 53 | # A machine learning experiment consists of one or several runs, which describe the performance of 54 | # an algorithm (called a flow in OpenML), its hyperparameter settings (called a setup) on a task. 55 | # A Task is the combination of a dataset, a split and an evaluation metric We choose a dataset from 56 | # OpenML, (https://www.openml.org/d/1464) and a subsequent task (https://www.openml.org/t/10101) To 57 | # make your own dataset and task, please refer to 58 | # https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html 59 | 60 | # https://www.openml.org/search?type=study&study_type=task&id=218 61 | task_id = 10101 62 | seed = 42 63 | categorical_preproc = ( 64 | "categorical", 65 | OneHotEncoder(sparse_output=False, handle_unknown="ignore"), 66 | cat, 67 | ) 68 | numerical_preproc = ("numerical", SimpleImputer(strategy="median"), cont) 69 | preproc = ColumnTransformer([categorical_preproc, numerical_preproc]) 70 | 71 | ############################################################################ 72 | # Define a pipeline for the hyperparameter optimization (this is standark for Optuna) 73 | # ===================================================== 74 | 75 | # Optuna explanation 76 | # we follow the `Optuna `__ search space design. 77 | 78 | # OpenML runs 79 | # We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance 80 | # of the pipeline 81 | # on the specified OpenML task. 82 | # Do you want to share your results along with an easily reproducible pipeline, you can set an API 83 | # key and just upload your results. 84 | # You can find more examples on https://www.openml.org/ 85 | 86 | 87 | def objective(trial: optuna.Trial) -> Pipeline: 88 | clf = RandomForestClassifier( 89 | max_depth=trial.suggest_int("max_depth", 2, 32, log=True), 90 | min_samples_leaf=trial.suggest_float("min_samples_leaf", 0.0, 1.0), 91 | random_state=seed, 92 | ) 93 | pipe = Pipeline(steps=[("preproc", preproc), ("model", clf)]) 94 | logger.log(1, f"Running pipeline - {pipe}") 95 | run = openml.runs.run_model_on_task(pipe, task=task_id, avoid_duplicate_runs=False) 96 | 97 | logger.log(1, f"Model has been trained - {run}") 98 | if openml.config.apikey != "": 99 | try: 100 | run.publish() 101 | 102 | logger.log(1, f"Run was uploaded to - {run.openml_url}") 103 | except Exception as e: 104 | logger.log(1, f"Could not publish run - {e}") 105 | else: 106 | logger.log( 107 | 0, 108 | "If you want to publish your results to OpenML, please set an apikey", 109 | ) 110 | accuracy = max(run.fold_evaluations["predictive_accuracy"][0].values()) 111 | logger.log(0, f"Accuracy {accuracy}") 112 | 113 | return accuracy 114 | 115 | 116 | ############################################################################ 117 | # Optimize the pipeline 118 | # ===================== 119 | study = optuna.create_study(direction="maximize") 120 | logger.log(0, f"Study {study}") 121 | study.optimize(objective, n_trials=15) 122 | 123 | ############################################################################ 124 | # Visualize the optimization history 125 | # ================================== 126 | fig = optuna.visualization.plot_optimization_history(study) 127 | fig.show() 128 | -------------------------------------------------------------------------------- /examples/Advanced/datasets_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # How to list and download datasets. 3 | 4 | # %% 5 | import pandas as pd 6 | 7 | import openml 8 | from openml.datasets import edit_dataset, fork_dataset, get_dataset 9 | 10 | # %% [markdown] 11 | # ## Exercise 0 12 | # 13 | # * List datasets and return a dataframe 14 | 15 | # %% 16 | datalist = openml.datasets.list_datasets() 17 | datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]] 18 | 19 | print(f"First 10 of {len(datalist)} datasets...") 20 | datalist.head(n=10) 21 | 22 | # The same can be done with lesser lines of code 23 | openml_df = openml.datasets.list_datasets() 24 | openml_df.head(n=10) 25 | 26 | # %% [markdown] 27 | # ## Exercise 1 28 | # 29 | # * Find datasets with more than 10000 examples. 30 | # * Find a dataset called 'eeg_eye_state'. 31 | # * Find all datasets with more than 50 classes. 32 | 33 | # %% 34 | datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20) 35 | 36 | # %% 37 | datalist.query('name == "eeg-eye-state"') 38 | 39 | # %% 40 | datalist.query("NumberOfClasses > 50") 41 | 42 | # %% [markdown] 43 | # ## Download datasets 44 | 45 | # %% 46 | # This is done based on the dataset ID. 47 | dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1) 48 | 49 | # Print a summary 50 | print( 51 | f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'" 52 | ) 53 | print(f"URL: {dataset.url}") 54 | print(dataset.description[:500]) 55 | 56 | # %% [markdown] 57 | # Get the actual data. 58 | # 59 | # openml-python returns data as pandas dataframes (stored in the `eeg` variable below), 60 | # and also some additional metadata that we don't care about right now. 61 | 62 | # %% 63 | eeg, *_ = dataset.get_data() 64 | 65 | # %% [markdown] 66 | # You can optionally choose to have openml separate out a column from the 67 | # dataset. In particular, many datasets for supervised problems have a set 68 | # `default_target_attribute` which may help identify the target variable. 69 | 70 | # %% 71 | X, y, categorical_indicator, attribute_names = dataset.get_data( 72 | target=dataset.default_target_attribute 73 | ) 74 | print(X.head()) 75 | print(X.info()) 76 | 77 | # %% [markdown] 78 | # Sometimes you only need access to a dataset's metadata. 79 | # In those cases, you can download the dataset without downloading the 80 | # data file. The dataset object can be used as normal. 81 | # Whenever you use any functionality that requires the data, 82 | # such as `get_data`, the data will be downloaded. 83 | # Starting from 0.15, not downloading data will be the default behavior instead. 84 | # The data will be downloading automatically when you try to access it through 85 | # openml objects, e.g., using `dataset.features`. 86 | 87 | # %% 88 | dataset = openml.datasets.get_dataset(1471) 89 | 90 | # %% [markdown] 91 | # ## Exercise 2 92 | # * Explore the data visually. 93 | 94 | # %% 95 | eegs = eeg.sample(n=1000) 96 | _ = pd.plotting.scatter_matrix( 97 | X.iloc[:100, :4], 98 | c=y[:100], 99 | figsize=(10, 10), 100 | marker="o", 101 | hist_kwds={"bins": 20}, 102 | alpha=0.8, 103 | cmap="plasma", 104 | ) 105 | 106 | 107 | # %% [markdown] 108 | # ## Edit a created dataset 109 | # This example uses the test server, to avoid editing a dataset on the main server. 110 | 111 | # %% 112 | openml.config.start_using_configuration_for_example() 113 | # %% [markdown] 114 | # Edit non-critical fields, allowed for all authorized users: 115 | # description, creator, contributor, collection_date, language, citation, 116 | # original_data_url, paper_url 117 | 118 | # %% 119 | desc = ( 120 | "This data sets consists of 3 different types of irises' " 121 | "(Setosa, Versicolour, and Virginica) petal and sepal length," 122 | " stored in a 150x4 numpy.ndarray" 123 | ) 124 | did = 128 125 | data_id = edit_dataset( 126 | did, 127 | description=desc, 128 | creator="R.A.Fisher", 129 | collection_date="1937", 130 | citation="The use of multiple measurements in taxonomic problems", 131 | language="English", 132 | ) 133 | edited_dataset = get_dataset(data_id) 134 | print(f"Edited dataset ID: {data_id}") 135 | 136 | 137 | # %% [markdown] 138 | # Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed 139 | # only for the dataset owner. Further, critical fields cannot be edited if the dataset has any 140 | # tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you, 141 | # configure the API key: 142 | # openml.config.apikey = 'FILL_IN_OPENML_API_KEY' 143 | # This example here only shows a failure when trying to work on a dataset not owned by you: 144 | 145 | # %% 146 | try: 147 | data_id = edit_dataset(1, default_target_attribute="shape") 148 | except openml.exceptions.OpenMLServerException as e: 149 | print(e) 150 | 151 | # %% [markdown] 152 | # ## Fork dataset 153 | # Used to create a copy of the dataset with you as the owner. 154 | # Use this API only if you are unable to edit the critical fields (default_target_attribute, 155 | # ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API. 156 | # After the dataset is forked, you can edit the new version of the dataset using edit_dataset. 157 | 158 | # %% 159 | data_id = fork_dataset(1) 160 | print(data_id) 161 | data_id = edit_dataset(data_id, default_target_attribute="shape") 162 | print(f"Forked dataset ID: {data_id}") 163 | 164 | # %% 165 | openml.config.stop_using_configuration_for_example() 166 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: openml-python 2 | repo_url: https://github.com/openml/openml-python 3 | repo_name: openml/openml-python 4 | theme: 5 | logo: images/openml_icon.png 6 | favicon: images/openml_icon.png 7 | name: material 8 | features: 9 | - content.code.annotate 10 | - content.code.copy 11 | - navigation.footer 12 | - navigation.sections 13 | - toc.follow 14 | - toc.integrate 15 | - navigation.tabs 16 | - navigation.tabs.sticky 17 | - header.autohide 18 | - header.social 19 | - search.suggest 20 | - search.highlight 21 | - search.share 22 | palette: 23 | - scheme: slate 24 | media: "(prefers-color-scheme: dark)" 25 | primary: indigo 26 | accent: deep purple 27 | toggle: 28 | icon: material/eye-outline 29 | name: Switch to light mode 30 | 31 | # Palette toggle for light mode 32 | - scheme: default 33 | media: "(prefers-color-scheme: light)" 34 | primary: indigo 35 | accent: deep purple 36 | toggle: 37 | icon: material/eye 38 | name: Switch to dark mode 39 | 40 | extra_css: 41 | - stylesheets/extra.css 42 | 43 | nav: 44 | - index.md 45 | - Examples: 46 | - Overview: examples/introduction.py 47 | - Basics: 48 | - Setup: examples/Basics/introduction_tutorial.py 49 | - Datasets: examples/Basics/simple_datasets_tutorial.py 50 | - Tasks: examples/Basics/simple_tasks_tutorial.py 51 | - Flows and Runs: examples/Basics/simple_flows_and_runs_tutorial.py 52 | - Suites: examples/Basics/simple_suites_tutorial.py 53 | - Advanced: 54 | - Dataset Splits from Tasks: examples/Advanced/task_manual_iteration_tutorial.py 55 | - Creating and Uploading Datasets: examples/Advanced/create_upload_tutorial.py 56 | - Searching and Editing Datasets: examples/Advanced/datasets_tutorial.py 57 | - Searching and Creating Tasks: examples/Advanced/tasks_tutorial.py 58 | - List, Download, and Upload Suites: examples/Advanced/suites_tutorial.py 59 | - List, Download, and Upload Studies: examples/Advanced/study_tutorial.py 60 | - Downloading Evaluation Results: examples/Advanced/fetch_evaluations_tutorial.py 61 | - Configuring Logging: examples/Advanced/configure_logging.py 62 | 63 | 64 | - Extensions: extensions.md 65 | - Advanced User Guide: details.md 66 | - API: reference/ 67 | - Contributing: contributing.md 68 | 69 | markdown_extensions: 70 | - pymdownx.highlight: 71 | anchor_linenums: true 72 | - pymdownx.superfences 73 | - attr_list 74 | - admonition 75 | - tables 76 | - attr_list 77 | - md_in_html 78 | - toc: 79 | permalink: "#" 80 | - pymdownx.highlight: 81 | anchor_linenums: true 82 | - pymdownx.magiclink: 83 | hide_protocol: true 84 | repo_url_shortener: true 85 | repo_url_shorthand: true 86 | user: openml 87 | repo: openml-python 88 | - pymdownx.highlight 89 | - pymdownx.inlinehilite 90 | - pymdownx.snippets 91 | - pymdownx.details 92 | - pymdownx.tabbed: 93 | alternate_style: true 94 | - pymdownx.superfences: 95 | custom_fences: 96 | - name: mermaid 97 | class: mermaid 98 | format: !!python/name:pymdownx.superfences.fence_code_format 99 | - pymdownx.emoji: 100 | emoji_index: !!python/name:material.extensions.emoji.twemoji 101 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 102 | - pymdownx.tabbed: 103 | alternate_style: true 104 | 105 | extra: 106 | version: 107 | provider: mike 108 | social: 109 | - icon: fontawesome/brands/github 110 | link: https://github.com/openml 111 | - icon: fontawesome/brands/twitter 112 | link: https://x.com/open_ml 113 | 114 | plugins: 115 | - search 116 | - autorefs 117 | - section-index 118 | # - mkdocstrings: 119 | - mkdocstrings: 120 | default_handler: python 121 | enable_inventory: true 122 | handlers: 123 | python: 124 | # paths: [openml] 125 | options: # https://mkdocstrings.github.io/python/usage/ 126 | docstring_section_style: spacy 127 | docstring_options: 128 | ignore_init_summary: true 129 | trim_doctest_flags: true 130 | returns_multiple_items: false 131 | show_docstring_attributes: true 132 | show_docstring_description: true 133 | show_root_heading: true 134 | show_root_toc_entry: true 135 | show_object_full_path: false 136 | show_root_members_full_path: false 137 | signature_crossrefs: true 138 | merge_init_into_class: true 139 | show_symbol_type_heading: true 140 | show_symbol_type_toc: true 141 | docstring_style: google 142 | inherited_members: true 143 | show_if_no_docstring: false 144 | show_bases: true 145 | show_source: true 146 | members_order: "alphabetical" 147 | group_by_category: true 148 | show_signature: true 149 | separate_signature: true 150 | show_signature_annotations: true 151 | filters: 152 | - "!^_[^_]" 153 | 154 | - gen-files: 155 | scripts: 156 | - scripts/gen_ref_pages.py 157 | - literate-nav: 158 | nav_file: SUMMARY.md 159 | - mkdocs-jupyter: 160 | theme: light 161 | - mike: 162 | version_selector: true 163 | css_dir: css 164 | javascript_dir: js 165 | canonical_version: latest 166 | -------------------------------------------------------------------------------- /docker/readme.md: -------------------------------------------------------------------------------- 1 | # OpenML Python Container 2 | 3 | This docker container has the latest version of openml-python downloaded and pre-installed. 4 | It can also be used by developers to run unit tests or build the docs in 5 | a fresh and/or isolated unix environment. 6 | This document contains information about: 7 | 8 | 1. [Usage](#usage): how to use the image and its main modes. 9 | 2. [Using local or remote code](#using-local-or-remote-code): useful when testing your own latest changes. 10 | 3. [Versions](#versions): identify which image to use. 11 | 4. [Development](#for-developers): information about the Docker image for developers. 12 | 13 | *note:* each docker image is shipped with a readme, which you can read with: 14 | `docker run --entrypoint=/bin/cat openml/openml-python:TAG readme.md` 15 | 16 | ## Usage 17 | 18 | There are three main ways to use the image: running a pre-installed Python environment, 19 | running tests, and building documentation. 20 | 21 | ### Running `Python` with pre-installed `OpenML-Python` (default): 22 | 23 | To run `Python` with a pre-installed `OpenML-Python` environment run: 24 | 25 | ```text 26 | docker run -it openml/openml-python 27 | ``` 28 | 29 | this accepts the normal `Python` arguments, e.g.: 30 | 31 | ```text 32 | docker run openml/openml-python -c "import openml; print(openml.__version__)" 33 | ``` 34 | 35 | if you want to run a local script, it needs to be mounted first. Mount it into the 36 | `openml` folder: 37 | 38 | ``` 39 | docker run -v PATH/TO/FILE:/openml/MY_SCRIPT.py openml/openml-python MY_SCRIPT.py 40 | ``` 41 | 42 | ### Running unit tests 43 | 44 | You can run the unit tests by passing `test` as the first argument. 45 | It also requires a local or remote repository to be specified, which is explained 46 | [below]((#using-local-or-remote-code). For this example, we specify to test the 47 | `develop` branch: 48 | 49 | ```text 50 | docker run openml/openml-python test develop 51 | ``` 52 | 53 | ### Building documentation 54 | 55 | You can build the documentation by passing `doc` as the first argument, 56 | you should [mount]((https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)) 57 | an output directory in which the docs will be stored. You also need to provide a remote 58 | or local repository as explained in [the section below]((#using-local-or-remote-code). 59 | In this example, we build documentation for the `develop` branch. 60 | On Windows: 61 | 62 | ```text 63 | docker run --mount type=bind,source="E:\\files/output",destination="/output" openml/openml-python doc develop 64 | ``` 65 | 66 | on Linux: 67 | ```text 68 | docker run --mount type=bind,source="./output",destination="/output" openml/openml-python doc develop 69 | ``` 70 | 71 | see [the section below]((#using-local-or-remote-code) for running against local changes 72 | or a remote branch. 73 | 74 | *Note: you can forgo mounting an output directory to test if the docs build successfully, 75 | but the result will only be available within the docker container under `/openml/docs/build`.* 76 | 77 | ## Using local or remote code 78 | 79 | You can build docs or run tests against your local repository or a Github repository. 80 | In the examples below, change the `source` to match the location of your local repository. 81 | 82 | ### Using a local repository 83 | 84 | To use a local directory, mount it in the `/code` directory, on Windows: 85 | 86 | ```text 87 | docker run --mount type=bind,source="E:\\repositories/openml-python",destination="/code" openml/openml-python test 88 | ``` 89 | 90 | on Linux: 91 | ```text 92 | docker run --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python test 93 | ``` 94 | 95 | when building docs, you also need to mount an output directory as shown above, so add both: 96 | 97 | ```text 98 | docker run --mount type=bind,source="./output",destination="/output" --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python doc 99 | ``` 100 | 101 | ### Using a Github repository 102 | Building from a remote repository requires you to specify a branch. 103 | The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/): 104 | 105 | docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output openml/openml-python [test,doc] BRANCH 106 | 107 | Where `BRANCH` is the name of the branch for which to generate the documentation. 108 | It is also possible to build the documentation from the branch on a fork, 109 | in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. 110 | `PGijsbers#my_feature_branch`) and the name of the forked repository should be `openml-python`. 111 | 112 | ## For developers 113 | This section contains some notes about the structure of the image, 114 | intended for those who want to work on it. 115 | 116 | ### Added Directories 117 | The `openml/openml-python` image is built on a vanilla `python:3` image. 118 | Additionally, it contains the following files are directories: 119 | 120 | - `/openml`: contains the openml-python repository in the state with which the image 121 | was built by default. If working with a `BRANCH`, this repository will be set to 122 | the `HEAD` of `BRANCH`. 123 | - `/openml/venv/`: contains the used virtual environment for `doc` and `test`. It has 124 | `openml-python` dependencies pre-installed. When invoked with `doc` or `test`, the 125 | dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`. 126 | - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`). 127 | 128 | ## Building the image 129 | To build the image yourself, execute `docker build -f Dockerfile .` from the `docker` 130 | directory of the `openml-python` repository. It will use the `startup.sh` as is, so any 131 | local changes will be present in the image. 132 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | push: 7 | branches: 8 | - main 9 | - develop 10 | tags: 11 | - "v*.*.*" 12 | 13 | pull_request: 14 | branches: 15 | - main 16 | - develop 17 | 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 20 | cancel-in-progress: true 21 | 22 | jobs: 23 | test: 24 | name: (${{ matrix.os }}, Py${{ matrix.python-version }}, sk${{ matrix.scikit-learn }}, sk-only:${{ matrix.sklearn-only }}) 25 | runs-on: ${{ matrix.os }} 26 | strategy: 27 | matrix: 28 | python-version: ["3.9"] 29 | scikit-learn: ["1.0.*", "1.1.*", "1.2.*", "1.3.*", "1.4.*", "1.5.*"] 30 | os: [ubuntu-latest] 31 | sklearn-only: ["true"] 32 | include: 33 | - os: ubuntu-latest 34 | python-version: "3.8" # no scikit-learn 0.23 release for Python 3.9 35 | scikit-learn: "0.23.1" 36 | sklearn-only: "true" 37 | # scikit-learn 0.24 relies on scipy defaults, so we need to fix the version 38 | # c.f. https://github.com/openml/openml-python/pull/1267 39 | - os: ubuntu-latest 40 | python-version: "3.9" 41 | scikit-learn: "0.24" 42 | scipy: "1.10.0" 43 | sklearn-only: "true" 44 | # Do a Windows and Ubuntu test for _all_ openml functionality 45 | # I am not sure why these are on 3.8 and older scikit-learn 46 | - os: windows-latest 47 | python-version: "3.8" 48 | scikit-learn: 0.24.* 49 | scipy: "1.10.0" 50 | sklearn-only: 'false' 51 | # Include a code cov version 52 | - os: ubuntu-latest 53 | code-cov: true 54 | python-version: "3.8" 55 | scikit-learn: 0.23.1 56 | sklearn-only: 'false' 57 | fail-fast: false 58 | 59 | steps: 60 | - uses: actions/checkout@v4 61 | with: 62 | fetch-depth: 2 63 | - name: Setup Python ${{ matrix.python-version }} 64 | if: matrix.os != 'windows-latest' # windows-latest only uses preinstalled Python (3.9.13) 65 | uses: actions/setup-python@v5 66 | with: 67 | python-version: ${{ matrix.python-version }} 68 | - name: Install test dependencies 69 | run: | 70 | python -m pip install --upgrade pip 71 | pip install -e .[test] 72 | - name: Install scikit-learn ${{ matrix.scikit-learn }} 73 | run: | 74 | pip install scikit-learn==${{ matrix.scikit-learn }} 75 | - name: Install numpy for Python 3.8 76 | # Python 3.8 & scikit-learn<0.24 requires numpy<=1.23.5 77 | if: ${{ matrix.python-version == '3.8' && matrix.scikit-learn == '0.23.1' }} 78 | run: | 79 | pip install numpy==1.23.5 80 | - name: "Install NumPy 1.x and SciPy <1.11 for scikit-learn < 1.4" 81 | if: ${{ contains(fromJSON('["1.0.*", "1.1.*", "1.2.*", "1.3.*"]'), matrix.scikit-learn) }} 82 | run: | 83 | # scipy has a change to the 'mode' behavior which breaks scikit-learn < 1.4 84 | # numpy 2.0 has several breaking changes 85 | pip install "numpy<2.0" "scipy<1.11" 86 | - name: Install scipy ${{ matrix.scipy }} 87 | if: ${{ matrix.scipy }} 88 | run: | 89 | pip install scipy==${{ matrix.scipy }} 90 | - name: Store repository status 91 | id: status-before 92 | if: matrix.os != 'windows-latest' 93 | run: | 94 | git_status=$(git status --porcelain -b) 95 | echo "BEFORE=$git_status" >> $GITHUB_ENV 96 | echo "Repository status before tests: $git_status" 97 | - name: Show installed dependencies 98 | run: python -m pip list 99 | - name: Run tests on Ubuntu Test 100 | if: matrix.os == 'ubuntu-latest' 101 | run: | 102 | if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long --cov-report=xml'; fi 103 | # Most of the time, running only the scikit-learn tests is sufficient 104 | if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and not production'; else marks='not production'; fi 105 | echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" 106 | pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" 107 | - name: Run tests on Ubuntu Production 108 | if: matrix.os == 'ubuntu-latest' 109 | run: | 110 | if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long --cov-report=xml'; fi 111 | # Most of the time, running only the scikit-learn tests is sufficient 112 | if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and production'; else marks='production'; fi 113 | echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" 114 | pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" 115 | - name: Run tests on Windows 116 | if: matrix.os == 'windows-latest' 117 | run: | # we need a separate step because of the bash-specific if-statement in the previous one. 118 | pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 119 | - name: Check for files left behind by test 120 | if: matrix.os != 'windows-latest' && always() 121 | run: | 122 | before="${{ env.BEFORE }}" 123 | after="$(git status --porcelain -b)" 124 | if [[ "$before" != "$after" ]]; then 125 | echo "git status from before: $before" 126 | echo "git status from after: $after" 127 | echo "Not all generated files have been deleted!" 128 | exit 1 129 | fi 130 | - name: Upload coverage 131 | if: matrix.code-cov && always() 132 | uses: codecov/codecov-action@v4 133 | with: 134 | files: coverage.xml 135 | token: ${{ secrets.CODECOV_TOKEN }} 136 | fail_ci_if_error: true 137 | verbose: true 138 | -------------------------------------------------------------------------------- /openml/setups/setup.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | from typing import Any 5 | 6 | import openml.config 7 | import openml.flows 8 | 9 | 10 | class OpenMLSetup: 11 | """Setup object (a.k.a. Configuration). 12 | 13 | Parameters 14 | ---------- 15 | setup_id : int 16 | The OpenML setup id 17 | flow_id : int 18 | The flow that it is build upon 19 | parameters : dict 20 | The setting of the parameters 21 | """ 22 | 23 | def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | None): 24 | if not isinstance(setup_id, int): 25 | raise ValueError("setup id should be int") 26 | 27 | if not isinstance(flow_id, int): 28 | raise ValueError("flow id should be int") 29 | 30 | if parameters is not None and not isinstance(parameters, dict): 31 | raise ValueError("parameters should be dict") 32 | 33 | self.setup_id = setup_id 34 | self.flow_id = flow_id 35 | self.parameters = parameters 36 | 37 | def _to_dict(self) -> dict[str, Any]: 38 | return { 39 | "setup_id": self.setup_id, 40 | "flow_id": self.flow_id, 41 | "parameters": {p.id: p._to_dict() for p in self.parameters.values()} 42 | if self.parameters is not None 43 | else None, 44 | } 45 | 46 | def __repr__(self) -> str: 47 | header = "OpenML Setup" 48 | header = f"{header}\n{'=' * len(header)}\n" 49 | 50 | fields = { 51 | "Setup ID": self.setup_id, 52 | "Flow ID": self.flow_id, 53 | "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), 54 | "# of Parameters": ( 55 | len(self.parameters) if self.parameters is not None else float("nan") 56 | ), 57 | } 58 | 59 | # determines the order in which the information will be printed 60 | order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"] 61 | _fields = [(key, fields[key]) for key in order if key in fields] 62 | 63 | longest_field_name_length = max(len(name) for name, _ in _fields) 64 | field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" 65 | body = "\n".join(field_line_format.format(name, value) for name, value in _fields) 66 | return header + body 67 | 68 | 69 | class OpenMLParameter: 70 | """Parameter object (used in setup). 71 | 72 | Parameters 73 | ---------- 74 | input_id : int 75 | The input id from the openml database 76 | flow id : int 77 | The flow to which this parameter is associated 78 | flow name : str 79 | The name of the flow (no version number) to which this parameter 80 | is associated 81 | full_name : str 82 | The name of the flow and parameter combined 83 | parameter_name : str 84 | The name of the parameter 85 | data_type : str 86 | The datatype of the parameter. generally unused for sklearn flows 87 | default_value : str 88 | The default value. For sklearn parameters, this is unknown and a 89 | default value is selected arbitrarily 90 | value : str 91 | If the parameter was set, the value that it was set to. 92 | """ 93 | 94 | def __init__( # noqa: PLR0913 95 | self, 96 | input_id: int, 97 | flow_id: int, 98 | flow_name: str, 99 | full_name: str, 100 | parameter_name: str, 101 | data_type: str, 102 | default_value: str, 103 | value: str, 104 | ): 105 | self.id = input_id 106 | self.flow_id = flow_id 107 | self.flow_name = flow_name 108 | self.full_name = full_name 109 | self.parameter_name = parameter_name 110 | self.data_type = data_type 111 | self.default_value = default_value 112 | self.value = value 113 | 114 | def _to_dict(self) -> dict[str, Any]: 115 | return { 116 | "id": self.id, 117 | "flow_id": self.flow_id, 118 | "flow_name": self.flow_name, 119 | "full_name": self.full_name, 120 | "parameter_name": self.parameter_name, 121 | "data_type": self.data_type, 122 | "default_value": self.default_value, 123 | "value": self.value, 124 | } 125 | 126 | def __repr__(self) -> str: 127 | header = "OpenML Parameter" 128 | header = f"{header}\n{'=' * len(header)}\n" 129 | 130 | fields = { 131 | "ID": self.id, 132 | "Flow ID": self.flow_id, 133 | # "Flow Name": self.flow_name, 134 | "Flow Name": self.full_name, 135 | "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), 136 | "Parameter Name": self.parameter_name, 137 | } 138 | # indented prints for parameter attributes 139 | # indention = 2 spaces + 1 | + 2 underscores 140 | indent = f"{' ' * 2}|{'_' * 2}" 141 | parameter_data_type = f"{indent}Data Type" 142 | fields[parameter_data_type] = self.data_type 143 | parameter_default = f"{indent}Default" 144 | fields[parameter_default] = self.default_value 145 | parameter_value = f"{indent}Value" 146 | fields[parameter_value] = self.value 147 | 148 | # determines the order in which the information will be printed 149 | order = [ 150 | "ID", 151 | "Flow ID", 152 | "Flow Name", 153 | "Flow URL", 154 | "Parameter Name", 155 | parameter_data_type, 156 | parameter_default, 157 | parameter_value, 158 | ] 159 | _fields = [(key, fields[key]) for key in order if key in fields] 160 | 161 | longest_field_name_length = max(len(name) for name, _ in _fields) 162 | field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" 163 | body = "\n".join(field_line_format.format(name, value) for name, value in _fields) 164 | return header + body 165 | -------------------------------------------------------------------------------- /tests/test_utils/test_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import unittest.mock 5 | import pytest 6 | import openml 7 | from openml.testing import _check_dataset 8 | 9 | 10 | @pytest.fixture() 11 | def min_number_tasks_on_test_server() -> int: 12 | """After a reset at least 1068 tasks are on the test server""" 13 | return 1068 14 | 15 | 16 | @pytest.fixture() 17 | def min_number_datasets_on_test_server() -> int: 18 | """After a reset at least 127 datasets are on the test server""" 19 | return 127 20 | 21 | 22 | @pytest.fixture() 23 | def min_number_flows_on_test_server() -> int: 24 | """After a reset at least 127 flows are on the test server""" 25 | return 15 26 | 27 | 28 | @pytest.fixture() 29 | def min_number_setups_on_test_server() -> int: 30 | """After a reset at least 20 setups are on the test server""" 31 | return 50 32 | 33 | 34 | @pytest.fixture() 35 | def min_number_runs_on_test_server() -> int: 36 | """After a reset at least 21 runs are on the test server""" 37 | return 21 38 | 39 | 40 | @pytest.fixture() 41 | def min_number_evaluations_on_test_server() -> int: 42 | """After a reset at least 8 evaluations are on the test server""" 43 | return 8 44 | 45 | 46 | def _mocked_perform_api_call(call, request_method): 47 | url = openml.config.server + "/" + call 48 | return openml._api_calls._download_text_file(url) 49 | 50 | 51 | @pytest.mark.server() 52 | def test_list_all(): 53 | openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) 54 | 55 | 56 | @pytest.mark.server() 57 | def test_list_all_for_tasks(min_number_tasks_on_test_server): 58 | tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server) 59 | assert min_number_tasks_on_test_server == len(tasks) 60 | 61 | 62 | @pytest.mark.server() 63 | def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): 64 | # By setting the batch size one lower than the minimum we guarantee at least two 65 | # batches and at the same time do as few batches (roundtrips) as possible. 66 | batch_size = min_number_tasks_on_test_server - 1 67 | batches = openml.utils._list_all( 68 | listing_call=openml.tasks.functions._list_tasks, 69 | batch_size=batch_size, 70 | ) 71 | assert len(batches) >= 2 72 | assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches) 73 | 74 | 75 | @pytest.mark.server() 76 | def test_list_all_for_datasets(min_number_datasets_on_test_server): 77 | datasets = openml.datasets.list_datasets( 78 | size=min_number_datasets_on_test_server, 79 | ) 80 | 81 | assert min_number_datasets_on_test_server == len(datasets) 82 | for dataset in datasets.to_dict(orient="index").values(): 83 | _check_dataset(dataset) 84 | 85 | 86 | @pytest.mark.server() 87 | def test_list_all_for_flows(min_number_flows_on_test_server): 88 | flows = openml.flows.list_flows(size=min_number_flows_on_test_server) 89 | assert min_number_flows_on_test_server == len(flows) 90 | 91 | 92 | @pytest.mark.server() 93 | @pytest.mark.flaky() # Other tests might need to upload runs first 94 | def test_list_all_for_setups(min_number_setups_on_test_server): 95 | # TODO apparently list_setups function does not support kwargs 96 | setups = openml.setups.list_setups(size=min_number_setups_on_test_server) 97 | assert min_number_setups_on_test_server == len(setups) 98 | 99 | 100 | @pytest.mark.server() 101 | @pytest.mark.flaky() # Other tests might need to upload runs first 102 | def test_list_all_for_runs(min_number_runs_on_test_server): 103 | runs = openml.runs.list_runs(size=min_number_runs_on_test_server) 104 | assert min_number_runs_on_test_server == len(runs) 105 | 106 | 107 | @pytest.mark.server() 108 | @pytest.mark.flaky() # Other tests might need to upload runs first 109 | def test_list_all_for_evaluations(min_number_evaluations_on_test_server): 110 | # TODO apparently list_evaluations function does not support kwargs 111 | evaluations = openml.evaluations.list_evaluations( 112 | function="predictive_accuracy", 113 | size=min_number_evaluations_on_test_server, 114 | ) 115 | assert min_number_evaluations_on_test_server == len(evaluations) 116 | 117 | 118 | @pytest.mark.server() 119 | @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call) 120 | def test_list_all_few_results_available(_perform_api_call): 121 | datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1) 122 | assert len(datasets) == 1, "only one iris dataset version 1 should be present" 123 | assert _perform_api_call.call_count == 1, "expect just one call to get one dataset" 124 | 125 | 126 | @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") 127 | @unittest.mock.patch("openml.config.get_cache_directory") 128 | def test__create_cache_directory(config_mock, tmp_path): 129 | config_mock.return_value = tmp_path 130 | openml.utils._create_cache_directory("abc") 131 | assert (tmp_path / "abc").exists() 132 | 133 | subdir = tmp_path / "def" 134 | subdir.mkdir() 135 | subdir.chmod(0o444) 136 | config_mock.return_value = subdir 137 | with pytest.raises( 138 | openml.exceptions.OpenMLCacheException, 139 | match="Cannot create cache directory", 140 | ): 141 | openml.utils._create_cache_directory("ghi") 142 | 143 | 144 | @pytest.mark.server() 145 | def test_correct_test_server_download_state(): 146 | """This test verifies that the test server downloads the data from the correct source. 147 | 148 | If this tests fails, it is highly likely that the test server is not configured correctly. 149 | Usually, this means that the test server is serving data from the task with the same ID from the production server. 150 | That is, it serves parquet files wrongly associated with the test server's task. 151 | """ 152 | task = openml.tasks.get_task(119) 153 | dataset = task.get_dataset() 154 | assert len(dataset.features) == dataset.get_data()[0].shape[1] 155 | -------------------------------------------------------------------------------- /examples/Advanced/task_manual_iteration_tutorial.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # Tasks define a target and a train/test split, which we can use for benchmarking. 3 | 4 | # %% 5 | import openml 6 | 7 | # %% [markdown] 8 | # For this tutorial we will use the famous King+Rook versus King+Pawn on A7 dataset, which has 9 | # the dataset ID 3 ([dataset on OpenML](https://www.openml.org/d/3)), and for which there exist 10 | # tasks with all important estimation procedures. It is small enough (less than 5000 samples) to 11 | # efficiently use it in an example. 12 | # 13 | # We will first start with ([task 233](https://www.openml.org/t/233)), which is a task with a 14 | # holdout estimation procedure. 15 | 16 | # %% 17 | task_id = 233 18 | task = openml.tasks.get_task(task_id) 19 | 20 | # %% [markdown] 21 | # Now that we have a task object we can obtain the number of repetitions, folds and samples as 22 | # defined by the task: 23 | 24 | # %% 25 | n_repeats, n_folds, n_samples = task.get_split_dimensions() 26 | 27 | # %% [markdown] 28 | # * ``n_repeats``: Number of times the model quality estimation is performed 29 | # * ``n_folds``: Number of folds per repeat 30 | # * ``n_samples``: How many data points to use. This is only relevant for learning curve tasks 31 | # 32 | # A list of all available estimation procedures is available 33 | # [here](https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure). 34 | # 35 | # Task ``233`` is a simple task using the holdout estimation procedure and therefore has only a 36 | # single repeat, a single fold and a single sample size: 37 | 38 | # %% 39 | print( 40 | f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." 41 | ) 42 | 43 | # %% [markdown] 44 | # We can now retrieve the train/test split for this combination of repeats, folds and number of 45 | # samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample 46 | # sizes, but we can neglect this here as there is only a single repetition. 47 | 48 | # %% 49 | train_indices, test_indices = task.get_train_test_split_indices( 50 | repeat=0, 51 | fold=0, 52 | sample=0, 53 | ) 54 | 55 | print(train_indices.shape, train_indices.dtype) 56 | print(test_indices.shape, test_indices.dtype) 57 | 58 | # %% [markdown] 59 | # And then split the data based on this: 60 | 61 | # %% 62 | X, y = task.get_X_and_y() 63 | X_train = X.iloc[train_indices] 64 | y_train = y.iloc[train_indices] 65 | X_test = X.iloc[test_indices] 66 | y_test = y.iloc[test_indices] 67 | 68 | print( 69 | f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}" 70 | ) 71 | 72 | # %% [markdown] 73 | # Obviously, we can also retrieve cross-validation versions of the dataset used in task ``233``: 74 | 75 | # %% 76 | task_id = 3 77 | task = openml.tasks.get_task(task_id) 78 | X, y = task.get_X_and_y() 79 | n_repeats, n_folds, n_samples = task.get_split_dimensions() 80 | print( 81 | f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." 82 | ) 83 | 84 | # %% [markdown] 85 | # And then perform the aforementioned iteration over all splits: 86 | 87 | # %% 88 | for repeat_idx in range(n_repeats): 89 | for fold_idx in range(n_folds): 90 | for sample_idx in range(n_samples): 91 | train_indices, test_indices = task.get_train_test_split_indices( 92 | repeat=repeat_idx, 93 | fold=fold_idx, 94 | sample=sample_idx, 95 | ) 96 | X_train = X.iloc[train_indices] 97 | y_train = y.iloc[train_indices] 98 | X_test = X.iloc[test_indices] 99 | y_test = y.iloc[test_indices] 100 | 101 | print( 102 | f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " 103 | f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" 104 | ) 105 | 106 | # %% [markdown] 107 | # And also versions with multiple repeats: 108 | 109 | # %% 110 | task_id = 1767 111 | task = openml.tasks.get_task(task_id) 112 | X, y = task.get_X_and_y() 113 | n_repeats, n_folds, n_samples = task.get_split_dimensions() 114 | print( 115 | f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." 116 | ) 117 | 118 | # %% [markdown] 119 | # And then again perform the aforementioned iteration over all splits: 120 | 121 | # %% 122 | for repeat_idx in range(n_repeats): 123 | for fold_idx in range(n_folds): 124 | for sample_idx in range(n_samples): 125 | train_indices, test_indices = task.get_train_test_split_indices( 126 | repeat=repeat_idx, 127 | fold=fold_idx, 128 | sample=sample_idx, 129 | ) 130 | X_train = X.iloc[train_indices] 131 | y_train = y.iloc[train_indices] 132 | X_test = X.iloc[test_indices] 133 | y_test = y.iloc[test_indices] 134 | 135 | print( 136 | f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " 137 | f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" 138 | ) 139 | 140 | # %% [markdown] 141 | # And finally a task based on learning curves: 142 | 143 | # %% 144 | task_id = 1702 145 | task = openml.tasks.get_task(task_id) 146 | X, y = task.get_X_and_y() 147 | n_repeats, n_folds, n_samples = task.get_split_dimensions() 148 | print( 149 | f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." 150 | ) 151 | 152 | # %% [markdown] 153 | # And then again perform the aforementioned iteration over all splits: 154 | 155 | # %% 156 | for repeat_idx in range(n_repeats): 157 | for fold_idx in range(n_folds): 158 | for sample_idx in range(n_samples): 159 | train_indices, test_indices = task.get_train_test_split_indices( 160 | repeat=repeat_idx, 161 | fold=fold_idx, 162 | sample=sample_idx, 163 | ) 164 | X_train = X.iloc[train_indices] 165 | y_train = y.iloc[train_indices] 166 | X_test = X.iloc[test_indices] 167 | y_test = y.iloc[test_indices] 168 | 169 | print( 170 | f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " 171 | f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" 172 | ) 173 | -------------------------------------------------------------------------------- /openml/base.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-Clause 2 | from __future__ import annotations 3 | 4 | import re 5 | import webbrowser 6 | from abc import ABC, abstractmethod 7 | from typing import Iterable, Sequence 8 | 9 | import xmltodict 10 | 11 | import openml._api_calls 12 | import openml.config 13 | 14 | from .utils import _get_rest_api_type_alias, _tag_openml_base 15 | 16 | 17 | class OpenMLBase(ABC): 18 | """Base object for functionality that is shared across entities.""" 19 | 20 | def __repr__(self) -> str: 21 | body_fields = self._get_repr_body_fields() 22 | return self._apply_repr_template(body_fields) 23 | 24 | @property 25 | @abstractmethod 26 | def id(self) -> int | None: 27 | """The id of the entity, it is unique for its entity type.""" 28 | 29 | @property 30 | def openml_url(self) -> str | None: 31 | """The URL of the object on the server, if it was uploaded, else None.""" 32 | if self.id is None: 33 | return None 34 | return self.__class__.url_for_id(self.id) 35 | 36 | @classmethod 37 | def url_for_id(cls, id_: int) -> str: 38 | """Return the OpenML URL for the object of the class entity with the given id.""" 39 | # Sample url for a flow: openml.org/f/123 40 | return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}" 41 | 42 | @classmethod 43 | def _entity_letter(cls) -> str: 44 | """Return the letter which represents the entity type in urls, e.g. 'f' for flow.""" 45 | # We take advantage of the class naming convention (OpenMLX), 46 | # which holds for all entities except studies and tasks, which overwrite this method. 47 | return cls.__name__.lower()[len("OpenML") :][0] 48 | 49 | # TODO(eddiebergman): This would be much cleaner as an iterator... 50 | @abstractmethod 51 | def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: 52 | """Collect all information to display in the __repr__ body. 53 | 54 | Returns 55 | ------- 56 | body_fields : List[Tuple[str, Union[str, int, List[str]]]] 57 | A list of (name, value) pairs to display in the body of the __repr__. 58 | E.g.: [('metric', 'accuracy'), ('dataset', 'iris')] 59 | If value is a List of str, then each item of the list will appear in a separate row. 60 | """ 61 | # Should be implemented in the base class. 62 | 63 | def _apply_repr_template( 64 | self, 65 | body_fields: Iterable[tuple[str, str | int | list[str] | None]], 66 | ) -> str: 67 | """Generates the header and formats the body for string representation of the object. 68 | 69 | Parameters 70 | ---------- 71 | body_fields: List[Tuple[str, str]] 72 | A list of (name, value) pairs to display in the body of the __repr__. 73 | """ 74 | # We add spaces between capitals, e.g. ClassificationTask -> Classification Task 75 | name_with_spaces = re.sub( 76 | r"(\w)([A-Z])", 77 | r"\1 \2", 78 | self.__class__.__name__[len("OpenML") :], 79 | ) 80 | header_text = f"OpenML {name_with_spaces}" 81 | header = f"{header_text}\n{'=' * len(header_text)}\n" 82 | 83 | _body_fields: list[tuple[str, str | int | list[str]]] = [ 84 | (k, "None" if v is None else v) for k, v in body_fields 85 | ] 86 | longest_field_name_length = max(len(name) for name, _ in _body_fields) 87 | field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" 88 | body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields) 89 | return header + body 90 | 91 | @abstractmethod 92 | def _to_dict(self) -> dict[str, dict]: 93 | """Creates a dictionary representation of self. 94 | 95 | The return value will be used to create the upload xml file. 96 | The xml file must have the tags in exactly the order of the object's xsd. 97 | (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/). 98 | 99 | Returns 100 | ------- 101 | Thing represented as dict. 102 | """ 103 | # Should be implemented in the base class. 104 | 105 | def _to_xml(self) -> str: 106 | """Generate xml representation of self for upload to server.""" 107 | dict_representation = self._to_dict() 108 | xml_representation = xmltodict.unparse(dict_representation, pretty=True) 109 | 110 | # A task may not be uploaded with the xml encoding specification: 111 | # 112 | _encoding_specification, xml_body = xml_representation.split("\n", 1) 113 | return str(xml_body) 114 | 115 | def _get_file_elements(self) -> openml._api_calls.FILE_ELEMENTS_TYPE: 116 | """Get file_elements to upload to the server, called during Publish. 117 | 118 | Derived child classes should overwrite this method as necessary. 119 | The description field will be populated automatically if not provided. 120 | """ 121 | return {} 122 | 123 | @abstractmethod 124 | def _parse_publish_response(self, xml_response: dict[str, str]) -> None: 125 | """Parse the id from the xml_response and assign it to self.""" 126 | 127 | def publish(self) -> OpenMLBase: 128 | """Publish the object on the OpenML server.""" 129 | file_elements = self._get_file_elements() 130 | 131 | if "description" not in file_elements: 132 | file_elements["description"] = self._to_xml() 133 | 134 | call = f"{_get_rest_api_type_alias(self)}/" 135 | response_text = openml._api_calls._perform_api_call( 136 | call, 137 | "post", 138 | file_elements=file_elements, 139 | ) 140 | xml_response = xmltodict.parse(response_text) 141 | 142 | self._parse_publish_response(xml_response) 143 | return self 144 | 145 | def open_in_browser(self) -> None: 146 | """Opens the OpenML web page corresponding to this object in your default browser.""" 147 | if self.openml_url is None: 148 | raise ValueError( 149 | "Cannot open element on OpenML.org when attribute `openml_url` is `None`", 150 | ) 151 | 152 | webbrowser.open(self.openml_url) 153 | 154 | def push_tag(self, tag: str) -> None: 155 | """Annotates this entity with a tag on the server. 156 | 157 | Parameters 158 | ---------- 159 | tag : str 160 | Tag to attach to the flow. 161 | """ 162 | _tag_openml_base(self, tag) 163 | 164 | def remove_tag(self, tag: str) -> None: 165 | """Removes a tag from this entity on the server. 166 | 167 | Parameters 168 | ---------- 169 | tag : str 170 | Tag to attach to the flow. 171 | """ 172 | _tag_openml_base(self, tag, untag=True) 173 | --------------------------------------------------------------------------------