├── openml
    ├── py.typed
    ├── __version__.py
    ├── evaluations
    │   ├── __init__.py
    │   └── evaluation.py
    ├── setups
    │   ├── __init__.py
    │   └── setup.py
    ├── flows
    │   └── __init__.py
    ├── extensions
    │   ├── __init__.py
    │   └── functions.py
    ├── runs
    │   └── __init__.py
    ├── datasets
    │   ├── __init__.py
    │   └── data_feature.py
    ├── tasks
    │   └── __init__.py
    ├── study
    │   └── __init__.py
    ├── exceptions.py
    ├── __init__.py
    └── base.py
├── tests
    ├── test_flows
    │   ├── __init__.py
    │   └── dummy_learn
    │   │   ├── __init__.py
    │   │   └── dummy_forest.py
    ├── test_runs
    │   ├── __init__.py
    │   └── test_trace.py
    ├── test_study
    │   └── __init__.py
    ├── test_utils
    │   ├── __init__.py
    │   └── test_utils.py
    ├── test_datasets
    │   └── __init__.py
    ├── test_extensions
    │   ├── __init__.py
    │   └── test_functions.py
    ├── test_openml
    │   ├── __init__.py
    │   ├── test_openml.py
    │   └── test_api_calls.py
    ├── test_evaluations
    │   ├── __init__.py
    │   └── test_evaluations_example.py
    ├── files
    │   ├── org
    │   │   └── openml
    │   │   │   └── test
    │   │   │       ├── datasets
    │   │   │           ├── 2
    │   │   │           │   └── description.xml
    │   │   │           ├── 30
    │   │   │           │   └── dataset_30.pq
    │   │   │           └── -1
    │   │   │           │   ├── description.xml
    │   │   │           │   └── qualities.xml
    │   │   │       ├── setups
    │   │   │           └── 1
    │   │   │           │   └── description.xml
    │   │   │       └── tasks
    │   │   │           ├── 1
    │   │   │               └── task.xml
    │   │   │           ├── 3
    │   │   │               └── task.xml
    │   │   │           └── 1882
    │   │   │               └── task.xml
    │   ├── mock_responses
    │   │   ├── datasets
    │   │   │   ├── data_delete_successful.xml
    │   │   │   ├── data_delete_not_exist.xml
    │   │   │   ├── data_delete_not_owned.xml
    │   │   │   ├── data_delete_has_tasks.xml
    │   │   │   └── data_description_61.xml
    │   │   ├── flows
    │   │   │   ├── flow_delete_successful.xml
    │   │   │   ├── flow_delete_not_exist.xml
    │   │   │   ├── flow_delete_not_owned.xml
    │   │   │   ├── flow_delete_has_runs.xml
    │   │   │   └── flow_delete_is_subflow.xml
    │   │   ├── runs
    │   │   │   ├── run_delete_successful.xml
    │   │   │   ├── run_delete_not_exist.xml
    │   │   │   └── run_delete_not_owned.xml
    │   │   └── tasks
    │   │   │   ├── task_delete_successful.xml
    │   │   │   ├── task_delete_not_exist.xml
    │   │   │   ├── task_delete_not_owned.xml
    │   │   │   └── task_delete_has_runs.xml
    │   └── misc
    │   │   └── features_with_whitespaces.xml
    ├── __init__.py
    ├── test_setups
    │   └── __init__.py
    └── test_tasks
    │   ├── __init__.py
    │   ├── test_supervised_task.py
    │   ├── test_learning_curve_task.py
    │   ├── test_classification_task.py
    │   ├── test_task_methods.py
    │   ├── test_clustering_task.py
    │   ├── test_regression_task.py
    │   ├── test_split.py
    │   └── test_task.py
├── MANIFEST.in
├── docs
    ├── images
    │   └── openml_icon.png
    ├── stylesheets
    │   └── extra.css
    ├── contributing.md
    ├── details.md
    └── index.md
├── examples
    ├── _external_or_deprecated
    │   ├── README.md
    │   ├── plot_svm_hyperparameters_tutorial.py
    │   ├── flow_id_tutorial.py
    │   ├── 2015_neurips_feurer_example.py
    │   ├── run_setup_tutorial.py
    │   ├── 2018_ida_strang_example.py
    │   └── benchmark_with_optunahub.py
    ├── Basics
    │   ├── simple_tasks_tutorial.py
    │   ├── simple_datasets_tutorial.py
    │   ├── introduction_tutorial.py
    │   ├── simple_suites_tutorial.py
    │   └── simple_flows_and_runs_tutorial.py
    ├── introduction.py
    └── Advanced
    │   ├── configure_logging.py
    │   ├── suites_tutorial.py
    │   ├── study_tutorial.py
    │   ├── datasets_tutorial.py
    │   └── task_manual_iteration_tutorial.py
├── .github
    ├── dependabot.yml
    ├── workflows
    │   ├── dist.yaml
    │   ├── docs.yaml
    │   ├── release_docker.yaml
    │   └── test.yml
    ├── PULL_REQUEST_TEMPLATE.md
    └── ISSUE_TEMPLATE
    │   └── ISSUE_TEMPLATE.md
├── Makefile
├── docker
    ├── Dockerfile
    ├── startup.sh
    └── readme.md
├── .pre-commit-config.yaml
├── .gitignore
├── scripts
    └── gen_ref_pages.py
├── CITATION.cff
├── LICENSE
├── README.md
└── mkdocs.yml


/openml/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_flows/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_runs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_study/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_extensions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_openml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_evaluations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/tests/test_flows/dummy_learn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/images/openml_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openml/openml-python/HEAD/docs/images/openml_icon.png


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | .jp-InputArea-prompt, .jp-InputPrompt {
2 |     display: none !important;
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/datasets/30/dataset_30.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openml/openml-python/HEAD/tests/files/org/openml/test/datasets/30/dataset_30.pq


--------------------------------------------------------------------------------
/tests/files/mock_responses/datasets/data_delete_successful.xml:
--------------------------------------------------------------------------------
1 | <oml:data_delete xmlns:oml="http://openml.org/openml">
2 |   <oml:id>40000</oml:id>
3 | </oml:data_delete>
4 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/flows/flow_delete_successful.xml:
--------------------------------------------------------------------------------
1 | <oml:flow_delete xmlns:oml="http://openml.org/openml">
2 |     <oml:id>33364</oml:id>
3 | </oml:flow_delete>
4 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/runs/run_delete_successful.xml:
--------------------------------------------------------------------------------
1 | <oml:run_delete xmlns:oml="http://openml.org/openml">
2 |   <oml:id>10591880</oml:id>
3 | </oml:run_delete>
4 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/tasks/task_delete_successful.xml:
--------------------------------------------------------------------------------
1 | <oml:task_delete xmlns:oml="http://openml.org/openml">
2 |   <oml:id>361323</oml:id>
3 | </oml:task_delete>
4 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # License: BSD 3-Clause
2 | 
3 | # Dummy to allow mock classes in the test files to have a version number for
4 | # their parent module
5 | __version__ = "0.1"
6 | 


--------------------------------------------------------------------------------
/tests/test_setups/__init__.py:
--------------------------------------------------------------------------------
1 | # License: BSD 3-Clause
2 | 
3 | # Dummy to allow mock classes in the test files to have a version number for
4 | # their parent module
5 | __version__ = "0.1"
6 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/runs/run_delete_not_exist.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>392</oml:code>
3 | 	<oml:message>Run does not exist</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/flows/flow_delete_not_exist.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>322</oml:code>
3 | 	<oml:message>flow does not exist</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/runs/run_delete_not_owned.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>393</oml:code>
3 | 	<oml:message>Run is not owned by you</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/tasks/task_delete_not_exist.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>452</oml:code>
3 | 	<oml:message>Task does not exist</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/datasets/data_delete_not_exist.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>352</oml:code>
3 | 	<oml:message>Dataset does not exist</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/datasets/data_delete_not_owned.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>353</oml:code>
3 | 	<oml:message>Dataset is not owned by you</oml:message>
4 | </oml:error>


--------------------------------------------------------------------------------
/tests/files/mock_responses/flows/flow_delete_not_owned.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>323</oml:code>
3 | 	<oml:message>flow is not owned by you</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/tasks/task_delete_not_owned.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>453</oml:code>
3 | 	<oml:message>Task is not owned by you</oml:message>
4 | 	</oml:error>
5 | 


--------------------------------------------------------------------------------
/openml/__version__.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 | 
3 | # License: BSD 3-Clause
4 | 
5 | # The following line *must* be the last in the module, exactly as formatted:
6 | from __future__ import annotations
7 | 
8 | __version__ = "0.16.0"
9 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/tasks/task_delete_has_runs.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>454</oml:code>
3 | 	<oml:message>Task is executed in some runs. Delete these first</oml:message>
4 | 	</oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/datasets/data_delete_has_tasks.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>354</oml:code>
3 | 	<oml:message>Dataset is in use by other content. Can not be deleted</oml:message>
4 | </oml:error>
5 | 


--------------------------------------------------------------------------------
/tests/test_tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .test_supervised_task import OpenMLSupervisedTaskTest
 4 | from .test_task import OpenMLTaskTest
 5 | 
 6 | __all__ = [
 7 |     "OpenMLTaskTest",
 8 |     "OpenMLSupervisedTaskTest",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/flows/flow_delete_has_runs.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>324</oml:code>
3 | 	<oml:message>flow is in use by other content (runs). Can not be deleted</oml:message>
4 |     <oml:additional_information>{10716, 10707} ()</oml:additional_information>
5 | </oml:error>
6 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/flows/flow_delete_is_subflow.xml:
--------------------------------------------------------------------------------
1 | <oml:error xmlns:oml="http://openml.org/openml">
2 | 	<oml:code>328</oml:code>
3 | 	<oml:message>flow is in use by other content (it is a subflow). Can not be deleted</oml:message>
4 |     <oml:additional_information>{37661}</oml:additional_information>
5 | </oml:error>
6 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/README.md:
--------------------------------------------------------------------------------
1 | # External or Deprecated Examples
2 | 
3 | This directory contains examples that are either external or deprecated. They may not be maintained or updated 
4 | regularly, and their functionality might not align with the latest version of the library. Moreover,
5 | they are not shown on the documentation website.


--------------------------------------------------------------------------------
/openml/evaluations/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .evaluation import OpenMLEvaluation
 4 | from .functions import list_evaluation_measures, list_evaluations, list_evaluations_setups
 5 | 
 6 | __all__ = [
 7 |     "OpenMLEvaluation",
 8 |     "list_evaluations",
 9 |     "list_evaluation_measures",
10 |     "list_evaluations_setups",
11 | ]
12 | 


--------------------------------------------------------------------------------
/openml/setups/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .functions import get_setup, initialize_model, list_setups, setup_exists
 4 | from .setup import OpenMLParameter, OpenMLSetup
 5 | 
 6 | __all__ = [
 7 |     "OpenMLSetup",
 8 |     "OpenMLParameter",
 9 |     "get_setup",
10 |     "list_setups",
11 |     "setup_exists",
12 |     "initialize_model",
13 | ]
14 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | updates:
 4 |   # This will check for updates to github actions every day
 5 |   # https://docs.github.com/en/enterprise-server@3.4/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
 6 |   - package-ecosystem: "github-actions"
 7 |     directory: "/"
 8 |     schedule:
 9 |       interval: "daily"
10 | 


--------------------------------------------------------------------------------
/tests/test_flows/dummy_learn/dummy_forest.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | 
 5 | class DummyRegressor:
 6 |     def fit(self, X, y):
 7 |         return self
 8 | 
 9 |     def predict(self, X):
10 |         return X[:, 0]
11 | 
12 |     def get_params(self, deep=False):
13 |         return {}
14 | 
15 |     def set_params(self, params):
16 |         return self
17 | 


--------------------------------------------------------------------------------
/openml/flows/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .flow import OpenMLFlow
 4 | from .functions import (
 5 |     assert_flows_equal,
 6 |     delete_flow,
 7 |     flow_exists,
 8 |     get_flow,
 9 |     get_flow_id,
10 |     list_flows,
11 | )
12 | 
13 | __all__ = [
14 |     "OpenMLFlow",
15 |     "get_flow",
16 |     "list_flows",
17 |     "get_flow_id",
18 |     "flow_exists",
19 |     "assert_flows_equal",
20 |     "delete_flow",
21 | ]
22 | 


--------------------------------------------------------------------------------
/openml/extensions/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from typing import List, Type  # noqa: F401
 4 | 
 5 | from .extension_interface import Extension
 6 | from .functions import get_extension_by_flow, get_extension_by_model, register_extension
 7 | 
 8 | extensions = []  # type: List[Type[Extension]]
 9 | 
10 | 
11 | __all__ = [
12 |     "Extension",
13 |     "register_extension",
14 |     "get_extension_by_model",
15 |     "get_extension_by_flow",
16 | ]
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # simple makefile to simplify repetitive build env management tasks under posix
 2 | 
 3 | PYTHON ?= python
 4 | CYTHON ?= cython
 5 | PYTEST ?= pytest
 6 | CTAGS ?= ctags
 7 | 
 8 | all: clean inplace test
 9 | 
10 | check:
11 | 	pre-commit run --all-files
12 | 
13 | clean:
14 | 	$(PYTHON) setup.py clean
15 | 	rm -rf dist openml.egg-info
16 | 
17 | in: inplace # just a shortcut
18 | inplace:
19 | 	$(PYTHON) setup.py build_ext -i
20 | 
21 | test-code: in
22 | 	$(PYTEST) -s -v tests
23 | 
24 | test-coverage:
25 | 	rm -rf coverage .coverage
26 | 	$(PYTEST) -s -v --cov=. tests
27 | 
28 | test: test-code
29 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/datasets/-1/description.xml:
--------------------------------------------------------------------------------
 1 | <oml:data_set_description xmlns:oml="http://openml.org/openml">
 2 |     <oml:id>-1</oml:id>
 3 |     <oml:name>dexter</oml:name>
 4 |     <oml:version>1</oml:version>
 5 |     <oml:description>Test set of the dexter dataset as used
 6 |         in the AutoWEKA paper (Thornton et al. 2013)</oml:description>
 7 |     <oml:format>ARFF</oml:format>
 8 |     <oml:upload_date></oml:upload_date>
 9 |     <oml:licence>Public</oml:licence>
10 |     <oml:url>http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/dexter.zip</oml:url>
11 |     <oml:default_target_attribute>class</oml:default_target_attribute>
12 |     <oml:md5_checksum></oml:md5_checksum>
13 | </oml:data_set_description>
14 | 


--------------------------------------------------------------------------------
/openml/runs/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .functions import (
 4 |     delete_run,
 5 |     get_run,
 6 |     get_run_trace,
 7 |     get_runs,
 8 |     initialize_model_from_run,
 9 |     initialize_model_from_trace,
10 |     list_runs,
11 |     run_exists,
12 |     run_flow_on_task,
13 |     run_model_on_task,
14 | )
15 | from .run import OpenMLRun
16 | from .trace import OpenMLRunTrace, OpenMLTraceIteration
17 | 
18 | __all__ = [
19 |     "OpenMLRun",
20 |     "OpenMLRunTrace",
21 |     "OpenMLTraceIteration",
22 |     "run_model_on_task",
23 |     "run_flow_on_task",
24 |     "get_run",
25 |     "list_runs",
26 |     "get_runs",
27 |     "get_run_trace",
28 |     "run_exists",
29 |     "initialize_model_from_run",
30 |     "initialize_model_from_trace",
31 |     "delete_run",
32 | ]
33 | 


--------------------------------------------------------------------------------
/openml/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .data_feature import OpenMLDataFeature
 4 | from .dataset import OpenMLDataset
 5 | from .functions import (
 6 |     attributes_arff_from_df,
 7 |     check_datasets_active,
 8 |     create_dataset,
 9 |     delete_dataset,
10 |     edit_dataset,
11 |     fork_dataset,
12 |     get_dataset,
13 |     get_datasets,
14 |     list_datasets,
15 |     list_qualities,
16 |     status_update,
17 | )
18 | 
19 | __all__ = [
20 |     "attributes_arff_from_df",
21 |     "check_datasets_active",
22 |     "create_dataset",
23 |     "get_dataset",
24 |     "get_datasets",
25 |     "list_datasets",
26 |     "OpenMLDataset",
27 |     "OpenMLDataFeature",
28 |     "status_update",
29 |     "list_qualities",
30 |     "edit_dataset",
31 |     "fork_dataset",
32 |     "delete_dataset",
33 | ]
34 | 


--------------------------------------------------------------------------------
/openml/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .functions import (
 4 |     create_task,
 5 |     delete_task,
 6 |     get_task,
 7 |     get_tasks,
 8 |     list_tasks,
 9 | )
10 | from .split import OpenMLSplit
11 | from .task import (
12 |     OpenMLClassificationTask,
13 |     OpenMLClusteringTask,
14 |     OpenMLLearningCurveTask,
15 |     OpenMLRegressionTask,
16 |     OpenMLSupervisedTask,
17 |     OpenMLTask,
18 |     TaskType,
19 | )
20 | 
21 | __all__ = [
22 |     "OpenMLTask",
23 |     "OpenMLSupervisedTask",
24 |     "OpenMLClusteringTask",
25 |     "OpenMLRegressionTask",
26 |     "OpenMLClassificationTask",
27 |     "OpenMLLearningCurveTask",
28 |     "create_task",
29 |     "get_task",
30 |     "get_tasks",
31 |     "list_tasks",
32 |     "OpenMLSplit",
33 |     "TaskType",
34 |     "delete_task",
35 | ]
36 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile to build an image with preinstalled dependencies
 2 | # Useful building docs or running unix tests from a Windows host.
 3 | FROM python:3.10
 4 | 
 5 | RUN git clone  https://github.com/openml/openml-python.git openml
 6 | WORKDIR openml
 7 | RUN python -m venv venv
 8 | RUN venv/bin/pip install wheel setuptools
 9 | RUN venv/bin/pip install -e .[test,examples,docs,examples_unix]
10 | 
11 | WORKDIR /
12 | RUN mkdir scripts
13 | ADD startup.sh scripts/
14 | ADD readme.md /
15 | 
16 | # Due to the nature of the Docker container it might often be built from Windows.
17 | # It is typical to have the files with \r\n line-ending, we want to remove it for the unix image.
18 | RUN sed -i 's/\r//g' scripts/startup.sh
19 | 
20 | # overwrite the default `python` entrypoint
21 | ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"]
22 | 


--------------------------------------------------------------------------------
/examples/Basics/simple_tasks_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # A brief example on how to use tasks from OpenML.
 3 | 
 4 | # %%
 5 | 
 6 | import openml
 7 | 
 8 | # %% [markdown]
 9 | # Get a [task](https://docs.openml.org/concepts/tasks/) for
10 | # [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
11 | 
12 | # %%
13 | task = openml.tasks.get_task(31)
14 | 
15 | # %% [markdown]
16 | # Get the dataset and its data from the task.
17 | 
18 | # %%
19 | dataset = task.get_dataset()
20 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
21 | 
22 | # %% [markdown]
23 | # Get the first out of the 10 cross-validation splits from the task.
24 | 
25 | # %%
26 | train_indices, test_indices = task.get_train_test_split_indices(fold=0)
27 | print(train_indices[:10])  # print the first 10 indices of the training set
28 | 


--------------------------------------------------------------------------------
/openml/study/__init__.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | 
 3 | from .functions import (
 4 |     attach_to_study,
 5 |     attach_to_suite,
 6 |     create_benchmark_suite,
 7 |     create_study,
 8 |     delete_study,
 9 |     delete_suite,
10 |     detach_from_study,
11 |     detach_from_suite,
12 |     get_study,
13 |     get_suite,
14 |     list_studies,
15 |     list_suites,
16 |     update_study_status,
17 |     update_suite_status,
18 | )
19 | from .study import OpenMLBenchmarkSuite, OpenMLStudy
20 | 
21 | __all__ = [
22 |     "OpenMLStudy",
23 |     "OpenMLBenchmarkSuite",
24 |     "attach_to_study",
25 |     "attach_to_suite",
26 |     "create_benchmark_suite",
27 |     "create_study",
28 |     "delete_study",
29 |     "delete_suite",
30 |     "detach_from_study",
31 |     "detach_from_suite",
32 |     "get_study",
33 |     "get_suite",
34 |     "list_studies",
35 |     "list_suites",
36 |     "update_suite_status",
37 |     "update_study_status",
38 | ]
39 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/setups/1/description.xml:
--------------------------------------------------------------------------------
 1 | <oml:setup_parameters xmlns:oml="http://openml.org/openml">
 2 |   <oml:setup_id>100</oml:setup_id>
 3 |   <oml:flow_id>60</oml:flow_id>
 4 | 	<oml:parameter>
 5 | 		<oml:id>3432</oml:id>
 6 | 		<oml:flow_id>60</oml:flow_id>
 7 | 		<oml:flow_name>weka.J48</oml:flow_name>
 8 | 		<oml:full_name>weka.J48(1)_C</oml:full_name>
 9 | 		<oml:parameter_name>C</oml:parameter_name>
10 | 		<oml:data_type>option</oml:data_type>
11 | 		<oml:default_value>0.25</oml:default_value>
12 | 		<oml:value>0.9</oml:value>
13 | 	</oml:parameter>
14 | 	<oml:parameter>
15 | 		<oml:id>3435</oml:id>
16 | 		<oml:flow_id>60</oml:flow_id>
17 | 		<oml:flow_name>weka.J48</oml:flow_name>
18 | 		<oml:full_name>weka.J48(1)_M</oml:full_name>
19 | 		<oml:parameter_name>M</oml:parameter_name>
20 | 		<oml:data_type>option</oml:data_type>
21 | 		<oml:default_value>2</oml:default_value>
22 | 		<oml:value>2</oml:value>
23 | 	</oml:parameter>
24 | </oml:setup_parameters>
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_supervised_task.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import unittest
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from openml.tasks import get_task
 9 | 
10 | from .test_task import OpenMLTaskTest
11 | 
12 | 
13 | class OpenMLSupervisedTaskTest(OpenMLTaskTest):
14 |     """
15 |     A helper class. The methods of the test case
16 |     are only executed in subclasses of the test case.
17 |     """
18 | 
19 |     __test__ = False
20 | 
21 |     @classmethod
22 |     def setUpClass(cls):
23 |         if cls is OpenMLSupervisedTaskTest:
24 |             raise unittest.SkipTest("Skip OpenMLSupervisedTaskTest tests," " it's a base class")
25 |         super().setUpClass()
26 | 
27 |     def setUp(self, n_levels: int = 1):
28 |         super().setUp()
29 | 
30 |     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
31 |         task = get_task(self.task_id)
32 |         X, Y = task.get_X_and_y()
33 |         return X, Y
34 | 


--------------------------------------------------------------------------------
/tests/files/misc/features_with_whitespaces.xml:
--------------------------------------------------------------------------------
 1 | <oml:data_features xmlns:oml="http://openml.org/openml">
 2 |     <oml:feature>
 3 |         <oml:index>0</oml:index>
 4 |         <oml:name>V1</oml:name>
 5 |         <oml:data_type>numeric</oml:data_type>
 6 |             <oml:is_target>false</oml:is_target>
 7 |         <oml:is_ignore>false</oml:is_ignore>
 8 |         <oml:is_row_identifier>false</oml:is_row_identifier>
 9 |         <oml:number_of_missing_values>0</oml:number_of_missing_values>
10 |     </oml:feature>
11 |     <oml:feature>
12 |         <oml:index>1</oml:index>
13 |         <oml:name>V42</oml:name>
14 |         <oml:data_type>nominal</oml:data_type>
15 |               <oml:nominal_value> - 50000.</oml:nominal_value>
16 |               <oml:nominal_value> 50000+.</oml:nominal_value>
17 |             <oml:is_target>false</oml:is_target>
18 |         <oml:is_ignore>false</oml:is_ignore>
19 |         <oml:is_row_identifier>false</oml:is_row_identifier>
20 |         <oml:number_of_missing_values>0</oml:number_of_missing_values>
21 |     </oml:feature>
22 | </oml:data_features>
23 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_learning_curve_task.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from openml.tasks import TaskType, get_task
 7 | 
 8 | from .test_supervised_task import OpenMLSupervisedTaskTest
 9 | 
10 | 
11 | class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest):
12 |     __test__ = True
13 | 
14 |     def setUp(self, n_levels: int = 1):
15 |         super().setUp()
16 |         self.task_id = 801  # diabetes
17 |         self.task_type = TaskType.LEARNING_CURVE
18 |         self.estimation_procedure = 13
19 | 
20 |     def test_get_X_and_Y(self):
21 |         X, Y = super().test_get_X_and_Y()
22 |         assert X.shape == (768, 8)
23 |         assert isinstance(X, pd.DataFrame)
24 |         assert Y.shape == (768,)
25 |         assert isinstance(Y, pd.Series)
26 |         assert pd.api.types.is_categorical_dtype(Y)
27 | 
28 |     def test_download_task(self):
29 |         task = super().test_download_task()
30 |         assert task.task_id == self.task_id
31 |         assert task.task_type_id == TaskType.LEARNING_CURVE
32 |         assert task.dataset_id == 20
33 | 
34 |     def test_class_labels(self):
35 |         task = get_task(self.task_id)
36 |         assert task.class_labels == ["tested_negative", "tested_positive"]
37 | 


--------------------------------------------------------------------------------
/.github/workflows/dist.yaml:
--------------------------------------------------------------------------------
 1 | name: dist-check
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |       - develop
10 |     tags:
11 |       - "v*.*.*"
12 | 
13 |   pull_request:
14 |     branches:
15 |       - main
16 |       - develop
17 | 
18 | concurrency:
19 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20 |   cancel-in-progress: true
21 | 
22 | jobs:
23 |   dist:
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |     - name: Setup Python
28 |       uses: actions/setup-python@v5
29 |       with:
30 |         python-version: 3.8
31 |     - name: Build dist
32 |       run: |
33 |         pip install build
34 |         python -m build --sdist
35 |     - name: Twine check
36 |       run: |
37 |         pip install twine
38 |         last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
39 |         twine check $last_dist
40 |     - name: Install dist
41 |       run: |
42 |         last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
43 |         pip install $last_dist
44 |     - name: PEP 561 Compliance
45 |       run: |
46 |         pip install mypy
47 |         cd ..  # required to use the installed version of openml
48 |         if ! python -m mypy -c "import openml"; then exit 1; fi
49 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contribution to the OpenML package is highly appreciated in all forms.
 4 | In particular, a few ways to contribute to openml-python are:
 5 | 
 6 | -   A direct contribution to the package, by means of improving the
 7 |     code, documentation or examples. To get started, see [this
 8 |     file](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
 9 |     with details on how to set up your environment to develop for
10 |     openml-python.
11 | -   A contribution to an openml-python extension. An extension package
12 |     allows OpenML to interface with a machine learning package (such
13 |     as scikit-learn or keras). These extensions are hosted in separate
14 |     repositories and may have their own guidelines. For more
15 |     information, see also [extensions](extensions.md).
16 | -   Bug reports. If something doesn't work for you or is cumbersome,
17 |     please open a new issue to let us know about the problem.
18 | -   [Cite OpenML](https://www.openml.org/terms) if you use it in a
19 |     scientific publication.
20 | -   Visit one of our [hackathons](https://www.openml.org/meet).
21 | -   Contribute to another OpenML project, such as [the main OpenML
22 |     project](https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md).
23 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing a pull request to the OpenML python connector! Please ensure you have taken a look at
 3 | the contribution guidelines: https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md#Contributing-Pull-Requests
 4 | 
 5 | Please make sure that:
 6 | 
 7 | * the title of the pull request is descriptive
 8 | * this pull requests is against the `develop` branch
 9 | * for any new functionality, consider adding a relevant example
10 | * add unit tests for new functionalities
11 |     * collect files uploaded to test server using _mark_entity_for_removal()
12 | * add the BSD 3-Clause license to any new file created
13 | -->
14 | 
15 | #### Metadata
16 | * Reference Issue: <!-- Example: Fixes #1234 or NA-->
17 | * New Tests Added: <!-- Yes/No/NA -->
18 | * Documentation Updated: <!-- Yes/No/NA -->
19 | * Change Log Entry: <!-- Short String, example: "Add new function `foo()` to module `bar`"; or "Fixes a bug with `bar`" -->
20 | 
21 | 
22 | #### Details 
23 | <!--
24 | if necessary, please share the following:
25 | 
26 | * What does this PR implement/fix? Explain your changes.
27 | * Why is this change necessary? What is the problem it solves?
28 | * How can I reproduce the issue this PR is solving and its solution?
29 | * Any other comments?
30 | -->
31 | 
32 | 


--------------------------------------------------------------------------------
/examples/introduction.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | #
 3 | # We provide a set of examples here to get started with OpenML-Python. These examples cover various aspects of using the
 4 | # OpenML API, including downloading datasets, uploading results, and working with tasks.
 5 | #
 6 | # ## Basics
 7 | #
 8 | # 1. [Installing and setting up OpenML-Python](../Basics/introduction_tutorial/)
 9 | # 2. [Downloading datasets](../Basics/simple_datasets_tutorial/)
10 | # 3. [Using tasks](../Basics/simple_tasks_tutorial/)
11 | # 3. [Uploading experiment results](../Basics/simple_flows_and_runs_tutorial/)
12 | # 4. [Working with collections of tasks](../Basics/simple_suites_tutorial/)
13 | #
14 | # ## Advanced
15 | # 1. [Getting splits for datasets from tasks](../Advanced/task_manual_iteration_tutorial/)
16 | # 2. [Creating and uploading datasets](../Advanced/create_upload_tutorial/)
17 | # 3. [Searching and editing datasets](../Advanced/datasets_tutorial/)
18 | # 4. [Searching and creating tasks](../Advanced/task_tutorial/)
19 | # 5. [Listing, downloading, and uploading suites](../Advanced/suites_tutorial/)
20 | # 6. [Listing, downloading, and uploading studies](../Advanced/study_tutorial/)
21 | # 7. [Downloading evaluation results](../Advanced/fetch_evaluations_tutorial/)
22 | # 8. [Configuring logging](../Advanced/configure_logging/)
23 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_classification_task.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from openml.tasks import TaskType, get_task
 8 | 
 9 | from .test_supervised_task import OpenMLSupervisedTaskTest
10 | 
11 | 
12 | class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest):
13 |     __test__ = True
14 | 
15 |     def setUp(self, n_levels: int = 1):
16 |         super().setUp()
17 |         self.task_id = 119  # diabetes
18 |         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
19 |         self.estimation_procedure = 5
20 | 
21 |     def test_download_task(self):
22 |         task = super().test_download_task()
23 |         assert task.task_id == self.task_id
24 |         assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
25 |         assert task.dataset_id == 20
26 |         assert task.estimation_procedure_id == self.estimation_procedure
27 | 
28 |     def test_class_labels(self):
29 |         task = get_task(self.task_id)
30 |         assert task.class_labels == ["tested_negative", "tested_positive"]
31 | 
32 | 
33 | @pytest.mark.server()
34 | def test_get_X_and_Y():
35 |     task = get_task(119)
36 |     X, Y = task.get_X_and_y()
37 |     assert X.shape == (768, 8)
38 |     assert isinstance(X, pd.DataFrame)
39 |     assert Y.shape == (768,)
40 |     assert isinstance(Y, pd.Series)
41 |     assert pd.api.types.is_categorical_dtype(Y)
42 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | files: |
 4 |   (?x)^(
 5 |     openml|
 6 |     tests
 7 |   )/.*\.py$
 8 | repos:
 9 |   - repo: https://github.com/astral-sh/ruff-pre-commit
10 |     rev: v0.7.3
11 |     hooks:
12 |       - id: ruff
13 |         args: [--fix, --exit-non-zero-on-fix, --no-cache]
14 |       - id: ruff-format
15 |   - repo: https://github.com/pre-commit/mirrors-mypy
16 |     rev: v1.13.0
17 |     hooks:
18 |       - id: mypy
19 |         additional_dependencies:
20 |           - types-requests
21 |           - types-python-dateutil
22 |   - repo: https://github.com/python-jsonschema/check-jsonschema
23 |     rev: 0.29.4
24 |     hooks:
25 |       - id: check-github-workflows
26 |         files: '^github/workflows/.*\.ya?ml$'
27 |         types: ["yaml"]
28 |       - id: check-dependabot
29 |         files: '^\.github/dependabot\.ya?ml$'
30 |   - repo: https://github.com/pre-commit/pre-commit-hooks
31 |     rev: v5.0.0
32 |     hooks:
33 |       - id: check-added-large-files
34 |         files: ".*"
35 |       - id: check-case-conflict
36 |         files: ".*"
37 |       - id: check-merge-conflict
38 |         files: ".*"
39 |       - id: check-yaml
40 |         files: ".*"
41 |       - id: end-of-file-fixer
42 |         files: ".*"
43 |         types: ["yaml"]
44 |       - id: check-toml
45 |         files: ".*"
46 |         types: ["toml"]
47 |       - id: debug-statements
48 |         files: '^src/.*\.py$'
49 | 


--------------------------------------------------------------------------------
/tests/test_openml/test_openml.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | from unittest import mock
 5 | 
 6 | import openml
 7 | from openml.testing import TestBase
 8 | 
 9 | 
10 | class TestInit(TestBase):
11 |     # Splitting not helpful, these test's don't rely on the server and take less
12 |     # than 1 seconds
13 | 
14 |     @mock.patch("openml.tasks.functions.get_task")
15 |     @mock.patch("openml.datasets.functions.get_dataset")
16 |     @mock.patch("openml.flows.functions.get_flow")
17 |     @mock.patch("openml.runs.functions.get_run")
18 |     def test_populate_cache(
19 |         self,
20 |         run_mock,
21 |         flow_mock,
22 |         dataset_mock,
23 |         task_mock,
24 |     ):
25 |         openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8])
26 |         assert run_mock.call_count == 2
27 |         for argument, fixture in zip(run_mock.call_args_list, [(7,), (8,)]):
28 |             assert argument[0] == fixture
29 | 
30 |         assert flow_mock.call_count == 2
31 |         for argument, fixture in zip(flow_mock.call_args_list, [(5,), (6,)]):
32 |             assert argument[0] == fixture
33 | 
34 |         assert dataset_mock.call_count == 2
35 |         for argument, fixture in zip(
36 |             dataset_mock.call_args_list,
37 |             [(3,), (4,)],
38 |         ):
39 |             assert argument[0] == fixture
40 | 
41 |         assert task_mock.call_count == 2
42 |         for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]):
43 |             assert argument[0] == fixture
44 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/tasks/1882/task.xml:
--------------------------------------------------------------------------------
 1 | <oml:task xmlns:oml="http://openml.org/openml">
 2 | 	<oml:task_id>1882</oml:task_id>
 3 |   <oml:task_type_id>1</oml:task_type_id>
 4 | 	<oml:task_type>Supervised Classification</oml:task_type>
 5 | 	      <oml:input name="source_data">
 6 |     <oml:data_set>
 7 | <oml:data_set_id>2</oml:data_set_id>
 8 | <oml:target_feature>class</oml:target_feature>
 9 | </oml:data_set>  </oml:input>
10 | 	      <oml:input name="estimation_procedure">
11 |     <oml:estimation_procedure>
12 |         <oml:id>3</oml:id>
13 | <oml:type>crossvalidation</oml:type>
14 | <oml:data_splits_url>http://capa.win.tue.nl/api_splits/get/1882/Task_1882_splits.arff</oml:data_splits_url>
15 | <oml:parameter name="number_repeats">10</oml:parameter>
16 | <oml:parameter name="number_folds">10</oml:parameter>
17 | <oml:parameter name="percentage"></oml:parameter>
18 | <oml:parameter name="stratified_sampling">true</oml:parameter>
19 | </oml:estimation_procedure>  </oml:input>
20 | 	      <oml:input name="cost_matrix">
21 |     <oml:cost_matrix></oml:cost_matrix>  </oml:input>
22 | 	      <oml:input name="evaluation_measures">
23 |     <oml:evaluation_measures>
24 | <oml:evaluation_measure>predictive_accuracy</oml:evaluation_measure>
25 | </oml:evaluation_measures>  </oml:input>
26 | 	      <oml:output name="predictions">
27 |     <oml:predictions>
28 | <oml:format>ARFF</oml:format>
29 | <oml:feature name="repeat" type="integer"/>
30 | <oml:feature name="fold" type="integer"/>
31 | <oml:feature name="row_id" type="integer"/>
32 | <oml:feature name="confidence.classname" type="numeric"/>
33 | <oml:feature name="prediction" type="string"/>
34 | </oml:predictions>  </oml:output>
35 | 	    <oml:tag>under100k</oml:tag>
36 |     <oml:tag>under1m</oml:tag>
37 |   </oml:task>
38 | 


--------------------------------------------------------------------------------
/examples/Basics/simple_datasets_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # A basic tutorial on how to list, load and visualize datasets.
 3 | #
 4 | # In general, we recommend working with tasks, so that the results can
 5 | # be easily reproduced. Furthermore, the results can be compared to existing results
 6 | # at OpenML. However, for the purposes of this tutorial, we are going to work with
 7 | # the datasets directly.
 8 | 
 9 | # %%
10 | 
11 | import openml
12 | 
13 | # %% [markdown]
14 | # ## List datasets stored on OpenML
15 | 
16 | # %%
17 | datasets_df = openml.datasets.list_datasets()
18 | print(datasets_df.head(n=10))
19 | 
20 | # %% [markdown]
21 | # ## Download a dataset
22 | 
23 | # %%
24 | # Iris dataset https://www.openml.org/d/61
25 | dataset = openml.datasets.get_dataset(dataset_id=61)
26 | 
27 | # Print a summary
28 | print(
29 |     f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'"
30 | )
31 | print(f"URL: {dataset.url}")
32 | print(dataset.description[:500])
33 | 
34 | # %% [markdown]
35 | # ## Load a dataset
36 | # * `X` - A dataframe where each row represents one example with
37 | #   the corresponding feature values.
38 | # * `y` - the classes for each example
39 | # * `categorical_indicator` - a list that indicates which feature is categorical
40 | # * `attribute_names` - the names of the features for the examples (X) and
41 | # target feature (y)
42 | 
43 | # %%
44 | X, y, categorical_indicator, attribute_names = dataset.get_data(
45 |     target=dataset.default_target_attribute
46 | )
47 | 
48 | # %% [markdown]
49 | # Visualize the dataset
50 | 
51 | # %%
52 | import matplotlib.pyplot as plt
53 | import pandas as pd
54 | import seaborn as sns
55 | 
56 | iris_plot = sns.pairplot(pd.concat([X, y], axis=1), hue="class")
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | doc/generated
 3 | examples/.ipynb_checkpoints
 4 | venv
 5 | .uv-lock
 6 | uv.lock
 7 | 
 8 | # Byte-compiled / optimized / DLL files
 9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 | 
13 | # C extensions
14 | *.so
15 | 
16 | # scikit-learn specific
17 | doc/_build/
18 | doc/auto_examples/
19 | doc/modules/generated/
20 | doc/datasets/generated/
21 | 
22 | # Some stuff from testing?
23 | tests/files/org/openml/test/datasets/1/
24 | tests/files/org/openml/test/datasets/2/features.xml.pkl
25 | tests/files/org/openml/test/datasets/2/qualities.xml.pkl
26 | tests/files/org/openml/test/locks/
27 | tests/files/org/openml/test/tasks/1/datasplits.pkl.py3
28 | tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3
29 | 
30 | # Distribution / packaging
31 | 
32 | .Python
33 | env/
34 | build/
35 | develop-eggs/
36 | dist/
37 | downloads/
38 | eggs/
39 | .eggs/
40 | lib/
41 | lib64/
42 | parts/
43 | sdist/
44 | var/
45 | *.egg-info/
46 | .installed.cfg
47 | *.egg
48 | 
49 | # PyInstaller
50 | #  Usually these files are written by a python script from a template
51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 | 
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 | 
59 | # Unit test / coverage reports
60 | htmlcov/
61 | cover
62 | coverage
63 | htmlcov
64 | .tox/
65 | .coverage
66 | .coverage.*
67 | .cache
68 | nosetests.xml
69 | coverage.xml
70 | *,cover
71 | .hypothesis/
72 | prof/
73 | 
74 | # Translations
75 | *.mo
76 | *.pot
77 | 
78 | # Django stuff:
79 | *.log
80 | 
81 | # Sphinx documentation
82 | docs/_build/
83 | 
84 | # PyBuilder
85 | target/
86 | 
87 | # IDE
88 | .idea
89 | *.swp
90 | .vscode
91 | 
92 | # MYPY
93 | .mypy_cache
94 | dmypy.json
95 | dmypy.sock
96 | 
97 | # Tests
98 | .pytest_cache
99 | .venv


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | It is recommended to check that your issue complies with the
 3 | following rules before submitting:
 4 | 
 5 | -  Verify that your issue is not being currently addressed by other
 6 |    issues (https://github.com/openml/openml-python/issues)
 7 |    or pull requests (https://github.com/openml/openml-python/pulls).
 8 | 
 9 | -  Please ensure all code snippets and error messages are formatted in
10 |    appropriate code blocks. See https://help.github.com/articles/creating-and-highlighting-code-blocks
11 | -->
12 | 
13 | #### Description
14 | <!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
15 | 
16 | #### Steps/Code to Reproduce
17 | <!--
18 | Example:
19 | ```python
20 | import openml
21 | openml.flows.get_flow(1)
22 | ```
23 | If the code is too long, feel free to put it in a public gist and link
24 | it in the issue: https://gist.github.com
25 | -->
26 | 
27 | #### Expected Results
28 | <!-- Example: No error is thrown. Please paste or describe the expected results.-->
29 | 
30 | #### Actual Results
31 | <!-- Please paste or specifically describe the actual output or traceback. -->
32 | 
33 | #### Versions
34 | <!--
35 | Please include your operating system type and version number, as well
36 | as your Python, openml, scikit-learn, numpy, and scipy versions. This information
37 | can be found by running the following code snippet:
38 | 
39 | import platform; print(platform.platform())
40 | import sys; print("Python", sys.version)
41 | import numpy; print("NumPy", numpy.__version__)
42 | import scipy; print("SciPy", scipy.__version__)
43 | import sklearn; print("Scikit-Learn", sklearn.__version__)
44 | import openml; print("OpenML", openml.__version__)
45 | -->
46 | 
47 | 
48 | <!-- Thanks for contributing! -->
49 | 
50 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/tasks/1/task.xml:
--------------------------------------------------------------------------------
 1 | <oml:task xmlns:oml="http://openml.org/openml">
 2 | 	<oml:task_id>1</oml:task_id>
 3 |   <oml:task_type_id>1</oml:task_type_id>
 4 | 	<oml:task_type>Supervised Classification</oml:task_type>
 5 | 	      <oml:input name="source_data">
 6 |     <oml:data_set>
 7 | <oml:data_set_id>1</oml:data_set_id>
 8 | <oml:target_feature>class</oml:target_feature>
 9 | </oml:data_set>  </oml:input>
10 | 	      <oml:input name="estimation_procedure">
11 |     <oml:estimation_procedure>
12 |         <oml:id>1</oml:id>
13 | <oml:type>crossvalidation</oml:type>
14 | <oml:data_splits_url>http://www.openml.org/api_splits/get/1/Task_1_splits.arff</oml:data_splits_url>
15 | <oml:parameter name="number_repeats">1</oml:parameter>
16 | <oml:parameter name="number_folds">10</oml:parameter>
17 | <oml:parameter name="percentage"></oml:parameter>
18 | <oml:parameter name="stratified_sampling">true</oml:parameter>
19 | </oml:estimation_procedure>  </oml:input>
20 | 	      <oml:input name="cost_matrix">
21 |     <oml:cost_matrix></oml:cost_matrix>  </oml:input>
22 | 	      <oml:input name="evaluation_measures">
23 |     <oml:evaluation_measures>
24 | <oml:evaluation_measure>predictive_accuracy</oml:evaluation_measure>
25 | </oml:evaluation_measures>  </oml:input>
26 | 	      <oml:output name="predictions">
27 |     <oml:predictions>
28 | <oml:format>ARFF</oml:format>
29 | <oml:feature name="repeat" type="integer"/>
30 | <oml:feature name="fold" type="integer"/>
31 | <oml:feature name="row_id" type="integer"/>
32 | <oml:feature name="confidence.classname" type="numeric"/>
33 | <oml:feature name="prediction" type="string"/>
34 | </oml:predictions>  </oml:output>
35 | 	    <oml:tag>basic</oml:tag>
36 |     <oml:tag>study_1</oml:tag>
37 |     <oml:tag>study_7</oml:tag>
38 |     <oml:tag>under100k</oml:tag>
39 |     <oml:tag>under1m</oml:tag>
40 |   </oml:task>
41 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/tasks/3/task.xml:
--------------------------------------------------------------------------------
 1 | <oml:task xmlns:oml="http://openml.org/openml">
 2 | 	<oml:task_id>3</oml:task_id>
 3 |   <oml:task_type_id>1</oml:task_type_id>
 4 | 	<oml:task_type>Supervised Classification</oml:task_type>
 5 | 	      <oml:input name="source_data">
 6 |     <oml:data_set>
 7 | <oml:data_set_id>3</oml:data_set_id>
 8 | <oml:target_feature>class</oml:target_feature>
 9 | </oml:data_set>  </oml:input>
10 | 	      <oml:input name="estimation_procedure">
11 |     <oml:estimation_procedure>
12 |         <oml:id>1</oml:id>
13 | <oml:type>crossvalidation</oml:type>
14 | <oml:data_splits_url>http://www.openml.org/api_splits/get/3/Task_3_splits.arff</oml:data_splits_url>
15 | <oml:parameter name="number_repeats">1</oml:parameter>
16 | <oml:parameter name="number_folds">10</oml:parameter>
17 | <oml:parameter name="percentage"></oml:parameter>
18 | <oml:parameter name="stratified_sampling">true</oml:parameter>
19 | </oml:estimation_procedure>  </oml:input>
20 | 	      <oml:input name="cost_matrix">
21 |     <oml:cost_matrix></oml:cost_matrix>  </oml:input>
22 | 	      <oml:input name="evaluation_measures">
23 |     <oml:evaluation_measures>
24 | <oml:evaluation_measure>predictive_accuracy</oml:evaluation_measure>
25 | </oml:evaluation_measures>  </oml:input>
26 | 	      <oml:output name="predictions">
27 |     <oml:predictions>
28 | <oml:format>ARFF</oml:format>
29 | <oml:feature name="repeat" type="integer"/>
30 | <oml:feature name="fold" type="integer"/>
31 | <oml:feature name="row_id" type="integer"/>
32 | <oml:feature name="confidence.classname" type="numeric"/>
33 | <oml:feature name="prediction" type="string"/>
34 | </oml:predictions>  </oml:output>
35 | 	    <oml:tag>basic</oml:tag>
36 |     <oml:tag>mythbusting</oml:tag>
37 |     <oml:tag>mythbusting_1</oml:tag>
38 |     <oml:tag>study_1</oml:tag>
39 |     <oml:tag>study_7</oml:tag>
40 |     <oml:tag>under100k</oml:tag>
41 |     <oml:tag>under1m</oml:tag>
42 |   </oml:task>
43 | 


--------------------------------------------------------------------------------
/tests/test_evaluations/test_evaluations_example.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import unittest
 5 | 
 6 | from openml.config import overwrite_config_context
 7 | 
 8 | 
 9 | class TestEvaluationsExample(unittest.TestCase):
10 |     def test_example_python_paper(self):
11 |         # Example script which will appear in the upcoming OpenML-Python paper
12 |         # This test ensures that the example will keep running!
13 |         with overwrite_config_context(
14 |             {
15 |                 "server": "https://www.openml.org/api/v1/xml",
16 |                 "apikey": None,
17 |             }
18 |         ):
19 |             import matplotlib.pyplot as plt
20 |             import numpy as np
21 |             import openml
22 | 
23 |             df = openml.evaluations.list_evaluations_setups(
24 |                 "predictive_accuracy",
25 |                 flows=[8353],
26 |                 tasks=[6],
27 |                 parameters_in_separate_columns=True,
28 |             )  # Choose an SVM flow, for example 8353, and a task.
29 | 
30 |             assert len(df) > 0, (
31 |                 "No evaluation found for flow 8353 on task 6, could "
32 |                 "be that this task is not available on the test server."
33 |             )
34 | 
35 |             hp_names = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
36 |             df[hp_names] = df[hp_names].astype(float).apply(np.log)
37 |             C, gamma, score = df[hp_names[0]], df[hp_names[1]], df["value"]
38 | 
39 |             cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
40 |             plt.colorbar(cntr, label="accuracy")
41 |             plt.xlim((min(C), max(C)))
42 |             plt.ylim((min(gamma), max(gamma)))
43 |             plt.xlabel("C (log10)", size=16)
44 |             plt.ylabel("gamma (log10)", size=16)
45 |             plt.title("SVM performance landscape", size=20)
46 | 
47 |             plt.tight_layout()
48 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Docs
 2 | on:
 3 |   workflow_dispatch:
 4 | 
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |       - develop
 9 |     tags:
10 |       - "v*.*.*"
11 | 
12 |   pull_request:
13 |     branches:
14 |       - main
15 |       - develop
16 | 
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19 |   cancel-in-progress: true
20 | 
21 | jobs:
22 |   build-and-deploy:
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - uses: actions/checkout@v4
26 |         with:
27 |           fetch-depth: 0
28 |       - name: Setup Python
29 |         uses: actions/setup-python@v5
30 |         with:
31 |           python-version: 3.8
32 |       - name: Install dependencies
33 |         run: |
34 |           pip install -e .[docs,examples]
35 |       - name: Make docs
36 |         run: |
37 |           mkdocs build
38 |       - name: Deploy to GitHub Pages
39 |         env:
40 |           CI: false
41 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
42 |           PAGES_BRANCH: gh-pages
43 |         if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
44 |         run: |
45 |           git config user.name doc-bot
46 |           git config user.email doc-bot@openml.com
47 |           current_version=$(git tag | sort --version-sort | tail -n 1)
48 |           # This block will rename previous retitled versions
49 |           retitled_versions=$(mike list -j | jq ".[] | select(.title != .version) | .version" | tr -d '"')
50 |           for version in $retitled_versions; do
51 |             mike retitle "${version}" "${version}"
52 |           done
53 | 
54 |           echo "Deploying docs for ${current_version}"
55 |           mike set-default latest
56 |           mike deploy \
57 |             --push \
58 |             --title "${current_version} (latest)" \
59 |             --update-aliases \
60 |             "${current_version}" \
61 |             "latest"\
62 |             -b $PAGES_BRANCH
63 | 


--------------------------------------------------------------------------------
/scripts/gen_ref_pages.py:
--------------------------------------------------------------------------------
 1 | """Generate the code reference pages.
 2 | 
 3 | based on https://github.com/mkdocstrings/mkdocstrings/blob/33aa573efb17b13e7b9da77e29aeccb3fbddd8e8/docs/recipes.md
 4 | but modified for lack of "src/" file structure.
 5 | 
 6 | """
 7 | 
 8 | from __future__ import annotations
 9 | 
10 | from pathlib import Path
11 | 
12 | import mkdocs_gen_files
13 | 
14 | nav = mkdocs_gen_files.Nav()
15 | 
16 | root = Path(__file__).parent.parent
17 | src = root / "openml"
18 | 
19 | for path in sorted(src.rglob("*.py")):
20 |     module_path = path.relative_to(root).with_suffix("")
21 |     doc_path = path.relative_to(src).with_suffix(".md")
22 |     full_doc_path = Path("reference", doc_path)
23 | 
24 |     parts = tuple(module_path.parts)
25 | 
26 |     if parts[-1] == "__init__":
27 |         parts = parts[:-1]
28 |         doc_path = doc_path.with_name("index.md")
29 |         full_doc_path = full_doc_path.with_name("index.md")
30 |     elif parts[-1] == "__main__":
31 |         continue
32 | 
33 |     nav[parts] = doc_path.as_posix()
34 | 
35 |     with mkdocs_gen_files.open(full_doc_path, "w") as fd:
36 |         identifier = ".".join(parts)
37 |         print("::: " + identifier, file=fd)
38 | 
39 |     mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
40 | 
41 |     with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
42 |         nav_file.writelines(nav.build_literate_nav())
43 | 
44 | nav = mkdocs_gen_files.Nav()
45 | examples_dir = root / "examples"
46 | examples_doc_dir = root / "docs" / "examples"
47 | for path in sorted(examples_dir.rglob("*.py")):
48 |     if "_external_or_deprecated" in path.parts:
49 |         continue
50 |     dest_path = Path("examples") / path.relative_to(examples_dir)
51 |     with mkdocs_gen_files.open(dest_path, "w") as dest_file:
52 |         print(path.read_text(), file=dest_file)
53 | 
54 |     new_relative_location = Path("../") / dest_path
55 |     nav[new_relative_location.parts[2:]] = new_relative_location.as_posix()
56 | 
57 |     with mkdocs_gen_files.open("examples/SUMMARY.md", "w") as nav_file:
58 |         nav_file.writelines(nav.build_literate_nav())
59 | 


--------------------------------------------------------------------------------
/.github/workflows/release_docker.yaml:
--------------------------------------------------------------------------------
 1 | name: release-docker
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - 'develop'
 8 |       - 'docker'
 9 |     tags:
10 |       - 'v*'
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 | 
18 |   docker:
19 | 
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |       - name: Set up QEMU
24 |         uses: docker/setup-qemu-action@v3
25 | 
26 |       - name: Set up Docker Buildx
27 |         uses: docker/setup-buildx-action@v3
28 | 
29 |       - name: Login to DockerHub
30 |         if: github.event_name != 'pull_request'
31 |         uses: docker/login-action@v3
32 |         with:
33 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
34 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
35 | 
36 |       - name: Check out the repo
37 |         uses: actions/checkout@v4
38 | 
39 |       - name: Extract metadata (tags, labels) for Docker Hub
40 |         id: meta_dockerhub
41 |         uses: docker/metadata-action@v5
42 |         with:
43 |           images: "openml/openml-python"
44 | 
45 |       - name: Build and push
46 |         id: docker_build
47 |         uses: docker/build-push-action@v6
48 |         with:
49 |           context: ./docker/
50 |           tags: ${{ steps.meta_dockerhub.outputs.tags }}
51 |           labels: ${{ steps.meta_dockerhub.outputs.labels }}
52 |           platforms: linux/amd64,linux/arm64
53 |           push: ${{ github.event_name == 'push' }}
54 | 
55 |       - name: Update repo description
56 |         if: ${{ startsWith(github.ref, 'refs/tags/v') }}
57 |         uses: peter-evans/dockerhub-description@v4
58 |         with:
59 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
60 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
61 |           repository: openml/openml-python
62 |           short-description: "pre-installed openml-python environment"
63 |           readme-filepath: ./docker/readme.md
64 |           
65 |       - name: Image digest
66 |         run: echo ${{ steps.docker_build.outputs.digest }}
67 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_task_methods.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | from time import time
 5 | 
 6 | import openml
 7 | from openml.testing import TestBase
 8 | 
 9 | 
10 | # Common methods between tasks
11 | class OpenMLTaskMethodsTest(TestBase):
12 |     def setUp(self):
13 |         super().setUp()
14 | 
15 |     def tearDown(self):
16 |         super().tearDown()
17 | 
18 |     def test_tagging(self):
19 |         task = openml.tasks.get_task(1)  # anneal; crossvalidation
20 |         # tags can be at most 64 alphanumeric (+ underscore) chars
21 |         unique_indicator = str(time()).replace(".", "")
22 |         tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"
23 |         tasks = openml.tasks.list_tasks(tag=tag)
24 |         assert len(tasks) == 0
25 |         task.push_tag(tag)
26 |         tasks = openml.tasks.list_tasks(tag=tag)
27 |         assert len(tasks) == 1
28 |         assert 1 in tasks["tid"]
29 |         task.remove_tag(tag)
30 |         tasks = openml.tasks.list_tasks(tag=tag)
31 |         assert len(tasks) == 0
32 | 
33 |     def test_get_train_and_test_split_indices(self):
34 |         openml.config.set_root_cache_directory(self.static_cache_dir)
35 |         task = openml.tasks.get_task(1882)
36 |         train_indices, test_indices = task.get_train_test_split_indices(0, 0)
37 |         assert train_indices[0] == 16
38 |         assert train_indices[-1] == 395
39 |         assert test_indices[0] == 412
40 |         assert test_indices[-1] == 364
41 |         train_indices, test_indices = task.get_train_test_split_indices(2, 2)
42 |         assert train_indices[0] == 237
43 |         assert train_indices[-1] == 681
44 |         assert test_indices[0] == 583
45 |         assert test_indices[-1] == 24
46 |         self.assertRaisesRegex(
47 |             ValueError,
48 |             "Fold 10 not known",
49 |             task.get_train_test_split_indices,
50 |             10,
51 |             0,
52 |         )
53 |         self.assertRaisesRegex(
54 |             ValueError,
55 |             "Repeat 10 not known",
56 |             task.get_train_test_split_indices,
57 |             0,
58 |             10,
59 |         )
60 | 


--------------------------------------------------------------------------------
/examples/Basics/introduction_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # ## Installation
 3 | # Installation is done via ``pip``:
 4 | #
 5 | # ```bash
 6 | # pip install openml
 7 | # ```
 8 | 
 9 | # %% [markdown]
10 | # ## Authentication
11 | #
12 | # For certain functionality, such as uploading tasks or datasets, users have to
13 | # sign up. Only accessing the data on OpenML does not require an account!
14 | #
15 | # If you don’t have an account yet, sign up now.
16 | # You will receive an API key, which will authenticate you to the server
17 | # and allow you to download and upload datasets, tasks, runs and flows.
18 | #
19 | # * Create an OpenML account (free) on https://www.openml.org.
20 | # * After logging in, open your account page (avatar on the top right)
21 | # * Open 'Account Settings', then 'API authentication' to find your API key.
22 | #
23 | # There are two ways to permanently authenticate:
24 | #
25 | # * Use the ``openml`` CLI tool with ``openml configure apikey MYKEY``,
26 | #   replacing **MYKEY** with your API key.
27 | # * Create a plain text file **~/.openml/config** with the line
28 | #   **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
29 | #   file must be in the directory ~/.openml/config and exist prior to
30 | #   importing the openml module.
31 | #
32 | # Alternatively, by running the code below and replacing 'YOURKEY' with your API key,
33 | # you authenticate for the duration of the Python process.
34 | 
35 | # %%
36 | import openml
37 | 
38 | openml.config.apikey = "YOURKEY"
39 | 
40 | # %% [markdown]
41 | # ## Caching
42 | # When downloading datasets, tasks, runs and flows, they will be cached to
43 | # retrieve them without calling the server later. As with the API key,
44 | # the cache directory can be either specified through the config file or
45 | # through the API:
46 | #
47 | # * Add the  line **cachedir = 'MYDIR'** to the config file, replacing
48 | #   'MYDIR' with the path to the cache directory. By default, OpenML
49 | #   will use **~/.openml/cache** as the cache directory.
50 | # * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
51 | 
52 | # %%
53 | import openml
54 | 
55 | openml.config.set_root_cache_directory("YOURDIR")


--------------------------------------------------------------------------------
/examples/Basics/simple_suites_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # This is a brief showcase of OpenML benchmark suites, which were introduced by
 3 | # [Bischl et al. (2019)](https://arxiv.org/abs/1708.03731v2). Benchmark suites standardize the
 4 | # datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML
 5 | # and simplify both the sharing of the setup and the results.
 6 | 
 7 | # %%
 8 | import openml
 9 | 
10 | # %% [markdown]
11 | # ## OpenML-CC18
12 | #
13 | # As an example we have a look at the OpenML-CC18, which is a suite of 72 classification datasets
14 | # from OpenML which were carefully selected to be usable by many algorithms. These are all datasets
15 | # from mid-2018 that satisfy a large set of clear requirements for thorough yet practical benchmarking:
16 | #
17 | # 1. the number of observations are between 500 and 100,000 to focus on medium-sized datasets,
18 | # 2. the number of features does not exceed 5,000 features to keep the runtime of the algorithms
19 | #    low
20 | # 3. the target attribute has at least two classes with no class having less than 20 observations
21 | # 4. the ratio of the minority class and the majority class is above 0.05 (to eliminate highly
22 | #    imbalanced datasets which require special treatment for both algorithms and evaluation
23 | #    measures).
24 | #
25 | # A full description can be found in the
26 | # [OpenML benchmarking docs](https://docs.openml.org/benchmark/#openml-cc18).
27 | #
28 | # In this example, we'll focus on how to use benchmark suites in practice.
29 | 
30 | # %% [markdown]
31 | # ## Downloading benchmark suites
32 | 
33 | # %%
34 | suite = openml.study.get_suite(99)
35 | print(suite)
36 | 
37 | # %% [markdown]
38 | # The benchmark suite does not download the included tasks and datasets itself, but only contains
39 | # a list of which tasks constitute the study.
40 | #
41 | # Tasks can then be accessed via
42 | 
43 | # %%
44 | tasks = suite.tasks
45 | print(tasks)
46 | 
47 | # %% [markdown]
48 | # and iterated over for benchmarking. For speed reasons, we only iterate over the first three tasks:
49 | 
50 | # %%
51 | for task_id in tasks[:3]:
52 |     task = openml.tasks.get_task(task_id)
53 |     print(task)
54 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software in a publication, please cite the metadata from preferred-citation."
 3 | preferred-citation:
 4 |   type: article
 5 |   authors:
 6 |   - family-names: "Feurer"
 7 |     given-names: "Matthias"
 8 |     orcid: "https://orcid.org/0000-0001-9611-8588"
 9 |   - family-names: "van Rijn"
10 |     given-names: "Jan N."
11 |     orcid: "https://orcid.org/0000-0003-2898-2168"
12 |   - family-names: "Kadra"
13 |     given-names: "Arlind"
14 |   - family-names: "Gijsbers"
15 |     given-names: "Pieter"
16 |     orcid: "https://orcid.org/0000-0001-7346-8075"
17 |   - family-names: "Mallik"
18 |     given-names: "Neeratyoy"
19 |     orcid: "https://orcid.org/0000-0002-0598-1608"
20 |   - family-names: "Ravi"
21 |     given-names: "Sahithya"
22 |   - family-names: "Müller"
23 |     given-names: "Andreas"
24 |     orcid: "https://orcid.org/0000-0002-2349-9428"
25 |   - family-names: "Vanschoren"
26 |     given-names: "Joaquin"
27 |     orcid: "https://orcid.org/0000-0001-7044-9805"
28 |   - family-names: "Hutter"
29 |     given-names: "Frank"
30 |     orcid: "https://orcid.org/0000-0002-2037-3694"
31 |   journal: "Journal of Machine Learning Research"
32 |   title: "OpenML-Python: an extensible Python API for OpenML"
33 |   abstract: "OpenML is an online platform for open science collaboration in machine learning, used to share datasets and results of machine learning experiments. In this paper, we introduce OpenML-Python, a client API for Python, which opens up the OpenML platform for a wide range of Python-based machine learning tools. It provides easy access to all datasets, tasks and experiments on OpenML from within Python. It also provides functionality to conduct machine learning experiments, upload the results to OpenML, and reproduce results which are stored on OpenML. Furthermore, it comes with a scikit-learn extension and an extension mechanism to easily integrate other machine learning libraries written in Python into the OpenML ecosystem. Source code and documentation are available at https://github.com/openml/openml-python/."
34 |   volume: 22
35 |   year: 2021
36 |   start: 1
37 |   end: 5
38 |   pages: 5
39 |   number: 100
40 |   url: https://jmlr.org/papers/v22/19-920.html
41 | 


--------------------------------------------------------------------------------
/openml/exceptions.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | 
 5 | class PyOpenMLError(Exception):
 6 |     """Base class for all exceptions in OpenML-Python."""
 7 | 
 8 |     def __init__(self, message: str):
 9 |         self.message = message
10 |         super().__init__(message)
11 | 
12 | 
13 | class OpenMLServerError(PyOpenMLError):
14 |     """class for when something is really wrong on the server
15 |     (result did not parse to dict), contains unparsed error.
16 |     """
17 | 
18 | 
19 | class OpenMLServerException(OpenMLServerError):  # noqa: N818
20 |     """exception for when the result of the server was
21 |     not 200 (e.g., listing call w/o results).
22 |     """
23 | 
24 |     # Code needs to be optional to allow the exception to be picklable:
25 |     # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable  # noqa: E501
26 |     def __init__(self, message: str, code: int | None = None, url: str | None = None):
27 |         self.message = message
28 |         self.code = code
29 |         self.url = url
30 |         super().__init__(message)
31 | 
32 |     def __str__(self) -> str:
33 |         return f"{self.url} returned code {self.code}: {self.message}"
34 | 
35 | 
36 | class OpenMLServerNoResult(OpenMLServerException):
37 |     """Exception for when the result of the server is empty."""
38 | 
39 | 
40 | class OpenMLCacheException(PyOpenMLError):  # noqa: N818
41 |     """Dataset / task etc not found in cache"""
42 | 
43 | 
44 | class OpenMLHashException(PyOpenMLError):  # noqa: N818
45 |     """Locally computed hash is different than hash announced by the server."""
46 | 
47 | 
48 | class OpenMLPrivateDatasetError(PyOpenMLError):
49 |     """Exception thrown when the user has no rights to access the dataset."""
50 | 
51 | 
52 | class OpenMLRunsExistError(PyOpenMLError):
53 |     """Indicates run(s) already exists on the server when they should not be duplicated."""
54 | 
55 |     def __init__(self, run_ids: set[int], message: str) -> None:
56 |         if len(run_ids) < 1:
57 |             raise ValueError("Set of run ids must be non-empty.")
58 |         self.run_ids = run_ids
59 |         super().__init__(message)
60 | 
61 | 
62 | class OpenMLNotAuthorizedError(OpenMLServerError):
63 |     """Indicates an authenticated user is not authorized to execute the requested action."""
64 | 
65 | 
66 | class ObjectNotPublishedError(PyOpenMLError):
67 |     """Indicates an object has not been published yet."""
68 | 


--------------------------------------------------------------------------------
/examples/Advanced/configure_logging.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # This tutorial explains openml-python logging, and shows how to configure it.
 3 | # Openml-python uses the [Python logging module](https://docs.python.org/3/library/logging.html)
 4 | # to provide users with log messages. Each log message is assigned a level of importance, see
 5 | # the table in Python's logging tutorial
 6 | # [here](https://docs.python.org/3/howto/logging.html#when-to-use-logging).
 7 | #
 8 | # By default, openml-python will print log messages of level `WARNING` and above to console.
 9 | # All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
10 | # found in your cache directory (see also the
11 | # [introduction tutorial](../Basics/introduction_tutorial).
12 | # These file logs are automatically deleted if needed, and use at most 2MB of space.
13 | #
14 | # It is possible to configure what log levels to send to console and file.
15 | # When downloading a dataset from OpenML, a `DEBUG`-level message is written:
16 | 
17 | # %%
18 | import openml
19 | 
20 | openml.datasets.get_dataset("iris", version=1)
21 | 
22 | # %% [markdown]
23 | # With default configuration, the above example will show no output to console.
24 | # However, in your cache directory you should find a file named 'openml_python.log',
25 | # which has a DEBUG message written to it. It should be either like
26 | # "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..."
27 | # or like
28 | # "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date."
29 | # , depending on whether or not you had downloaded iris before.
30 | # The processed log levels can be configured programmatically:
31 | 
32 | # %%
33 | import logging
34 | 
35 | openml.config.set_console_log_level(logging.DEBUG)
36 | openml.config.set_file_log_level(logging.WARNING)
37 | openml.datasets.get_dataset("iris", version=1)
38 | 
39 | # %% [markdown]
40 | # Now the log level that was previously written to file should also be shown in the console.
41 | # The message is now no longer written to file as the `file_log` was set to level `WARNING`.
42 | #
43 | # It is also possible to specify the desired log levels through the configuration file.
44 | # This way you will not need to set them on each script separately.
45 | # Add the  line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file,
46 | # where 'NUMBER' should be one of:
47 | #
48 | # * 0: `logging.WARNING` and up.
49 | # * 1: `logging.INFO` and up.
50 | # * 2: `logging.DEBUG` and up (i.e. all messages).
51 | 


--------------------------------------------------------------------------------
/examples/Advanced/suites_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # How to list, download and upload benchmark suites.
 3 | 
 4 | # %%
 5 | import uuid
 6 | 
 7 | import numpy as np
 8 | 
 9 | import openml
10 | 
11 | # %% [markdown]
12 | # ## Listing suites
13 | #
14 | # * Use the output_format parameter to select output type
15 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
16 | #   easier-to-work-with data structure
17 | 
18 | # %%
19 | suites = openml.study.list_suites(status="all")
20 | print(suites.head(n=10))
21 | 
22 | # %% [markdown]
23 | # ## Downloading suites
24 | # This is done based on the dataset ID.
25 | 
26 | # %%
27 | suite = openml.study.get_suite(99)
28 | print(suite)
29 | 
30 | # %% [markdown]
31 | # Suites also feature a description:
32 | 
33 | # %%
34 | print(suite.description)
35 | 
36 | # %% [markdown]
37 | # Suites are a container for tasks:
38 | 
39 | # %%
40 | print(suite.tasks)
41 | 
42 | # %% [markdown]
43 | # And we can use the task listing functionality to learn more about them:
44 | 
45 | # %%
46 | tasks = openml.tasks.list_tasks()
47 | 
48 | # %% [markdown]
49 | # Using ``@`` in
50 | # [pd.DataFrame.query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html)
51 | # accesses variables outside of the current dataframe.
52 | 
53 | # %%
54 | tasks = tasks.query("tid in @suite.tasks")
55 | print(tasks.describe().transpose())
56 | 
57 | # %% [markdown]
58 | # We'll use the test server for the rest of this tutorial.
59 | 
60 | # %%
61 | openml.config.start_using_configuration_for_example()
62 | 
63 | # %% [markdown]
64 | # ## Uploading suites
65 | #
66 | # Uploading suites is as simple as uploading any kind of other OpenML
67 | # entity - the only reason why we need so much code in this example is
68 | # because we upload some random data.
69 | 
70 | # We'll take a random subset of at least ten tasks of all available tasks on
71 | # the test server:
72 | 
73 | # %%
74 | all_tasks = list(openml.tasks.list_tasks()["tid"])
75 | task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
76 | 
77 | # The study needs a machine-readable and unique alias. To obtain this,
78 | # we simply generate a random uuid.
79 | 
80 | alias = uuid.uuid4().hex
81 | 
82 | new_suite = openml.study.create_benchmark_suite(
83 |     name="Test-Suite",
84 |     description="Test suite for the Python tutorial on benchmark suites",
85 |     task_ids=task_ids_for_suite,
86 |     alias=alias,
87 | )
88 | new_suite.publish()
89 | print(new_suite)
90 | 
91 | # %%
92 | openml.config.stop_using_configuration_for_example()
93 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_clustering_task.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import pytest
 5 | 
 6 | import openml
 7 | from openml.exceptions import OpenMLServerException
 8 | from openml.tasks import TaskType
 9 | from openml.testing import TestBase
10 | 
11 | from .test_task import OpenMLTaskTest
12 | 
13 | 
14 | class OpenMLClusteringTaskTest(OpenMLTaskTest):
15 |     __test__ = True
16 | 
17 |     def setUp(self, n_levels: int = 1):
18 |         super().setUp()
19 |         self.task_id = 146714
20 |         self.task_type = TaskType.CLUSTERING
21 |         self.estimation_procedure = 17
22 | 
23 |     @pytest.mark.production()
24 |     def test_get_dataset(self):
25 |         # no clustering tasks on test server
26 |         self.use_production_server()
27 |         task = openml.tasks.get_task(self.task_id)
28 |         task.get_dataset()
29 | 
30 |     @pytest.mark.production()
31 |     def test_download_task(self):
32 |         # no clustering tasks on test server
33 |         self.use_production_server()
34 |         task = super().test_download_task()
35 |         assert task.task_id == self.task_id
36 |         assert task.task_type_id == TaskType.CLUSTERING
37 |         assert task.dataset_id == 36
38 | 
39 |     def test_upload_task(self):
40 |         compatible_datasets = self._get_compatible_rand_dataset()
41 |         for i in range(100):
42 |             try:
43 |                 dataset_id = compatible_datasets[i % len(compatible_datasets)]
44 |                 # Upload a clustering task without a ground truth.
45 |                 task = openml.tasks.create_task(
46 |                     task_type=self.task_type,
47 |                     dataset_id=dataset_id,
48 |                     estimation_procedure_id=self.estimation_procedure,
49 |                 )
50 |                 task = task.publish()
51 |                 TestBase._mark_entity_for_removal("task", task.id)
52 |                 TestBase.logger.info(
53 |                     f"collected from {__file__.split('/')[-1]}: {task.id}",
54 |                 )
55 |                 # success
56 |                 break
57 |             except OpenMLServerException as e:
58 |                 # Error code for 'task already exists'
59 |                 # Should be 533 according to the docs
60 |                 # (# https://www.openml.org/api_docs#!/task/post_task)
61 |                 if e.code == 614:
62 |                     continue
63 |                 else:
64 |                     raise e
65 |         else:
66 |             raise ValueError(
67 |                 f"Could not create a valid task for task type ID {self.task_type}",
68 |             )
69 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # # Plotting hyperparameter surfaces
 3 | 
 4 | # %%
 5 | import openml
 6 | import numpy as np
 7 | 
 8 | # %% [markdown]
 9 | # # First step - obtaining the data
10 | # First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
11 | # not part of this tutorial, this could for example be done via the website.
12 | #
13 | # For this we use the function ``list_evaluations_setup`` which can automatically join
14 | # evaluations conducted by the server with the hyperparameter settings extracted from the
15 | # uploaded runs (called *setup*).
16 | 
17 | # %%
18 | df = openml.evaluations.list_evaluations_setups(
19 |     function="predictive_accuracy",
20 |     flows=[8353],
21 |     tasks=[6],
22 |     # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
23 |     # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
24 |     parameters_in_separate_columns=True,
25 | )
26 | print(df.head(n=10))
27 | 
28 | # %% [markdown]
29 | # We can see all the hyperparameter names in the columns of the dataframe:
30 | 
31 | # %%
32 | for name in df.columns:
33 |     print(name)
34 | 
35 | # %% [markdown]
36 | # Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we
37 | # can nicely plot them.
38 | 
39 | # %%
40 | hyperparameters = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
41 | df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log10)
42 | 
43 | # %% [markdown]
44 | # ## Option 1 - plotting via the pandas helper functions
45 | 
46 | # %%
47 | df.plot.hexbin(
48 |     x="sklearn.svm.classes.SVC(16)_C",
49 |     y="sklearn.svm.classes.SVC(16)_gamma",
50 |     C="value",
51 |     reduce_C_function=np.mean,
52 |     gridsize=25,
53 |     title="SVM performance landscape",
54 | )
55 | 
56 | # %% [markdown]
57 | # ## Option 2 - plotting via matplotlib
58 | 
59 | # %%
60 | import matplotlib.pyplot as plt
61 | 
62 | fig, ax = plt.subplots()
63 | 
64 | C = df["sklearn.svm.classes.SVC(16)_C"]
65 | gamma = df["sklearn.svm.classes.SVC(16)_gamma"]
66 | score = df["value"]
67 | 
68 | # Plotting all evaluations:
69 | ax.plot(C, gamma, "ko", ms=1)
70 | # Create a contour plot
71 | cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
72 | # Adjusting the colorbar
73 | fig.colorbar(cntr, ax=ax, label="accuracy")
74 | # Adjusting the axis limits
75 | ax.set(
76 |     xlim=(min(C), max(C)),
77 |     ylim=(min(gamma), max(gamma)),
78 |     xlabel="C (log10)",
79 |     ylabel="gamma (log10)",
80 | )
81 | ax.set_title("SVM performance landscape")
82 | # License: BSD 3-Clause
83 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_regression_task.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import ast
 5 | 
 6 | import pandas as pd
 7 | 
 8 | import openml
 9 | from openml.exceptions import OpenMLServerException
10 | from openml.tasks import TaskType
11 | from openml.testing import TestBase, check_task_existence
12 | 
13 | from .test_supervised_task import OpenMLSupervisedTaskTest
14 | 
15 | 
16 | class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest):
17 |     __test__ = True
18 | 
19 |     def setUp(self, n_levels: int = 1):
20 |         super().setUp()
21 |         self.estimation_procedure = 9
22 |         task_meta_data = {
23 |             "task_type": TaskType.SUPERVISED_REGRESSION,
24 |             "dataset_id": 105,  # wisconsin
25 |             "estimation_procedure_id": self.estimation_procedure, # non default value to test estimation procedure id
26 |             "target_name": "time",
27 |         }
28 |         _task_id = check_task_existence(**task_meta_data)
29 |         if _task_id is not None:
30 |             task_id = _task_id
31 |         else:
32 |             new_task = openml.tasks.create_task(**task_meta_data)
33 |             # publishes the new task
34 |             try:
35 |                 new_task = new_task.publish()
36 |                 task_id = new_task.task_id
37 |                 # mark to remove the uploaded task
38 |                 TestBase._mark_entity_for_removal("task", task_id)
39 |                 TestBase.logger.info(f"collected from test_run_functions: {task_id}")
40 |             except OpenMLServerException as e:
41 |                 if e.code == 614:  # Task already exists
42 |                     # the exception message contains the task_id that was matched in the format
43 |                     # 'Task already exists. - matched id(s): [xxxx]'
44 |                     task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
45 |                 else:
46 |                     raise Exception(repr(e))
47 |         self.task_id = task_id
48 |         self.task_type = TaskType.SUPERVISED_REGRESSION
49 | 
50 | 
51 |     def test_get_X_and_Y(self):
52 |         X, Y = super().test_get_X_and_Y()
53 |         assert X.shape == (194, 32)
54 |         assert isinstance(X, pd.DataFrame)
55 |         assert Y.shape == (194,)
56 |         assert isinstance(Y, pd.Series)
57 |         assert pd.api.types.is_numeric_dtype(Y)
58 | 
59 |     def test_download_task(self):
60 |         task = super().test_download_task()
61 |         assert task.task_id == self.task_id
62 |         assert task.task_type_id == TaskType.SUPERVISED_REGRESSION
63 |         assert task.dataset_id == 105
64 |         assert task.estimation_procedure_id == self.estimation_procedure
65 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/datasets/-1/qualities.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <oml:data_qualities xmlns:oml="http://openml.org/openml">
 3 |    <oml:quality>
 4 |       <oml:name>DefaultAccuracy</oml:name>
 5 |       <oml:value>0.5</oml:value>
 6 |    </oml:quality>
 7 |    <oml:quality>
 8 |       <oml:name>Dimensionality</oml:name>
 9 |       <oml:value>33.335</oml:value>
10 |    </oml:quality>
11 |    <oml:quality>
12 |       <oml:name>MajorityClassPercentage</oml:name>
13 |       <oml:value>50.0</oml:value>
14 |    </oml:quality>
15 |    <oml:quality>
16 |       <oml:name>MajorityClassSize</oml:name>
17 |       <oml:value>300.0</oml:value>
18 |    </oml:quality>
19 |    <oml:quality>
20 |       <oml:name>MinorityClassPerentage</oml:name>
21 |       <oml:value>50.0</oml:value>
22 |    </oml:quality>
23 |    <oml:quality>
24 |       <oml:name>MinorityClassSize</oml:name>
25 |       <oml:value>300.0</oml:value>
26 |    </oml:quality>
27 |    <oml:quality>
28 |       <oml:name>NumberOfBinaryFeatures</oml:name>
29 |       <oml:value>1.0</oml:value>
30 |    </oml:quality>
31 |    <oml:quality>
32 |       <oml:name>NumberOfClasses</oml:name>
33 |       <oml:value>2.0</oml:value>
34 |    </oml:quality>
35 |    <oml:quality>
36 |       <oml:name>NumberOfFeatures</oml:name>
37 |       <oml:value>20001.0</oml:value>
38 |    </oml:quality>
39 |    <oml:quality>
40 |       <oml:name>NumberOfInstances</oml:name>
41 |       <oml:value>600.0</oml:value>
42 |    </oml:quality>
43 |    <oml:quality>
44 |       <oml:name>NumberOfInstancesWithMissingValues</oml:name>
45 |       <oml:value>0.0</oml:value>
46 |    </oml:quality>
47 |    <oml:quality>
48 |       <oml:name>NumberOfMissingValues</oml:name>
49 |       <oml:value>0.0</oml:value>
50 |    </oml:quality>
51 |    <oml:quality>
52 |       <oml:name>NumberOfNumericFeatures</oml:name>
53 |       <oml:value>20000.0</oml:value>
54 |    </oml:quality>
55 |    <oml:quality>
56 |       <oml:name>NumberOfSymbolicFeatures</oml:name>
57 |       <oml:value>1.0</oml:value>
58 |    </oml:quality>
59 |    <oml:quality>
60 |       <oml:name>PercentageOfBinaryFeatures</oml:name>
61 |       <oml:value>0.004999750012499375</oml:value>
62 |    </oml:quality>
63 |    <oml:quality>
64 |       <oml:name>PercentageOfInstancesWithMissingValues</oml:name>
65 |       <oml:value>0.0</oml:value>
66 |    </oml:quality>
67 |    <oml:quality>
68 |       <oml:name>PercentageOfMissingValues</oml:name>
69 |       <oml:value>0.0</oml:value>
70 |    </oml:quality>
71 |    <oml:quality>
72 |       <oml:name>PercentageOfNumericFeatures</oml:name>
73 |       <oml:value>99.9950002499875</oml:value>
74 |    </oml:quality>
75 |    <oml:quality>
76 |       <oml:name>PercentageOfSymbolicFeatures</oml:name>
77 |       <oml:value>0.004999750012499375</oml:value>
78 |    </oml:quality>
79 | </oml:data_qualities>
80 | 
81 | 


--------------------------------------------------------------------------------
/tests/files/mock_responses/datasets/data_description_61.xml:
--------------------------------------------------------------------------------
 1 | <oml:data_set_description xmlns:oml="http://openml.org/openml">
 2 |   <oml:id>61</oml:id>
 3 |   <oml:name>iris</oml:name>
 4 |   <oml:version>1</oml:version>
 5 |   <oml:description>**Author**: R.A. Fisher  
 6 | **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
 7 | **Please cite**:   
 8 | 
 9 | **Iris Plants Database**  
10 | This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda &amp; Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.
11 | 
12 | Predicted attribute: class of iris plant.  
13 | This is an exceedingly simple domain.  
14 |  
15 | ### Attribute Information:
16 |     1. sepal length in cm
17 |     2. sepal width in cm
18 |     3. petal length in cm
19 |     4. petal width in cm
20 |     5. class: 
21 |        -- Iris Setosa
22 |        -- Iris Versicolour
23 |        -- Iris Virginica</oml:description>
24 |   <oml:description_version>4</oml:description_version>
25 |   <oml:format>ARFF</oml:format>
26 |   <oml:creator>R.A. Fisher</oml:creator>     <oml:collection_date>1936</oml:collection_date>  <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
27 |   <oml:language>English</oml:language>  <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
28 |   <oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url>  <oml:file_id>61</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>  <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation>  <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url>  <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url>  <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url>  <oml:status>active</oml:status>
29 |   <oml:processing_date>2020-11-20 19:02:18</oml:processing_date>      <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
30 | </oml:data_set_description>
31 | 


--------------------------------------------------------------------------------
/openml/datasets/data_feature.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | from typing import TYPE_CHECKING, Any, ClassVar, Sequence
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from IPython.lib import pretty
 8 | 
 9 | 
10 | class OpenMLDataFeature:
11 |     """
12 |     Data Feature (a.k.a. Attribute) object.
13 | 
14 |     Parameters
15 |     ----------
16 |     index : int
17 |         The index of this feature
18 |     name : str
19 |         Name of the feature
20 |     data_type : str
21 |         can be nominal, numeric, string, date (corresponds to arff)
22 |     nominal_values : list(str)
23 |         list of the possible values, in case of nominal attribute
24 |     number_missing_values : int
25 |         Number of rows that have a missing value for this feature.
26 |     ontologies : list(str)
27 |         list of ontologies attached to this feature. An ontology describes the
28 |         concept that are described in a feature. An ontology is defined by an
29 |         URL where the information is provided.
30 |     """
31 | 
32 |     LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
33 | 
34 |     def __init__(  # noqa: PLR0913
35 |         self,
36 |         index: int,
37 |         name: str,
38 |         data_type: str,
39 |         nominal_values: list[str],
40 |         number_missing_values: int,
41 |         ontologies: list[str] | None = None,
42 |     ):
43 |         if not isinstance(index, int):
44 |             raise TypeError(f"Index must be `int` but is {type(index)}")
45 | 
46 |         if data_type not in self.LEGAL_DATA_TYPES:
47 |             raise ValueError(
48 |                 f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}",
49 |             )
50 | 
51 |         if data_type == "nominal":
52 |             if nominal_values is None:
53 |                 raise TypeError(
54 |                     "Dataset features require attribute `nominal_values` for nominal "
55 |                     "feature type.",
56 |                 )
57 | 
58 |             if not isinstance(nominal_values, list):
59 |                 raise TypeError(
60 |                     "Argument `nominal_values` is of wrong datatype, should be list, "
61 |                     f"but is {type(nominal_values)}",
62 |                 )
63 |         elif nominal_values is not None:
64 |             raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
65 | 
66 |         if not isinstance(number_missing_values, int):
67 |             msg = f"number_missing_values must be int but is {type(number_missing_values)}"
68 |             raise TypeError(msg)
69 | 
70 |         self.index = index
71 |         self.name = str(name)
72 |         self.data_type = str(data_type)
73 |         self.nominal_values = nominal_values
74 |         self.number_missing_values = number_missing_values
75 |         self.ontologies = ontologies
76 | 
77 |     def __repr__(self) -> str:
78 |         return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
79 | 
80 |     def __eq__(self, other: Any) -> bool:
81 |         return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__
82 | 
83 |     def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None:  # noqa: FBT001, ARG002
84 |         pp.text(str(self))
85 | 


--------------------------------------------------------------------------------
/tests/test_extensions/test_functions.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import inspect
 5 | 
 6 | import pytest
 7 | 
 8 | import openml.testing
 9 | from openml.extensions import get_extension_by_flow, get_extension_by_model, register_extension
10 | 
11 | 
12 | class DummyFlow:
13 |     external_version = "DummyFlow==0.1"
14 |     name = "Dummy Flow"
15 |     flow_id = 1
16 |     dependencies = None
17 | 
18 | 
19 | class DummyModel:
20 |     pass
21 | 
22 | 
23 | class DummyExtension1:
24 |     @staticmethod
25 |     def can_handle_flow(flow):
26 |         return inspect.stack()[2].filename.endswith("test_functions.py")
27 | 
28 |     @staticmethod
29 |     def can_handle_model(model):
30 |         return inspect.stack()[2].filename.endswith("test_functions.py")
31 | 
32 | 
33 | class DummyExtension2:
34 |     @staticmethod
35 |     def can_handle_flow(flow):
36 |         return False
37 | 
38 |     @staticmethod
39 |     def can_handle_model(model):
40 |         return False
41 | 
42 | 
43 | def _unregister():
44 |     # "Un-register" the test extensions
45 |     while True:
46 |         rem_dum_ext1 = False
47 |         rem_dum_ext2 = False
48 |         try:
49 |             openml.extensions.extensions.remove(DummyExtension1)
50 |             rem_dum_ext1 = True
51 |         except ValueError:
52 |             pass
53 |         try:
54 |             openml.extensions.extensions.remove(DummyExtension2)
55 |             rem_dum_ext2 = True
56 |         except ValueError:
57 |             pass
58 |         if not rem_dum_ext1 and not rem_dum_ext2:
59 |             break
60 | 
61 | 
62 | class TestInit(openml.testing.TestBase):
63 |     def setUp(self):
64 |         super().setUp()
65 |         _unregister()
66 | 
67 |     def test_get_extension_by_flow(self):
68 |         assert get_extension_by_flow(DummyFlow()) is None
69 |         with pytest.raises(ValueError, match="No extension registered which can handle flow:"):
70 |             get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
71 |         register_extension(DummyExtension1)
72 |         assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
73 |         register_extension(DummyExtension2)
74 |         assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
75 |         register_extension(DummyExtension1)
76 |         with pytest.raises(
77 |             ValueError, match="Multiple extensions registered which can handle flow:"
78 |         ):
79 |             get_extension_by_flow(DummyFlow())
80 | 
81 |     def test_get_extension_by_model(self):
82 |         assert get_extension_by_model(DummyModel()) is None
83 |         with pytest.raises(ValueError, match="No extension registered which can handle model:"):
84 |             get_extension_by_model(DummyModel(), raise_if_no_extension=True)
85 |         register_extension(DummyExtension1)
86 |         assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
87 |         register_extension(DummyExtension2)
88 |         assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
89 |         register_extension(DummyExtension1)
90 |         with pytest.raises(
91 |             ValueError, match="Multiple extensions registered which can handle model:"
92 |         ):
93 |             get_extension_by_model(DummyModel())
94 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/flow_id_tutorial.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | # # Obtaining Flow IDs
 3 | # This tutorial discusses different ways to obtain the ID of a flow in order to perform further
 4 | # analysis.
 5 | 
 6 | 
 7 | # %%
 8 | import sklearn.tree
 9 | 
10 | import openml
11 | 
12 | 
13 | # %% [markdown]
14 | # .. warning::
15 | #    .. include:: ../../test_server_usage_warning.txt
16 | 
17 | # %%
18 | openml.config.start_using_configuration_for_example()
19 | openml.config.server = "https://api.openml.org/api/v1/xml"
20 | 
21 | # %%
22 | # Defining a classifier
23 | clf = sklearn.tree.DecisionTreeClassifier()
24 | 
25 | # %% [markdown]
26 | # ## 1. Obtaining a flow given a classifier
27 | 
28 | # %%
29 | flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
30 | flow_id = flow.flow_id
31 | print(flow_id)
32 | 
33 | # %% [markdown]
34 | # This piece of code is rather involved. First, it retrieves a
35 | # :class:`~openml.extensions.Extension` which is registered and can handle the given model,
36 | # in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension
37 | # converts the classifier into an instance of :class:`openml.OpenMLFlow`. Third and finally,
38 | # the publish method checks whether the current flow is already present on OpenML. If not,
39 | # it uploads the flow, otherwise, it updates the current instance with all information computed
40 | # by the server (which is obviously also done when uploading/publishing a flow).
41 | #
42 | # To simplify the usage we have created a helper function which automates all these steps:
43 | 
44 | # %%
45 | flow_id = openml.flows.get_flow_id(model=clf)
46 | print(flow_id)
47 | 
48 | # %% [markdown]
49 | # ## 2. Obtaining a flow given its name
50 | # The schema of a flow is given in XSD (
51 | # [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)).  # noqa E501
52 | # Only two fields are required, a unique name, and an external version. While it should be pretty
53 | # obvious why we need a name, the need for the additional external version information might not
54 | # be immediately clear. However, this information is very important as it allows to have multiple
55 | # flows with the same name for different versions of a software. This might be necessary if an
56 | # algorithm or implementation introduces, renames or drop hyperparameters over time.
57 | 
58 | # %%
59 | print(flow.name, flow.external_version)
60 | 
61 | # %% [markdown]
62 | # The name and external version are automatically added to a flow when constructing it from a
63 | # model. We can then use them to retrieve the flow id as follows:
64 | 
65 | # %%
66 | flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version)
67 | print(flow_id)
68 | 
69 | # %% [markdown]
70 | # We can also retrieve all flows for a given name:
71 | 
72 | # %%
73 | flow_ids = openml.flows.get_flow_id(name=flow.name)
74 | print(flow_ids)
75 | 
76 | # %% [markdown]
77 | # This also works with the actual model (generalizing the first part of this example):
78 | 
79 | # %%
80 | flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
81 | print(flow_ids)
82 | 
83 | # %%
84 | # Deactivating test configuration
85 | openml.config.stop_using_configuration_for_example()
86 | # License: BSD 3-Clause
87 | 


--------------------------------------------------------------------------------
/docker/startup.sh:
--------------------------------------------------------------------------------
 1 | # Entry script to switch between the different Docker functionalities.
 2 | # By default, execute Python with OpenML pre-installed
 3 | #
 4 | # Entry script to allow docker to be ran for bash, tests and docs.
 5 | # The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``.
 6 | # Executes ``mode`` on ``branch`` or the provided ``code`` directory.
 7 | # $1: Mode, optional. Options:
 8 | #        - test: execute unit tests
 9 | #        - doc: build documentation, requires a mounted ``output`` directory if built from a branch.
10 | #        - if not provided: execute bash.
11 | # $2: Branch, optional.
12 | #        Mutually exclusive with mounting a ``code`` directory.
13 | #        Can be a branch on a Github fork, specified with the USERNAME#BRANCH format.
14 | #        The test or doc build is executed on this branch.
15 | 
16 | if [[ ! ( $1 = "doc" || $1 = "test" ) ]]; then
17 |   cd openml
18 |   source venv/bin/activate
19 |   python "$@"
20 |   exit 0
21 | fi
22 | 
23 | # doc and test modes require mounted directories and/or specified branches
24 | if ! [ -d "/code" ] && [ -z "$2" ]; then
25 |   echo "To perform $1 a code repository must be mounted to '/code' or a branch must be specified." >> /dev/stderr
26 |   exit 1
27 | fi
28 | if [ -d "/code" ] && [ -n "$2" ]; then
29 |   # We want to avoid switching the git environment from within the docker container
30 |   echo "You can not specify a branch for a mounted code repository." >> /dev/stderr
31 |   exit 1
32 | fi
33 | if [ "$1" == "doc" ]  && [ -n "$2" ] && ! [ -d "/output" ]; then
34 |     echo "To build docs from an online repository, you need to mount an output directory." >> /dev/stderr
35 |     exit 1
36 | fi
37 | 
38 | if [ -n "$2" ]; then
39 |   # if a branch is provided, we will pull it into the `openml` local repository that was created with the image.
40 |   cd openml
41 |   if [[ $2 == *#* ]]; then
42 |     # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling
43 |     # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<<
44 |     readarray -d '#' -t fork_name_and_branch<<<"$2#"
45 |     fork_url="https://github.com/${fork_name_and_branch[0]}/openml-python.git"
46 |     fork_branch="${fork_name_and_branch[1]}"
47 |     echo git fetch "$fork_url" "$fork_branch":branch_from_fork
48 |     git fetch "$fork_url" "$fork_branch":branch_from_fork
49 |     branch=branch_from_fork
50 |   else
51 |     git fetch origin "$2"
52 |     branch=$2
53 |   fi
54 |   if ! git checkout "$branch" ; then
55 |     echo "Could not checkout $branch. If the branch lives on a fork, specify it as USER#BRANCH. Make sure to push the branch." >> /dev/stderr
56 |     exit 1
57 |   fi
58 |   git pull
59 |   code_dir="/openml"
60 | else
61 |   code_dir="/code"
62 | fi
63 | 
64 | source /openml/venv/bin/activate
65 | cd $code_dir
66 | # The most recent ``main`` is already installed, but we want to update any outdated dependencies
67 | pip install -e .[test,examples,docs,examples_unix]
68 | 
69 | if [ "$1" == "test" ]; then
70 |   pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv
71 | fi
72 | 
73 | if [ "$1" == "doc" ]; then
74 |   cd doc
75 |   make html
76 |   make linkcheck
77 |   if [ -d "/output" ]; then
78 |     cp -r /openml/doc/build /output
79 |   fi
80 | fi


--------------------------------------------------------------------------------
/openml/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The OpenML module implements a python interface to
  3 | `OpenML <https://www.openml.org>`_, a collaborative platform for machine
  4 | learning. OpenML can be used to
  5 | 
  6 | * store, download and analyze datasets
  7 | * make experiments and their results (e.g. models, predictions)
  8 |   accesible and reproducible for everybody
  9 | * analyze experiments (uploaded by you and other collaborators) and conduct
 10 |   meta studies
 11 | 
 12 | In particular, this module implements a python interface for the
 13 | `OpenML REST API <https://www.openml.org/guide#!rest_services>`_
 14 | (`REST on wikipedia
 15 | <https://en.wikipedia.org/wiki/Representational_state_transfer>`_).
 16 | """
 17 | 
 18 | # License: BSD 3-Clause
 19 | from __future__ import annotations
 20 | 
 21 | from . import (
 22 |     _api_calls,
 23 |     config,
 24 |     datasets,
 25 |     evaluations,
 26 |     exceptions,
 27 |     extensions,
 28 |     flows,
 29 |     runs,
 30 |     setups,
 31 |     study,
 32 |     tasks,
 33 |     utils,
 34 | )
 35 | from .__version__ import __version__
 36 | from .datasets import OpenMLDataFeature, OpenMLDataset
 37 | from .evaluations import OpenMLEvaluation
 38 | from .flows import OpenMLFlow
 39 | from .runs import OpenMLRun
 40 | from .setups import OpenMLParameter, OpenMLSetup
 41 | from .study import OpenMLBenchmarkSuite, OpenMLStudy
 42 | from .tasks import (
 43 |     OpenMLClassificationTask,
 44 |     OpenMLClusteringTask,
 45 |     OpenMLLearningCurveTask,
 46 |     OpenMLRegressionTask,
 47 |     OpenMLSplit,
 48 |     OpenMLSupervisedTask,
 49 |     OpenMLTask,
 50 | )
 51 | 
 52 | 
 53 | def populate_cache(
 54 |     task_ids: list[int] | None = None,
 55 |     dataset_ids: list[int | str] | None = None,
 56 |     flow_ids: list[int] | None = None,
 57 |     run_ids: list[int] | None = None,
 58 | ) -> None:
 59 |     """
 60 |     Populate a cache for offline and parallel usage of the OpenML connector.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     task_ids : iterable
 65 | 
 66 |     dataset_ids : iterable
 67 | 
 68 |     flow_ids : iterable
 69 | 
 70 |     run_ids : iterable
 71 | 
 72 |     Returns
 73 |     -------
 74 |     None
 75 |     """
 76 |     if task_ids is not None:
 77 |         for task_id in task_ids:
 78 |             tasks.functions.get_task(task_id)
 79 | 
 80 |     if dataset_ids is not None:
 81 |         for dataset_id in dataset_ids:
 82 |             datasets.functions.get_dataset(dataset_id)
 83 | 
 84 |     if flow_ids is not None:
 85 |         for flow_id in flow_ids:
 86 |             flows.functions.get_flow(flow_id)
 87 | 
 88 |     if run_ids is not None:
 89 |         for run_id in run_ids:
 90 |             runs.functions.get_run(run_id)
 91 | 
 92 | 
 93 | __all__ = [
 94 |     "OpenMLDataset",
 95 |     "OpenMLDataFeature",
 96 |     "OpenMLRun",
 97 |     "OpenMLSplit",
 98 |     "OpenMLEvaluation",
 99 |     "OpenMLSetup",
100 |     "OpenMLParameter",
101 |     "OpenMLTask",
102 |     "OpenMLSupervisedTask",
103 |     "OpenMLClusteringTask",
104 |     "OpenMLLearningCurveTask",
105 |     "OpenMLRegressionTask",
106 |     "OpenMLClassificationTask",
107 |     "OpenMLFlow",
108 |     "OpenMLStudy",
109 |     "OpenMLBenchmarkSuite",
110 |     "datasets",
111 |     "evaluations",
112 |     "exceptions",
113 |     "extensions",
114 |     "config",
115 |     "runs",
116 |     "flows",
117 |     "tasks",
118 |     "setups",
119 |     "study",
120 |     "utils",
121 |     "_api_calls",
122 |     "__version__",
123 | ]
124 | 


--------------------------------------------------------------------------------
/tests/test_runs/test_trace.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import pytest
 5 | 
 6 | from openml.runs import OpenMLRunTrace, OpenMLTraceIteration
 7 | from openml.testing import TestBase
 8 | 
 9 | 
10 | class TestTrace(TestBase):
11 |     def test_get_selected_iteration(self):
12 |         trace_iterations = {}
13 |         for i in range(5):
14 |             for j in range(5):
15 |                 for k in range(5):
16 |                     t = OpenMLTraceIteration(
17 |                         repeat=i,
18 |                         fold=j,
19 |                         iteration=5,
20 |                         setup_string="parameter_%d%d%d" % (i, j, k),
21 |                         evaluation=1.0 * i + 0.1 * j + 0.01 * k,
22 |                         selected=(i == j and i == k and i == 2),
23 |                         parameters=None,
24 |                     )
25 |                     trace_iterations[(i, j, k)] = t
26 | 
27 |         trace = OpenMLRunTrace(-1, trace_iterations=trace_iterations)
28 |         # This next one should simply not fail
29 |         assert trace.get_selected_iteration(2, 2) == 2
30 |         with pytest.raises(
31 |             ValueError, match="Could not find the selected iteration for rep/fold 3/3"
32 |         ):
33 |             trace.get_selected_iteration(3, 3)
34 | 
35 |     def test_initialization(self):
36 |         """Check all different ways to fail the initialization"""
37 |         with pytest.raises(ValueError, match="Trace content not available."):
38 |             OpenMLRunTrace.generate(attributes="foo", content=None)
39 |         with pytest.raises(ValueError, match="Trace attributes not available."):
40 |             OpenMLRunTrace.generate(attributes=None, content="foo")
41 |         with pytest.raises(ValueError, match="Trace content is empty."):
42 |             OpenMLRunTrace.generate(attributes="foo", content=[])
43 |         with pytest.raises(ValueError, match="Trace_attributes and trace_content not compatible:"):
44 |             OpenMLRunTrace.generate(attributes=["abc"], content=[[1, 2]])
45 | 
46 |     def test_duplicate_name(self):
47 |         # Test that the user does not pass a parameter which has the same name
48 |         # as one of the required trace attributes
49 |         trace_attributes = [
50 |             ("repeat", "NUMERICAL"),
51 |             ("fold", "NUMERICAL"),
52 |             ("iteration", "NUMERICAL"),
53 |             ("evaluation", "NUMERICAL"),
54 |             ("selected", ["true", "false"]),
55 |             ("repeat", "NUMERICAL"),
56 |         ]
57 |         trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
58 |         with pytest.raises(
59 |             ValueError,
60 |             match="Either `setup_string` or `parameters` needs to be passed as argument.",
61 |         ):
62 |             OpenMLRunTrace.generate(trace_attributes, trace_content)
63 | 
64 |         trace_attributes = [
65 |             ("repeat", "NUMERICAL"),
66 |             ("fold", "NUMERICAL"),
67 |             ("iteration", "NUMERICAL"),
68 |             ("evaluation", "NUMERICAL"),
69 |             ("selected", ["true", "false"]),
70 |             ("sunshine", "NUMERICAL"),
71 |         ]
72 |         trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
73 |         with pytest.raises(
74 |             ValueError,
75 |             match="Encountered unknown attribute sunshine that does not start with "
76 |             "prefix parameter_",
77 |         ):
78 |             OpenMLRunTrace.generate(trace_attributes, trace_content)
79 | 


--------------------------------------------------------------------------------
/docs/details.md:
--------------------------------------------------------------------------------
 1 | # Advanced User Guide
 2 | 
 3 | This document highlights some of the more advanced features of
 4 | `openml-python`. 
 5 | 
 6 | ## Configuration
 7 | 
 8 | The configuration file resides in a directory `.config/openml` in the
 9 | home directory of the user and is called config (More specifically, it
10 | resides in the [configuration directory specified by the XDGB Base
11 | Directory
12 | Specification](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html)).
13 | It consists of `key = value` pairs which are separated by newlines. The
14 | following keys are defined:
15 | 
16 | - apikey: required to access the server.
17 | - server: the server to connect to (default: `http://www.openml.org`).
18 |           For connection to the test server, set this to `test.openml.org`.
19 | - cachedir: the root folder where the cache file directories should be created.
20 |     If not given, will default to `~/.openml/cache`
21 | - avoid_duplicate_runs: if set to `True` (default), when certain functions
22 |             are called a lookup is performed to see if there already
23 |             exists such a run on the server. If so, download those
24 |             results instead.
25 | - retry_policy: Defines how to react when the server is unavailable or
26 |             experiencing high load. It determines both how often to
27 |             attempt to reconnect and how quickly to do so. Please don't
28 |             use `human` in an automated script that you run more than
29 |             one instance of, it might increase the time to complete your
30 |             jobs and that of others. One of:
31 |             -   human (default): For people running openml in interactive
32 |                 fashion. Try only a few times, but in quick succession.
33 |             -   robot: For people using openml in an automated fashion. Keep
34 |                 trying to reconnect for a longer time, quickly increasing
35 |                 the time between retries.
36 | 
37 | - connection_n_retries: number of times to retry a request if they fail. 
38 | Default depends on retry_policy (5 for `human`, 50 for `robot`)
39 | - verbosity: the level of output:
40 |       -   0: normal output
41 |       -   1: info output
42 |       -   2: debug output
43 | 
44 | This file is easily configurable by the `openml` command line interface.
45 | To see where the file is stored, and what its values are, use openml
46 | configure none. 
47 | 
48 | ## Docker
49 | 
50 | It is also possible to try out the latest development version of
51 | `openml-python` with docker:
52 | 
53 | ``` bash
54 | docker run -it openml/openml-python
55 | ```
56 | 
57 | See the [openml-python docker
58 | documentation](https://github.com/openml/openml-python/blob/main/docker/readme.md)
59 | for more information.
60 | 
61 | ## Key concepts
62 | 
63 | OpenML contains several key concepts which it needs to make machine
64 | learning research shareable. A machine learning experiment consists of
65 | one or several **runs**, which describe the performance of an algorithm
66 | (called a **flow** in OpenML), its hyperparameter settings (called a
67 | **setup**) on a **task**. A **Task** is the combination of a
68 | **dataset**, a split and an evaluation metric. In this user guide we
69 | will go through listing and exploring existing **tasks** to actually
70 | running machine learning algorithms on them. In a further user guide we
71 | will examine how to search through **datasets** in order to curate a
72 | list of **tasks**.
73 | 
74 | A further explanation is given in the [OpenML user
75 | guide](https://docs.openml.org/concepts/).
76 | 
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller, 
 4 | Joaquin Vanschoren and others.
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | * Redistributions of source code must retain the above copyright notice, this
11 |   list of conditions and the following disclaimer.
12 | 
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 |   this list of conditions and the following disclaimer in the documentation
15 |   and/or other materials provided with the distribution.
16 | 
17 | * Neither the name of the copyright holder nor the names of its
18 |   contributors may be used to endorse or promote products derived from
19 |   this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 
32 | License of the files CONTRIBUTING.md, ISSUE_TEMPLATE.md and 
33 | PULL_REQUEST_TEMPLATE.md:
34 | 
35 | Those files are modifications of the respecting templates in scikit-learn and
36 | they are licensed under a New BSD license:
37 | 
38 | New BSD License
39 | 
40 | Copyright (c) 2007–2018 The scikit-learn developers.
41 | All rights reserved.
42 | 
43 | 
44 | Redistribution and use in source and binary forms, with or without
45 | modification, are permitted provided that the following conditions are met:
46 | 
47 |   a. Redistributions of source code must retain the above copyright notice,
48 |      this list of conditions and the following disclaimer.
49 |   b. Redistributions in binary form must reproduce the above copyright
50 |      notice, this list of conditions and the following disclaimer in the
51 |      documentation and/or other materials provided with the distribution.
52 |   c. Neither the name of the Scikit-learn Developers  nor the names of
53 |      its contributors may be used to endorse or promote products
54 |      derived from this software without specific prior written
55 |      permission. 
56 | 
57 | 
58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
59 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
62 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
64 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
65 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
68 | DAMAGE.
69 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_split.py:
--------------------------------------------------------------------------------
 1 | # License: BSD 3-Clause
 2 | from __future__ import annotations
 3 | 
 4 | import inspect
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | import numpy as np
 9 | 
10 | from openml import OpenMLSplit
11 | from openml.testing import TestBase
12 | 
13 | 
14 | class OpenMLSplitTest(TestBase):
15 |     # Splitting not helpful, these test's don't rely on the server and take less
16 |     # than 5 seconds + rebuilding the test would potentially be costly
17 | 
18 |     def setUp(self):
19 |         __file__ = inspect.getfile(OpenMLSplitTest)
20 |         self.directory = os.path.dirname(__file__)
21 |         # This is for dataset
22 |         self.arff_filepath = (
23 |             Path(self.directory).parent
24 |             / "files"
25 |             / "org"
26 |             / "openml"
27 |             / "test"
28 |             / "tasks"
29 |             / "1882"
30 |             / "datasplits.arff"
31 |         )
32 |         self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3")
33 | 
34 |     def tearDown(self):
35 |         try:
36 |             os.remove(self.pd_filename)
37 |         except (OSError, FileNotFoundError):
38 |             #  Replaced bare except. Not sure why these exceptions are acceptable.
39 |             pass
40 | 
41 |     def test_eq(self):
42 |         split = OpenMLSplit._from_arff_file(self.arff_filepath)
43 |         assert split == split
44 | 
45 |         split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
46 |         split2.name = "a"
47 |         assert split != split2
48 | 
49 |         split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
50 |         split2.description = "a"
51 |         assert split != split2
52 | 
53 |         split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
54 |         split2.split[10] = {}
55 |         assert split != split2
56 | 
57 |         split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
58 |         split2.split[0][10] = {}
59 |         assert split != split2
60 | 
61 |     def test_from_arff_file(self):
62 |         split = OpenMLSplit._from_arff_file(self.arff_filepath)
63 |         assert isinstance(split.split, dict)
64 |         assert isinstance(split.split[0], dict)
65 |         assert isinstance(split.split[0][0], dict)
66 |         assert isinstance(split.split[0][0][0][0], np.ndarray)
67 |         assert isinstance(split.split[0][0][0].train, np.ndarray)
68 |         assert isinstance(split.split[0][0][0].train, np.ndarray)
69 |         assert isinstance(split.split[0][0][0][1], np.ndarray)
70 |         assert isinstance(split.split[0][0][0].test, np.ndarray)
71 |         assert isinstance(split.split[0][0][0].test, np.ndarray)
72 |         for i in range(10):
73 |             for j in range(10):
74 |                 assert split.split[i][j][0].train.shape[0] >= 808
75 |                 assert split.split[i][j][0].test.shape[0] >= 89
76 |                 assert (
77 |                     split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0] == 898
78 |                 )
79 | 
80 |     def test_get_split(self):
81 |         split = OpenMLSplit._from_arff_file(self.arff_filepath)
82 |         train_split, test_split = split.get(fold=5, repeat=2)
83 |         assert train_split.shape[0] == 808
84 |         assert test_split.shape[0] == 90
85 |         self.assertRaisesRegex(
86 |             ValueError,
87 |             "Repeat 10 not known",
88 |             split.get,
89 |             10,
90 |             2,
91 |         )
92 |         self.assertRaisesRegex(
93 |             ValueError,
94 |             "Fold 10 not known",
95 |             split.get,
96 |             2,
97 |             10,
98 |         )
99 | 


--------------------------------------------------------------------------------
/examples/Advanced/study_tutorial.py:
--------------------------------------------------------------------------------
  1 | # %% [markdown]
  2 | # How to list, download and upload benchmark studies.
  3 | # In contrast to
  4 | # [benchmark suites](https://docs.openml.org/benchmark/#benchmarking-suites) which
  5 | # hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and
  6 | # tasks, all required information about a study can be retrieved.
  7 | 
  8 | # %%
  9 | import uuid
 10 | 
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | 
 13 | import openml
 14 | 
 15 | # %% [markdown]
 16 | # ##  Listing studies
 17 | #
 18 | # * Use the output_format parameter to select output type
 19 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
 20 | #   easier-to-work-with data structure
 21 | 
 22 | # %%
 23 | studies = openml.study.list_studies(status="all")
 24 | print(studies.head(n=10))
 25 | 
 26 | 
 27 | # %% [markdown]
 28 | # ## Downloading studies
 29 | # This is done based on the study ID.
 30 | 
 31 | # %%
 32 | study = openml.study.get_study(123)
 33 | print(study)
 34 | 
 35 | # %% [markdown]
 36 | # Studies also features a description:
 37 | 
 38 | # %%
 39 | print(study.description)
 40 | 
 41 | # %% [markdown]
 42 | # Studies are a container for runs:
 43 | 
 44 | # %%
 45 | print(study.runs)
 46 | 
 47 | # %% [markdown]
 48 | # And we can use the evaluation listing functionality to learn more about
 49 | # the evaluations available for the conducted runs:
 50 | 
 51 | # %%
 52 | evaluations = openml.evaluations.list_evaluations(
 53 |     function="predictive_accuracy",
 54 |     study=study.study_id,
 55 |     output_format="dataframe",
 56 | )
 57 | print(evaluations.head())
 58 | 
 59 | # %% [markdown]
 60 | # We'll use the test server for the rest of this tutorial.
 61 | 
 62 | # %%
 63 | openml.config.start_using_configuration_for_example()
 64 | 
 65 | # %% [markdown]
 66 | # ## Uploading studies
 67 | #
 68 | # Creating a study is as simple as creating any kind of other OpenML entity.
 69 | # In this examples we'll create a few runs for the OpenML-100 benchmark
 70 | # suite which is available on the OpenML test server.
 71 | 
 72 | # <div class="admonition warning">
 73 | #     <p class="admonition-title">Warning</p>
 74 | #     <p>
 75 | #         For the rest of this tutorial, we will require the `openml-sklearn` package.
 76 | #         Install it with `pip install openml-sklearn`.
 77 | #     </p>
 78 | # </div>
 79 | 
 80 | # %%
 81 | # Get sklearn extension to run sklearn models easily on OpenML tasks.
 82 | from openml_sklearn import SklearnExtension
 83 | 
 84 | extension = SklearnExtension()
 85 | 
 86 | # Model to be used
 87 | clf = RandomForestClassifier()
 88 | 
 89 | # We'll create a study with one run on 3 datasets present in the suite
 90 | tasks = [115, 259, 307]
 91 | 
 92 | # To verify
 93 | # https://test.openml.org/api/v1/study/1
 94 | suite = openml.study.get_suite("OpenML100")
 95 | print(all(t_id in suite.tasks for t_id in tasks))
 96 | 
 97 | run_ids = []
 98 | for task_id in tasks:
 99 |     task = openml.tasks.get_task(task_id)
100 |     run = openml.runs.run_model_on_task(clf, task)
101 |     run.publish()
102 |     run_ids.append(run.run_id)
103 | 
104 | # The study needs a machine-readable and unique alias. To obtain this,
105 | # we simply generate a random uuid.
106 | alias = uuid.uuid4().hex
107 | 
108 | new_study = openml.study.create_study(
109 |     name="Test-Study",
110 |     description="Test study for the Python tutorial on studies",
111 |     run_ids=run_ids,
112 |     alias=alias,
113 |     benchmark_suite=suite.study_id,
114 | )
115 | new_study.publish()
116 | print(new_study)
117 | 
118 | 
119 | # %%
120 | openml.config.stop_using_configuration_for_example()
121 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # OpenML
  2 | 
  3 | **The Python API for a World of Data and More**
  4 | 
  5 | Welcome to the documentation of the OpenML Python API, a connector to
  6 | the collaborative machine learning platform
  7 | [OpenML.org](https://www.openml.org). 
  8 | OpenML-Python can download or upload data from OpenML, such as datasets
  9 | and machine learning experiment results.
 10 | 
 11 | If you are new to OpenML, we recommend checking out the [OpenML documentation](https://docs.openml.org/)
 12 | to get familiar with the concepts and features of OpenML. In particular, we recommend 
 13 | reading more about the [OpenML concepts](https://docs.openml.org/concepts/). 
 14 | 
 15 | ## :joystick: Minimal Examples
 16 | 
 17 | Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/):
 18 | 
 19 | ```python
 20 | import openml
 21 | 
 22 | dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31)
 23 | X, y, categorical_indicator, attribute_names = dataset.get_data(target="class")
 24 | ```
 25 | 
 26 | Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
 27 | 
 28 | ```python
 29 | import openml
 30 | 
 31 | task = openml.tasks.get_task(31)
 32 | dataset = task.get_dataset()
 33 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
 34 | # get splits for the first fold of 10-fold cross-validation
 35 | train_indices, test_indices = task.get_train_test_split_indices(fold=0)
 36 | ```
 37 | 
 38 | Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks:
 39 | ```python
 40 | import openml
 41 | 
 42 | suite = openml.study.get_suite("amlb-classification-all")  # Get a curated list of tasks for classification
 43 | for task_id in suite.tasks:
 44 |     task = openml.tasks.get_task(task_id)
 45 | ```
 46 | Find more examples in the navbar at the top.
 47 | 
 48 | ## :magic_wand: Installation
 49 | 
 50 | OpenML-Python is available on Linux, MacOS, and Windows.
 51 | 
 52 | You can install OpenML-Python with:
 53 | 
 54 | ```bash
 55 | pip install openml
 56 | ```
 57 | 
 58 | For more advanced installation information, please see the
 59 | ["Introduction"](../examples/Basics/introduction_tutorial) example.
 60 | 
 61 | 
 62 | ## Further information
 63 | 
 64 | -   [OpenML documentation](https://docs.openml.org/)
 65 | -   [OpenML client APIs](https://docs.openml.org/APIs/)
 66 | -   [OpenML developer guide](https://docs.openml.org/contributing/)
 67 | -   [Contact information](https://www.openml.org/contact)
 68 | -   [Citation request](https://www.openml.org/cite)
 69 | -   [OpenML blog](https://medium.com/open-machine-learning)
 70 | -   [OpenML twitter account](https://twitter.com/open_ml)
 71 | 
 72 | 
 73 | ## Contributing
 74 | 
 75 | Contributing to the OpenML package is highly appreciated. Please see the
 76 | ["Contributing"](contributing.md) page for more information.
 77 | 
 78 | ## Citing OpenML-Python
 79 | 
 80 | If you use OpenML-Python in a scientific publication, we would
 81 | appreciate a reference to our JMLR-MLOSS paper 
 82 | ["OpenML-Python: an extensible Python API for OpenML"](https://www.jmlr.org/papers/v22/19-920.html):
 83 | 
 84 | === "Bibtex"
 85 | 
 86 |     ```bibtex
 87 |     @article{JMLR:v22:19-920,
 88 |         author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas MÃ¼ller and Joaquin Vanschoren and Frank Hutter},
 89 |         title   = {OpenML-Python: an extensible Python API for OpenML},
 90 |         journal = {Journal of Machine Learning Research},
 91 |         year    = {2021},
 92 |         volume  = {22},
 93 |         number  = {100},
 94 |         pages   = {1--5},
 95 |         url     = {http://jmlr.org/papers/v22/19-920.html}
 96 |     }
 97 |     ```
 98 | 
 99 | === "MLA"
100 | 
101 |     Feurer, Matthias, et al. 
102 |     "OpenML-Python: an extensible Python API for OpenML."
103 |     _Journal of Machine Learning Research_ 22.100 (2021):1−5.
104 | 


--------------------------------------------------------------------------------
/examples/Basics/simple_flows_and_runs_tutorial.py:
--------------------------------------------------------------------------------
  1 | # %% [markdown]
  2 | # A simple tutorial on how to upload results from a machine learning experiment to OpenML.
  3 | 
  4 | # %%
  5 | import sklearn
  6 | from sklearn.neighbors import KNeighborsClassifier
  7 | 
  8 | import openml
  9 | 
 10 | # %% [markdown]
 11 | # <div class="admonition warning">
 12 | #     <p class="admonition-title">Warning</p>
 13 | #     <p>
 14 | #         This example uploads data. For that reason, this example connects to the
 15 | #         test server at <a href="https://test.openml.org"
 16 | #         target="_blank">test.openml.org</a>.<br>
 17 | #         This prevents the main server from becoming overloaded with example datasets, tasks,
 18 | #         runs, and other submissions.<br>
 19 | #         Using this test server may affect the behavior and performance of the
 20 | #         OpenML-Python API.
 21 | #     </p>
 22 | # </div>
 23 | 
 24 | # %%
 25 | openml.config.start_using_configuration_for_example()
 26 | 
 27 | # %% [markdown]
 28 | # ## Train a machine learning model and evaluate it
 29 | # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20
 30 | 
 31 | # %%
 32 | task = openml.tasks.get_task(119)
 33 | 
 34 | # Get the data
 35 | dataset = task.get_dataset()
 36 | X, y, categorical_indicator, attribute_names = dataset.get_data(
 37 |     target=dataset.default_target_attribute
 38 | )
 39 | 
 40 | # Get the holdout split from the task
 41 | train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0)
 42 | X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
 43 | y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
 44 | 
 45 | knn_parameters = {
 46 |     "n_neighbors": 3,
 47 | }
 48 | clf = KNeighborsClassifier(**knn_parameters)
 49 | clf.fit(X_train, y_train)
 50 | 
 51 | # Get experiment results
 52 | y_pred = clf.predict(X_test)
 53 | y_pred_proba = clf.predict_proba(X_test)
 54 | 
 55 | # %% [markdown]
 56 | # ## Upload the machine learning experiments to OpenML
 57 | # First, create a fow and fill it with metadata about the machine learning model.
 58 | 
 59 | # %%
 60 | knn_flow = openml.flows.OpenMLFlow(
 61 |     # Metadata
 62 |     model=clf,  # or None, if you do not want to upload the model object.
 63 |     name="CustomKNeighborsClassifier",
 64 |     description="A custom KNeighborsClassifier flow for OpenML.",
 65 |     external_version=f"{sklearn.__version__}",
 66 |     language="English",
 67 |     tags=["openml_tutorial_knn"],
 68 |     dependencies=f"{sklearn.__version__}",
 69 |     # Hyperparameters
 70 |     parameters={k: str(v) for k, v in knn_parameters.items()},
 71 |     parameters_meta_info={
 72 |         "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"}
 73 |     },
 74 |     # If you have a pipeline with subcomponents, such as preprocessing, add them here.
 75 |     components={},
 76 | )
 77 | knn_flow.publish()
 78 | print(f"knn_flow was published with the ID {knn_flow.flow_id}")
 79 | 
 80 | # %% [markdown]
 81 | # Second, we create a run to store the results associated with the flow.
 82 | 
 83 | # %%
 84 | 
 85 | # Format the predictions for OpenML
 86 | predictions = []
 87 | for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
 88 |     test_indices, y_test, y_pred, y_pred_proba
 89 | ):
 90 |     predictions.append(
 91 |         openml.runs.functions.format_prediction(
 92 |             task=task,
 93 |             repeat=0,
 94 |             fold=0,
 95 |             index=test_index,
 96 |             prediction=y_pred_i,
 97 |             truth=y_true_i,
 98 |             proba=dict(zip(task.class_labels, y_pred_proba_i)),
 99 |         )
100 |     )
101 | 
102 | # Format the parameters for OpenML
103 | oml_knn_parameters = [
104 |     {"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id}
105 |     for k, v in knn_parameters.items()
106 | ]
107 | 
108 | knn_run = openml.runs.OpenMLRun(
109 |     task_id=task.task_id,
110 |     flow_id=knn_flow.flow_id,
111 |     dataset_id=dataset.dataset_id,
112 |     parameter_settings=oml_knn_parameters,
113 |     data_content=predictions,
114 |     tags=["openml_tutorial_knn"],
115 |     description_text="Run generated by the tutorial.",
116 | )
117 | knn_run = knn_run.publish()
118 | print(f"Run was uploaded to {knn_run.openml_url}")
119 | print(f"The flow can be found at {knn_run.flow.openml_url}")
120 | 
121 | # %%
122 | openml.config.stop_using_configuration_for_example()
123 | 


--------------------------------------------------------------------------------
/openml/extensions/functions.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 3-Clause
  2 | from __future__ import annotations
  3 | 
  4 | from typing import TYPE_CHECKING, Any
  5 | 
  6 | # Need to implement the following by its full path because otherwise it won't be possible to
  7 | # access openml.extensions.extensions
  8 | import openml.extensions
  9 | 
 10 | # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 11 | if TYPE_CHECKING:
 12 |     from openml.flows import OpenMLFlow
 13 | 
 14 |     from . import Extension
 15 | 
 16 | SKLEARN_HINT = (
 17 |     "But it looks related to scikit-learn. "
 18 |     "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
 19 |     "For more information, see "
 20 |     "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation"
 21 | )
 22 | 
 23 | 
 24 | def register_extension(extension: type[Extension]) -> None:
 25 |     """Register an extension.
 26 | 
 27 |     Registered extensions are considered by ``get_extension_by_flow`` and
 28 |     ``get_extension_by_model``, which are used by ``openml.flow`` and ``openml.runs``.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     extension : Type[Extension]
 33 | 
 34 |     Returns
 35 |     -------
 36 |     None
 37 |     """
 38 |     openml.extensions.extensions.append(extension)
 39 | 
 40 | 
 41 | def get_extension_by_flow(
 42 |     flow: OpenMLFlow,
 43 |     raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
 44 | ) -> Extension | None:
 45 |     """Get an extension which can handle the given flow.
 46 | 
 47 |     Iterates all registered extensions and checks whether they can handle the presented flow.
 48 |     Raises an exception if two extensions can handle a flow.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     flow : OpenMLFlow
 53 | 
 54 |     raise_if_no_extension : bool (optional, default=False)
 55 |         Raise an exception if no registered extension can handle the presented flow.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     Extension or None
 60 |     """
 61 |     candidates = []
 62 |     for extension_class in openml.extensions.extensions:
 63 |         if extension_class.can_handle_flow(flow):
 64 |             candidates.append(extension_class())
 65 |     if len(candidates) == 0:
 66 |         if raise_if_no_extension:
 67 |             install_instruction = ""
 68 |             if flow.name.startswith("sklearn"):
 69 |                 install_instruction = SKLEARN_HINT
 70 |             raise ValueError(
 71 |                 f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). "
 72 |                 f"{install_instruction}"
 73 |             )
 74 | 
 75 |         return None
 76 | 
 77 |     if len(candidates) == 1:
 78 |         return candidates[0]
 79 | 
 80 |     raise ValueError(
 81 |         f"Multiple extensions registered which can handle flow: {flow}, but only one "
 82 |         f"is allowed ({candidates}).",
 83 |     )
 84 | 
 85 | 
 86 | def get_extension_by_model(
 87 |     model: Any,
 88 |     raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
 89 | ) -> Extension | None:
 90 |     """Get an extension which can handle the given flow.
 91 | 
 92 |     Iterates all registered extensions and checks whether they can handle the presented model.
 93 |     Raises an exception if two extensions can handle a model.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     model : Any
 98 | 
 99 |     raise_if_no_extension : bool (optional, default=False)
100 |         Raise an exception if no registered extension can handle the presented model.
101 | 
102 |     Returns
103 |     -------
104 |     Extension or None
105 |     """
106 |     candidates = []
107 |     for extension_class in openml.extensions.extensions:
108 |         if extension_class.can_handle_model(model):
109 |             candidates.append(extension_class())
110 |     if len(candidates) == 0:
111 |         if raise_if_no_extension:
112 |             install_instruction = ""
113 |             if type(model).__module__.startswith("sklearn"):
114 |                 install_instruction = SKLEARN_HINT
115 |             raise ValueError(
116 |                 f"No extension registered which can handle model: {model}. {install_instruction}"
117 |             )
118 | 
119 |         return None
120 | 
121 |     if len(candidates) == 1:
122 |         return candidates[0]
123 | 
124 |     raise ValueError(
125 |         f"Multiple extensions registered which can handle model: {model}, but only one "
126 |         f"is allowed ({candidates}).",
127 |     )
128 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/2015_neurips_feurer_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Feurer et al. (2015)
 3 | ====================
 4 | 
 5 | A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
 6 | 
 7 | Auto-sklearn website: https://automl.github.io/auto-sklearn/
 8 | 
 9 | Publication
10 | ~~~~~~~~~~~
11 | 
12 | | Efficient and Robust Automated Machine Learning
13 | | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
14 | | In *Advances in Neural Information Processing Systems 28*, 2015
15 | | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
16 | """  # noqa F401
17 | 
18 | # License: BSD 3-Clause
19 | 
20 | import pandas as pd
21 | 
22 | import openml
23 | 
24 | ####################################################################################################
25 | # List of dataset IDs given in the supplementary material of Feurer et al.:
26 | # https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
27 | # fmt: off
28 | dataset_ids = [
29 |     3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
30 |     57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
31 |     390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
32 |     723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
33 |     803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
34 |     849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
35 |     934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
36 |     1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
37 |     1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
38 |     1134, 1138, 1139, 1142, 1146, 1161, 1166,
39 | ]
40 | # fmt: on
41 | 
42 | ####################################################################################################
43 | # The dataset IDs could be used directly to load the dataset and split the data into a training set
44 | # and a test set. However, to be reproducible, we will first obtain the respective tasks from
45 | # OpenML, which define both the target feature and the train/test split.
46 | #
47 | # .. note::
48 | #    It is discouraged to work directly on datasets and only provide dataset IDs in a paper as
49 | #    this does not allow reproducibility (unclear splitting). Please do not use datasets but the
50 | #    respective tasks as basis for a paper and publish task IDS. This example is only given to
51 | #    showcase the use of OpenML-Python for a published paper and as a warning on how not to do it.
52 | #    Please check the `OpenML documentation of tasks <https://docs.openml.org/concepts/tasks/>`_ if you
53 | #    want to learn more about them.
54 | 
55 | ####################################################################################################
56 | # This lists both active and inactive tasks (because of ``status='all'``). Unfortunately,
57 | # this is necessary as some of the datasets contain issues found after the publication and became
58 | # deactivated, which also deactivated the tasks on them. More information on active or inactive
59 | # datasets can be found in the `online docs <https://docs.openml.org/concepts/data/#dataset-status>`_.
60 | tasks = openml.tasks.list_tasks(
61 |     task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
62 |     status="all",
63 |     output_format="dataframe",
64 | )
65 | 
66 | # Query only those with holdout as the resampling startegy.
67 | tasks = tasks.query('estimation_procedure == "33% Holdout set"')
68 | 
69 | task_ids = []
70 | for did in dataset_ids:
71 |     tasks_ = list(tasks.query("did == {}".format(did)).tid)
72 |     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
73 |         task_id = min(tasks_)
74 |     else:
75 |         raise ValueError(did)
76 | 
77 |     # Optional - Check that the task has the same target attribute as the
78 |     # dataset default target attribute
79 |     # (disabled for this example as it needs to run fast to be rendered online)
80 |     # task = openml.tasks.get_task(task_id)
81 |     # dataset = task.get_dataset()
82 |     # if task.target_name != dataset.default_target_attribute:
83 |     #     raise ValueError(
84 |     #         (task.target_name, dataset.default_target_attribute)
85 |     #     )
86 | 
87 |     task_ids.append(task_id)
88 | 
89 | assert len(task_ids) == 140
90 | task_ids.sort()
91 | 
92 | # These are the tasks to work with:
93 | print(task_ids)
94 | 


--------------------------------------------------------------------------------
/tests/test_tasks/test_task.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 3-Clause
  2 | from __future__ import annotations
  3 | 
  4 | import unittest
  5 | from random import randint, shuffle
  6 | 
  7 | from openml.datasets import (
  8 |     get_dataset,
  9 |     list_datasets,
 10 | )
 11 | from openml.exceptions import OpenMLServerException
 12 | from openml.tasks import TaskType, create_task, get_task
 13 | from openml.testing import TestBase
 14 | 
 15 | 
 16 | class OpenMLTaskTest(TestBase):
 17 |     """
 18 |     A helper class. The methods of the test case
 19 |     are only executed in subclasses of the test case.
 20 |     """
 21 | 
 22 |     __test__ = False
 23 | 
 24 |     @classmethod
 25 |     def setUpClass(cls):
 26 |         if cls is OpenMLTaskTest:
 27 |             raise unittest.SkipTest("Skip OpenMLTaskTest tests," " it's a base class")
 28 |         super().setUpClass()
 29 | 
 30 |     def setUp(self, n_levels: int = 1):
 31 |         super().setUp()
 32 | 
 33 |     def test_download_task(self):
 34 |         return get_task(self.task_id)
 35 | 
 36 |     def test_upload_task(self):
 37 |         # We don't know if the task in question already exists, so we try a few times. Checking
 38 |         # beforehand would not be an option because a concurrent unit test could potentially
 39 |         # create the same task and make this unit test fail (i.e. getting a dataset and creating
 40 |         # a task for it is not atomic).
 41 |         compatible_datasets = self._get_compatible_rand_dataset()
 42 |         for i in range(100):
 43 |             try:
 44 |                 dataset_id = compatible_datasets[i % len(compatible_datasets)]
 45 |                 # TODO consider implementing on the diff task types.
 46 |                 task = create_task(
 47 |                     task_type=self.task_type,
 48 |                     dataset_id=dataset_id,
 49 |                     target_name=self._get_random_feature(dataset_id),
 50 |                     estimation_procedure_id=self.estimation_procedure,
 51 |                 )
 52 | 
 53 |                 task.publish()
 54 |                 TestBase._mark_entity_for_removal("task", task.id)
 55 |                 TestBase.logger.info(
 56 |                     f"collected from {__file__.split('/')[-1]}: {task.id}",
 57 |                 )
 58 |                 # success
 59 |                 break
 60 |             except OpenMLServerException as e:
 61 |                 # Error code for 'task already exists'
 62 |                 # Should be 533 according to the docs
 63 |                 # (# https://www.openml.org/api_docs#!/task/post_task)
 64 |                 if e.code == 614:
 65 |                     continue
 66 |                 else:
 67 |                     raise e
 68 |         else:
 69 |             raise ValueError(
 70 |                 f"Could not create a valid task for task type ID {self.task_type}",
 71 |             )
 72 | 
 73 |     def _get_compatible_rand_dataset(self) -> list:
 74 |         active_datasets = list_datasets(status="active")
 75 | 
 76 |         # depending on the task type, find either datasets
 77 |         # with only symbolic features or datasets with only
 78 |         # numerical features.
 79 |         if self.task_type == TaskType.SUPERVISED_REGRESSION:
 80 |             compatible_datasets = active_datasets[active_datasets["NumberOfSymbolicFeatures"] == 0]
 81 |         elif self.task_type == TaskType.CLUSTERING:
 82 |             compatible_datasets = active_datasets
 83 |         else:
 84 |             compatible_datasets = active_datasets[active_datasets["NumberOfNumericFeatures"] == 0]
 85 | 
 86 |         compatible_datasets = list(compatible_datasets["did"])
 87 |         # in-place shuffling
 88 |         shuffle(compatible_datasets)
 89 |         return compatible_datasets
 90 | 
 91 |         # random_dataset_pos = randint(0, len(compatible_datasets) - 1)
 92 |         #
 93 |         # return compatible_datasets[random_dataset_pos]
 94 | 
 95 |     def _get_random_feature(self, dataset_id: int) -> str:
 96 |         random_dataset = get_dataset(dataset_id)
 97 |         # necessary loop to overcome string and date type
 98 |         # features.
 99 |         while True:
100 |             random_feature_index = randint(0, len(random_dataset.features) - 1)
101 |             random_feature = random_dataset.features[random_feature_index]
102 |             if self.task_type == TaskType.SUPERVISED_REGRESSION:
103 |                 if random_feature.data_type == "numeric":
104 |                     break
105 |             else:
106 |                 if random_feature.data_type == "nominal":
107 |                     break
108 |         return random_feature.name
109 | 


--------------------------------------------------------------------------------
/tests/files/org/openml/test/datasets/2/description.xml:
--------------------------------------------------------------------------------
  1 | <oml:data_set_description xmlns:oml="http://openml.org/openml">
  2 |   <oml:id>2</oml:id>
  3 |   <oml:name>anneal</oml:name>
  4 |   <oml:version>1</oml:version>
  5 |   <oml:description>**Author**:   
  6 | **Source**: Unknown -   
  7 | **Please cite**:   
  8 | 
  9 | 1. Title of Database: Annealing Data
 10 |  
 11 |  2. Source Information: donated by David Sterling and Wray Buntine.
 12 |  
 13 |  3. Past Usage: unknown
 14 |  
 15 |  4. Relevant Information:
 16 |     -- Explanation: I suspect this was left by Ross Quinlan in 1987 at the
 17 |        4th Machine Learning Workshop.  I'd have to check with Jeff Schlimmer
 18 |        to double check this.
 19 |  
 20 |  5. Number of Instances: 798
 21 |  
 22 |  6. Number of Attributes: 38
 23 |     -- 6 continuously-valued
 24 |     -- 3 integer-valued
 25 |     -- 29 nominal-valued
 26 |  
 27 |  7. Attribute Information:
 28 |      1. family:          --,GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS
 29 |      2. product-type:    C, H, G
 30 |      3. steel:           -,R,A,U,K,M,S,W,V
 31 |      4. carbon:          continuous
 32 |      5. hardness:        continuous
 33 |      6. temper_rolling:  -,T
 34 |      7. condition:       -,S,A,X
 35 |      8. formability:     -,1,2,3,4,5
 36 |      9. strength:        continuous
 37 |     10. non-ageing:      -,N
 38 |     11. surface-finish:  P,M,-
 39 |     12. surface-quality: -,D,E,F,G
 40 |     13. enamelability:   -,1,2,3,4,5
 41 |     14. bc:              Y,-
 42 |     15. bf:              Y,-
 43 |     16. bt:              Y,-
 44 |     17. bw/me:           B,M,-
 45 |     18. bl:              Y,-
 46 |     19. m:               Y,-
 47 |     20. chrom:           C,-
 48 |     21. phos:            P,-
 49 |     22. cbond:           Y,-
 50 |     23. marvi:           Y,-
 51 |     24. exptl:           Y,-
 52 |     25. ferro:           Y,-
 53 |     26. corr:            Y,-
 54 |     27. blue/bright/varn/clean:          B,R,V,C,-
 55 |     28. lustre:          Y,-
 56 |     29. jurofm:          Y,-
 57 |     30. s:               Y,-
 58 |     31. p:               Y,-
 59 |     32. shape:           COIL, SHEET
 60 |     33. thick:           continuous
 61 |     34. width:           continuous
 62 |     35. len:             continuous
 63 |     36. oil:             -,Y,N
 64 |     37. bore:            0000,0500,0600,0760
 65 |     38. packing: -,1,2,3
 66 |     classes:        1,2,3,4,5,U
 67 |   
 68 |     -- The '-' values are actually 'not_applicable' values rather than
 69 |        'missing_values' (and so can be treated as legal discrete
 70 |        values rather than as showing the absence of a discrete value).
 71 |  
 72 |  8. Missing Attribute Values: Signified with &quot;?&quot;
 73 |     Attribute:  Number of instances missing its value:
 74 |     1           0
 75 |     2           0
 76 |     3           70
 77 |     4           0
 78 |     5           0
 79 |     6           675
 80 |     7           271
 81 |     8           283
 82 |     9           0
 83 |    10           703
 84 |    11           790
 85 |    12           217
 86 |    13           785
 87 |    14           797
 88 |    15           680
 89 |    16           736
 90 |    17           609
 91 |    18           662
 92 |    19           798
 93 |    20           775
 94 |    21           791
 95 |    22           730
 96 |    23           798
 97 |    24           796
 98 |    25           772
 99 |    26           798
100 |    27           793
101 |    28           753
102 |    29           798
103 |    30           798
104 |    31           798
105 |    32           0
106 |    33           0
107 |    34           0
108 |    35           0
109 |    36           740
110 |    37           0
111 |    38           789
112 |    39           0
113 |  
114 |  9. Distribution of Classes
115 |       Class Name:   Number of Instances:
116 |       1               8
117 |       2              88
118 |       3             608
119 |       4               0
120 |       5              60
121 |       U              34
122 |                     ---
123 |                     798</oml:description>
124 |   <oml:format>ARFF</oml:format>
125 |   			<oml:upload_date>2014-04-06T23:19:24</oml:upload_date>
126 |     <oml:licence>Public</oml:licence>  <oml:url>http://www.openml.org/data/download/1666876/phpFsFYVN</oml:url>
127 |   
128 |   <oml:file_id>1666876</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>    <oml:tag>hallo</oml:tag><oml:tag>study_1</oml:tag><oml:tag>uci</oml:tag><oml:tag>welt</oml:tag>  <oml:visibility>public</oml:visibility>      <oml:status>active</oml:status>
129 |   <oml:md5_checksum>4eaed8b6ec9d8211024b6c089b064761</oml:md5_checksum>
130 | </oml:data_set_description>
131 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/run_setup_tutorial.py:
--------------------------------------------------------------------------------
  1 | # %% [markdown]
  2 | # # Run Setup
  3 | # One of the key features of the openml-python library is that is allows to
  4 | # reinstantiate flows with hyperparameter settings that were uploaded before.
  5 | # This tutorial uses the concept of setups. Although setups are not extensively
  6 | # described in the OpenML documentation (because most users will not directly
  7 | # use them), they form a important concept within OpenML distinguishing between
  8 | # hyperparameter configurations.
  9 | # A setup is the combination of a flow with all its hyperparameters set.
 10 | #
 11 | # A key requirement for reinstantiating a flow is to have the same scikit-learn
 12 | # version as the flow that was uploaded. However, this tutorial will upload the
 13 | # flow (that will later be reinstantiated) itself, so it can be ran with any
 14 | # scikit-learn version that is supported by this library. In this case, the
 15 | # requirement of the corresponding scikit-learn versions is automatically met.
 16 | #
 17 | # In this tutorial we will
 18 | #     1) Create a flow and use it to solve a task;
 19 | #     2) Download the flow, reinstantiate the model with same hyperparameters,
 20 | #        and solve the same task again;
 21 | #     3) We will verify that the obtained results are exactly the same.
 22 | 
 23 | # %%
 24 | 
 25 | import numpy as np
 26 | import openml
 27 | from openml.extensions.sklearn import cat, cont
 28 | 
 29 | from sklearn.pipeline import make_pipeline, Pipeline
 30 | from sklearn.compose import ColumnTransformer
 31 | from sklearn.impute import SimpleImputer
 32 | from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 33 | from sklearn.ensemble import RandomForestClassifier
 34 | from sklearn.decomposition import TruncatedSVD
 35 | 
 36 | # %% [markdown]
 37 | # .. warning::
 38 | #    .. include:: ../../test_server_usage_warning.txt
 39 | 
 40 | # %%
 41 | openml.config.start_using_configuration_for_example()
 42 | 
 43 | # %% [markdown]
 44 | # 1) Create a flow and use it to solve a task
 45 | 
 46 | # First, let's download the task that we are interested in
 47 | 
 48 | # %%
 49 | task = openml.tasks.get_task(6)
 50 | 
 51 | # %% [markdown]
 52 | # we will create a fairly complex model, with many preprocessing components and
 53 | # many potential hyperparameters. Of course, the model can be as complex and as
 54 | # easy as you want it to be
 55 | 
 56 | 
 57 | # %%
 58 | cat_imp = make_pipeline(
 59 |     OneHotEncoder(handle_unknown="ignore"),
 60 |     TruncatedSVD(),
 61 | )
 62 | cont_imp = SimpleImputer(strategy="median")
 63 | ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
 64 | model_original = Pipeline(
 65 |     steps=[
 66 |         ("transform", ct),
 67 |         ("estimator", RandomForestClassifier()),
 68 |     ]
 69 | )
 70 | 
 71 | # %% [markdown]
 72 | # Let's change some hyperparameters. Of course, in any good application we
 73 | # would tune them using, e.g., Random Search or Bayesian Optimization, but for
 74 | # the purpose of this tutorial we set them to some specific values that might
 75 | # or might not be optimal
 76 | 
 77 | # %%
 78 | hyperparameters_original = {
 79 |     "estimator__criterion": "gini",
 80 |     "estimator__n_estimators": 50,
 81 |     "estimator__max_depth": 10,
 82 |     "estimator__min_samples_leaf": 1,
 83 | }
 84 | model_original.set_params(**hyperparameters_original)
 85 | 
 86 | # solve the task and upload the result (this implicitly creates the flow)
 87 | run = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)
 88 | run_original = run.publish()  # this implicitly uploads the flow
 89 | 
 90 | # %% [markdown]
 91 | # ## 2) Download the flow and solve the same task again.
 92 | 
 93 | # %%
 94 | # obtain setup id (note that the setup id is assigned by the OpenML server -
 95 | # therefore it was not yet available in our local copy of the run)
 96 | run_downloaded = openml.runs.get_run(run_original.run_id)
 97 | setup_id = run_downloaded.setup_id
 98 | 
 99 | # after this, we can easily reinstantiate the model
100 | model_duplicate = openml.setups.initialize_model(setup_id)
101 | # it will automatically have all the hyperparameters set
102 | 
103 | # and run the task again
104 | run_duplicate = openml.runs.run_model_on_task(model_duplicate, task, avoid_duplicate_runs=False)
105 | 
106 | 
107 | # %% [markdown]
108 | # ## 3) We will verify that the obtained results are exactly the same.
109 | 
110 | # %%
111 | # the run has stored all predictions in the field data content
112 | np.testing.assert_array_equal(run_original.data_content, run_duplicate.data_content)
113 | 
114 | 
115 | # %%
116 | openml.config.stop_using_configuration_for_example()
117 | 
118 | # By: Jan N. van Rijn
119 | # License: BSD 3-Clause
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <div align="center">
  4 | 
  5 | <div id="user-content-toc">
  6 |   <ul align="center" style="list-style: none;">
  7 |     <summary>
  8 |       <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
  9 |       <h1>OpenML-Python</h1>
 10 |       <img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
 11 |     </summary>
 12 |   </ul>
 13 | </div>
 14 | 
 15 | ## The Python API for a World of Data and More :dizzy:
 16 | 
 17 | [![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases)
 18 | [![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)](https://pypi.org/project/openml/)
 19 | [![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml)
 20 | [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 21 | <!-- Add green badges for CI and precommit -->
 22 | 
 23 | [Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md)
 24 | </div>
 25 | 
 26 | OpenML-Python provides an easy-to-use and straightforward Python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning.
 27 | It can download or upload data from OpenML, such as datasets and machine learning experiment results.
 28 | 
 29 | ## :joystick: Minimal Example
 30 | 
 31 | Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/):
 32 | 
 33 | ```python
 34 | import openml
 35 | 
 36 | dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31)
 37 | X, y, categorical_indicator, attribute_names = dataset.get_data(target="class")
 38 | ```
 39 | 
 40 | Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
 41 | 
 42 | ```python
 43 | import openml
 44 | 
 45 | task = openml.tasks.get_task(31)
 46 | dataset = task.get_dataset()
 47 | X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
 48 | # get splits for the first fold of 10-fold cross-validation
 49 | train_indices, test_indices = task.get_train_test_split_indices(fold=0)
 50 | ```
 51 | 
 52 | Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks:
 53 | ```python
 54 | import openml
 55 | 
 56 | suite = openml.study.get_suite("amlb-classification-all")  # Get a curated list of tasks for classification
 57 | for task_id in suite.tasks:
 58 |     task = openml.tasks.get_task(task_id)
 59 | ```
 60 | 
 61 | ## :magic_wand: Installation
 62 | 
 63 | OpenML-Python is supported on Python 3.8 - 3.13 and is available on Linux, MacOS, and Windows.
 64 | 
 65 | You can install OpenML-Python with:
 66 | 
 67 | ```bash
 68 | pip install openml
 69 | ```
 70 | 
 71 | ## :page_facing_up: Citing OpenML-Python
 72 | 
 73 | If you use OpenML-Python in a scientific publication, we would appreciate a reference to the following paper:
 74 | 
 75 | [Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren, Frank Hutter<br/>
 76 | **OpenML-Python: an extensible Python API for OpenML**<br/>
 77 | Journal of Machine Learning Research, 22(100):1−5, 2021](https://www.jmlr.org/papers/v22/19-920.html)
 78 | 
 79 | Bibtex entry:
 80 | ```bibtex
 81 | @article{JMLR:v22:19-920,
 82 |   author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
 83 |   title   = {OpenML-Python: an extensible Python API for OpenML},
 84 |   journal = {Journal of Machine Learning Research},
 85 |   year    = {2021},
 86 |   volume  = {22},
 87 |   number  = {100},
 88 |   pages   = {1--5},
 89 |   url     = {http://jmlr.org/papers/v22/19-920.html}
 90 | }
 91 | ```
 92 | ## :handshake: Contributing
 93 | 
 94 | We welcome contributions from both new and experienced developers!
 95 | 
 96 | If you would like to contribute to OpenML-Python, please read our  
 97 | [Contribution Guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md).
 98 | 
 99 | If you are new to open-source development, a great way to get started is by
100 | looking at issues labeled **"good first issue"** in our GitHub issue tracker.
101 | These tasks are beginner-friendly and help you understand the project structure,
102 | development workflow, and how to submit a pull request.
103 | 


--------------------------------------------------------------------------------
/openml/evaluations/evaluation.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 3-Clause
  2 | from __future__ import annotations
  3 | 
  4 | import openml.config
  5 | import openml.datasets
  6 | import openml.flows
  7 | import openml.runs
  8 | import openml.tasks
  9 | 
 10 | 
 11 | # TODO(eddiebergman): A lot of this class is automatically
 12 | # handled by a dataclass
 13 | class OpenMLEvaluation:
 14 |     """
 15 |     Contains all meta-information about a run / evaluation combination,
 16 |     according to the evaluation/list function
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     run_id : int
 21 |         Refers to the run.
 22 |     task_id : int
 23 |         Refers to the task.
 24 |     setup_id : int
 25 |         Refers to the setup.
 26 |     flow_id : int
 27 |         Refers to the flow.
 28 |     flow_name : str
 29 |         Name of the referred flow.
 30 |     data_id : int
 31 |         Refers to the dataset.
 32 |     data_name : str
 33 |         The name of the dataset.
 34 |     function : str
 35 |         The evaluation metric of this item (e.g., accuracy).
 36 |     upload_time : str
 37 |         The time of evaluation.
 38 |     uploader: int
 39 |         Uploader ID (user ID)
 40 |     upload_name : str
 41 |         Name of the uploader of this evaluation
 42 |     value : float
 43 |         The value (score) of this evaluation.
 44 |     values : List[float]
 45 |         The values (scores) per repeat and fold (if requested)
 46 |     array_data : str
 47 |         list of information per class.
 48 |         (e.g., in case of precision, auroc, recall)
 49 |     """
 50 | 
 51 |     def __init__(  # noqa: PLR0913
 52 |         self,
 53 |         run_id: int,
 54 |         task_id: int,
 55 |         setup_id: int,
 56 |         flow_id: int,
 57 |         flow_name: str,
 58 |         data_id: int,
 59 |         data_name: str,
 60 |         function: str,
 61 |         upload_time: str,
 62 |         uploader: int,
 63 |         uploader_name: str,
 64 |         value: float | None,
 65 |         values: list[float] | None,
 66 |         array_data: str | None = None,
 67 |     ):
 68 |         self.run_id = run_id
 69 |         self.task_id = task_id
 70 |         self.setup_id = setup_id
 71 |         self.flow_id = flow_id
 72 |         self.flow_name = flow_name
 73 |         self.data_id = data_id
 74 |         self.data_name = data_name
 75 |         self.function = function
 76 |         self.upload_time = upload_time
 77 |         self.uploader = uploader
 78 |         self.uploader_name = uploader_name
 79 |         self.value = value
 80 |         self.values = values
 81 |         self.array_data = array_data
 82 | 
 83 |     def _to_dict(self) -> dict:
 84 |         return {
 85 |             "run_id": self.run_id,
 86 |             "task_id": self.task_id,
 87 |             "setup_id": self.setup_id,
 88 |             "flow_id": self.flow_id,
 89 |             "flow_name": self.flow_name,
 90 |             "data_id": self.data_id,
 91 |             "data_name": self.data_name,
 92 |             "function": self.function,
 93 |             "upload_time": self.upload_time,
 94 |             "uploader": self.uploader,
 95 |             "uploader_name": self.uploader_name,
 96 |             "value": self.value,
 97 |             "values": self.values,
 98 |             "array_data": self.array_data,
 99 |         }
100 | 
101 |     def __repr__(self) -> str:
102 |         header = "OpenML Evaluation"
103 |         header = f"{header}\n{'=' * len(header)}\n"
104 | 
105 |         fields = {
106 |             "Upload Date": self.upload_time,
107 |             "Run ID": self.run_id,
108 |             "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id),
109 |             "Task ID": self.task_id,
110 |             "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
111 |             "Flow ID": self.flow_id,
112 |             "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
113 |             "Setup ID": self.setup_id,
114 |             "Data ID": self.data_id,
115 |             "Data Name": self.data_name,
116 |             "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id),
117 |             "Metric Used": self.function,
118 |             "Result": self.value,
119 |         }
120 | 
121 |         order = [
122 |             "Uploader Date",
123 |             "Run ID",
124 |             "OpenML Run URL",
125 |             "Task ID",
126 |             "OpenML Task URL" "Flow ID",
127 |             "OpenML Flow URL",
128 |             "Setup ID",
129 |             "Data ID",
130 |             "Data Name",
131 |             "OpenML Data URL",
132 |             "Metric Used",
133 |             "Result",
134 |         ]
135 |         _fields = [(key, fields[key]) for key in order if key in fields]
136 | 
137 |         longest_field_name_length = max(len(name) for name, _ in _fields)
138 |         field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
139 |         body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
140 |         return header + body
141 | 


--------------------------------------------------------------------------------
/tests/test_openml/test_api_calls.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import unittest.mock
  4 | from pathlib import Path
  5 | from typing import NamedTuple, Iterable, Iterator
  6 | from unittest import mock
  7 | 
  8 | import minio
  9 | import pytest
 10 | 
 11 | import openml
 12 | from openml.config import ConfigurationForExamples
 13 | import openml.testing
 14 | from openml._api_calls import _download_minio_bucket, API_TOKEN_HELP_LINK
 15 | 
 16 | 
 17 | class TestConfig(openml.testing.TestBase):
 18 |     def test_too_long_uri(self):
 19 |         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
 20 |             openml.datasets.list_datasets(data_id=list(range(10000)))
 21 | 
 22 |     @unittest.mock.patch("time.sleep")
 23 |     @unittest.mock.patch("requests.Session")
 24 |     def test_retry_on_database_error(self, Session_class_mock, _):
 25 |         response_mock = unittest.mock.Mock()
 26 |         response_mock.text = (
 27 |             "<oml:error>\n"
 28 |             "<oml:code>107</oml:code>"
 29 |             "<oml:message>Database connection error. "
 30 |             "Usually due to high server load. "
 31 |             "Please wait for N seconds and try again.</oml:message>\n"
 32 |             "</oml:error>"
 33 |         )
 34 |         Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
 35 |         with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"):
 36 |             openml._api_calls._send_request("get", "/abc", {})
 37 | 
 38 |         assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
 39 | 
 40 | 
 41 | class FakeObject(NamedTuple):
 42 |     object_name: str
 43 |     etag: str
 44 |     """We use the etag of a Minio object as the name of a marker if we already downloaded it."""
 45 | 
 46 | 
 47 | class FakeMinio:
 48 |     def __init__(self, objects: Iterable[FakeObject] | None = None):
 49 |         self._objects = objects or []
 50 | 
 51 |     def list_objects(self, *args, **kwargs) -> Iterator[FakeObject]:
 52 |         yield from self._objects
 53 | 
 54 |     def fget_object(self, object_name: str, file_path: str, *args, **kwargs) -> None:
 55 |         if object_name in [obj.object_name for obj in self._objects]:
 56 |             Path(file_path).write_text("foo")
 57 |             return
 58 |         raise FileNotFoundError
 59 | 
 60 | 
 61 | @mock.patch.object(minio, "Minio")
 62 | def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
 63 |     some_prefix, some_filename = "some/prefix", "dataset.arff"
 64 |     some_object_path = f"{some_prefix}/{some_filename}"
 65 |     some_url = f"https://not.real.com/bucket/{some_object_path}"
 66 |     mock_minio.return_value = FakeMinio(
 67 |         objects=[
 68 |             FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))),
 69 |         ],
 70 |     )
 71 | 
 72 |     _download_minio_bucket(source=some_url, destination=tmp_path)
 73 |     time_created = (tmp_path / "dataset.arff").stat().st_ctime
 74 | 
 75 |     _download_minio_bucket(source=some_url, destination=tmp_path)
 76 |     time_modified = (tmp_path / some_filename).stat().st_mtime
 77 | 
 78 |     assert time_created == time_modified
 79 | 
 80 | 
 81 | @mock.patch.object(minio, "Minio")
 82 | def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
 83 |     some_prefix, some_filename = "some/prefix", "dataset.arff"
 84 |     some_object_path = f"{some_prefix}/{some_filename}"
 85 |     some_url = f"https://not.real.com/bucket/{some_object_path}"
 86 |     mock_minio.return_value = FakeMinio(
 87 |         objects=[
 88 |             FakeObject(object_name=None, etag="tmp"),
 89 |         ],
 90 |     )
 91 | 
 92 |     with pytest.raises(ValueError):
 93 |         _download_minio_bucket(source=some_url, destination=tmp_path)
 94 | 
 95 |     mock_minio.return_value = FakeMinio(
 96 |         objects=[
 97 |             FakeObject(object_name="tmp", etag=None),
 98 |         ],
 99 |     )
100 | 
101 |     with pytest.raises(ValueError):
102 |         _download_minio_bucket(source=some_url, destination=tmp_path)
103 | 
104 | 
105 | @pytest.mark.parametrize(
106 |     "endpoint, method",
107 |     [
108 |         # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php
109 |         ("flow/exists", "post"),  # 102
110 |         ("dataset", "post"),  # 137
111 |         ("dataset/42", "delete"),  # 350
112 |         # ("flow/owned", "post"),  # 310 - Couldn't find what would trigger this
113 |         ("flow/42", "delete"),  # 320
114 |         ("run/42", "delete"),  # 400
115 |         ("task/42", "delete"),  # 460
116 |     ],
117 | )
118 | def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
119 |     endpoint: str,
120 |     method: str,
121 | ) -> None:
122 |     # We need to temporarily disable the API key to test the error message
123 |     with openml.config.overwrite_config_context({"apikey": None}):
124 |         with pytest.raises(openml.exceptions.OpenMLNotAuthorizedError, match=API_TOKEN_HELP_LINK):
125 |             openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
126 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/2018_ida_strang_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Strang et al. (2018)
  3 | ====================
  4 | 
  5 | A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models
  6 | Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*.
  7 | 
  8 | Publication
  9 | ~~~~~~~~~~~
 10 | 
 11 | | Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML
 12 | | Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter
 13 | | In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018
 14 | | Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25
 15 | """
 16 | 
 17 | # License: BSD 3-Clause
 18 | 
 19 | import matplotlib.pyplot as plt
 20 | import openml
 21 | import pandas as pd
 22 | 
 23 | ##############################################################################
 24 | # A basic step for each data-mining or machine learning task is to determine
 25 | # which model to choose based on the problem and the data at hand. In this
 26 | # work we investigate when non-linear classifiers outperform linear
 27 | # classifiers by means of a large scale experiment.
 28 | #
 29 | # The paper is accompanied with a study object, containing all relevant tasks
 30 | # and runs (``study_id=123``). The paper features three experiment classes:
 31 | # Support Vector Machines (SVM), Neural Networks (NN) and Decision Trees (DT).
 32 | # This example demonstrates how to reproduce the plots, comparing two
 33 | # classifiers given the OpenML flow ids. Note that this allows us to reproduce
 34 | # the SVM and NN experiment, but not the DT experiment, as this requires a bit
 35 | # more effort to distinguish the same flow with different hyperparameter
 36 | # values.
 37 | 
 38 | study_id = 123
 39 | # for comparing svms: flow_ids = [7754, 7756]
 40 | # for comparing nns: flow_ids = [7722, 7729]
 41 | # for comparing dts: flow_ids = [7725], differentiate on hyper-parameter value
 42 | classifier_family = "SVM"
 43 | flow_ids = [7754, 7756]
 44 | measure = "predictive_accuracy"
 45 | meta_features = ["NumberOfInstances", "NumberOfFeatures"]
 46 | class_values = ["non-linear better", "linear better", "equal"]
 47 | 
 48 | # Downloads all evaluation records related to this study
 49 | evaluations = openml.evaluations.list_evaluations(
 50 |     measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
 51 | )
 52 | # gives us a table with columns data_id, flow1_value, flow2_value
 53 | evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
 54 | # downloads all data qualities (for scatter plot)
 55 | data_qualities = openml.datasets.list_datasets(
 56 |     data_id=list(evaluations.index.values), output_format="dataframe"
 57 | )
 58 | # removes irrelevant data qualities
 59 | data_qualities = data_qualities[meta_features]
 60 | # makes a join between evaluation table and data qualities table,
 61 | # now we have columns data_id, flow1_value, flow2_value, meta_feature_1,
 62 | # meta_feature_2
 63 | evaluations = evaluations.join(data_qualities, how="inner")
 64 | 
 65 | # adds column that indicates the difference between the two classifiers
 66 | evaluations["diff"] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]]
 67 | 
 68 | 
 69 | ##############################################################################
 70 | # makes the s-plot
 71 | 
 72 | fig_splot, ax_splot = plt.subplots()
 73 | ax_splot.plot(range(len(evaluations)), sorted(evaluations["diff"]))
 74 | ax_splot.set_title(classifier_family)
 75 | ax_splot.set_xlabel("Dataset (sorted)")
 76 | ax_splot.set_ylabel("difference between linear and non-linear classifier")
 77 | ax_splot.grid(linestyle="--", axis="y")
 78 | plt.show()
 79 | 
 80 | 
 81 | ##############################################################################
 82 | # adds column that indicates the difference between the two classifiers,
 83 | # needed for the scatter plot
 84 | 
 85 | 
 86 | def determine_class(val_lin, val_nonlin):
 87 |     if val_lin < val_nonlin:
 88 |         return class_values[0]
 89 |     elif val_nonlin < val_lin:
 90 |         return class_values[1]
 91 |     else:
 92 |         return class_values[2]
 93 | 
 94 | 
 95 | evaluations["class"] = evaluations.apply(
 96 |     lambda row: determine_class(row[flow_ids[0]], row[flow_ids[1]]), axis=1
 97 | )
 98 | 
 99 | # does the plotting and formatting
100 | fig_scatter, ax_scatter = plt.subplots()
101 | for class_val in class_values:
102 |     df_class = evaluations[evaluations["class"] == class_val]
103 |     plt.scatter(df_class[meta_features[0]], df_class[meta_features[1]], label=class_val)
104 | ax_scatter.set_title(classifier_family)
105 | ax_scatter.set_xlabel(meta_features[0])
106 | ax_scatter.set_ylabel(meta_features[1])
107 | ax_scatter.legend()
108 | ax_scatter.set_xscale("log")
109 | ax_scatter.set_yscale("log")
110 | plt.show()
111 | 
112 | ##############################################################################
113 | # makes a scatter plot where each data point represents the performance of the
114 | # two algorithms on various axis (not in the paper)
115 | 
116 | fig_diagplot, ax_diagplot = plt.subplots()
117 | ax_diagplot.grid(linestyle="--")
118 | ax_diagplot.plot([0, 1], ls="-", color="black")
119 | ax_diagplot.plot([0.2, 1.2], ls="--", color="black")
120 | ax_diagplot.plot([-0.2, 0.8], ls="--", color="black")
121 | ax_diagplot.scatter(evaluations[flow_ids[0]], evaluations[flow_ids[1]])
122 | ax_diagplot.set_xlabel(measure)
123 | ax_diagplot.set_ylabel(measure)
124 | plt.show()
125 | 


--------------------------------------------------------------------------------
/examples/_external_or_deprecated/benchmark_with_optunahub.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ====================================================
  3 | Hyperparameter Optimization Benchmark with OptunaHub
  4 | ====================================================
  5 | 
  6 | In this tutorial, we walk through how to conduct hyperparameter optimization experiments using OpenML and OptunaHub.
  7 | """
  8 | ############################################################################
  9 | # Please make sure to install the dependencies with:
 10 | # ``pip install "openml>=0.15.1" plotly``
 11 | # Then we import all the necessary modules.
 12 | 
 13 | # License: BSD 3-Clause
 14 | 
 15 | import logging
 16 | 
 17 | import optuna
 18 | from sklearn.compose import ColumnTransformer
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | from sklearn.impute import SimpleImputer
 21 | from sklearn.pipeline import Pipeline
 22 | from sklearn.preprocessing import OneHotEncoder
 23 | 
 24 | import openml
 25 | 
 26 | logger = logging.Logger(name="Experiment Logger", level=1)
 27 | 
 28 | # <div class="admonition warning">
 29 | #     <p class="admonition-title">Warning</p>
 30 | #     <p>
 31 | #         For the rest of this tutorial, we will require the `openml-sklearn` package.
 32 | #         Install it with `pip install openml-sklearn`.
 33 | #     </p>
 34 | # </div>
 35 | 
 36 | # %%
 37 | # Get sklearn extension to run sklearn models easily on OpenML tasks.
 38 | from openml_sklearn import SklearnExtension, cat, cont
 39 | 
 40 | extension = SklearnExtension()
 41 | 
 42 | # Set your openml api key if you want to upload your results to OpenML (eg:
 43 | # https://openml.org/search?type=run&sort=date) . To get one, simply make an
 44 | # account (you don't need one for anything else, just to upload your results),
 45 | # go to your profile and select the API-KEY.
 46 | # Or log in, and navigate to https://www.openml.org/auth/api-key
 47 | openml.config.apikey = ""
 48 | ############################################################################
 49 | # Prepare for preprocessors and an OpenML task
 50 | # ============================================
 51 | 
 52 | # OpenML contains several key concepts which it needs to make machine learning research shareable.
 53 | # A machine learning experiment consists of one or several runs, which describe the performance of
 54 | # an algorithm (called a flow in OpenML), its hyperparameter settings (called a setup) on a task.
 55 | # A Task is the combination of a dataset, a split and an evaluation metric We choose a dataset from
 56 | # OpenML, (https://www.openml.org/d/1464) and a subsequent task (https://www.openml.org/t/10101) To
 57 | # make your own dataset and task, please refer to
 58 | # https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html
 59 | 
 60 | # https://www.openml.org/search?type=study&study_type=task&id=218
 61 | task_id = 10101
 62 | seed = 42
 63 | categorical_preproc = (
 64 |     "categorical",
 65 |     OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
 66 |     cat,
 67 | )
 68 | numerical_preproc = ("numerical", SimpleImputer(strategy="median"), cont)
 69 | preproc = ColumnTransformer([categorical_preproc, numerical_preproc])
 70 | 
 71 | ############################################################################
 72 | # Define a pipeline for the hyperparameter optimization (this is standark for Optuna)
 73 | # =====================================================
 74 | 
 75 | # Optuna explanation
 76 | # we follow the `Optuna <https://github.com/optuna/optuna/>`__ search space design.
 77 | 
 78 | # OpenML runs
 79 | # We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance
 80 | # of the pipeline
 81 | # on the specified OpenML task.
 82 | # Do you want to share your results along with an easily reproducible pipeline, you can set an API
 83 | # key and just upload your results.
 84 | # You can find more examples on https://www.openml.org/
 85 | 
 86 | 
 87 | def objective(trial: optuna.Trial) -> Pipeline:
 88 |     clf = RandomForestClassifier(
 89 |         max_depth=trial.suggest_int("max_depth", 2, 32, log=True),
 90 |         min_samples_leaf=trial.suggest_float("min_samples_leaf", 0.0, 1.0),
 91 |         random_state=seed,
 92 |     )
 93 |     pipe = Pipeline(steps=[("preproc", preproc), ("model", clf)])
 94 |     logger.log(1, f"Running pipeline - {pipe}")
 95 |     run = openml.runs.run_model_on_task(pipe, task=task_id, avoid_duplicate_runs=False)
 96 | 
 97 |     logger.log(1, f"Model has been trained - {run}")
 98 |     if openml.config.apikey != "":
 99 |         try:
100 |             run.publish()
101 | 
102 |             logger.log(1, f"Run was uploaded to - {run.openml_url}")
103 |         except Exception as e:
104 |             logger.log(1, f"Could not publish run - {e}")
105 |     else:
106 |         logger.log(
107 |             0,
108 |             "If you want to publish your results to OpenML, please set an apikey",
109 |         )
110 |     accuracy = max(run.fold_evaluations["predictive_accuracy"][0].values())
111 |     logger.log(0, f"Accuracy {accuracy}")
112 | 
113 |     return accuracy
114 | 
115 | 
116 | ############################################################################
117 | # Optimize the pipeline
118 | # =====================
119 | study = optuna.create_study(direction="maximize")
120 | logger.log(0, f"Study {study}")
121 | study.optimize(objective, n_trials=15)
122 | 
123 | ############################################################################
124 | # Visualize the optimization history
125 | # ==================================
126 | fig = optuna.visualization.plot_optimization_history(study)
127 | fig.show()
128 | 


--------------------------------------------------------------------------------
/examples/Advanced/datasets_tutorial.py:
--------------------------------------------------------------------------------
  1 | # %% [markdown]
  2 | # How to list and download datasets.
  3 | 
  4 | # %%
  5 | import pandas as pd
  6 | 
  7 | import openml
  8 | from openml.datasets import edit_dataset, fork_dataset, get_dataset
  9 | 
 10 | # %% [markdown]
 11 | # ## Exercise 0
 12 | #
 13 | # * List datasets and return a dataframe
 14 | 
 15 | # %%
 16 | datalist = openml.datasets.list_datasets()
 17 | datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 18 | 
 19 | print(f"First 10 of {len(datalist)} datasets...")
 20 | datalist.head(n=10)
 21 | 
 22 | # The same can be done with lesser lines of code
 23 | openml_df = openml.datasets.list_datasets()
 24 | openml_df.head(n=10)
 25 | 
 26 | # %% [markdown]
 27 | # ## Exercise 1
 28 | #
 29 | # * Find datasets with more than 10000 examples.
 30 | # * Find a dataset called 'eeg_eye_state'.
 31 | # * Find all datasets with more than 50 classes.
 32 | 
 33 | # %%
 34 | datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
 35 | 
 36 | # %%
 37 | datalist.query('name == "eeg-eye-state"')
 38 | 
 39 | # %%
 40 | datalist.query("NumberOfClasses > 50")
 41 | 
 42 | # %% [markdown]
 43 | # ## Download datasets
 44 | 
 45 | # %%
 46 | # This is done based on the dataset ID.
 47 | dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
 48 | 
 49 | # Print a summary
 50 | print(
 51 |     f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'"
 52 | )
 53 | print(f"URL: {dataset.url}")
 54 | print(dataset.description[:500])
 55 | 
 56 | # %% [markdown]
 57 | # Get the actual data.
 58 | #
 59 | # openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
 60 | # and also some additional metadata that we don't care about right now.
 61 | 
 62 | # %%
 63 | eeg, *_ = dataset.get_data()
 64 | 
 65 | # %% [markdown]
 66 | # You can optionally choose to have openml separate out a column from the
 67 | # dataset. In particular, many datasets for supervised problems have a set
 68 | # `default_target_attribute` which may help identify the target variable.
 69 | 
 70 | # %%
 71 | X, y, categorical_indicator, attribute_names = dataset.get_data(
 72 |     target=dataset.default_target_attribute
 73 | )
 74 | print(X.head())
 75 | print(X.info())
 76 | 
 77 | # %% [markdown]
 78 | # Sometimes you only need access to a dataset's metadata.
 79 | # In those cases, you can download the dataset without downloading the
 80 | # data file. The dataset object can be used as normal.
 81 | # Whenever you use any functionality that requires the data,
 82 | # such as `get_data`, the data will be downloaded.
 83 | # Starting from 0.15, not downloading data will be the default behavior instead.
 84 | # The data will be downloading automatically when you try to access it through
 85 | # openml objects, e.g., using `dataset.features`.
 86 | 
 87 | # %%
 88 | dataset = openml.datasets.get_dataset(1471)
 89 | 
 90 | # %% [markdown]
 91 | # ## Exercise 2
 92 | # * Explore the data visually.
 93 | 
 94 | # %%
 95 | eegs = eeg.sample(n=1000)
 96 | _ = pd.plotting.scatter_matrix(
 97 |     X.iloc[:100, :4],
 98 |     c=y[:100],
 99 |     figsize=(10, 10),
100 |     marker="o",
101 |     hist_kwds={"bins": 20},
102 |     alpha=0.8,
103 |     cmap="plasma",
104 | )
105 | 
106 | 
107 | # %% [markdown]
108 | # ## Edit a created dataset
109 | # This example uses the test server, to avoid editing a dataset on the main server.
110 | 
111 | # %%
112 | openml.config.start_using_configuration_for_example()
113 | # %% [markdown]
114 | # Edit non-critical fields, allowed for all authorized users:
115 | # description, creator, contributor, collection_date, language, citation,
116 | # original_data_url, paper_url
117 | 
118 | # %%
119 | desc = (
120 |     "This data sets consists of 3 different types of irises' "
121 |     "(Setosa, Versicolour, and Virginica) petal and sepal length,"
122 |     " stored in a 150x4 numpy.ndarray"
123 | )
124 | did = 128
125 | data_id = edit_dataset(
126 |     did,
127 |     description=desc,
128 |     creator="R.A.Fisher",
129 |     collection_date="1937",
130 |     citation="The use of multiple measurements in taxonomic problems",
131 |     language="English",
132 | )
133 | edited_dataset = get_dataset(data_id)
134 | print(f"Edited dataset ID: {data_id}")
135 | 
136 | 
137 | # %% [markdown]
138 | # Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
139 | # only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
140 | # tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
141 | # configure the API key:
142 | # openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
143 | # This example here only shows a failure when trying to work on a dataset not owned by you:
144 | 
145 | # %%
146 | try:
147 |     data_id = edit_dataset(1, default_target_attribute="shape")
148 | except openml.exceptions.OpenMLServerException as e:
149 |     print(e)
150 | 
151 | # %% [markdown]
152 | # ## Fork dataset
153 | # Used to create a copy of the dataset with you as the owner.
154 | # Use this API only if you are unable to edit the critical fields (default_target_attribute,
155 | # ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
156 | # After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
157 | 
158 | # %%
159 | data_id = fork_dataset(1)
160 | print(data_id)
161 | data_id = edit_dataset(data_id, default_target_attribute="shape")
162 | print(f"Forked dataset ID: {data_id}")
163 | 
164 | # %%
165 | openml.config.stop_using_configuration_for_example()
166 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: openml-python
  2 | repo_url: https://github.com/openml/openml-python
  3 | repo_name: openml/openml-python
  4 | theme:
  5 |   logo: images/openml_icon.png
  6 |   favicon: images/openml_icon.png
  7 |   name: material
  8 |   features:
  9 |     - content.code.annotate
 10 |     - content.code.copy
 11 |     - navigation.footer
 12 |     - navigation.sections
 13 |     - toc.follow
 14 |     - toc.integrate
 15 |     - navigation.tabs
 16 |     - navigation.tabs.sticky
 17 |     - header.autohide
 18 |     - header.social
 19 |     - search.suggest
 20 |     - search.highlight
 21 |     - search.share
 22 |   palette:
 23 |     - scheme: slate
 24 |       media: "(prefers-color-scheme: dark)"
 25 |       primary: indigo
 26 |       accent: deep purple
 27 |       toggle:
 28 |         icon: material/eye-outline
 29 |         name: Switch to light mode
 30 | 
 31 |     # Palette toggle for light mode
 32 |     - scheme: default
 33 |       media: "(prefers-color-scheme: light)"
 34 |       primary: indigo
 35 |       accent: deep purple
 36 |       toggle:
 37 |         icon: material/eye
 38 |         name: Switch to dark mode
 39 | 
 40 | extra_css:
 41 |   - stylesheets/extra.css
 42 | 
 43 | nav:
 44 |   - index.md
 45 |   - Examples:
 46 |     - Overview: examples/introduction.py
 47 |     - Basics:
 48 |         - Setup: examples/Basics/introduction_tutorial.py
 49 |         - Datasets: examples/Basics/simple_datasets_tutorial.py
 50 |         - Tasks: examples/Basics/simple_tasks_tutorial.py
 51 |         - Flows and Runs: examples/Basics/simple_flows_and_runs_tutorial.py
 52 |         - Suites: examples/Basics/simple_suites_tutorial.py
 53 |     - Advanced:
 54 |       - Dataset Splits from Tasks: examples/Advanced/task_manual_iteration_tutorial.py
 55 |       - Creating and Uploading Datasets: examples/Advanced/create_upload_tutorial.py
 56 |       - Searching and Editing Datasets: examples/Advanced/datasets_tutorial.py
 57 |       - Searching and Creating Tasks: examples/Advanced/tasks_tutorial.py
 58 |       - List, Download, and Upload Suites: examples/Advanced/suites_tutorial.py
 59 |       - List, Download, and Upload Studies: examples/Advanced/study_tutorial.py
 60 |       - Downloading Evaluation Results: examples/Advanced/fetch_evaluations_tutorial.py
 61 |       - Configuring Logging: examples/Advanced/configure_logging.py
 62 | 
 63 | 
 64 |   - Extensions: extensions.md
 65 |   - Advanced User Guide: details.md
 66 |   - API: reference/
 67 |   - Contributing: contributing.md
 68 | 
 69 | markdown_extensions:
 70 |   - pymdownx.highlight:
 71 |       anchor_linenums: true
 72 |   - pymdownx.superfences
 73 |   - attr_list
 74 |   - admonition
 75 |   - tables
 76 |   - attr_list
 77 |   - md_in_html
 78 |   - toc:
 79 |       permalink: "#"
 80 |   - pymdownx.highlight:
 81 |       anchor_linenums: true
 82 |   - pymdownx.magiclink:
 83 |       hide_protocol: true
 84 |       repo_url_shortener: true
 85 |       repo_url_shorthand: true
 86 |       user: openml
 87 |       repo: openml-python
 88 |   - pymdownx.highlight
 89 |   - pymdownx.inlinehilite
 90 |   - pymdownx.snippets
 91 |   - pymdownx.details
 92 |   - pymdownx.tabbed:
 93 |       alternate_style: true
 94 |   - pymdownx.superfences:
 95 |       custom_fences:
 96 |       - name: mermaid
 97 |         class: mermaid
 98 |         format: !!python/name:pymdownx.superfences.fence_code_format
 99 |   - pymdownx.emoji:
100 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
101 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
102 |   - pymdownx.tabbed:
103 |       alternate_style: true
104 | 
105 | extra:
106 |   version:
107 |     provider: mike
108 |   social:
109 |     - icon: fontawesome/brands/github
110 |       link: https://github.com/openml
111 |     - icon: fontawesome/brands/twitter
112 |       link: https://x.com/open_ml
113 | 
114 | plugins:
115 |   - search
116 |   - autorefs
117 |   - section-index
118 |   # - mkdocstrings:
119 |   - mkdocstrings:
120 |       default_handler: python
121 |       enable_inventory: true
122 |       handlers:
123 |         python:
124 |           # paths: [openml]
125 |           options:  # https://mkdocstrings.github.io/python/usage/
126 |             docstring_section_style: spacy
127 |             docstring_options:
128 |               ignore_init_summary: true
129 |               trim_doctest_flags: true
130 |               returns_multiple_items: false
131 |             show_docstring_attributes: true
132 |             show_docstring_description: true
133 |             show_root_heading: true
134 |             show_root_toc_entry: true
135 |             show_object_full_path: false
136 |             show_root_members_full_path: false
137 |             signature_crossrefs: true
138 |             merge_init_into_class: true
139 |             show_symbol_type_heading: true
140 |             show_symbol_type_toc: true
141 |             docstring_style: google
142 |             inherited_members: true
143 |             show_if_no_docstring: false
144 |             show_bases: true
145 |             show_source: true
146 |             members_order: "alphabetical"
147 |             group_by_category: true
148 |             show_signature: true
149 |             separate_signature: true
150 |             show_signature_annotations: true
151 |             filters:
152 |               - "!^_[^_]"
153 | 
154 |   - gen-files:
155 |       scripts:
156 |         - scripts/gen_ref_pages.py
157 |   - literate-nav:
158 |       nav_file: SUMMARY.md
159 |   - mkdocs-jupyter:
160 |       theme: light
161 |   - mike:
162 |       version_selector: true
163 |       css_dir: css
164 |       javascript_dir: js
165 |       canonical_version: latest
166 | 


--------------------------------------------------------------------------------
/docker/readme.md:
--------------------------------------------------------------------------------
  1 | # OpenML Python Container
  2 | 
  3 | This docker container has the latest version of openml-python downloaded and pre-installed.
  4 | It can also be used by developers to run unit tests or build the docs in 
  5 | a fresh and/or isolated unix environment. 
  6 | This document contains information about:
  7 | 
  8 |  1. [Usage](#usage): how to use the image and its main modes.
  9 |  2. [Using local or remote code](#using-local-or-remote-code): useful when testing your own latest changes.
 10 |  3. [Versions](#versions): identify which image to use.
 11 |  4. [Development](#for-developers): information about the Docker image for developers.
 12 | 
 13 | *note:* each docker image is shipped with a readme, which you can read with:
 14 | `docker run --entrypoint=/bin/cat openml/openml-python:TAG readme.md`
 15 | 
 16 | ## Usage
 17 | 
 18 | There are three main ways to use the image: running a pre-installed Python environment,
 19 | running tests, and building documentation.
 20 | 
 21 | ### Running `Python` with pre-installed `OpenML-Python` (default):
 22 | 
 23 | To run `Python` with a pre-installed `OpenML-Python` environment run:
 24 | 
 25 | ```text
 26 | docker run -it openml/openml-python
 27 | ```
 28 | 
 29 | this accepts the normal `Python` arguments, e.g.:
 30 | 
 31 | ```text
 32 | docker run openml/openml-python -c "import openml; print(openml.__version__)"
 33 | ```
 34 | 
 35 | if you want to run a local script, it needs to be mounted first. Mount it into the
 36 | `openml` folder:
 37 | 
 38 | ```
 39 | docker run -v PATH/TO/FILE:/openml/MY_SCRIPT.py openml/openml-python MY_SCRIPT.py
 40 | ```
 41 | 
 42 | ### Running unit tests
 43 | 
 44 | You can run the unit tests by passing `test` as the first argument.
 45 | It also requires a local or remote repository to be specified, which is explained 
 46 | [below]((#using-local-or-remote-code). For this example, we specify to test the
 47 | `develop` branch:
 48 | 
 49 | ```text
 50 | docker run openml/openml-python test develop
 51 | ```
 52 | 
 53 | ### Building documentation
 54 | 
 55 | You can build the documentation by passing `doc` as the first argument, 
 56 | you should [mount]((https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)) 
 57 | an output directory in which the docs will be stored. You also need to provide a remote
 58 | or local repository as explained in [the section below]((#using-local-or-remote-code).
 59 | In this example, we build documentation for the `develop` branch.
 60 | On Windows:
 61 | 
 62 | ```text
 63 |     docker run --mount type=bind,source="E:\\files/output",destination="/output" openml/openml-python doc develop
 64 | ```
 65 | 
 66 | on Linux:
 67 | ```text
 68 |     docker run --mount type=bind,source="./output",destination="/output" openml/openml-python doc develop
 69 | ```
 70 |     
 71 | see [the section below]((#using-local-or-remote-code) for running against local changes
 72 | or a remote branch.
 73 | 
 74 | *Note: you can forgo mounting an output directory to test if the docs build successfully,
 75 | but the result will only be available within the docker container under `/openml/docs/build`.*
 76 | 
 77 | ## Using local or remote code
 78 | 
 79 | You can build docs or run tests against your local repository or a Github repository.
 80 | In the examples below, change the `source` to match the location of your local repository.
 81 | 
 82 | ### Using a local repository
 83 | 
 84 | To use a local directory, mount it in the `/code` directory,  on Windows:
 85 | 
 86 | ```text
 87 |     docker run --mount type=bind,source="E:\\repositories/openml-python",destination="/code" openml/openml-python test
 88 | ```
 89 | 
 90 | on Linux:
 91 | ```text
 92 |     docker run --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python test
 93 | ```
 94 | 
 95 | when building docs, you also need to mount an output directory as shown above, so add both:
 96 | 
 97 | ```text
 98 | docker run --mount type=bind,source="./output",destination="/output" --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python doc
 99 | ```
100 | 
101 | ### Using a Github repository
102 | Building from a remote repository requires you to specify a branch.
103 | The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/):
104 | 
105 |     docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output openml/openml-python [test,doc] BRANCH
106 | 
107 | Where `BRANCH` is the name of the branch for which to generate the documentation.
108 | It is also possible to build the documentation from the branch on a fork,
109 | in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. 
110 | `PGijsbers#my_feature_branch`) and the name of the forked repository should be `openml-python`.
111 | 
112 | ## For developers
113 | This section contains some notes about the structure of the image, 
114 | intended for those who want to work on it.
115 | 
116 | ### Added Directories
117 | The `openml/openml-python` image is built on a vanilla `python:3` image.
118 | Additionally, it contains the following files are directories:
119 | 
120 |  - `/openml`: contains the openml-python repository in the state with which the image 
121 |    was built by default. If working with a `BRANCH`, this repository will be set to 
122 |    the `HEAD` of `BRANCH`.
123 |  - `/openml/venv/`: contains the used virtual environment for `doc` and `test`. It has
124 |    `openml-python` dependencies pre-installed.  When invoked with `doc` or `test`, the 
125 |    dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`.
126 |  - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`).
127 | 
128 | ## Building the image
129 | To build the image yourself, execute `docker build -f Dockerfile .` from the `docker`
130 | directory of the `openml-python` repository. It will use the `startup.sh` as is, so any 
131 | local changes will be present in the image.
132 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | name: Tests
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 | 
  6 |   push:
  7 |     branches:
  8 |       - main
  9 |       - develop
 10 |     tags:
 11 |       - "v*.*.*"
 12 | 
 13 |   pull_request:
 14 |     branches:
 15 |       - main
 16 |       - develop
 17 | 
 18 | concurrency:
 19 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 20 |   cancel-in-progress: true
 21 | 
 22 | jobs:
 23 |   test:
 24 |     name: (${{ matrix.os }}, Py${{ matrix.python-version }}, sk${{ matrix.scikit-learn }}, sk-only:${{ matrix.sklearn-only }})
 25 |     runs-on: ${{ matrix.os }}
 26 |     strategy:
 27 |       matrix:
 28 |         python-version: ["3.9"]
 29 |         scikit-learn: ["1.0.*", "1.1.*", "1.2.*", "1.3.*", "1.4.*", "1.5.*"]
 30 |         os: [ubuntu-latest]
 31 |         sklearn-only: ["true"]
 32 |         include:
 33 |           - os: ubuntu-latest
 34 |             python-version: "3.8"  # no scikit-learn 0.23 release for Python 3.9
 35 |             scikit-learn: "0.23.1"
 36 |             sklearn-only: "true"
 37 |           # scikit-learn 0.24 relies on scipy defaults, so we need to fix the version
 38 |           # c.f. https://github.com/openml/openml-python/pull/1267
 39 |           - os: ubuntu-latest
 40 |             python-version: "3.9"
 41 |             scikit-learn: "0.24"
 42 |             scipy: "1.10.0"
 43 |             sklearn-only: "true"
 44 |           # Do a Windows and Ubuntu test for _all_ openml functionality
 45 |           # I am not sure why these are on 3.8 and older scikit-learn
 46 |           - os: windows-latest
 47 |             python-version: "3.8"
 48 |             scikit-learn: 0.24.*
 49 |             scipy: "1.10.0"
 50 |             sklearn-only: 'false'
 51 |           # Include a code cov version
 52 |           - os: ubuntu-latest
 53 |             code-cov: true
 54 |             python-version: "3.8"
 55 |             scikit-learn: 0.23.1
 56 |             sklearn-only: 'false'
 57 |       fail-fast:  false
 58 | 
 59 |     steps:
 60 |     - uses: actions/checkout@v4
 61 |       with:
 62 |         fetch-depth: 2
 63 |     - name: Setup Python ${{ matrix.python-version }}
 64 |       if: matrix.os != 'windows-latest'  # windows-latest only uses preinstalled Python (3.9.13)
 65 |       uses: actions/setup-python@v5
 66 |       with:
 67 |         python-version: ${{ matrix.python-version }}
 68 |     - name: Install test dependencies
 69 |       run: |
 70 |         python -m pip install --upgrade pip
 71 |         pip install -e .[test]
 72 |     - name: Install scikit-learn ${{ matrix.scikit-learn }}
 73 |       run: |
 74 |         pip install scikit-learn==${{ matrix.scikit-learn }}
 75 |     - name: Install numpy for Python 3.8
 76 |       # Python 3.8 & scikit-learn<0.24 requires numpy<=1.23.5
 77 |       if: ${{ matrix.python-version == '3.8' && matrix.scikit-learn == '0.23.1' }}
 78 |       run: |
 79 |         pip install numpy==1.23.5
 80 |     - name: "Install NumPy 1.x and SciPy <1.11 for scikit-learn < 1.4"
 81 |       if: ${{ contains(fromJSON('["1.0.*", "1.1.*", "1.2.*", "1.3.*"]'), matrix.scikit-learn) }}
 82 |       run: |
 83 |         # scipy has a change to the 'mode' behavior which breaks scikit-learn < 1.4
 84 |         # numpy 2.0 has several breaking changes
 85 |         pip install "numpy<2.0" "scipy<1.11"
 86 |     - name: Install scipy ${{ matrix.scipy }}
 87 |       if: ${{ matrix.scipy }}
 88 |       run: |
 89 |         pip install scipy==${{ matrix.scipy }}
 90 |     - name: Store repository status
 91 |       id: status-before
 92 |       if: matrix.os != 'windows-latest'
 93 |       run: |
 94 |         git_status=$(git status --porcelain -b)
 95 |         echo "BEFORE=$git_status" >> $GITHUB_ENV
 96 |         echo "Repository status before tests: $git_status"
 97 |     - name: Show installed dependencies
 98 |       run: python -m pip list
 99 |     - name: Run tests on Ubuntu Test
100 |       if: matrix.os == 'ubuntu-latest'
101 |       run: |
102 |         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
103 |         # Most of the time, running only the scikit-learn tests is sufficient
104 |         if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and not production'; else marks='not production'; fi
105 |         echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
106 |         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
107 |     - name: Run tests on Ubuntu Production
108 |       if: matrix.os == 'ubuntu-latest'
109 |       run: |
110 |         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
111 |         # Most of the time, running only the scikit-learn tests is sufficient
112 |         if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and production'; else marks='production'; fi
113 |         echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
114 |         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
115 |     - name: Run tests on Windows
116 |       if: matrix.os == 'windows-latest'
117 |       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
118 |         pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1
119 |     - name: Check for files left behind by test
120 |       if: matrix.os != 'windows-latest' && always()
121 |       run: |
122 |         before="${{ env.BEFORE }}"
123 |         after="$(git status --porcelain -b)"
124 |         if [[ "$before" != "$after" ]]; then
125 |             echo "git status from before: $before"
126 |             echo "git status from after: $after"
127 |             echo "Not all generated files have been deleted!"
128 |             exit 1
129 |         fi
130 |     - name: Upload coverage
131 |       if: matrix.code-cov && always()
132 |       uses: codecov/codecov-action@v4
133 |       with:
134 |         files: coverage.xml
135 |         token: ${{ secrets.CODECOV_TOKEN }}
136 |         fail_ci_if_error: true
137 |         verbose: true
138 | 


--------------------------------------------------------------------------------
/openml/setups/setup.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 3-Clause
  2 | from __future__ import annotations
  3 | 
  4 | from typing import Any
  5 | 
  6 | import openml.config
  7 | import openml.flows
  8 | 
  9 | 
 10 | class OpenMLSetup:
 11 |     """Setup object (a.k.a. Configuration).
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     setup_id : int
 16 |         The OpenML setup id
 17 |     flow_id : int
 18 |         The flow that it is build upon
 19 |     parameters : dict
 20 |         The setting of the parameters
 21 |     """
 22 | 
 23 |     def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | None):
 24 |         if not isinstance(setup_id, int):
 25 |             raise ValueError("setup id should be int")
 26 | 
 27 |         if not isinstance(flow_id, int):
 28 |             raise ValueError("flow id should be int")
 29 | 
 30 |         if parameters is not None and not isinstance(parameters, dict):
 31 |             raise ValueError("parameters should be dict")
 32 | 
 33 |         self.setup_id = setup_id
 34 |         self.flow_id = flow_id
 35 |         self.parameters = parameters
 36 | 
 37 |     def _to_dict(self) -> dict[str, Any]:
 38 |         return {
 39 |             "setup_id": self.setup_id,
 40 |             "flow_id": self.flow_id,
 41 |             "parameters": {p.id: p._to_dict() for p in self.parameters.values()}
 42 |             if self.parameters is not None
 43 |             else None,
 44 |         }
 45 | 
 46 |     def __repr__(self) -> str:
 47 |         header = "OpenML Setup"
 48 |         header = f"{header}\n{'=' * len(header)}\n"
 49 | 
 50 |         fields = {
 51 |             "Setup ID": self.setup_id,
 52 |             "Flow ID": self.flow_id,
 53 |             "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
 54 |             "# of Parameters": (
 55 |                 len(self.parameters) if self.parameters is not None else float("nan")
 56 |             ),
 57 |         }
 58 | 
 59 |         # determines the order in which the information will be printed
 60 |         order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
 61 |         _fields = [(key, fields[key]) for key in order if key in fields]
 62 | 
 63 |         longest_field_name_length = max(len(name) for name, _ in _fields)
 64 |         field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
 65 |         body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
 66 |         return header + body
 67 | 
 68 | 
 69 | class OpenMLParameter:
 70 |     """Parameter object (used in setup).
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     input_id : int
 75 |         The input id from the openml database
 76 |     flow id : int
 77 |         The flow to which this parameter is associated
 78 |     flow name : str
 79 |         The name of the flow (no version number) to which this parameter
 80 |         is associated
 81 |     full_name : str
 82 |         The name of the flow and parameter combined
 83 |     parameter_name : str
 84 |         The name of the parameter
 85 |     data_type : str
 86 |         The datatype of the parameter. generally unused for sklearn flows
 87 |     default_value : str
 88 |         The default value. For sklearn parameters, this is unknown and a
 89 |         default value is selected arbitrarily
 90 |     value : str
 91 |         If the parameter was set, the value that it was set to.
 92 |     """
 93 | 
 94 |     def __init__(  # noqa: PLR0913
 95 |         self,
 96 |         input_id: int,
 97 |         flow_id: int,
 98 |         flow_name: str,
 99 |         full_name: str,
100 |         parameter_name: str,
101 |         data_type: str,
102 |         default_value: str,
103 |         value: str,
104 |     ):
105 |         self.id = input_id
106 |         self.flow_id = flow_id
107 |         self.flow_name = flow_name
108 |         self.full_name = full_name
109 |         self.parameter_name = parameter_name
110 |         self.data_type = data_type
111 |         self.default_value = default_value
112 |         self.value = value
113 | 
114 |     def _to_dict(self) -> dict[str, Any]:
115 |         return {
116 |             "id": self.id,
117 |             "flow_id": self.flow_id,
118 |             "flow_name": self.flow_name,
119 |             "full_name": self.full_name,
120 |             "parameter_name": self.parameter_name,
121 |             "data_type": self.data_type,
122 |             "default_value": self.default_value,
123 |             "value": self.value,
124 |         }
125 | 
126 |     def __repr__(self) -> str:
127 |         header = "OpenML Parameter"
128 |         header = f"{header}\n{'=' * len(header)}\n"
129 | 
130 |         fields = {
131 |             "ID": self.id,
132 |             "Flow ID": self.flow_id,
133 |             # "Flow Name": self.flow_name,
134 |             "Flow Name": self.full_name,
135 |             "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
136 |             "Parameter Name": self.parameter_name,
137 |         }
138 |         # indented prints for parameter attributes
139 |         # indention = 2 spaces + 1 | + 2 underscores
140 |         indent = f"{' ' * 2}|{'_' * 2}"
141 |         parameter_data_type = f"{indent}Data Type"
142 |         fields[parameter_data_type] = self.data_type
143 |         parameter_default = f"{indent}Default"
144 |         fields[parameter_default] = self.default_value
145 |         parameter_value = f"{indent}Value"
146 |         fields[parameter_value] = self.value
147 | 
148 |         # determines the order in which the information will be printed
149 |         order = [
150 |             "ID",
151 |             "Flow ID",
152 |             "Flow Name",
153 |             "Flow URL",
154 |             "Parameter Name",
155 |             parameter_data_type,
156 |             parameter_default,
157 |             parameter_value,
158 |         ]
159 |         _fields = [(key, fields[key]) for key in order if key in fields]
160 | 
161 |         longest_field_name_length = max(len(name) for name, _ in _fields)
162 |         field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
163 |         body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
164 |         return header + body
165 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import unittest.mock
  5 | import pytest
  6 | import openml
  7 | from openml.testing import _check_dataset
  8 | 
  9 | 
 10 | @pytest.fixture()
 11 | def min_number_tasks_on_test_server() -> int:
 12 |     """After a reset at least 1068 tasks are on the test server"""
 13 |     return 1068
 14 | 
 15 | 
 16 | @pytest.fixture()
 17 | def min_number_datasets_on_test_server() -> int:
 18 |     """After a reset at least 127 datasets are on the test server"""
 19 |     return 127
 20 | 
 21 | 
 22 | @pytest.fixture()
 23 | def min_number_flows_on_test_server() -> int:
 24 |     """After a reset at least 127 flows are on the test server"""
 25 |     return 15
 26 | 
 27 | 
 28 | @pytest.fixture()
 29 | def min_number_setups_on_test_server() -> int:
 30 |     """After a reset at least 20 setups are on the test server"""
 31 |     return 50
 32 | 
 33 | 
 34 | @pytest.fixture()
 35 | def min_number_runs_on_test_server() -> int:
 36 |     """After a reset at least 21 runs are on the test server"""
 37 |     return 21
 38 | 
 39 | 
 40 | @pytest.fixture()
 41 | def min_number_evaluations_on_test_server() -> int:
 42 |     """After a reset at least 8 evaluations are on the test server"""
 43 |     return 8
 44 | 
 45 | 
 46 | def _mocked_perform_api_call(call, request_method):
 47 |     url = openml.config.server + "/" + call
 48 |     return openml._api_calls._download_text_file(url)
 49 | 
 50 | 
 51 | @pytest.mark.server()
 52 | def test_list_all():
 53 |     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 54 | 
 55 | 
 56 | @pytest.mark.server()
 57 | def test_list_all_for_tasks(min_number_tasks_on_test_server):
 58 |     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
 59 |     assert min_number_tasks_on_test_server == len(tasks)
 60 | 
 61 | 
 62 | @pytest.mark.server()
 63 | def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
 64 |     # By setting the batch size one lower than the minimum we guarantee at least two
 65 |     # batches and at the same time do as few batches (roundtrips) as possible.
 66 |     batch_size = min_number_tasks_on_test_server - 1
 67 |     batches = openml.utils._list_all(
 68 |         listing_call=openml.tasks.functions._list_tasks,
 69 |         batch_size=batch_size,
 70 |     )
 71 |     assert len(batches) >= 2
 72 |     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 73 | 
 74 | 
 75 | @pytest.mark.server()
 76 | def test_list_all_for_datasets(min_number_datasets_on_test_server):
 77 |     datasets = openml.datasets.list_datasets(
 78 |         size=min_number_datasets_on_test_server,
 79 |     )
 80 | 
 81 |     assert min_number_datasets_on_test_server == len(datasets)
 82 |     for dataset in datasets.to_dict(orient="index").values():
 83 |         _check_dataset(dataset)
 84 | 
 85 | 
 86 | @pytest.mark.server()
 87 | def test_list_all_for_flows(min_number_flows_on_test_server):
 88 |     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
 89 |     assert min_number_flows_on_test_server == len(flows)
 90 | 
 91 | 
 92 | @pytest.mark.server()
 93 | @pytest.mark.flaky()  # Other tests might need to upload runs first
 94 | def test_list_all_for_setups(min_number_setups_on_test_server):
 95 |     # TODO apparently list_setups function does not support kwargs
 96 |     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
 97 |     assert min_number_setups_on_test_server == len(setups)
 98 | 
 99 | 
100 | @pytest.mark.server()
101 | @pytest.mark.flaky()  # Other tests might need to upload runs first
102 | def test_list_all_for_runs(min_number_runs_on_test_server):
103 |     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
104 |     assert min_number_runs_on_test_server == len(runs)
105 | 
106 | 
107 | @pytest.mark.server()
108 | @pytest.mark.flaky()  # Other tests might need to upload runs first
109 | def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
110 |     # TODO apparently list_evaluations function does not support kwargs
111 |     evaluations = openml.evaluations.list_evaluations(
112 |         function="predictive_accuracy",
113 |         size=min_number_evaluations_on_test_server,
114 |     )
115 |     assert min_number_evaluations_on_test_server == len(evaluations)
116 | 
117 | 
118 | @pytest.mark.server()
119 | @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
120 | def test_list_all_few_results_available(_perform_api_call):
121 |     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
122 |     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
123 |     assert _perform_api_call.call_count == 1, "expect just one call to get one dataset"
124 | 
125 | 
126 | @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
127 | @unittest.mock.patch("openml.config.get_cache_directory")
128 | def test__create_cache_directory(config_mock, tmp_path):
129 |     config_mock.return_value = tmp_path
130 |     openml.utils._create_cache_directory("abc")
131 |     assert (tmp_path / "abc").exists()
132 | 
133 |     subdir = tmp_path / "def"
134 |     subdir.mkdir()
135 |     subdir.chmod(0o444)
136 |     config_mock.return_value = subdir
137 |     with pytest.raises(
138 |         openml.exceptions.OpenMLCacheException,
139 |         match="Cannot create cache directory",
140 |     ):
141 |         openml.utils._create_cache_directory("ghi")
142 | 
143 | 
144 | @pytest.mark.server()
145 | def test_correct_test_server_download_state():
146 |     """This test verifies that the test server downloads the data from the correct source.
147 | 
148 |     If this tests fails, it is highly likely that the test server is not configured correctly.
149 |     Usually, this means that the test server is serving data from the task with the same ID from the production server.
150 |     That is, it serves parquet files wrongly associated with the test server's task.
151 |     """
152 |     task = openml.tasks.get_task(119)
153 |     dataset = task.get_dataset()
154 |     assert len(dataset.features) == dataset.get_data()[0].shape[1]
155 | 


--------------------------------------------------------------------------------
/examples/Advanced/task_manual_iteration_tutorial.py:
--------------------------------------------------------------------------------
  1 | # %% [markdown]
  2 | # Tasks define a target and a train/test split, which we can use for benchmarking.
  3 | 
  4 | # %%
  5 | import openml
  6 | 
  7 | # %% [markdown]
  8 | # For this tutorial we will use the famous King+Rook versus King+Pawn on A7 dataset, which has
  9 | # the dataset ID 3 ([dataset on OpenML](https://www.openml.org/d/3)), and for which there exist
 10 | # tasks with all important estimation procedures. It is small enough (less than 5000 samples) to
 11 | # efficiently use it in an example.
 12 | #
 13 | # We will first start with ([task 233](https://www.openml.org/t/233)), which is a task with a
 14 | # holdout estimation procedure.
 15 | 
 16 | # %%
 17 | task_id = 233
 18 | task = openml.tasks.get_task(task_id)
 19 | 
 20 | # %% [markdown]
 21 | # Now that we have a task object we can obtain the number of repetitions, folds and samples as
 22 | # defined by the task:
 23 | 
 24 | # %%
 25 | n_repeats, n_folds, n_samples = task.get_split_dimensions()
 26 | 
 27 | # %% [markdown]
 28 | # * ``n_repeats``: Number of times the model quality estimation is performed
 29 | # * ``n_folds``: Number of folds per repeat
 30 | # * ``n_samples``: How many data points to use. This is only relevant for learning curve tasks
 31 | #
 32 | # A list of all available estimation procedures is available
 33 | # [here](https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure).
 34 | #
 35 | # Task ``233`` is a simple task using the holdout estimation procedure and therefore has only a
 36 | # single repeat, a single fold and a single sample size:
 37 | 
 38 | # %%
 39 | print(
 40 |     f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
 41 | )
 42 | 
 43 | # %% [markdown]
 44 | # We can now retrieve the train/test split for this combination of repeats, folds and number of
 45 | # samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample
 46 | # sizes, but we can neglect this here as there is only a single repetition.
 47 | 
 48 | # %%
 49 | train_indices, test_indices = task.get_train_test_split_indices(
 50 |     repeat=0,
 51 |     fold=0,
 52 |     sample=0,
 53 | )
 54 | 
 55 | print(train_indices.shape, train_indices.dtype)
 56 | print(test_indices.shape, test_indices.dtype)
 57 | 
 58 | # %% [markdown]
 59 | # And then split the data based on this:
 60 | 
 61 | # %%
 62 | X, y = task.get_X_and_y()
 63 | X_train = X.iloc[train_indices]
 64 | y_train = y.iloc[train_indices]
 65 | X_test = X.iloc[test_indices]
 66 | y_test = y.iloc[test_indices]
 67 | 
 68 | print(
 69 |     f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}"
 70 | )
 71 | 
 72 | # %% [markdown]
 73 | # Obviously, we can also retrieve cross-validation versions of the dataset used in task ``233``:
 74 | 
 75 | # %%
 76 | task_id = 3
 77 | task = openml.tasks.get_task(task_id)
 78 | X, y = task.get_X_and_y()
 79 | n_repeats, n_folds, n_samples = task.get_split_dimensions()
 80 | print(
 81 |     f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
 82 | )
 83 | 
 84 | # %% [markdown]
 85 | # And then perform the aforementioned iteration over all splits:
 86 | 
 87 | # %%
 88 | for repeat_idx in range(n_repeats):
 89 |     for fold_idx in range(n_folds):
 90 |         for sample_idx in range(n_samples):
 91 |             train_indices, test_indices = task.get_train_test_split_indices(
 92 |                 repeat=repeat_idx,
 93 |                 fold=fold_idx,
 94 |                 sample=sample_idx,
 95 |             )
 96 |             X_train = X.iloc[train_indices]
 97 |             y_train = y.iloc[train_indices]
 98 |             X_test = X.iloc[test_indices]
 99 |             y_test = y.iloc[test_indices]
100 | 
101 |             print(
102 |                 f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
103 |                 f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
104 |             )
105 | 
106 | # %% [markdown]
107 | # And also versions with multiple repeats:
108 | 
109 | # %%
110 | task_id = 1767
111 | task = openml.tasks.get_task(task_id)
112 | X, y = task.get_X_and_y()
113 | n_repeats, n_folds, n_samples = task.get_split_dimensions()
114 | print(
115 |     f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
116 | )
117 | 
118 | # %% [markdown]
119 | # And then again perform the aforementioned iteration over all splits:
120 | 
121 | # %%
122 | for repeat_idx in range(n_repeats):
123 |     for fold_idx in range(n_folds):
124 |         for sample_idx in range(n_samples):
125 |             train_indices, test_indices = task.get_train_test_split_indices(
126 |                 repeat=repeat_idx,
127 |                 fold=fold_idx,
128 |                 sample=sample_idx,
129 |             )
130 |             X_train = X.iloc[train_indices]
131 |             y_train = y.iloc[train_indices]
132 |             X_test = X.iloc[test_indices]
133 |             y_test = y.iloc[test_indices]
134 | 
135 |             print(
136 |                 f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
137 |                 f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
138 |             )
139 | 
140 | # %% [markdown]
141 | # And finally a task based on learning curves:
142 | 
143 | # %%
144 | task_id = 1702
145 | task = openml.tasks.get_task(task_id)
146 | X, y = task.get_X_and_y()
147 | n_repeats, n_folds, n_samples = task.get_split_dimensions()
148 | print(
149 |     f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
150 | )
151 | 
152 | # %% [markdown]
153 | # And then again perform the aforementioned iteration over all splits:
154 | 
155 | # %%
156 | for repeat_idx in range(n_repeats):
157 |     for fold_idx in range(n_folds):
158 |         for sample_idx in range(n_samples):
159 |             train_indices, test_indices = task.get_train_test_split_indices(
160 |                 repeat=repeat_idx,
161 |                 fold=fold_idx,
162 |                 sample=sample_idx,
163 |             )
164 |             X_train = X.iloc[train_indices]
165 |             y_train = y.iloc[train_indices]
166 |             X_test = X.iloc[test_indices]
167 |             y_test = y.iloc[test_indices]
168 | 
169 |             print(
170 |                 f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
171 |                 f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
172 |             )
173 | 


--------------------------------------------------------------------------------
/openml/base.py:
--------------------------------------------------------------------------------
  1 | # License: BSD 3-Clause
  2 | from __future__ import annotations
  3 | 
  4 | import re
  5 | import webbrowser
  6 | from abc import ABC, abstractmethod
  7 | from typing import Iterable, Sequence
  8 | 
  9 | import xmltodict
 10 | 
 11 | import openml._api_calls
 12 | import openml.config
 13 | 
 14 | from .utils import _get_rest_api_type_alias, _tag_openml_base
 15 | 
 16 | 
 17 | class OpenMLBase(ABC):
 18 |     """Base object for functionality that is shared across entities."""
 19 | 
 20 |     def __repr__(self) -> str:
 21 |         body_fields = self._get_repr_body_fields()
 22 |         return self._apply_repr_template(body_fields)
 23 | 
 24 |     @property
 25 |     @abstractmethod
 26 |     def id(self) -> int | None:
 27 |         """The id of the entity, it is unique for its entity type."""
 28 | 
 29 |     @property
 30 |     def openml_url(self) -> str | None:
 31 |         """The URL of the object on the server, if it was uploaded, else None."""
 32 |         if self.id is None:
 33 |             return None
 34 |         return self.__class__.url_for_id(self.id)
 35 | 
 36 |     @classmethod
 37 |     def url_for_id(cls, id_: int) -> str:
 38 |         """Return the OpenML URL for the object of the class entity with the given id."""
 39 |         # Sample url for a flow: openml.org/f/123
 40 |         return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"
 41 | 
 42 |     @classmethod
 43 |     def _entity_letter(cls) -> str:
 44 |         """Return the letter which represents the entity type in urls, e.g. 'f' for flow."""
 45 |         # We take advantage of the class naming convention (OpenMLX),
 46 |         # which holds for all entities except studies and tasks, which overwrite this method.
 47 |         return cls.__name__.lower()[len("OpenML") :][0]
 48 | 
 49 |     # TODO(eddiebergman): This would be much cleaner as an iterator...
 50 |     @abstractmethod
 51 |     def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
 52 |         """Collect all information to display in the __repr__ body.
 53 | 
 54 |         Returns
 55 |         -------
 56 |         body_fields : List[Tuple[str, Union[str, int, List[str]]]]
 57 |             A list of (name, value) pairs to display in the body of the __repr__.
 58 |             E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
 59 |             If value is a List of str, then each item of the list will appear in a separate row.
 60 |         """
 61 |         # Should be implemented in the base class.
 62 | 
 63 |     def _apply_repr_template(
 64 |         self,
 65 |         body_fields: Iterable[tuple[str, str | int | list[str] | None]],
 66 |     ) -> str:
 67 |         """Generates the header and formats the body for string representation of the object.
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         body_fields: List[Tuple[str, str]]
 72 |            A list of (name, value) pairs to display in the body of the __repr__.
 73 |         """
 74 |         # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
 75 |         name_with_spaces = re.sub(
 76 |             r"(\w)([A-Z])",
 77 |             r"\1 \2",
 78 |             self.__class__.__name__[len("OpenML") :],
 79 |         )
 80 |         header_text = f"OpenML {name_with_spaces}"
 81 |         header = f"{header_text}\n{'=' * len(header_text)}\n"
 82 | 
 83 |         _body_fields: list[tuple[str, str | int | list[str]]] = [
 84 |             (k, "None" if v is None else v) for k, v in body_fields
 85 |         ]
 86 |         longest_field_name_length = max(len(name) for name, _ in _body_fields)
 87 |         field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
 88 |         body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
 89 |         return header + body
 90 | 
 91 |     @abstractmethod
 92 |     def _to_dict(self) -> dict[str, dict]:
 93 |         """Creates a dictionary representation of self.
 94 | 
 95 |         The return value will be used to create the upload xml file.
 96 |         The xml file must have the tags in exactly the order of the object's xsd.
 97 |         (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/).
 98 | 
 99 |         Returns
100 |         -------
101 |             Thing represented as dict.
102 |         """
103 |         # Should be implemented in the base class.
104 | 
105 |     def _to_xml(self) -> str:
106 |         """Generate xml representation of self for upload to server."""
107 |         dict_representation = self._to_dict()
108 |         xml_representation = xmltodict.unparse(dict_representation, pretty=True)
109 | 
110 |         # A task may not be uploaded with the xml encoding specification:
111 |         # <?xml version="1.0" encoding="utf-8"?>
112 |         _encoding_specification, xml_body = xml_representation.split("\n", 1)
113 |         return str(xml_body)
114 | 
115 |     def _get_file_elements(self) -> openml._api_calls.FILE_ELEMENTS_TYPE:
116 |         """Get file_elements to upload to the server, called during Publish.
117 | 
118 |         Derived child classes should overwrite this method as necessary.
119 |         The description field will be populated automatically if not provided.
120 |         """
121 |         return {}
122 | 
123 |     @abstractmethod
124 |     def _parse_publish_response(self, xml_response: dict[str, str]) -> None:
125 |         """Parse the id from the xml_response and assign it to self."""
126 | 
127 |     def publish(self) -> OpenMLBase:
128 |         """Publish the object on the OpenML server."""
129 |         file_elements = self._get_file_elements()
130 | 
131 |         if "description" not in file_elements:
132 |             file_elements["description"] = self._to_xml()
133 | 
134 |         call = f"{_get_rest_api_type_alias(self)}/"
135 |         response_text = openml._api_calls._perform_api_call(
136 |             call,
137 |             "post",
138 |             file_elements=file_elements,
139 |         )
140 |         xml_response = xmltodict.parse(response_text)
141 | 
142 |         self._parse_publish_response(xml_response)
143 |         return self
144 | 
145 |     def open_in_browser(self) -> None:
146 |         """Opens the OpenML web page corresponding to this object in your default browser."""
147 |         if self.openml_url is None:
148 |             raise ValueError(
149 |                 "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
150 |             )
151 | 
152 |         webbrowser.open(self.openml_url)
153 | 
154 |     def push_tag(self, tag: str) -> None:
155 |         """Annotates this entity with a tag on the server.
156 | 
157 |         Parameters
158 |         ----------
159 |         tag : str
160 |             Tag to attach to the flow.
161 |         """
162 |         _tag_openml_base(self, tag)
163 | 
164 |     def remove_tag(self, tag: str) -> None:
165 |         """Removes a tag from this entity on the server.
166 | 
167 |         Parameters
168 |         ----------
169 |         tag : str
170 |             Tag to attach to the flow.
171 |         """
172 |         _tag_openml_base(self, tag, untag=True)
173 | 


--------------------------------------------------------------------------------