├── .circleci
└── config.yml
├── .gitignore
├── README.md
├── docs
├── benchmark
│ ├── automl
│ │ ├── AutoML-Benchmark.md
│ │ ├── basic_example.md
│ │ ├── benchmark_on_openml.md
│ │ ├── important_params.md
│ │ └── specific_task_fold_example.md
│ └── index.md
├── concepts
│ ├── authentication.md
│ ├── benchmarking.md
│ ├── data.md
│ ├── flows.md
│ ├── index.md
│ ├── runs.md
│ ├── sharing.md
│ ├── tagging.md
│ └── tasks.md
├── contributing
│ ├── OpenML-Docs.md
│ ├── Style.md
│ ├── backend
│ │ ├── API-development.md
│ │ ├── Datasets.md
│ │ ├── Java-App.md
│ │ └── Local-Installation.md
│ ├── clients
│ │ ├── Client-API-Standards.md
│ │ ├── Rest.md
│ │ └── metadata_definition.md
│ ├── index.md
│ ├── resources.md
│ └── website
│ │ ├── Dash.md
│ │ ├── Flask.md
│ │ ├── React.md
│ │ └── Website.md
├── css
│ └── extra.css
├── data
│ ├── index.md
│ ├── specs.md
│ └── use.md
├── ecosystem
│ ├── Java.md
│ ├── MOA.md
│ ├── Python_extensions.md
│ ├── Rest.md
│ ├── Scikit-learn
│ │ ├── basic_tutorial.ipynb
│ │ ├── datasets_tutorial.ipynb
│ │ └── index.md
│ ├── Weka.md
│ ├── index.md
│ ├── mlr.md
│ └── showcase.md
├── examples
│ ├── 20_basic
│ │ ├── README.txt
│ │ ├── introduction_tutorial.py
│ │ ├── simple_datasets_tutorial.py
│ │ ├── simple_flows_and_runs_tutorial.py
│ │ └── simple_suites_tutorial.py
│ ├── 30_extended
│ │ ├── README.txt
│ │ ├── configure_logging.py
│ │ ├── create_upload_tutorial.py
│ │ ├── custom_flow_.py
│ │ ├── datasets_tutorial.py
│ │ ├── fetch_evaluations_tutorial.py
│ │ ├── fetch_runtimes_tutorial.py
│ │ ├── flow_id_tutorial.py
│ │ ├── flows_and_runs_tutorial.py
│ │ ├── plot_svm_hyperparameters_tutorial.py
│ │ ├── run_setup_tutorial.py
│ │ ├── study_tutorial.py
│ │ ├── suites_tutorial.py
│ │ ├── task_manual_iteration_tutorial.py
│ │ └── tasks_tutorial.py
│ ├── 40_paper
│ │ ├── 2015_neurips_feurer_example.py
│ │ ├── 2018_ida_strang_example.py
│ │ ├── 2018_kdd_rijn_example.py
│ │ ├── 2018_neurips_perrone_example.py
│ │ └── README.txt
│ ├── README.txt
│ └── SUMMARY.md
├── help
│ └── index.md
├── img
│ ├── OpenML-governance.png
│ ├── OpenML-logo2.png
│ ├── R.png
│ ├── api_get_dataset.png
│ ├── api_get_implementation.png
│ ├── api_get_task.png
│ ├── api_upload_data.png
│ ├── api_upload_implementation.png
│ ├── api_upload_run.png
│ ├── benchmark_cartoon.webp
│ ├── cover-results.png
│ ├── data-ss.png
│ ├── data-ss1.png
│ ├── data-ss2.png
│ ├── data_cartoon.webp
│ ├── data_version.png
│ ├── dots-sq.png
│ ├── dots.png
│ ├── dotsicon.png
│ ├── editdata.png
│ ├── editdatagui.png
│ ├── expdb_run.png
│ ├── expdblogo.png
│ ├── expdblogo2.png
│ ├── expdbschema.png
│ ├── expdbschema2.png
│ ├── favicon.ico
│ ├── flow-ss1.png
│ ├── flow_top.png
│ ├── icon-sprite.png
│ ├── intro-brain.png
│ ├── intro-code.png
│ ├── intro-data.png
│ ├── intro-run.png
│ ├── intro-task.png
│ ├── java.png
│ ├── logo-github.svg
│ ├── logo_kuleuven.png
│ ├── logo_tue.gif
│ ├── logo_uleiden.png
│ ├── mlr.png
│ ├── moa.jpeg
│ ├── openml-black.png
│ ├── openml-gray.png
│ ├── openml-logo.png
│ ├── openml.png
│ ├── openml_black.png
│ ├── openmldiagram-alpha.png
│ ├── openmldiagram-alpha.pxm
│ ├── openmldiagram.png
│ ├── openmlicon.png
│ ├── openmllogo-beta.png
│ ├── openmllogo.png
│ ├── openmlmoa.png
│ ├── openmlweka.png
│ ├── osr.png
│ ├── plugins-ss1.png
│ ├── python.png
│ ├── r-ss1.png
│ ├── rapidminer.png
│ ├── react-components.png
│ ├── rest.png
│ ├── run-ss1.png
│ ├── run_study.png
│ ├── structure.png
│ ├── studies.png
│ ├── task-ss1.png
│ ├── task-ss2.png
│ ├── task_leaderboard.png
│ ├── task_top_flows.png
│ ├── webui.png
│ └── webui2.png
├── index.md
├── intro
│ ├── Governance.md
│ ├── showcase.md
│ └── terms.md
├── js
│ └── extra.js
├── notebooks
│ └── getting_started.ipynb
├── old
│ ├── Data-collections.md
│ └── altmetrics.md
└── python
│ ├── contributing.md
│ ├── extensions.md
│ ├── index.md
│ ├── progress.md
│ └── usage.md
├── mkdocs-local.yml
├── mkdocs.yml
├── requirements.txt
└── scripts
├── gen_python_ref_pages.py
├── github_scraper.py
└── showcase_urls.txt
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | jobs:
4 | deploydocs:
5 | docker:
6 | - image: python:3.10
7 | steps:
8 | - checkout
9 | - run:
10 | name: Install dependencies
11 | command: pip install -r requirements.txt
12 | - run:
13 | name: Deploy
14 | command: mkdocs gh-deploy -m "Deployment of commit {sha} [ci skip]"
15 | - run:
16 | name: Deploy2
17 | command: mkdocs gh-deploy -m "Deployment of commit {sha} [ci skip]"
18 |
19 |
20 | workflows:
21 | version: 2
22 | build:
23 | jobs:
24 | - deploydocs:
25 | filters:
26 | branches:
27 | only:
28 | - master
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | site/*
2 | openml/*
3 | docs/example/*
4 | docs/python/*
5 | temp_dir/*
6 | .cache/*
7 | .DS_store
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenML documentation
2 |
3 | ## Documentation structure
4 | The OpenML documentation in written in MarkDown. The sources are generated by [MkDocs](http://www.mkdocs.org/), using the [Material theme](https://squidfunk.github.io/mkdocs-material/). Check these links to see what is possible in terms of styling.
5 |
6 | The overal structure (navigation) of the docs is configurated in the `mkdocs.yml` file.
7 |
8 | Some of the API's use other documentation generators, such as [Sphinx](https://restcoder.readthedocs.io/en/latest/sphinx-docgen.html) in openml-python. This documentation is pulled in via iframes to gather all docs into the same place, but they need to be edited in their own GitHub repo's.
9 |
10 | ## Editing documentation
11 | Documentation can be edited by simply editing the markdown files in the `docs` folder and creating a pull request.
12 |
13 | End users can edit the docs by simply clicking the edit button (the pencil icon) on the top of every documentation page. It will open up an editing page on [GitHub](https://github.com/) (you do need to be logged in on GitHub). When you are done, add a small message explaining the change and click 'commit changes'. On the next page, just launch the pull request. We will then review it and approve the changes, or discuss them if necessary.
14 |
15 | ## Deployment
16 | The documentation is hosted on GitHub pages.
17 |
18 | To deploy the documentation, you need to have MkDocs and MkDocs-Material installed, and then run `mkdocs gh-deploy` in the top directory (with the `mkdocs.yml` file). This will build the HTML files and push them to the gh-pages branch of openml/docs. `https://docs.openml.org` is just a reverse proxy for `https://openml.github.io/docs/`.
19 |
20 | MKDocs and MkDocs-Material can be installed as follows:
21 | ```
22 | pip install mkdocs
23 | pip install mkdocs-material
24 | pip install -U fontawesome_markdown
25 | ```
26 |
27 |
--------------------------------------------------------------------------------
/docs/benchmark/automl/AutoML-Benchmark.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Getting Started
3 | description: A short tutorial on installing the software and running a simple benchmark.
4 | ---
5 |
6 | # Getting Started
7 |
8 | The [AutoML Benchmark](https://openml.github.io/automlbenchmark/index.html) is a tool for benchmarking AutoML frameworks on tabular data.
9 | It automates the installation of AutoML frameworks, passing it data, and evaluating
10 | their predictions.
11 | [Our paper](https://arxiv.org/pdf/2207.12560.pdf) describes the design and showcases
12 | results from an evaluation using the benchmark.
13 | This guide goes over the minimum steps needed to evaluate an
14 | AutoML framework on a toy dataset.
15 |
16 | Full instructions can be found in the [API Documentation.](https://openml.github.io/automlbenchmark/docs/)
17 |
18 | ## Installation
19 | These instructions assume that [Python 3.9 (or higher)](https://www.python.org/downloads/)
20 | and [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) are installed,
21 | and are available under the alias `python` and `git`, respectively. We recommend
22 | [Pyenv](https://github.com/pyenv/pyenv) for managing multiple Python installations,
23 | if applicable. We support Ubuntu 22.04, but many linux and MacOS versions likely work
24 | (for MacOS, it may be necessary to have [`brew`](https://brew.sh) installed).
25 |
26 | First, clone the repository:
27 |
28 | ```bash
29 | git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1
30 | cd automlbenchmark
31 | ```
32 |
33 | Create a virtual environments to install the dependencies in:
34 |
35 | ### Linux
36 |
37 | ```bash
38 | python -m venv venv
39 | source venv/bin/activate
40 | ```
41 |
42 | ### MacOS
43 |
44 | ```bash
45 | python -m venv venv
46 | source venv/bin/activate
47 | ```
48 |
49 | ### Windows
50 |
51 | ```bash
52 | python -m venv ./venv
53 | venv/Scripts/activate
54 | ```
55 |
56 | Then install the dependencies:
57 |
58 | ```bash
59 | python -m pip install --upgrade pip
60 | python -m pip install -r requirements.txt
61 | ```
62 |
63 |
64 | ??? windows "Note for Windows users"
65 |
66 | The automated installation of AutoML frameworks is done using shell script,
67 | which doesn't work on Windows. We recommend you use
68 | [Docker](https://docs.docker.com/desktop/install/windows-install/) to run the
69 | examples below. First, install and run `docker`.
70 | Then, whenever there is a `python runbenchmark.py ...`
71 | command in the tutorial, add `-m docker` to it (`python runbenchmark.py ... -m docker`).
72 |
73 | ??? question "Problem with the installation?"
74 |
75 | On some platforms, we need to ensure that requirements are installed sequentially.
76 | Use `xargs -L 1 python -m pip install < requirements.txt` to do so. If problems
77 | persist, [open an issue](https://github.com/openml/automlbenchmark/issues/new) with
78 | the error and information about your environment (OS, Python version, pip version).
79 |
80 |
81 | ## Running the Benchmark
82 |
83 | To run a benchmark call the `runbenchmark.py` script specifying the framework to evaluate.
84 |
85 | See the [API Documentation.](https://openml.github.io/automlbenchmark/docs/) for more information on the parameters available.
86 |
87 |
--------------------------------------------------------------------------------
/docs/benchmark/automl/benchmark_on_openml.md:
--------------------------------------------------------------------------------
1 | # Example: Benchmarks on OpenML
2 |
3 | In the previous examples, we used benchmarks which were defined in a local file
4 | ([test.yaml](https://github.com/openml/automlbenchmark/resources/benchmarks/test.yaml) and
5 | [validation.yaml](https://github.com/openml/automlbenchmark/resources/benchmarks/validation.yaml), respectively).
6 | However, we can also use tasks and
7 | benchmarking suites defined on OpenML directly from the command line. When referencing
8 | an OpenML task or suite, we can use `openml/t/ID` or `openml/s/ID` respectively as
9 | argument for the benchmark parameter. Running on the [iris task](https://openml.org/t/59):
10 |
11 | ```
12 | python runbenchmark.py randomforest openml/t/59
13 | ```
14 |
15 | or on the entire [AutoML benchmark classification suite](https://openml.org/s/271) (this will take hours!):
16 |
17 | ```
18 | python runbenchmark.py randomforest openml/s/271
19 | ```
20 |
21 | !!! info "Large-scale Benchmarking"
22 |
23 | For large scale benchmarking it is advised to parallelize your experiments,
24 | as otherwise it may take months to run the experiments.
25 | The benchmark currently only supports native parallelization in `aws` mode
26 | (by using the `--parallel` parameter), but using the `--task` and `--fold` parameters
27 | it is easy to generate scripts that invoke individual jobs on e.g., a SLURM cluster.
28 | When you run in any parallelized fashion, it is advised to run each process on
29 | separate hardware to ensure experiments can not interfere with each other.
--------------------------------------------------------------------------------
/docs/benchmark/automl/important_params.md:
--------------------------------------------------------------------------------
1 | # Important Parameters
2 |
3 | As you can see from the results above, the default behavior is to execute a short test
4 | benchmark. However, we can specify a different benchmark, provide different constraints,
5 | and even run the experiment in a container or on AWS. There are many parameters
6 | for the `runbenchmark.py` script, but the most important ones are:
7 |
8 | `Framework (required)`
9 |
10 | - The AutoML framework or baseline to evaluate and is not case-sensitive. See
11 | [integrated frameworks](https://openml.github.io/automlbenchmark/frameworks.html) for a list of supported frameworks.
12 | In the above example, this benchmarked framework `randomforest`.
13 |
14 | `Benchmark (optional, default='test')`
15 |
16 | - The benchmark suite is the dataset or set of datasets to evaluate the framework on.
17 | These can be defined as on [OpenML](https://www.openml.org) as a [study or task](https://openml.github.io/automlbenchmark/docs/extending/benchmark.md#defining-a-benchmark-on-openml)
18 | (formatted as `openml/s/X` or `openml/t/Y` respectively) or in a [local file](https://openml.github.io/automlbenchmark/docs/extending/benchmark.md#defining-a-benchmark-with-a-file).
19 | The default is a short evaluation on two folds of `iris`, `kc2`, and `cholesterol`.
20 |
21 | `Constraints (optional, default='test')`
22 |
23 | - The constraints applied to the benchmark as defined by default in [constraints.yaml](https://openml.github.io/automlbenchmark/resources/constraints.yaml).
24 | These include time constraints, memory constrains, the number of available cpu cores, and more.
25 | Default constraint is `test` (2 folds for 10 min each).
26 |
27 | !!! warning "Constraints are not enforced!"
28 | These constraints are forwarded to the AutoML framework if possible but, except for
29 | runtime constraints, are generally not enforced. It is advised when benchmarking
30 | to use an environment that mimics the given constraints.
31 |
32 | ??? info "Constraints can be overriden by `benchmark`"
33 | A benchmark definition can override constraints on a task level.
34 | This is useful if you want to define a benchmark which has different constraints
35 | for different tasks. The default "test" benchmark does this to limit runtime to
36 | 60 seconds instead of 600 seconds, which is useful to get quick results for its
37 | small datasets. For more information, see [defining a benchmark](../benchmark.md).
38 |
39 | `Mode (optional, default='local')`
40 |
41 | - The benchmark can be run in four modes:
42 |
43 | * `local`: install a local virtual environment and run the benchmark on your machine.
44 | * `docker`: create a docker image with the virtual environment and run the benchmark in a container on your machine.
45 | If a local or remote image already exists, that will be used instead. Requires [Docker](https://docs.docker.com/desktop/).
46 | * `singularity`: create a singularity image with the virtual environment and run the benchmark in a container on your machine. Requires [Singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html).
47 | * `aws`: run the benchmark on [AWS EC2](https://aws.amazon.com/free/?trk=b3f93e34-c1e0-4aa9-95f8-6d2c36891d8a&sc_channel=ps&ef_id=CjwKCAjw-7OlBhB8EiwAnoOEk0li05IUgU9Ok2uCdejP22Yr7ZuqtMeJZAdxgL5KZFaeOVskCAsknhoCSjUQAvD_BwE:G:s&s_kwcid=AL!4422!3!649687387631!e!!g!!aws%20ec2!19738730094!148084749082&all-free-tier.sort-by=item.additionalFields.SortRank&all-free-tier.sort-order=asc&awsf.Free%20Tier%20Types=*all&awsf.Free%20Tier%20Categories=*all) instances.
48 | It is possible to run directly on the instance or have the EC2 instance run in `docker` mode.
49 | Requires valid AWS credentials to be configured, for more information see [Running on AWS](#ADD-link-to-aws-guide).
50 |
51 |
52 | For a full list of parameters available, run:
53 |
54 | ```
55 | python runbenchmark.py --help
56 | ```
--------------------------------------------------------------------------------
/docs/benchmark/automl/specific_task_fold_example.md:
--------------------------------------------------------------------------------
1 | # Example: AutoML on a specific task and fold
2 |
3 | The defaults are very useful for performing a quick test, as the datasets are small
4 | and cover different task types (binary classification, multiclass classification, and
5 | regression). We also have a ["validation" benchmark](https://openml.github.io/automlbenchmark/resources/benchmarks/validation.yaml)
6 | suite for more elaborate testing that also includes missing data, categorical data,
7 | wide data, and more. The benchmark defines 9 tasks, and evaluating two folds with a
8 | 10-minute time constraint would take roughly 3 hours (=9 tasks * 2 folds * 10 minutes,
9 | plus overhead). Let's instead use the `--task` and `--fold` parameters to run only a
10 | specific task and fold in the `benchmark` when evaluating the
11 | [flaml](https://microsoft.github.io/FLAML/) AutoML framework:
12 |
13 | ```
14 | python runbenchmark.py flaml validation test -t eucalyptus -f 0
15 | ```
16 |
17 | This should take about 10 minutes plus the time it takes to install `flaml`.
18 | Results should look roughly like this:
19 |
20 | ```
21 | Processing results for flaml.validation.test.local.20230711T122823
22 | Summing up scores for current run:
23 | id task fold framework constraint result metric duration seed
24 | openml.org/t/2079 eucalyptus 0 flaml test -0.702976 neg_logloss 611.0 1385946458
25 | ```
26 |
27 | Similarly to the test run, you will find additional files in the `results` directory.
28 |
--------------------------------------------------------------------------------
/docs/concepts/authentication.md:
--------------------------------------------------------------------------------
1 | # Authentication
2 | OpenML is as open as possible. You can download and inspect all datasets, tasks, flows and runs through the website or the API without creating an account.
3 |
4 | However, if you want to upload datasets or experiments, you need to create an account, sign in, and find your API key on your profile page. This key can then be used with any of the [OpenML APIs](https://www.openml.org/apis).
5 |
6 | ## API keys
7 | If you don’t have an account yet, sign up now.
8 | You will receive an API key, which will authenticate you to the server
9 | and allow you to download and upload datasets, tasks, runs and flows.
10 |
11 | * Create an OpenML account (free) on https://www.openml.org.
12 | * After logging in, open your profile page. Click on the avatar on the top right, and choose 'Your Profile'.
13 | * Click on 'API key' to find your API key. You can also reset it if needed.
14 |
15 | To store your API key locally (to permanently authenticate), create a plain text file **~/.openml/config** with the line
16 | **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
17 | file must be in the directory ~/.openml/config and exist prior to
18 | importing the openml module.
--------------------------------------------------------------------------------
/docs/concepts/benchmarking.md:
--------------------------------------------------------------------------------
1 | # Collections and benchmarks
2 | You can combine tasks and runs into collections, to run experiments across many tasks at once and collect all results. Each collection gets its own page, which can be linked to publications so that others can find all the details online.
3 |
4 | ## Benchmarking suites
5 | Collections of tasks can be published as _benchmarking suites_. Seamlessly integrated into the OpenML platform, benchmark suites standardize the setup, execution, analysis, and reporting of benchmarks. Moreover, they make benchmarking a whole lot easier:
6 | - all datasets are uniformly formatted in standardized data formats
7 | - they can be easily downloaded programmatically through APIs and client libraries
8 | - they come with machine-readable meta-information, such as the occurrence of missing values, to train algorithms correctly
9 | - standardized train-test splits are provided to ensure that results can be objectively compared - results can be shared in a reproducible way through the APIs
10 | - results from other users can be easily downloaded and reused
11 |
12 | You can search for all existing benchmarking suites or create your own. For all further details, see the [benchmarking guide](../benchmark/benchmark.md).
13 |
14 |
15 |
16 | ## Benchmark studies
17 | Collections of runs can be published as _benchmarking studies_. They contain the results of all runs (possibly millions) executed on a specific benchmarking suite. OpenML allows you to easily download all such results at once via the APIs, but also visualized them online in the Analysis tab (next to the complete list of included tasks and runs). Below is an example of a benchmark study for AutoML algorithms.
18 |
19 |
--------------------------------------------------------------------------------
/docs/concepts/data.md:
--------------------------------------------------------------------------------
1 | # Data
2 | ## Discovery
3 | OpenML allows fine-grained search over thousands of machine learning datasets. Via the website, you can filter by many dataset properties, such as size, type, format, and many more. Via the [APIs](https://www.openml.org/apis) you have access to many more filters, and you can download a complete table with statistics of all datasest. Via the APIs you can also load datasets directly into your preferred data structures such as numpy ([example in Python](https://openml.github.io/openml-python/main/examples/20_basic/simple_datasets_tutorial.html#sphx-glr-examples-20-basic-simple-datasets-tutorial-py)). We are also working on better organization of all datasets by topic
4 |
5 |
6 | 
7 |
8 | ## Sharing
9 | You can upload and download datasets through the website or though the [APIs](https://www.openml.org/apis) (recommended). You can share data directly from common data science libraries, e.g. from Python or R dataframes, in a few lines of code. The OpenML APIs will automatically extract lots of meta-data and store all datasets in a uniform format.
10 |
11 | ``` python
12 | import pandas as pd
13 | import openml as oml
14 |
15 | # Create an OpenML dataset from a pandas dataframe
16 | df = pd.DataFrame(data, columns=attribute_names)
17 | my_data = oml.datasets.functions.create_dataset(
18 | name="covertype", description="Predicting forest cover ...",
19 | licence="CC0", data=df
20 | )
21 |
22 | # Share the dataset on OpenML
23 | my_data.publish()
24 | ```
25 |
26 | Every dataset gets a dedicated page on OpenML with all known information, and can be edited further online.
27 |
28 |
29 | 
30 |
31 | Data hosted elsewhere can be referenced by URL. We are also working on interconnecting OpenML with other machine learning data set repositories
32 |
33 | ## Automated analysis
34 | OpenML will automatically analyze the data and compute a range of data quality characteristics. These include simple statistics such as the number of examples and features, but also potential quality issues (e.g. missing values) and more advanced statistics (e.g. the mutual information in the features and benchmark performances of simple models). These can be useful to find, filter and compare datasets, or to automate data preprocessing. We are also working on simple metrics and automated dataset quality reports
35 |
36 | The Analysis tab (see image below, or try it live) also shows an automated and interactive analysis of all datasets. This runs on open-source Python code via Dash and we welcome all contributions
37 |
38 |
39 | 
40 |
41 | The third tab, 'Tasks', lists all tasks created on the dataset. More on that below.
42 |
43 | ## Dataset ID and versions
44 | A dataset can be uniquely identified by its dataset ID, which is shown on the website and returned by the API. It's `1596` in the `covertype` example above. They can also be referenced by name and ID. OpenML assigns incremental version numbers per upload with the same name. You can also add a free-form `version_label` with every upload.
45 |
46 | ## Dataset status
47 | When you upload a dataset, it will be marked `in_preparation` until it is (automatically) verified. Once approved, the dataset will become `active` (or `verified`). If a severe issue has been found with a dataset, it can become `deactivated` (or `deprecated`) signaling that it should not be used. By default, dataset search only returns verified datasets, but you can access and download datasets with any status.
48 |
49 | ## Special attributes
50 | Machine learning datasets often have special attributes that require special handling in order to build useful models. OpenML marks these as special attributes.
51 |
52 | A `target` attribute is the column that is to be predicted, also known as dependent variable. Datasets can have a default target attribute set by the author, but OpenML tasks can also overrule this. Example: The default target variable for the MNIST dataset is to predict the class from pixel values, and most supervised tasks will have the class as their target. However, one can also create a task aimed at predicting the value of pixel257 given all the other pixel values and the class column.
53 |
54 | `Row id` attributes indicate externally defined row IDs (e.g. `instance` in dataset 164). `Ignore` attributes are other columns that should not be included in training data (e.g. `Player` in dataset 185). OpenML will clearly mark these, and will (by default) drop these columns when constructing training sets.
--------------------------------------------------------------------------------
/docs/concepts/flows.md:
--------------------------------------------------------------------------------
1 | # Flows
2 |
3 | Flows are machine learning pipelines, models, or scripts. They are typically uploaded directly from machine learning libraries (e.g. scikit-learn, pyTorch, TensorFlow, MLR, WEKA,...) via the corresponding [APIs](https://www.openml.org/apis). Associated code (e.g., on GitHub) can be referenced by URL.
4 |
5 | ## Analysing algorithm performance
6 |
7 | Every flow gets a dedicated page with all known information. The Analysis tab shows an automated interactive analysis of all collected results. For instance, below are the results of a scikit-learn pipeline including missing value imputation, feature encoding, and a RandomForest model. It shows the results across multiple tasks, and how the AUC score is affected by certain hyperparameters.
8 |
9 |
10 | 
11 |
12 | This helps to better understand specific models, as well as their strengths and weaknesses.
13 |
14 | ## Automated sharing
15 |
16 | When you evaluate algorithms and share the results, OpenML will automatically extract all the details of the algorithm (dependencies, structure, and all hyperparameters), and upload them in the background.
17 |
18 | ``` python
19 | from sklearn import ensemble
20 | from openml import tasks, runs
21 |
22 | # Build any model you like.
23 | clf = ensemble.RandomForestClassifier()
24 |
25 | # Evaluate the model on a task
26 | run = runs.run_model_on_task(clf, task)
27 |
28 | # Share the results, including the flow and all its details.
29 | run.publish()
30 | ```
31 |
32 | ## Reproducing algorithms and experiments
33 |
34 | Given an OpenML run, the exact same algorithm or model, with exactly the same hyperparameters, can be reconstructed within the same machine learning library to easily reproduce earlier results.
35 |
36 | ``` python
37 | from openml import runs
38 |
39 | # Rebuild the (scikit-learn) pipeline from run 9864498
40 | model = openml.runs.initialize_model_from_run(9864498)
41 | ```
42 |
43 | !!! note
44 | You may need the exact same library version to reconstruct flows. The API will always state the required version. We aim to add support for VMs so that flows can be easily (re)run in any environment
--------------------------------------------------------------------------------
/docs/concepts/index.md:
--------------------------------------------------------------------------------
1 | # Concepts
2 |
3 | ## OpenML concepts
4 | OpenML operates on a number of core concepts which are important to understand:
5 |
6 | **:fontawesome-solid-database: Datasets**
7 | Datasets are pretty straight-forward. Tabular datasets are self-contained, consisting of a number of rows (_instances_) and columns (features), including their data types. Other
8 | modalities (e.g. images) are included via paths to files stored within the same folder.
9 | Datasets are uniformly formatted ([S3](https://min.io/product/s3-compatibility) buckets with [Parquet](https://parquet.apache.org/) tables, [JSON](https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Objects/JSON) metadata, and media files), and are auto-converted and auto-loaded in your desired format by the [APIs](https://www.openml.org/apis) (e.g. in [Python](https://openml.github.io/openml-python/main/)) in a single line of code.
10 | _Example: The Iris dataset or the Plankton dataset_
11 |
12 |
13 | **:fontawesome-solid-trophy: Tasks**
14 | A task consists of a dataset, together with a machine learning task to perform, such as classification or clustering and an evaluation method. For
15 | supervised tasks, this also specifies the target column in the data.
16 | _Example: Classifying different iris species from other attributes and evaluate using 10-fold cross-validation._
17 |
18 | **:material-cogs: Flows**
19 | A flow identifies a particular machine learning algorithm (a pipeline or untrained model) from a particular library or framework, such as scikit-learn, pyTorch, or MLR. It contains details about the structure of the model/pipeline, dependencies (e.g. the library and its version) and a list of settable hyperparameters. In short, it is a serialized description of the algorithm that in many cases can also be deserialized to reinstantiate the exact same algorithm in a particular library.
20 | _Example: scikit-learn's RandomForest or a simple TensorFlow model_
21 |
22 | **:fontawesome-solid-star: Runs**
23 | A run is an experiment - it evaluates a particular flow (pipeline/model) with particular hyperparameter settings, on a particular task. Depending on the task it will include certain results, such as model evaluations (e.g. accuracies), model predictions, and other output files (e.g. the trained model).
24 | _Example: Classifying Gamma rays with scikit-learn's RandomForest_
--------------------------------------------------------------------------------
/docs/concepts/runs.md:
--------------------------------------------------------------------------------
1 | # Runs
2 |
3 | ## Automated reproducible evaluations
4 | Runs are experiments (benchmarks) evaluating a specific flows on a specific task. As shown above, they are typically submitted automatically by machine learning
5 | libraries through the OpenML [APIs](https://www.openml.org/apis)), including lots of automatically extracted meta-data, to create reproducible experiments. With a few for-loops you can easily run (and share) millions of experiments.
6 |
7 | ## Online organization
8 | OpenML organizes all runs online, linked to the underlying data, flows, parameter settings, people, and other details. See the many examples above, where every dot in the scatterplots is a single OpenML run.
9 |
10 | ## Independent (server-side) evaluation
11 | OpenML runs include all information needed to independently evaluate models. For most tasks, this includes all predictions, for all train-test splits, for all instances in the dataset, including all class confidences. When a run is uploaded, OpenML automatically evaluates every run using a wide array of evaluation metrics. This makes them directly comparable with all other runs shared on OpenML. For completeness, OpenML will also upload locally computed evaluation metrics and runtimes.
12 |
13 | New metrics can also be added to OpenML's evaluation engine, and computed for all runs afterwards. Or, you can download OpenML runs and analyse the results any way you like.
14 |
15 | !!! note
16 | Please note that while OpenML tries to maximise reproducibility, exactly reproducing all results may not always be possible because of changes in numeric libraries, operating systems, and hardware.
--------------------------------------------------------------------------------
/docs/concepts/sharing.md:
--------------------------------------------------------------------------------
1 | # Sharing (under construction)
2 | Currently, anything on OpenML can be shared publicly or kept private to a single user. We are working on sharing features that allow you to share your materials with other users without making them entirely public. Watch this space
3 |
--------------------------------------------------------------------------------
/docs/concepts/tagging.md:
--------------------------------------------------------------------------------
1 | # Tagging
2 | Datasets, tasks, runs and flows can be assigned tags, either via the web
3 | interface or the API. These tags can be used to search and annotate datasets, or simply to better organize your own datasets and experiments.
4 |
5 | For example, the tag OpenML-CC18 refers to all tasks included in the OpenML-CC18 benchmarkign suite.
7 |
--------------------------------------------------------------------------------
/docs/concepts/tasks.md:
--------------------------------------------------------------------------------
1 | # Tasks
2 | Tasks describe what to do with the data. OpenML covers several task types, such as classification and clustering. Tasks are containers including the data and other information such as train/test splits, and define what needs to be returned. They are machine-readable so that you can automate machine learning experiments, and easily compare algorithms evaluations (using the exact same train-test splits) against all other benchmarks shared by others on OpenML.
4 |
5 | ## Collaborative benchmarks
6 |
7 | Tasks are real-time, collaborative benchmarks (e.g. see
8 | MNIST below). In the Analysis tab, you can view timelines and leaderboards, and learn from all prior submissions to design even better algorithms.
9 |
10 |
11 | 
12 |
13 | ## Discover the best algorithms
14 | All algorithms evaluated on the same task (with the same train-test splits) can be directly compared to each other, so you can easily look up which algorithms perform best overall, and download their exact configurations. Likewise, you can look up the best algorithms for _similar_ tasks to know what to try first.
15 |
16 |
17 | 
18 |
19 | ## Automating benchmarks
20 | You can search and download existing tasks, evaluate your algorithms, and automatically share the results (which are stored in a _run_). Here's what this looks like in the Python API. You can do the same across hundreds of tasks at once.
21 |
22 | ``` python
23 | from sklearn import ensemble
24 | from openml import tasks, runs
25 |
26 | # Build any model you like
27 | clf = ensemble.RandomForestClassifier()
28 |
29 | # Download any OpenML task (includes the datasets)
30 | task = tasks.get_task(3954)
31 |
32 | # Automatically evaluate your model on the task
33 | run = runs.run_model_on_task(clf, task)
34 |
35 | # Share the results on OpenML.
36 | run.publish()
37 | ```
38 |
39 | You can create new tasks via the website or [via the APIs](https://www.openml.org/apis) as well.
--------------------------------------------------------------------------------
/docs/contributing/OpenML-Docs.md:
--------------------------------------------------------------------------------
1 | ## General Documentation
2 | High-quality and up-to-date documentation are crucial. If you notice any mistake in these documentation pages, click the :material-pencil: button (on the top right). It will open up an editing page on [GitHub](https://github.com/) (you do need to be logged in). When you are done, add a small message explaining the change and click 'commit changes'. On the next page, just launch the pull request. We will then review it and approve the changes, or discuss them if necessary.
3 |
4 | The sources are generated by [MkDocs](http://www.mkdocs.org/), using the [Material theme](https://squidfunk.github.io/mkdocs-material/).
5 | Check these docs to see what is possible in terms of styling.
6 |
7 | OpenML is a big project with multiple repositories. To keep the documentation close to the code, it will always be kept in the relevant repositories (see below), and
8 | combined into these documentation pages using [MkDocs multirepo](https://github.com/jdoiro3/mkdocs-multirepo-plugin/issues/3).
9 |
10 | !!! note "Developer note"
11 | To work on the documentation locally, do the following:
12 | ```
13 | git clone https://github.com/openml/docs.git
14 | pip install -r requirements.txt
15 | ```
16 | To build the documentation, run `mkdocs serve` in the top directory (with the `mkdocs.yml` file). Any changes made after that will be hot-loaded.
17 |
18 | The documentation will be auto-deployed with every push or merge with the master branch of `https://www.github.com/openml/docs/`. In the background, a CI job
19 | will run `mkdocs gh-deploy`, which will build the HTML files and push them to the gh-pages branch of openml/docs. `https://docs.openml.org` is just a reverse proxy for `https://openml.github.io/docs/`.
20 |
21 |
22 | ## Python API
23 | To edit the tutorial, you have to edit the `reStructuredText` files on [openml-python/doc](https://github.com/openml/openml-python/tree/master/doc). When done, you can do a pull request.
24 |
25 | To edit the documentation of the python functions, edit the docstrings in the [Python code](https://github.com/openml/openml-python/openml). When done, you can do a pull request.
26 |
27 | !!! note "Developer note"
28 | A CircleCI job will automatically render the documentation on every GitHub commit, using [Sphinx](http://www.sphinx-doc.org/en/stable/).
29 | For inclusion in these documentation pages, it will also be rendered in markdown and imported.
30 |
31 | ## R API
32 | To edit the tutorial, you have to edit the `Rmarkdown` files on [openml-r/vignettes](https://github.com/openml/openml-r/tree/master/vignettes).
33 |
34 | To edit the documentation of the R functions, edit the Roxygen documention next to the functions in the [R code](https://github.com/openml/openml-r/R).
35 |
36 | !!! note "Developer note"
37 | A Travis job will automatically render the documentation on every GitHub commit, using [knitr](https://yihui.name/knitr/). The Roxygen documentation is updated every time a new version is released on CRAN.
38 |
39 | ## Java API
40 | The Java Tutorial is written in markdown and can be edited the usual way (see above).
41 |
42 | To edit the documentation of the Java functions, edit the documentation next to the functions in the [Java code](https://github.com/openml/java/apiconnector).
43 |
44 | - Javadocs: https://www.openml.org/docs/
45 |
46 | !!! note "Developer note"
47 | A Travis job will automatically render the documentation on every GitHub commit, using [Javadoc](http://www.oracle.com/technetwork/java/javase/tech/index-137868.html).
48 |
49 | ## REST API
50 | The REST API is documented using Swagger.io, in YAML. This generates a nice web interface that also allows trying out the API calls using your own API key (when you are logged in).
51 |
52 | You can edit the sources on [SwaggerHub](https://app.swaggerhub.com/apis/openml/openml/1.0.0). When you are done, export to json and replace the [downloads/swagger.json](https://github.com/openml/OpenML/blob/master/downloads/swagger.json) file in the OpenML main GitHub repository. You need to do a pull request that is then reviewed by us. When we merge the new file the changes are immediately available.
53 |
54 | The [data API](https://app.swaggerhub.com/apis/openml/openml_file/1.0.0) can be edited in the same way.
55 |
--------------------------------------------------------------------------------
/docs/contributing/Style.md:
--------------------------------------------------------------------------------
1 | # Style guide
2 |
3 | These are some (non-mandatory) style guidelines to make the OpenML experience more pleasant and consistent for everyone.
4 |
5 | ## Logos
6 |
7 |
8 |
9 | (SVG)
10 |
11 | ## Colors
12 |
13 | We use the [Material Design](https://m2.material.io/design/color/the-color-system.html#color-usage-and-palettes) color system,
14 | and especially the colors green[400], yellow[800], blue[800], red[400], green[400], yellow[800], pink[400], and purple[400].
15 |
16 | Primary colors are #1E88E5 (general), #000482 (dark), and #b5b7ff (light).
17 |
18 |
19 |
--------------------------------------------------------------------------------
/docs/contributing/backend/Java-App.md:
--------------------------------------------------------------------------------
1 | !!! tip "Phasing out"
2 | This documentation is about the older Java-based version of the OpenML evaluation engine, which will be phased out. These parts are being rewritten as a set of independent services in Python.
3 |
4 | When you submit datasets or experiments (runs) to OpenML, they will be processed by set of server-side processes, combined in the 'Evaluation Engine':
5 |
6 | - It extracts the features in tabular datasets and their statistical types
7 | - It computes a set of dataset characteristics (meta-features), e.g. the number of features and classes, that help with search and filtering, or to compute dataset similarity measures
8 | - It evaluates experiments using a set of server-side evaluation metrics that are computed uniformly for all experiments so that they are comparable
9 | - It creates consistent train-test splits based on task characteristics.
10 |
11 | The application that implements the evaluation engine was originally implemented in Java because it bulds on the Weka API. It is invoked from the OpenML API by means of a CLI interface. Typically, a call looks like this:
12 |
13 | `java -jar webapplication.jar -config "api_key=S3CR3T_AP1_K3Y" -f evaluate_run -r 500`
14 |
15 | Which in this case executes the webapplication jar, invokes the function "evaluate run" and gives it parameter run id 500. The config parameter can be used to set some config items, in this case the api_key is mandatory. Every OpenML user has an api_key, which can be downloaded from their [OpenML profile page](http://www.openml.org/u). The response of this function is a call to the OpenML API uploading evaluation results to the OpenML database. Note that in this case the PHP website invokes the Java webapplication, which makes a call to the PHP website again, albeit another endpoint.
16 |
17 | The webapplication does not have direct writing rights into the database. All communication to the database goes by means of the [OpenML Connector](http://search.maven.org/#search|ga|1|g%3A%22org.openml%22), which communicates with the OpenML API. As a consequence, the webapplication could run on any system, i.e., there is no formal need for the webapplication to be on the same server as the website code. This is important, since this created modularity, and not all servers provide a command line interface to PHP scripts.
18 |
19 | Another example is the following:
20 |
21 | `java -jar webapplication -config "api_key=S3CR3T_AP1_K3Y" -f all_wrong -r 81,161 -t 59`
22 |
23 | Which takes a comma separated list of run ids (no spaces) and a task id as input and outputs the test examples on the dataset on which all algorithms used in the runs produced wrong examples (in this case, weka.BayesNet_K2 and weka.SMO, respectively). An error will be displayed if there are runs not consistent with the task id in there.
24 |
25 | ## Extending the Java App
26 |
27 | The bootstrap class of the webapplication is
28 |
29 | `org.openml.webapplication.Main`
30 |
31 | It automatically checks authentication settings (such as api_key) and the determines which function to invoke.
32 |
33 | It uses a switch-like if - else contruction to facilitate the functionalities of the various functions. Additional functions can be added to this freely. From there on, it is easy to add functionality to the webapplication.
34 |
35 | Parameters are handled using the Apache Commons CommandLineParser class, which makes sure that the passed parameters are available to the program.
36 |
37 | In order to make new functionalities available to the website, there also needs to be programmed an interface to the function, somewhere in the website. The next section details on that.
38 |
39 | ## Interfacing from the OpenML API
40 | By design, the REST API is not allowed to communicate with the Java App. All interfaces with the Java webapplication should go through other controllers of the PHP CodeIgniter framework., for example api_splits. Currently, the website features two main API's. These are represented by a Controller. Controllers can be found in the folder openml_OS/controllers. Here we see:
41 | * api_new.php, representing the REST API
42 | * api_splits.php, representing an API interfacing to the Java webapplication.
43 |
44 | ## Helper functions
45 | The Java code is available in the 'OpenML' repository: https://github.com/openml/OpenML/tree/master/Java
46 |
47 | ### Components
48 | Support for tasks:
49 |
50 | - *foldgeneration*: Java code for generating cross-validation folds. Can be used from command line.
51 | - *splitgeneration*: Split generator for cross validation and holdout. Unsure what's the difference with the previous?
52 | - *generate_predictions*: Helper class to build prediction files based on WEKA output. Move to WEKA repository?
53 | - *evaluate_predictions*: The evaluation engine computing evaluation scores based on submitted predictions
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/docs/contributing/backend/Local-Installation.md:
--------------------------------------------------------------------------------
1 | !!! tip "Test server"
2 | OpenML has a fully functional test server accessible at `test.openml.org` that you can use to develop against.
3 | For many cases, this is sufficient for development, and a full local installation is not required.
4 |
5 | !!! warning "Backend evolution"
6 | OpenML has grown organically, since before the current ecosystem of python tools for platform building.
7 | We are currently rewriting the entire backend using state-of-the-art Python tools (e.g. [FastAPI](https://github.com/openml/server-api)) so that the entire platform
8 | can be easily installed locally in one go. We plan this to be available early/mid 2025. Please get in touch
9 | if you want to know more or want to contribute.
10 |
11 | ## Using Docker Compose
12 | The easiest way to set up a local version of OpenML is to use Docker Compose following the instructions here (thanks to Jos van der Velde!):
13 | https://github.com/openml/services.
14 |
15 | If you run into problems, please post an issue in the same github repo.
16 |
17 | ## Installation from scratch
18 | If you want to install a local version of OpenML from scratch please follow the steps mentioned below.
19 | Note that this does not include the Kubernetes and S3 Object storage components that we use in production.
20 |
21 | ### Requirements
22 | You'll need to have the following software running:
23 | * Apache Webserver, (with the rewrite module enabled. Is installed by default,
24 | not enabled.)
25 | * MySQL Server.
26 | * PHP 5.5 or higher (comes also with Apache)
27 | Or just a XAMP (Mac), LAMP (Linux) or WAMP (Windows) package, which conveniently contains all these applications.
28 |
29 | ### Databases
30 | Next, OpenML runs on two databases, a public database with all experiment information, and a private database, with information like user accounts etc. The latest version of both databases can be downloaded here: https://docs.openml.org/resources
31 |
32 | Obviously, the private database does not include any actual user account info.
33 |
34 | ### Backend
35 | The source code is available in the 'OpenML' repository: https://github.com/openml/OpenML
36 |
37 | OpenML is written in PHP, and can be 'installed' by copying all files in the 'www' or 'public_html' directory of Apache.
38 |
39 | After that, you need to provide your local paths and database accounts and passwords using the config file in:
40 | 'APACHE_WWW_DIR'/openml_OS/config/BASE_CONFIG.php.
41 |
42 | If everything is configured correctly, OpenML should now be running.
43 |
44 | ### Search Indices
45 | If you want to run your own (separate) OpenML instance, and store your own data, you'll also want to build your own search indices to show all data on the website. The OpenML website is based on the ElasticSearch stack. To install it, follow the instructions here: http://knowm.org/how-to-set-up-the-elk-stack-elasticsearch-logstash-and-kibana/
46 |
47 | ### Initialization
48 | This script wipes all OpenML server data and rebuilds the database and search index. Replace 'openmldir' with the directory where you want OpenML to store files.
49 |
50 | ```
51 | # delete data from server
52 | sudo rm -rf /openmldir/*
53 | mkdir /openmldir/log
54 |
55 | # delete database
56 | mysqladmin -u "root" -p"yourpassword" DROP openml_expdb
57 | mysql -h localhost -u root -p"yourpassword" -e "TRUNCATE openml.file;"
58 |
59 | # reset ES search index
60 | echo "Deleting and recreating the ES index: "
61 | curl -XDELETE http://localhost:9200/openml
62 | curl -XPUT 'localhost:9200/openml?pretty' -H 'Content-Type: application/json' -d'
63 | {
64 | "settings" : {
65 | "index" : {
66 | "number_of_shards" : 3,
67 | "number_of_replicas" : 2
68 | }
69 | }
70 | }
71 | '
72 |
73 | # go to directory with the website source code
74 | cd /var/www/openml.org/public_html/
75 |
76 | # reinitiate the database
77 | mysql -u root -p"yourpassword!" < downloads/openml_expdb.sql
78 |
79 | # fill important columns
80 | sudo php index.php cron install_database
81 |
82 | # rebuild search index
83 | sudo php index.php cron initialize_es_indices
84 | sudo php index.php cron build_es_indices
85 |
86 | sudo chown apache:apache /openmldir/log
87 | sudo chown apache:apache /openmldir/log/*
88 | ```
89 |
--------------------------------------------------------------------------------
/docs/contributing/clients/Client-API-Standards.md:
--------------------------------------------------------------------------------
1 | ## Building clients ##
2 | You can access OpenML datasets, pipelines, benchmarks, and much more, through a range of client APIs.
3 | Well-developed clients exist in Python, R, Java, and several other languages. Please see their documentation (in the other tabs)
4 | for more guidance of how to contribute to them.
5 |
6 | If you want to develop your own client (e.g. for a new language), please check out the following resources:
7 |
8 | * [REST API](./Rest.md): all endpoints to GET, POST, or DELETE resources
9 | * [Metadata Standard](./metadata_definition.md): how we describe datasets and all other OpenML resources
10 | * Minimal standards (below) for uniform client configuration and caching mechanisms, to make the client behavior more uniform across languages.
11 |
12 | !!! info "Integrating tools"
13 | If you want to integrate OpenML into machine learning and data science tools, it's often easier to build on one of the existing clients,
14 | which often can be used as is or extended. For instance, see how to [extend the Python API](../../ecosystem/Python_extensions.md) to integrate OpenML into Python tools.
15 |
16 |
17 | ## Minimal standards
18 |
19 | ### Configuration file
20 |
21 | The configuration file resides in a directory `.openml` in the home directory of the user and is called config. It consists of `key = value` pairs which are seperated by newlines. The following keys are defined:
22 |
23 | * apikey:
24 | * required to access the server
25 | * server:
26 | * default: `http://www.openml.org`
27 | * verbosity:
28 | * 0: normal output
29 | * 1: info output
30 | * 2: debug output
31 | * cachedir:
32 | * if not given, will default to `file.path(tempdir(), "cache")`.
33 |
34 | ### Caching
35 |
36 | #### Cache invalidation
37 |
38 | All parts of the entities which affect experiments are immutable. The entities dataset and task have a flag `status` which tells the user whether they can be used safely.
39 |
40 | #### File structure
41 |
42 | Caching should be implemented for
43 |
44 | * datasets
45 | * tasks
46 | * splits
47 | * predictions
48 |
49 | and further entities might follow in the future. The cache directory `$cache` should be specified by the user when invoking the API. The structure in the cache directory should be as following:
50 |
51 | * One directory for the following entities:
52 | * `$cache/datasets`
53 | * `$cache/tasks`
54 | * `$cache/runs`
55 | * For every dataset there is an extra directory for which the name is the dataset ID, e.g. `$cache/datasets/2` for the dataset with OpenML ID 2.
56 | * The dataset should be called `dataset.pq` or `dataset.arff`
57 | * Every other file should be named by the API call which was used to obtain it. The XML returned by invoking `openml.data.qualities` should therefore be called qualities.xml.
58 | * For every task there is an extra directory for which the name is the task ID, e.g. `$cache/tasks/1`
59 | * The task file should be called `task.xml`.
60 | * The splits accompanying a task are stored in a file `datasplits.arff`.
61 | * For every run there is an extra directory for which the name is the run ID, e.g. `$cache/run/1`
62 | * The predictions should be called `predictions.arff`.
--------------------------------------------------------------------------------
/docs/contributing/clients/Rest.md:
--------------------------------------------------------------------------------
1 | # REST API
2 |
3 | OpenML offers a RESTful Web API, with predictive URLs, for uploading and downloading machine learning resources. Try the REST API Documentation to see examples of all calls, and test them right in your browser.
4 |
5 | ## Getting started
6 |
7 | * REST services can be called using simple HTTP GET or POST actions.
8 | * The REST Endpoint URL is https://www.openml.org/api/v1/
9 | * The default endpoint returns data in XML. If you prefer JSON, use the endpoint https://www.openml.org/api/v1/json/. Note that, to upload content, you still need to use XML (at least for now).
10 |
11 | ## Testing
12 | For continuous integration and testing purposes, we have a test server offering the same API, but which does not affect the production server.
13 |
14 | * The test server REST Endpoint URL is https://test.openml.org/api/v1/
15 |
16 | ## Error messages
17 | Error messages will look like this:
18 |
19 | ```xml
20 |
21 | 100
22 | Please invoke legal function
23 | Additional information, not always available.
24 |
25 | ```
26 |
27 | All error messages are listed in the API documentation. E.g. try to get a non-existing dataset:
28 |
29 | * in XML: https://www.openml.org/api_new/v1/data/99999
30 | * in JSON: https://www.openml.org/api_new/v1/json/data/99999
31 |
32 | ## Examples
33 | You need to be logged in for these examples to work.
34 |
35 | ### Download a dataset
36 | 
37 |
38 | * User asks for a dataset using the /data/{id} service. The dataset id is typically part of a task, or can be found on OpenML.org.
39 | * OpenML returns a description of the dataset as an XML file (or JSON). Try it now
40 | * The dataset description contains the URL where the dataset can be downloaded. The user calls that URL to download the dataset.
41 | * The dataset is returned by the server hosting the dataset. This can be OpenML, but also any other data repository. Try it now
42 |
43 | ### Download a flow
44 | 
45 |
46 | * User asks for a flow using the /flow/{id} service and a flow id. The flow id can be found on OpenML.org.
47 | * OpenML returns a description of the flow as an XML file (or JSON). Try it now
48 | * The flow description contains the URL where the flow can be downloaded (e.g. GitHub), either as source, binary or both, as well as additional information on history, dependencies and licence. The user calls the right URL to download it.
49 | * The flow is returned by the server hosting it. This can be OpenML, but also any other code repository. Try it now
50 |
51 | ### Download a task
52 | 
53 |
54 | * User asks for a task using the /task/{id} service and a task id. The task id is typically returned when searching for tasks.
55 | * OpenML returns a description of the task as an XML file (or JSON). Try it now
56 | * The task description contains the dataset id(s) of the datasets involved in this task. The user asks for the dataset using the /data/{id} service and the dataset id.
57 | * OpenML returns a description of the dataset as an XML file (or JSON). Try it now
58 | * The dataset description contains the URL where the dataset can be downloaded. The user calls that URL to download the dataset.
59 | * The dataset is returned by the server hosting it. This can be OpenML, but also any other data repository. Try it now
60 | * The task description may also contain links to other resources, such as the train-test splits to be used in cross-validation. The user calls that URL to download the train-test splits.
61 | * The train-test splits are returned by OpenML. Try it now
--------------------------------------------------------------------------------
/docs/contributing/clients/metadata_definition.md:
--------------------------------------------------------------------------------
1 | OpenML is at its core a meta-database, from which datasets, pipelines (flows), experiments (runs) and other entities can be downloaded and uploaded,
2 | all described using a clearly defined meta-data standard. In this document, we describe the standard how to upload entities to OpenML and what the resulting database state will be.
3 |
4 | !!! tip ":croissant: Croissant"
5 | OpenML has partnered with MLCommons, Google, Kaggle, HuggingFace, and a consortium of other partners to define a new metadata standard for machine
6 | learning datasets: :croissant: [Croissant](https://mlcommons.org/working-groups/data/croissant/)!
7 | You can already download all OpenML datasets in the Croissant format, and we're working further supporting and extending Croissant.
8 |
9 | Below is the OpenML metadata standard for version 1 of the API.
10 |
11 | ## Data
12 |
13 | Data is uploaded through the function [post data](https://www.openml.org/api_docs#!/data/post_data). The following files are needed:
14 |
15 | - `description`: An XML adhiring to the [XSD schema](https://www.openml.org/api_new/v1/xsd/openml.data.upload).
16 | - `dataset`: An [ARFF file](https://www.cs.waikato.ac.nz/ml/weka/arff.html) containing the data (optional, if not set, there should be an URL in the description, pointing to this file).
17 | Uploading any other files will result in an error.
18 |
19 | ## Tasks
20 |
21 | Tasks are uploaded through the function [post task](https://www.openml.org/api_docs#!/task/post_task). The following files are needed:
22 |
23 | - `description`: An XML adhering to the [XSD schema](https://www.openml.org/api_new/v1/xsd/openml.task.upload).
24 | Uploading any other files will result in an error.
25 |
26 | The task file should contain several input fields. These are a name and value combination of fields that are marked to be relevant by the task type definition. There are several task type definitions, e.g.:
27 |
28 | - [Supervised Classification](https://www.openml.org/api/v1/tasktype/1)
29 | - [Supervised Regression](https://www.openml.org/api/v1/tasktype/2)
30 | - [Learning Curve](https://www.openml.org/api/v1/tasktype/3)
31 | - [Data Stream Classification](https://www.openml.org/api/v1/tasktype/4)
32 |
33 | Note that the task types themselves are flexible content (ideally users can contribute task types) and therefore the documents are not part of the OpenML definition. The task types define which input fields should be set, when creating a task.
34 |
35 | Duplicate tasks (i.e., same value for `task_type_id` and all `input` fields equal) will be rejected.
36 |
37 | When creating a task, the API checks for all of the input fields whether the input is legitimate. (Todo: describe the checks and what they depend on).
38 |
39 | ## Flow
40 |
41 | Flows are uploaded through the function [post flow](https://www.openml.org/api_docs#!/flow/post_flow). The following file is needed:
42 |
43 | - `description`: An XML adhering to the [XSD schema](https://www.openml.org/api_new/v1/xsd/openml.implementation.upload).
44 | Uploading any other files will result in an error.
45 |
46 | Duplicate flows (i.e., same values for `name` and `external_version`) will be rejected.
47 |
48 | ## Runs
49 |
50 | Runs are uploaded through the function [post run](https://www.openml.org/api_docs#!/run/post_run). The following files are needed:
51 |
52 | - `description`: An XML adhering to the [XSD schema](https://www.openml.org/api_new/v1/xsd/openml.run.upload).
53 | - `predictions`: An [ARFF file](https://www.cs.waikato.ac.nz/ml/weka/arff.html) containing the predictions (optional, depending on the task).
54 | - `trace`: An [ARFF file](https://www.cs.waikato.ac.nz/ml/weka/arff.html) containing the run trace (optional, depending on the flow).
55 | Uploading any other files will result in an error.
56 |
57 | ### Predictions
58 |
59 | The contents of the prediction file depends on the task type.
60 |
61 | #### Task type: Supervised classification
62 |
63 | [Example predictions file](https://www.openml.org/api/v1/arff_example/predictions)
64 |
65 | - repeat NUMERIC
66 | - fold NUMERIC
67 | - row_id NUMERIC
68 | - confidence.{\$classname}: optional. various columns, describing the confidence per class. The values of these columns should add to 1 (precision 1e-6).
69 | - (proposal) decision_function.{\$classname}: optional. various columns, describing decision function per class.
70 | - prediction {\$classname}
71 | Runs that have a different set of columns will be rejected.
72 |
73 | ### Trace
74 |
75 | [Example trace file](https://www.openml.org/api/v1/arff_example/trace)
76 |
77 | - repeat: cross-validation repeat
78 | - fold: cross-validation fold
79 | - iteration: the index order within this repeat/fold combination
80 | - evaluation (float): the evaluation score that was attached based on the validation set
81 | - selected {True, False}: Whether in this repeat/run combination this was the selected hyperparameter configuration (exactly one should be tagged with True)
82 | - Per optimized parameter a column that has the name of the parameter and the prefix "parameter_"
83 | - setup_string: Due to legacy reasons accepted, but will be ignored by the default evaluation engine
84 |
85 | Traces that have a different set of columns will be rejected.
--------------------------------------------------------------------------------
/docs/contributing/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | icon: fontawesome/solid/laptop-code
3 | ---
4 |
5 | OpenML is an open source project, hosted on GitHub. We welcome everybody to help improve OpenML, and make it more useful for everyone.
6 |
7 | !!! tip "Mission"
8 | We want to make machine learning **open** and **accessible** for the benefit of all of humanity.
9 | OpenML offers an **entirely open online platform** for machine learning datasets, models, and experiments,
10 | making them **easy to use and share** to facilitate global collaboration and extensive automation.
11 |
12 | ## Want to get involved?
13 |
14 | Awesome, we're happy to have you! :tada:
15 |
16 | ### Who are we?
17 |
18 | We are a group of friendly people who are excited about open science and machine learning.
19 |
20 | [Read more about who we are, what we stand for, and how to get in touch](https://www.openml.org/about).
21 |
22 | ### We need help!
23 |
24 | We are currently looking for help with:
25 |
26 | :octicons-comment-discussion-16: User feedback (best via [GitHub issues](https://github.com/openml), but email or Slack is also fine)
27 |
28 | - Frontend / UX / Design of the website
29 | - Backend / API
30 | - Outreach / making OpenML better known (especially in non-ML-communities, where people have data but no analysis experise)
31 | - Helping with the interfaces (Python,R,Julia,Java) and tool integrations
32 | - Helping with documenting the interfaces or the API
33 | - What could we do better to get new users started? Help us to figure out what is difficult to understand about OpenML. If you _are_ a new user, you are the perfect person for this!
34 |
35 | ### Beginner issues
36 |
37 | Check out the issues labeled [Good first issue](https://github.com/issues?q=is%3Aopen+is%3Aissue+user%3Aopenml++label%3A%22Good+first+issue%22+) or [help wanted](https://github.com/issues?q=is%3Aopen+is%3Aissue+user%3Aopenml++label%3A%22help+wanted%22+) (you need to be logged into GitHub to see these)
38 |
39 | ### Change the world
40 |
41 | If you have your own ideas on how you want to contribute, please get in touch! We are very friendly and open to new ideas :wink:
42 |
43 | ## Communication channels:
44 |
45 | We have several communication channels set up for different purposes:
46 |
47 | ### GitHub
48 |
49 | https://github.com/openml
50 |
51 | - Issues (members and users can complain)
52 | - Request new features
53 |
54 | Anyone with a GitHub account can write issues. We are happy if people get involved by writing issues, so don't be shy :smiley:
55 |
56 | Please post issues in the relevant issue tracker.
57 |
58 | - :simple-github: OpenML Core - Web services and API
59 | - :simple-github: Website - The (new) OpenML website
60 | - :simple-github: Docs - The documentation pages
61 | - :simple-github: Python API - The Python API
62 | - :simple-github: R API - The OpenML R package
63 | - :simple-github: Java API - The Java API and Java-based plugins
64 | - :simple-github: Datasets - For issues about datasets
65 | - :simple-github: Blog - The OpenML Blog
66 |
67 | ### Slack
68 |
69 | https://openml.slack.com
70 |
71 | - Informal communication
72 |
73 | We use slack for day to day discussions and news. If you want to join the OpenML slack chat, please message us (openmlHQ@googlegroups.com).
74 |
75 | ### Twitter (@open_ml)
76 |
77 | https://twitter.com/open_ml
78 |
79 | - News
80 | - Publicly relevant information
81 |
82 | ### Blog
83 |
84 | https://blog.openml.org
85 |
86 | - Tutorials
87 | - News
88 | - Open discussions
89 |
90 | ## Contributors bot
91 |
92 | We use all contributors bot to add contributors to the repository README. You can check how to use this here. You can contribute in a lot of ways including code, blogs, content, design and talks. You can find the emoji key here .
93 |
--------------------------------------------------------------------------------
/docs/contributing/resources.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Resources
4 |
5 | ## Database snapshots
6 |
7 | Everything uploaded to OpenML is available to the community. The nightly snapshot of the public database contains all experiment runs, evaluations and links to datasets, implementations and result files. In SQL format (gzipped). You can also download the Database schema.
8 |
9 | Nightly database SNAPSHOT
10 |
11 | If you want to work on the website locally, you'll also need the schema for the 'private' database with non-public information.
12 |
13 | Private database schema
14 |
15 | ## Legacy Resources
16 |
17 | OpenML is always evolving, but we keep hosting the resources that were used in prior publications so that others may still build on them.
18 |
19 | :material-database: The experiment database used in Vanschoren et al. (2012) Experiment databases. Machine Learning 87(2), pp 127-158. You'll need to import this database (we used MySQL) to run queries. The database structure is described in the paper. Note that most of the experiments in this database have been rerun using OpenML, using newer algorithm implementations and stored in much more detail.
20 |
21 | :fontawesome-solid-share-nodes: The Exposé ontology used in the same paper, and described in more detail here and here. Exposé is used in designing our databases, and we aim to use it to export all OpenML data as Linked Open Data.
22 |
23 | ## Other dataset repositories
24 |
25 | We keep a list of [other dataset repositories all over the world](./backend/Datasets.md)
--------------------------------------------------------------------------------
/docs/contributing/website/Dash.md:
--------------------------------------------------------------------------------
1 | # Dash visualization
2 |
3 | Dash is a python framework which is suitable for building data visualization dashboards using pure python. Dash is written on top of plotly, react and flask and the graphs are defined using plotly python. The dash application is composed of two major parts :
4 |
5 | - `Layout` - Describes how the dashboard looks like
6 | - `Callbacks` - Used to update graphs, tables in the layout and makes the dashboard interactive.
7 |
8 | # Files
9 |
10 | The dash application is organized as follows:
11 |
12 | - `dashapp.py`
13 |
14 | - Creates the dash application
15 | - The dash app is embedded in the flask app passed to `create_dash_app` function
16 | - This file need not be modified to create a new plot
17 |
18 | - `layouts.py`
19 |
20 | - contains the layout for all the pages
21 | - `get_layout_from_data`- returns layout of data visualization
22 | - `get_layout_from_task`- returns layout of taskvisualization
23 | - `get_layout_from_flow`- returns layout of flow visualization
24 | - `get_layout_from_run` - returns layout of run visualization
25 | - This file needs to be modified to add a new plot (data, task, flow, run)
26 |
27 | - `callbacks.py`
28 | - Registers all the callbacks for the dash application
29 | - This file needs to be modified to add a new plot, especially if the plot needs to be interactive
30 |
31 | ## How the dashboard works
32 |
33 | In this dash application, we need to create the layout of the page dynamically based on the entered URL.
34 | For example, [http://127.0.0.1:5000/dashboard/data/5] needs to return the layout for dataset id #5 whereas
35 | [http://127.0.0.1:5000/dashboard/run/5] needs to return the layout for run id #5.
36 |
37 | Hence , the dash app is initially created with a dummy `app.layout` by dashapp.py and
38 | the callbacks are registered for the app using `register_callbacks` function.
39 |
40 | - **render_layout** is the callback which dynamically renders layout. Once the dash app is running, the first callback which is fired is `render_layout.`
41 | This is the main callback invoked when a URL with a data , task, run or flow ID is entered.
42 | Based on the information in the URL, this method returns the layout.
43 |
44 | - Based on the URL, get_layout_from_data, get_layout_from_task, get_layout_from_flow, get_layout_from_run are called.
45 | These functions define the layout of the page - tables, html Divs, tabs, graphs etc.
46 |
47 | - The callbacks corresponding to each component in the layout are invoked to update the components dynamically and
48 | make the graphs interactive. For example, **update_scatter_plot** in `data_callbacks.py` updates the scatter plot
49 | component in the data visualization dashboard.
50 |
--------------------------------------------------------------------------------
/docs/contributing/website/Flask.md:
--------------------------------------------------------------------------------
1 | We use [Flask](http://flask.pocoo.org/) as our web framework. It handles user
2 | authentication, dataset upload, task creation, and other aspects that require
3 | server-side interaction. It is designed to be _independent_ from the OpenML API.
4 | This means that you can use it to create your own personal frontend for OpenML,
5 | using the main OpenML server to provide the data. Of course, you can also link
6 | it to your own [local OpenML setup](../backend/Local-Installation.md).
7 |
8 | ### Design
9 | Out flask app follows [Application factories design pattern](https://flask.palletsprojects.com/en/1.1.x/patterns/appfactories/).
10 | A new app instance can be created by:
11 | ``` python
12 | from autoapp import create_app
13 | app = create_app(config_object)
14 | ```
15 |
16 | The backend is designed in a modular fashion with flask [Blueprints](https://flask.palletsprojects.com/en/1.0.x/blueprints/). Currently,
17 | the flask app consists of two blueprints public and user:
18 |
Public blueprint: contains routes that do not require user authentication or authorization. like signup and forgot password.
19 |
User blueprint: Contains routes which require user authentication like login, changes in profile and fetching API key.
20 |
21 |
New blueprints can be registered in `server/app.py` with register_blueprints function:
22 |
23 | ``` python
24 | def register_blueprints(app):
25 | app.register_blueprint(new_blueprint)
26 | ```
27 |
28 |
29 | ### Database setup
30 | If you want o setup a local user database similar to OpenML then follow these steps:
31 |
32 |
Install MySQL
33 |
Create a new database 'openml'
34 |
Set current database to 'openml' via use method
35 |
Download users.sql file from openml.org github repo and add it in the openml db via "mysql -u root -p openml < users.sql"
36 |
Edit the database path in `server/extensions.py` and `server/config.py`
37 |
38 | Note: Remember to add passwords and socket extension address(if any) in both in `server/extensions.py` and `server/config.py`
39 |
40 |
41 | ### Security
42 | Flask backend uses [JSON web tokens](https://jwt.io/) for all the user handling tasks. [Flask JWT extended](https://flask-jwt-extended.readthedocs.io/en/stable/) library is used to bind JWT with the flask app.
43 | Current Mechanism is :
44 |
45 |
User logs in.
46 |
JWT token is assigned to user and sent with every request to frontend.
47 |
All the user information can only be accessed with a JWT token like edit profile and API-key.
48 |
The JWT token is stored in local memory of the browser.
49 |
The token get expired after 2 hours or get blacklisted after logout.
50 |
51 |
JWT is registered as an extension in `server/extensions.py`.
52 | All the user password hash are saved in Argon2 format with the new backend.
53 |
54 | ### Registering Extensions
55 | To register a new extension to flask backend extension has to be added in `server/extensions.py` and initialized in server/app.py.
56 | Current extensions are : flask_argon2, flask_bcrypt, flask_jwt_extended and flask_sqlalchemy.
57 |
58 | ### Configuring App
59 | Configuration variables like secret keys, Database URI and extension configurations are specified in
60 | `server/config.py` with Config object, which is supplied to the flask app during initialization.
61 |
62 | ### Creating a new route
63 | To create a new route in backend you can add the route in `server/public/views.py` or `server/user/views.py` (if it requires user authorisation or JWT usage in any way).
64 |
65 | ### Bindings to OpenML server
66 | You can specify which OpenML server to connect to.
67 | This is stored in the `.env` file in the main directory. It is set to the main OpenML server by default:
68 |
69 | ``` python
70 | ELASTICSEARCH_SERVER=https://www.openml.org/es
71 | OPENML_SERVER=https://www.openml.org
72 | ```
73 |
74 | The ElasticSearch server is used to download information about datasets, tasks, flows and runs, as well as to power the frontend search. The OpenML server is used for uploading datasets, tasks, and anything else that requires calls to the OpenML API.
75 |
76 | ### Bindings to frontend
77 | The frontend is generated by [React](https://reactjs.org/). See below for more information. The React app is loaded as a static website. This is done in Flask setup in file `server.py`.
78 |
79 | ``` python
80 | app = Flask(__name__, static_url_path='', static_folder='src/client/app/build')
81 | ```
82 |
83 | It will find the React app there and load it.
84 |
85 | ### Email Server
86 | OpenML uses its own mail server, You can use basically any mail server compatible with python SMTP library. Our suggestion is to use mailtrap.io for local testing. You can configure email server configurations in .env file. Currently we only use emails for confirmation email and forgotten password emails.
87 |
--------------------------------------------------------------------------------
/docs/contributing/website/Website.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 | The OpenML website runs on [Flask](http://flask.pocoo.org/), [React](https://reactjs.org/), and [Dash](https://dash.plot.ly/). You need to install these first.
3 |
4 | * Download or clone the source code for the OpenML website from [GitHub](https://github.com/openml/openml.org).
5 | Then, go into that folder (it should have the `requirements.txt` and `package.json` files).
6 | ``` python
7 | git clone https://github.com/openml/openml.org.git
8 | cd openml.org
9 | ```
10 |
11 | * Install Flask, Dash, and dependencies using [PIP](https://pip.pypa.io/en/stable/installing/)
12 | ``` python
13 | pip install -r requirements.txt
14 | ```
15 |
16 | * Install React and dependencies using [NPM (8 or higher)](https://nodejs.org/en/download/)
17 | ``` python
18 | cd server/src/client/app/
19 | npm install
20 | ```
21 |
22 | ## Building and running
23 |
24 | Go back to the home directory. Build a production version of the website with:
25 |
26 | ``` python
27 | npm run build --prefix server/src/client/app/
28 | ```
29 |
30 | Start the server by running:
31 |
32 | ``` python
33 | flask run
34 | ```
35 |
36 | You should now see the app running in your browser at `localhost:5000`
37 |
38 | Note: If you run the app using HTTPS, add the SSL context or use 'adhoc' to use on-the-fly certificates or you can specify your own certificates.
39 |
40 | ``` python
41 | flask run --cert='adhoc'
42 | ```
43 |
44 | As flask server is not suitable for production we recommend you to use some other server if you want to deploy
45 | your openml installation in production. We currently use gunicorn for production server. You can install the gunicorn server and run it:
46 | ```
47 | gunicorn --certfile cert.pem --keyfile key.pem -b localhost:5000 autoapp:app
48 | ```
49 |
50 | ## Development
51 |
52 | To start the React frontend in developer mode, go to `server/src/client/app` and run:
53 |
54 | ``` python
55 | npm run start
56 | ```
57 |
58 | The app should automatically open at `localhost:3000` and any changes made to
59 | the code will automatically reload the website (hot loading).
60 |
61 | For the new Next.js frontend, install and run like this:
62 | ``` python
63 | cd app
64 | npm install
65 | npm run dev
66 | ```
67 |
68 | ## Structure
69 |
70 |
71 |
72 | The website is built on the following components:
73 |
74 | * A [Flask backend](Flask.md). Written in Python, the backend takes care of all communication with the OpenML server. It builds on top of the OpenML Python API. It also takes care of user authentication and keeps the search engine (ElasticSearch) up to date with the latest information from the server. Files are located in the `server` folder.
75 | * A [React frontend](React.md). Written in JavaScript, this takes care of rendering the website. It pulls in information from the search engine, and shows plots rendered by Dash. It also contains forms (e.g. for logging in or uploading new datasets), which will be sent off to the backend for processing. Files are located in `server/src/client/app`.
76 | * [Dash dashboards](Dash.md). Written in Python, Dash is used for writing interactive plots. It pulls in data from the Python API, and renders the plots as React components. Files are located in `server/src/dashboard`.
77 |
--------------------------------------------------------------------------------
/docs/css/extra.css:
--------------------------------------------------------------------------------
1 | .md-footer-meta {
2 | display: none !important;
3 | }
4 |
5 | .md-header-nav__source {
6 | display: none !important;
7 | }
8 |
9 | .md-header__source {
10 | width: 0px;
11 | }
12 |
13 | .md-source__icon {
14 | display: none !important;
15 | }
16 |
17 | .md-grid {
18 | max-width: 122rem !important;
19 | }
20 |
21 | .md-header__button.md-logo img, .md-header__button.md-logo svg {
22 | height: 2rem;
23 | }
24 |
25 | .md-source__repository {
26 | display: none;
27 | }
28 |
29 | .framed-python {
30 | margin-top: -70px;
31 | overflow: hidden;
32 | }
33 |
34 | .framed-r {
35 | margin-top: 0px;
36 | overflow: hidden;
37 | }
38 |
39 | .framed-r-api {
40 | margin-top: -50px;
41 | overflow: hidden;
42 | }
43 |
44 | .framed-github {
45 | height: 100vh !important;
46 | width: 100% !important;
47 | }
48 |
49 | img[alt="icon"] {
50 | width: 50px;
51 | }
52 |
53 | @media only screen and (min-width: 76.25em) {
54 | .framed-python {
55 | margin-left: -45px;
56 | }
57 | .framed-r-api {
58 | margin-left: -45px;
59 | }
60 | }
61 | table {
62 | display: block;
63 | max-width: -moz-fit-content;
64 | max-width: fit-content;
65 | margin: 0 auto;
66 | overflow-x: auto;
67 | white-space: nowrap;
68 | }
69 |
70 | :root {
71 | --md-primary-fg-color: #1E88E5;
72 | --md-primary-fg-color--light: #000482;
73 | --md-primary-fg-color--dark: #b5b7ff;
74 | }
75 |
76 | .card-container {
77 | display: flex;
78 | flex-wrap: wrap;
79 | gap: 20px;
80 | justify-content: center;
81 | }
82 |
83 | .card {
84 | border: 1px solid #ccc;
85 | border-radius: 5px;
86 | padding: 20px;
87 | width: 300px;
88 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
89 | }
90 |
91 | .card h2 {
92 | margin-top: 0;
93 | }
94 |
95 | .card p {
96 | margin-bottom: 0;}
97 |
98 | .github-logo {
99 | height: 15px;
100 | width: 13px;
101 | margin-left: 10px;
102 | }
103 |
104 | iframe[seamless] {
105 | border: none;
106 | }
107 |
108 | .green{
109 | color: #4caf50
110 | }
111 | .red{
112 | color: #f44336
113 | }
114 | .yellow{
115 | color: #ffc107
116 | }
117 | .blue{
118 | color: #2196f3
119 | }
120 | .purple{
121 | color: #4caf50
122 | }
123 | .pink{
124 | color: #4caf50
125 | }
--------------------------------------------------------------------------------
/docs/data/specs.md:
--------------------------------------------------------------------------------
1 | # Technical specifications
2 |
3 | ## Data formatting
4 | OpenML converts datasets to a uniform format based on Parquet. Read [this blog post](https://blog.openml.org/openml/data/2020/03/23/Finding-a-standard-dataset-format-for-machine-learning.html) for a detailed explanation for this approach. You will usually never notice this since OpenML libraries will take care of transferring data from Parquet to your favorite data structures. See the [using datasets](use.md) page for details.
5 |
6 | Datasets that depend on included files (e.g. a dataset of images) are defined by create a dataframe with all the dataset information, and columns with paths to local files, as well as a folder with all the local files (e.g. images, video, audio) according to the paths in main dataframe.
7 |
8 | In the backend, datasets are stored in an S3 object store, with one bucket per dataset. We currently allow datasets to be up to 200GB in size.
9 |
10 | ## Dataset ID and versions
11 | A dataset can be uniquely identified by its dataset ID, which is shown on the website and returned by the API. It's `1596` in the `covertype` example above. They can also be referenced by name and ID. OpenML assigns incremental version numbers per upload with the same name. You can also add a free-form `version_label` with every upload.
12 |
13 | ## Dataset status
14 | When you upload a dataset, it will be marked `in_preparation` until it is (automatically) verified. Once approved, the dataset will become `active` (or `verified`). If a severe issue has been found with a dataset, it can become `deactivated` (or `deprecated`) signaling that it should not be used. By default, dataset search only returns verified datasets, but you can access and download datasets with any status.
15 |
16 | ## Caching
17 | When downloading datasets, tasks, runs and flows, OpenML will automatically cache them locally. By default, OpenML will use ~/.openml/cache as the cache directory
18 |
19 | The cache directory can be either specified through the OpenML config file. To do this, add the line `cachedir = ‘MYDIR’` to the config file, replacing ‘MYDIR’ with the path to the cache directory.
20 |
21 | You can also set the cache dir temporarily via the Python API:
22 |
23 | ``` python
24 | import os
25 | import openml
26 |
27 | openml.config.cache_directory = os.path.expanduser('YOURDIR')
28 | ```
29 |
30 |
31 |
--------------------------------------------------------------------------------
/docs/ecosystem/MOA.md:
--------------------------------------------------------------------------------
1 | OpenML features extensive support for MOA. However currently this is implemented as a stand alone MOA compilation, using the latest version (as of May, 2014).
2 |
3 | [Download MOA for OpenML](https://www.openml.org/downloads/openmlmoa.beta.jar)
4 |
5 | ## Quick Start
6 | 
7 |
8 | * Download the standalone MOA environment above.
9 | * Find your [API key](https://www.openml.org/u#!api) in your profile (log in first). Create a config file called openml.conf in a .openml directory in your home dir. It should contain the following lines:
10 | >api_key = YOUR_KEY
11 | * Launch the JAR file by double clicking on it, or launch from command-line using the following command:
12 | > java -cp openmlmoa.beta.jar moa.gui.GUI
13 | * Select the task moa.tasks.openml.OpenmlDataStreamClassification to evaluate a classifier on an OpenML task, and send the results to OpenML.
14 | * Optionally, you can generate new streams using the Bayesian Network Generator: select the moa.tasks.WriteStreamToArff task, with moa.streams.generators.BayesianNetworkGenerator.
15 |
--------------------------------------------------------------------------------
/docs/ecosystem/Rest.md:
--------------------------------------------------------------------------------
1 | # REST tutorial
2 |
3 | OpenML offers a RESTful Web API, with predictive URLs, for uploading and downloading machine learning resources. Try the API Documentation to see examples of all calls, and test them right in your browser.
4 |
5 | ## Getting started
6 |
7 | * REST services can be called using simple HTTP GET or POST actions.
8 | * The REST Endpoint URL is https://www.openml.org/api/v1/
9 | * The default endpoint returns data in XML. If you prefer JSON, use the endpoint https://www.openml.org/api/v1/json/. Note that, to upload content, you still need to use XML (at least for now).
10 |
11 | ## Testing
12 | For continuous integration and testing purposes, we have a test server offering the same API, but which does not affect the production server.
13 |
14 | * The test server REST Endpoint URL is https://test.openml.org/api/v1/
15 |
16 | ## Error messages
17 | Error messages will look like this:
18 |
19 | ```xml
20 |
21 | 100
22 | Please invoke legal function
23 | Additional information, not always available.
24 |
25 | ```
26 |
27 | All error messages are listed in the API documentation. E.g. try to get a non-existing dataset:
28 |
29 | * in XML: https://www.openml.org/api_new/v1/data/99999
30 | * in JSON: https://www.openml.org/api_new/v1/json/data/99999
31 |
32 | ## Examples
33 | You need to be logged in for these examples to work.
34 |
35 | ### Download a dataset
36 | 
37 |
38 | * User asks for a dataset using the /data/{id} service. The dataset id is typically part of a task, or can be found on OpenML.org.
39 | * OpenML returns a description of the dataset as an XML file (or JSON). Try it now
40 | * The dataset description contains the URL where the dataset can be downloaded. The user calls that URL to download the dataset.
41 | * The dataset is returned by the server hosting the dataset. This can be OpenML, but also any other data repository. Try it now
42 |
43 | ### Download a flow
44 | 
45 |
46 | * User asks for a flow using the /flow/{id} service and a flow id. The flow id can be found on OpenML.org.
47 | * OpenML returns a description of the flow as an XML file (or JSON). Try it now
48 | * The flow description contains the URL where the flow can be downloaded (e.g. GitHub), either as source, binary or both, as well as additional information on history, dependencies and licence. The user calls the right URL to download it.
49 | * The flow is returned by the server hosting it. This can be OpenML, but also any other code repository. Try it now
50 |
51 | ### Download a task
52 | 
53 |
54 | * User asks for a task using the /task/{id} service and a task id. The task id is typically returned when searching for tasks.
55 | * OpenML returns a description of the task as an XML file (or JSON). Try it now
56 | * The task description contains the dataset id(s) of the datasets involved in this task. The user asks for the dataset using the /data/{id} service and the dataset id.
57 | * OpenML returns a description of the dataset as an XML file (or JSON). Try it now
58 | * The dataset description contains the URL where the dataset can be downloaded. The user calls that URL to download the dataset.
59 | * The dataset is returned by the server hosting it. This can be OpenML, but also any other data repository. Try it now
60 | * The task description may also contain links to other resources, such as the train-test splits to be used in cross-validation. The user calls that URL to download the train-test splits.
61 | * The train-test splits are returned by OpenML. Try it now
62 |
--------------------------------------------------------------------------------
/docs/ecosystem/Scikit-learn/basic_tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "\n",
12 | " \n",
13 | ""
14 | ],
15 | "text/plain": [
16 | ""
17 | ]
18 | },
19 | "metadata": {},
20 | "output_type": "display_data"
21 | },
22 | {
23 | "data": {
24 | "text/markdown": [
25 | "[](https://mybinder.org/v2/gh/SubhadityaMukherjee/openml_docs/HEAD?labpath=Scikit-learn%2Fdatasets_tutorial)"
26 | ],
27 | "text/plain": [
28 | ""
29 | ]
30 | },
31 | "metadata": {},
32 | "output_type": "display_data"
33 | }
34 | ],
35 | "source": [
36 | "from IPython.display import display, HTML, Markdown\n",
37 | "import os\n",
38 | "import yaml\n",
39 | "with open(\"../../../mkdocs.yml\", \"r\") as f:\n",
40 | " load_config = yaml.safe_load(f)\n",
41 | "repo_url = load_config[\"repo_url\"].replace(\"https://github.com/\", \"\")\n",
42 | "binder_url = load_config[\"binder_url\"]\n",
43 | "relative_file_path = \"integrations/Scikit-learn/basic_tutorial.ipynb\"\n",
44 | "display(HTML(f\"\"\"\n",
45 | " \n",
46 | "\"\"\"))\n",
47 | "display(Markdown(\"[](https://mybinder.org/v2/gh/SubhadityaMukherjee/openml_docs/HEAD?labpath=Scikit-learn%2Fdatasets_tutorial)\"))"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "!pip install openml"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 2,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "import openml\n",
66 | "from sklearn import impute, tree, pipeline"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 7,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "name": "stderr",
76 | "output_type": "stream",
77 | "text": [
78 | "/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/openml/config.py:184: UserWarning: Switching to the test server https://test.openml.org/api/v1/xml to not upload results to the live server. Using the test server may result in reduced performance of the API!\n",
79 | " warnings.warn(\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "openml.config.start_using_configuration_for_example()"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 8,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "\n",
94 | "# Define a scikit-learn classifier or pipeline\n",
95 | "clf = pipeline.Pipeline(\n",
96 | " steps=[\n",
97 | " ('imputer', impute.SimpleImputer()),\n",
98 | " ('estimator', tree.DecisionTreeClassifier())\n",
99 | " ]\n",
100 | ")\n"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 9,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/plain": [
111 | "OpenML Classification Task\n",
112 | "==========================\n",
113 | "Task Type Description: https://test.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION\n",
114 | "Task ID..............: 32\n",
115 | "Task URL.............: https://test.openml.org/t/32\n",
116 | "Estimation Procedure.: crossvalidation\n",
117 | "Target Feature.......: class\n",
118 | "# of Classes.........: 10\n",
119 | "Cost Matrix..........: Available"
120 | ]
121 | },
122 | "execution_count": 9,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "\n",
129 | "# Download the OpenML task for the pendigits dataset with 10-fold\n",
130 | "# cross-validation.\n",
131 | "task = openml.tasks.get_task(32)\n",
132 | "task"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 11,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "# Run the scikit-learn model on the task.\n",
142 | "run = openml.runs.run_model_on_task(clf, task)\n",
143 | "# Publish the experiment on OpenML (optional, requires an API key.\n",
144 | "# You can get your own API key by signing up to OpenML.org)\n"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "\n",
154 | "run.publish()\n",
155 | "print(f'View the run online: {run.openml_url}')"
156 | ]
157 | }
158 | ],
159 | "metadata": {
160 | "kernelspec": {
161 | "display_name": "openml",
162 | "language": "python",
163 | "name": "python3"
164 | },
165 | "language_info": {
166 | "codemirror_mode": {
167 | "name": "ipython",
168 | "version": 3
169 | },
170 | "file_extension": ".py",
171 | "mimetype": "text/x-python",
172 | "name": "python",
173 | "nbconvert_exporter": "python",
174 | "pygments_lexer": "ipython3",
175 | "version": "3.9.19"
176 | }
177 | },
178 | "nbformat": 4,
179 | "nbformat_minor": 2
180 | }
181 |
--------------------------------------------------------------------------------
/docs/ecosystem/Scikit-learn/index.md:
--------------------------------------------------------------------------------
1 | # scikit-learn
2 |
3 | OpenML is readily integrated with scikit-learn through the [Python API](https://openml.github.io/openml-python/main/api.html).
4 | This page provides a brief overview of the key features and installation instructions. For more detailed API documentation, please refer to the [official documentation](https://openml.github.io/openml-python/main/api.html).
5 |
6 | ## Key features:
7 |
8 | - Query and download OpenML datasets and use them however you like
9 | - Build any sklearn estimator or pipeline and convert to OpenML flows
10 | - Run any flow on any task and save the experiment as run objects
11 | - Upload your runs for collaboration or publishing
12 | - Query, download and reuse all shared runs
13 |
14 | ## Installation
15 |
16 | ```bash
17 | pip install openml
18 | ```
19 |
20 | ## Query and download data
21 | ```python
22 | import openml
23 |
24 | # List all datasets and their properties
25 | openml.datasets.list_datasets(output_format="dataframe")
26 |
27 | # Get dataset by ID
28 | dataset = openml.datasets.get_dataset(61)
29 |
30 | # Get dataset by name
31 | dataset = openml.datasets.get_dataset('Fashion-MNIST')
32 |
33 | # Get the data itself as a dataframe (or otherwise)
34 | X, y, _, _ = dataset.get_data(dataset_format="dataframe")
35 | ```
36 |
37 | ## Download tasks, run models locally, publish results (with scikit-learn)
38 | ```python
39 | from sklearn import ensemble
40 | from openml import tasks, runs
41 |
42 | # Build any model you like
43 | clf = ensemble.RandomForestClassifier()
44 |
45 | # Download any OpenML task
46 | task = tasks.get_task(3954)
47 |
48 | # Run and evaluate your model on the task
49 | run = runs.run_model_on_task(clf, task)
50 |
51 | # Share the results on OpenML. Your API key can be found in your account.
52 | # openml.config.apikey = 'YOUR_KEY'
53 | run.publish()
54 | ```
55 |
56 | ## OpenML Benchmarks
57 | ```python
58 | # List all tasks in a benchmark
59 | benchmark = openml.study.get_suite('OpenML-CC18')
60 | tasks.list_tasks(output_format="dataframe", task_id=benchmark.tasks)
61 |
62 | # Return benchmark results
63 | openml.evaluations.list_evaluations(
64 | function="area_under_roc_curve",
65 | tasks=benchmark.tasks,
66 | output_format="dataframe"
67 | )
68 | ```
69 |
--------------------------------------------------------------------------------
/docs/ecosystem/Weka.md:
--------------------------------------------------------------------------------
1 | OpenML is integrated in the Weka (Waikato Environment for Knowledge Analysis) Experimenter and the Command Line Interface.
2 |
3 | ## Installation
4 | OpenML is available as a weka extension in the package manager:
5 |
6 | * [Download the latest version](http://www.cs.waikato.ac.nz/ml/weka/downloading.html) (3.7.13 or higher).
7 | * Launch Weka, or start from commandline:
8 | > java -jar weka.jar
9 | * If you need more memory (e.g. 1GB), start as follows:
10 | > java -Xmx1G -jar weka.jar
11 | * Open the package manager (Under 'Tools')
12 | * Select package **OpenmlWeka** and click install. Afterwards, restart WEKA.
13 | * From the Tools menu, open the 'OpenML Experimenter'.
14 |
15 | ## Graphical Interface
16 | 
17 |
18 | You can solve OpenML Tasks in the Weka Experimenter, and automatically upload your experiments to OpenML (or store them locally).
19 |
20 | * From the Tools menu, open the 'OpenML Experimenter'.
21 | * Enter your [API key](https://www.openml.org/u#!api) in the top field (log in first). You can also store this in a config file (see below).
22 | * In the 'Tasks' panel, click the 'Add New' button to add new tasks. Insert the task id's as comma-separated values (e.g., '1,2,3,4,5'). Use the search function on OpenML to find interesting tasks and click the ID icon to list the ID's. In the future this search will also be integrated in WEKA.
23 | * Add algorithms in the "Algorithm" panel.
24 | * Go to the "Run" tab, and click on the "Start" button.
25 | * The experiment will be executed and sent to OpenML.org.
26 | * The runs will now appear on OpenML.org. You can follow their progress and check for errors on your profile page under 'Runs'.
27 |
28 | ## CommandLine Interface
29 | The Command Line interface is useful for running experiments automatically on a server, without using a GUI.
30 |
31 | * Create a config file called openml.conf in a new directory called .openml in your home dir. It should contain the following line:
32 | > api_key = YOUR_KEY
33 | * Execute the following command:
34 | > java -cp weka.jar openml.experiment.TaskBasedExperiment -T -C --
35 | * For example, the following command will run Weka's J48 algorithm on Task 1:
36 | > java -cp OpenWeka.beta.jar openml.experiment.TaskBasedExperiment -T 1 -C weka.classifiers.trees.J48
37 | * The following suffix will set some parameters of this classifier:
38 | > -- -C 0.25 -M 2
39 |
40 | ## API reference
41 | Check the [Weka integration Java Docs](https://openml.github.io/openml-weka/) for more details about the possibilities.
42 |
43 | ## Issues
44 | Please report any bugs that you may encounter in the issue tracker: https://github.com/openml/openml-weka
45 | Or email to j.n.van.rijn@liacs.leidenuniv.nl
46 |
--------------------------------------------------------------------------------
/docs/ecosystem/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | icon: fontawesome/solid/seedling
3 | ---
4 |
5 | # Ecosystem
6 |
7 | OpenML has a rich ecosystem of tools and projects that seamlessly integrate OpenML in various ways.
8 |
9 | !!! tip "Add your library"
10 | Did you use OpenML in your work and want to share it with the community? We would love to have you!
11 | Simply create a pull request with the necessary information (click the :material-pencil: icon) and we will add it to this page.
12 |
13 | !!! info "Integrate OpenML in your libraries"
14 | If you want to integrate OpenML into machine learning and data science tools, it's easiest to build on one of the existing clients,
15 | which often can be used as is or extended. For instance, see how to [extend the Python API](./Python_extensions.md) to integrate OpenML into Python tools.
16 |
17 | {!ecosystem/showcase.md!}
--------------------------------------------------------------------------------
/docs/ecosystem/mlr.md:
--------------------------------------------------------------------------------
1 | # Machine Learning in R (mlr)
2 |
3 | OpenML is readily integrated with mlr through the [mlr3oml](https://mlr3oml.mlr-org.com/index.html) package.
4 |
5 | !!! example
6 | ```r
7 | library(mlr3oml)
8 | library(mlr3)
9 |
10 | # Search for specific datasets
11 | odatasets = list_oml_data(
12 | number_features = c(10, 20),
13 | number_instances = c(45000, 50000),
14 | number_classes = 2
15 | )
16 |
17 | # Get dataset
18 | odata = odt(id = 1590)
19 | # Access the actual data
20 | odata$data
21 |
22 | # Convert to an mlr3::Task
23 | tsk_adult = as_task(odata, target = "class")
24 | ```
25 |
26 | Key features:
27 |
28 | * Query and download OpenML datasets and use them however you like
29 | * Build any mlr learner, run it on any task and save the experiment as run objects
30 | * Upload your runs for collaboration or publishing
31 | * Query, download and reuse all shared runs
32 |
33 | There is also an older (deprecated) [OpenML R package](http://openml.github.io/openml-r/).
34 |
--------------------------------------------------------------------------------
/docs/ecosystem/showcase.md:
--------------------------------------------------------------------------------
1 |
Tool to convert openml flows to ONNX and visualize them via Netron
119 |
120 |
121 |
--------------------------------------------------------------------------------
/docs/examples/20_basic/README.txt:
--------------------------------------------------------------------------------
1 | Introductory Examples
2 | =====================
3 |
4 | Introductory examples to the usage of the OpenML python connector.
5 |
--------------------------------------------------------------------------------
/docs/examples/20_basic/introduction_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Introduction tutorial & Setup
3 | # An example how to set up OpenML-Python followed up by a simple example.
4 |
5 | # %% [markdown]
6 | # OpenML is an online collaboration platform for machine learning which allows
7 | # you to:
8 | #
9 | # * Find or share interesting, well-documented datasets
10 | # * Define research / modelling goals (tasks)
11 | # * Explore large amounts of machine learning algorithms, with APIs in Java, R, Python
12 | # * Log and share reproducible experiments, models, results
13 | # * Works seamlessly with scikit-learn and other libraries
14 | # * Large scale benchmarking, compare to state of the art
15 | #
16 |
17 | # %% [markdown]
18 | # # Installation
19 | # Installation is done via ``pip``:
20 | #
21 | # ```bash
22 | # pip install openml
23 | # ```
24 |
25 | # %% [markdown]
26 | # # Authentication
27 | #
28 | # The OpenML server can only be accessed by users who have signed up on the
29 | # OpenML platform. If you don’t have an account yet, sign up now.
30 | # You will receive an API key, which will authenticate you to the server
31 | # and allow you to download and upload datasets, tasks, runs and flows.
32 | #
33 | # * Create an OpenML account (free) on https://www.openml.org.
34 | # * After logging in, open your account page (avatar on the top right)
35 | # * Open 'Account Settings', then 'API authentication' to find your API key.
36 | #
37 | # There are two ways to permanently authenticate:
38 | #
39 | # * Use the ``openml`` CLI tool with ``openml configure apikey MYKEY``,
40 | # replacing **MYKEY** with your API key.
41 | # * Create a plain text file **~/.openml/config** with the line
42 | # **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
43 | # file must be in the directory ~/.openml/config and exist prior to
44 | # importing the openml module.
45 | #
46 | # Alternatively, by running the code below and replacing 'YOURKEY' with your API key,
47 | # you authenticate for the duration of the python process.
48 |
49 |
50 | # %%
51 |
52 | import openml
53 | from sklearn import neighbors
54 |
55 | # %% [markdown]
56 | # .. warning::
57 | # .. include:: ../../test_server_usage_warning.txt
58 |
59 | # %%
60 | openml.config.start_using_configuration_for_example()
61 |
62 | # %% [markdown]
63 | # When using the main server instead, make sure your apikey is configured.
64 | # This can be done with the following line of code (uncomment it!).
65 | # Never share your apikey with others.
66 |
67 | # %%
68 | # openml.config.apikey = 'YOURKEY'
69 |
70 | # %% [markdown]
71 | # # Caching
72 | # When downloading datasets, tasks, runs and flows, they will be cached to
73 | # retrieve them without calling the server later. As with the API key,
74 | # the cache directory can be either specified through the config file or
75 | # through the API:
76 | #
77 | # * Add the line **cachedir = 'MYDIR'** to the config file, replacing
78 | # 'MYDIR' with the path to the cache directory. By default, OpenML
79 | # will use **~/.openml/cache** as the cache directory.
80 | # * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
81 |
82 | # %%
83 | # Uncomment and set your OpenML cache directory
84 | # import os
85 | # openml.config.cache_directory = os.path.expanduser('YOURDIR')
86 |
87 | # %% [markdown]
88 | # # Simple Example
89 | # Download the OpenML task for the eeg-eye-state.
90 |
91 | # %%
92 | task = openml.tasks.get_task(403)
93 | data = openml.datasets.get_dataset(task.dataset_id)
94 | clf = neighbors.KNeighborsClassifier(n_neighbors=5)
95 | run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
96 | # Publish the experiment on OpenML (optional, requires an API key).
97 | # For this tutorial, our configuration publishes to the test server
98 | # as to not crowd the main server with runs created by examples.
99 | myrun = run.publish()
100 | print(f"kNN on {data.name}: {myrun.openml_url}")
101 |
102 | # %%
103 | openml.config.stop_using_configuration_for_example()
104 | # License: BSD 3-Clause
105 |
--------------------------------------------------------------------------------
/docs/examples/20_basic/simple_datasets_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Datasets
3 | # A basic tutorial on how to list, load and visualize datasets.
4 | #
5 | # In general, we recommend working with tasks, so that the results can
6 | # be easily reproduced. Furthermore, the results can be compared to existing results
7 | # at OpenML. However, for the purposes of this tutorial, we are going to work with
8 | # the datasets directly.
9 |
10 | # %%
11 |
12 | import openml
13 |
14 | # %% [markdown]
15 | # ## List datasets
16 |
17 | # %%
18 | datasets_df = openml.datasets.list_datasets(output_format="dataframe")
19 | print(datasets_df.head(n=10))
20 |
21 | # %% [markdown]
22 | # ## Download a dataset
23 |
24 | # %%
25 | # Iris dataset https://www.openml.org/d/61
26 | dataset = openml.datasets.get_dataset(61)
27 |
28 | # Print a summary
29 | print(
30 | f"This is dataset '{dataset.name}', the target feature is "
31 | f"'{dataset.default_target_attribute}'"
32 | )
33 | print(f"URL: {dataset.url}")
34 | print(dataset.description[:500])
35 |
36 | # %% [markdown]
37 | # ## Load a dataset
38 | # X - An array/dataframe where each row represents one example with
39 | # the corresponding feature values.
40 | #
41 | # y - the classes for each example
42 | #
43 | # categorical_indicator - an array that indicates which feature is categorical
44 | #
45 | # attribute_names - the names of the features for the examples (X) and
46 | # target feature (y)
47 |
48 | # %%
49 | X, y, categorical_indicator, attribute_names = dataset.get_data(
50 | dataset_format="dataframe", target=dataset.default_target_attribute
51 | )
52 |
53 | # %% [markdown]
54 | # Visualize the dataset
55 |
56 | # %%
57 | import pandas as pd
58 | import seaborn as sns
59 | import matplotlib.pyplot as plt
60 |
61 | sns.set_style("darkgrid")
62 |
63 |
64 | def hide_current_axis(*args, **kwds):
65 | plt.gca().set_visible(False)
66 |
67 |
68 | # We combine all the data so that we can map the different
69 | # examples to different colors according to the classes.
70 | combined_data = pd.concat([X, y], axis=1)
71 | iris_plot = sns.pairplot(combined_data, hue="class")
72 | iris_plot.map_upper(hide_current_axis)
73 | plt.show()
74 |
75 | # License: BSD 3-Clause
76 |
--------------------------------------------------------------------------------
/docs/examples/20_basic/simple_flows_and_runs_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Flows and Runs
3 | # A simple tutorial on how to train/run a model and how to upload the results.
4 |
5 | # %%
6 | import openml
7 | from sklearn import ensemble, neighbors
8 |
9 |
10 | # %% [markdown]
11 | # .. warning::
12 | # .. include:: ../../test_server_usage_warning.txt
13 |
14 | # %%
15 | openml.config.start_using_configuration_for_example()
16 |
17 | # %% [markdown]
18 | # ## Train a machine learning model
19 |
20 | # NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
21 |
22 | # %%
23 | dataset = openml.datasets.get_dataset(20)
24 | X, y, categorical_indicator, attribute_names = dataset.get_data(
25 | target=dataset.default_target_attribute
26 | )
27 | clf = neighbors.KNeighborsClassifier(n_neighbors=3)
28 | clf.fit(X, y)
29 |
30 | # %% [markdown]
31 | # ## Running a model on a task
32 |
33 | # %%
34 | task = openml.tasks.get_task(119)
35 | clf = ensemble.RandomForestClassifier()
36 | run = openml.runs.run_model_on_task(clf, task)
37 | print(run)
38 |
39 | # %% [markdown]
40 | # ## Publishing the run
41 |
42 | # %%
43 | myrun = run.publish()
44 | print(f"Run was uploaded to {myrun.openml_url}")
45 | print(f"The flow can be found at {myrun.flow.openml_url}")
46 |
47 | # %%
48 | openml.config.stop_using_configuration_for_example()
49 | # License: BSD 3-Clause
50 |
--------------------------------------------------------------------------------
/docs/examples/20_basic/simple_suites_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Benchmark suites
3 | # This is a brief showcase of OpenML benchmark suites, which were introduced by
4 | # [Bischl et al. (2019)](https://arxiv.org/abs/1708.03731v2). Benchmark suites standardize the
5 | # datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML
6 | # and simplify both the sharing of the setup and the results.
7 |
8 | # %%
9 | import openml
10 |
11 | # %% [markdown]
12 | # OpenML-CC18
13 | # ===========
14 | #
15 | # As an example we have a look at the OpenML-CC18, which is a suite of 72 classification datasets
16 | # from OpenML which were carefully selected to be usable by many algorithms and also represent
17 | # datasets commonly used in machine learning research. These are all datasets from mid-2018 that
18 | # satisfy a large set of clear requirements for thorough yet practical benchmarking:
19 | #
20 | # 1. the number of observations are between 500 and 100,000 to focus on medium-sized datasets,
21 | # 2. the number of features does not exceed 5,000 features to keep the runtime of the algorithms
22 | # low
23 | # 3. the target attribute has at least two classes with no class having less than 20 observations
24 | # 4. the ratio of the minority class and the majority class is above 0.05 (to eliminate highly
25 | # imbalanced datasets which require special treatment for both algorithms and evaluation
26 | # measures).
27 | #
28 | # A full description can be found in the
29 | # [OpenML benchmarking docs](https://docs.openml.org/benchmark/#openml-cc18).
30 | #
31 | # In this example we'll focus on how to use benchmark suites in practice.
32 |
33 | # %% [markdown]
34 | # Downloading benchmark suites
35 | # ============================
36 |
37 | # %%
38 | suite = openml.study.get_suite(99)
39 | print(suite)
40 |
41 | # %% [markdown]
42 | # The benchmark suite does not download the included tasks and datasets itself, but only contains
43 | # a list of which tasks constitute the study.
44 | #
45 | # Tasks can then be accessed via
46 |
47 | # %%
48 | tasks = suite.tasks
49 | print(tasks)
50 |
51 | # %% [markdown]
52 | # and iterated over for benchmarking. For speed reasons we only iterate over the first three tasks:
53 |
54 | # %%
55 | for task_id in tasks[:3]:
56 | task = openml.tasks.get_task(task_id)
57 | print(task)
58 |
59 | # %% [markdown]
60 | # Further examples
61 | # ================
62 | #
63 | # * [Suites Tutorial](../../30_extended/suites_tutorial)
64 | # * [Study Tutoral](../../30_extended/study_tutorial)
65 | # * [Paper example: Strang et al.](../../40_paper/2018_ida_strang_example.py)
66 |
67 | # License: BSD 3-Clause
68 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/README.txt:
--------------------------------------------------------------------------------
1 | In-Depth Examples
2 | =================
3 |
4 | Extended examples for the usage of the OpenML python connector.
--------------------------------------------------------------------------------
/docs/examples/30_extended/configure_logging.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Logging
3 | # This tutorial explains openml-python logging, and shows how to configure it.
4 | # Openml-python uses the [Python logging module](https://docs.python.org/3/library/logging.html)
5 | # to provide users with log messages. Each log message is assigned a level of importance, see
6 | # the table in Python's logging tutorial
7 | # [here](https://docs.python.org/3/howto/logging.html#when-to-use-logging).
8 | #
9 | # By default, openml-python will print log messages of level `WARNING` and above to console.
10 | # All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
11 | # found in your cache directory (see also the
12 | # [introduction tutorial](../20_basic/introduction_tutorial).
13 | # These file logs are automatically deleted if needed, and use at most 2MB of space.
14 | #
15 | # It is possible to configure what log levels to send to console and file.
16 | # When downloading a dataset from OpenML, a `DEBUG`-level message is written:
17 |
18 | # %%
19 | import openml
20 |
21 | openml.datasets.get_dataset("iris")
22 |
23 | # %% [markdown]
24 | # With default configuration, the above example will show no output to console.
25 | # However, in your cache directory you should find a file named 'openml_python.log',
26 | # which has a DEBUG message written to it. It should be either like
27 | # "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..."
28 | # or like
29 | # "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date."
30 | # , depending on whether or not you had downloaded iris before.
31 | # The processed log levels can be configured programmatically:
32 |
33 | # %%
34 | import logging
35 |
36 | openml.config.set_console_log_level(logging.DEBUG)
37 | openml.config.set_file_log_level(logging.WARNING)
38 | openml.datasets.get_dataset("iris")
39 |
40 | # %% [markdown]
41 | # Now the log level that was previously written to file should also be shown in the console.
42 | # The message is now no longer written to file as the `file_log` was set to level `WARNING`.
43 | #
44 | # It is also possible to specify the desired log levels through the configuration file.
45 | # This way you will not need to set them on each script separately.
46 | # Add the line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file,
47 | # where 'NUMBER' should be one of:
48 | #
49 | # * 0: `logging.WARNING` and up.
50 | # * 1: `logging.INFO` and up.
51 | # * 2: `logging.DEBUG` and up (i.e. all messages).
52 | #
53 | # License: BSD 3-Clause
54 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/datasets_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Datasets
3 | # How to list and download datasets.
4 |
5 | # %%
6 | import openml
7 | import pandas as pd
8 | from openml.datasets import edit_dataset, fork_dataset, get_dataset
9 |
10 | # %% [markdown]
11 | # ## Exercise 0
12 | #
13 | # * List datasets
14 |
15 | # %%
16 | datalist = openml.datasets.list_datasets()
17 | datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
18 |
19 | print(f"First 10 of {len(datalist)} datasets...")
20 | datalist.head(n=10)
21 |
22 | # The same can be done with lesser lines of code
23 | openml_df = openml.datasets.list_datasets()
24 | openml_df.head(n=10)
25 |
26 | # %% [markdown]
27 | # ## Exercise 1
28 | #
29 | # * Find datasets with more than 10000 examples.
30 | # * Find a dataset called 'eeg_eye_state'.
31 | # * Find all datasets with more than 50 classes.
32 |
33 | # %%
34 | datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
35 |
36 | # %%
37 | datalist.query('name == "eeg-eye-state"')
38 |
39 | # %%
40 | datalist.query("NumberOfClasses > 50")
41 |
42 | # %% [markdown]
43 | # ## Download datasets
44 |
45 | # %%
46 | # This is done based on the dataset ID.
47 | dataset = openml.datasets.get_dataset(1471)
48 |
49 | # Print a summary
50 | print(
51 | f"This is dataset '{dataset.name}', the target feature is "
52 | f"'{dataset.default_target_attribute}'"
53 | )
54 | print(f"URL: {dataset.url}")
55 | print(dataset.description[:500])
56 |
57 | # %% [markdown]
58 | # Get the actual data.
59 | #
60 | # openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
61 | # and also some additional metadata that we don't care about right now.
62 |
63 | # %%
64 | eeg, *_ = dataset.get_data()
65 |
66 | # %% [markdown]
67 | # You can optionally choose to have openml separate out a column from the
68 | # dataset. In particular, many datasets for supervised problems have a set
69 | # `default_target_attribute` which may help identify the target variable.
70 |
71 | # %%
72 | X, y, categorical_indicator, attribute_names = dataset.get_data(
73 | target=dataset.default_target_attribute
74 | )
75 | print(X.head())
76 | print(X.info())
77 |
78 | # %% [markdown]
79 | # Sometimes you only need access to a dataset's metadata.
80 | # In those cases, you can download the dataset without downloading the
81 | # data file. The dataset object can be used as normal.
82 | # Whenever you use any functionality that requires the data,
83 | # such as `get_data`, the data will be downloaded.
84 | # Starting from 0.15, not downloading data will be the default behavior instead.
85 | # The data will be downloading automatically when you try to access it through
86 | # openml objects, e.g., using `dataset.features`.
87 |
88 | # %%
89 | dataset = openml.datasets.get_dataset(1471)
90 |
91 | # %% [markdown]
92 | # ## Exercise 2
93 | # * Explore the data visually.
94 |
95 | # %%
96 | eegs = eeg.sample(n=1000)
97 | _ = pd.plotting.scatter_matrix(
98 | X.iloc[:100, :4],
99 | c=y[:100],
100 | figsize=(10, 10),
101 | marker="o",
102 | hist_kwds={"bins": 20},
103 | alpha=0.8,
104 | cmap="plasma",
105 | )
106 |
107 |
108 | # %% [markdown]
109 | # ## Edit a created dataset
110 | # This example uses the test server, to avoid editing a dataset on the main server.
111 | #
112 | # .. warning::
113 | # .. include:: ../../test_server_usage_warning.txt
114 |
115 | # %%
116 | openml.config.start_using_configuration_for_example()
117 | # %% [markdown]
118 | # Edit non-critical fields, allowed for all authorized users:
119 | # description, creator, contributor, collection_date, language, citation,
120 | # original_data_url, paper_url
121 |
122 | # %%
123 | desc = (
124 | "This data sets consists of 3 different types of irises' "
125 | "(Setosa, Versicolour, and Virginica) petal and sepal length,"
126 | " stored in a 150x4 numpy.ndarray"
127 | )
128 | did = 128
129 | data_id = edit_dataset(
130 | did,
131 | description=desc,
132 | creator="R.A.Fisher",
133 | collection_date="1937",
134 | citation="The use of multiple measurements in taxonomic problems",
135 | language="English",
136 | )
137 | edited_dataset = get_dataset(data_id)
138 | print(f"Edited dataset ID: {data_id}")
139 |
140 |
141 | # %% [markdown]
142 | # Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
143 | # only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
144 | # tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
145 | # configure the API key:
146 | # openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
147 | # This example here only shows a failure when trying to work on a dataset not owned by you:
148 |
149 | # %%
150 | try:
151 | data_id = edit_dataset(1, default_target_attribute="shape")
152 | except openml.exceptions.OpenMLServerException as e:
153 | print(e)
154 |
155 | # %% [markdown]
156 | # ## Fork dataset
157 | # Used to create a copy of the dataset with you as the owner.
158 | # Use this API only if you are unable to edit the critical fields (default_target_attribute,
159 | # ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
160 | # After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
161 |
162 | # %%
163 | data_id = fork_dataset(1)
164 | print(data_id)
165 | data_id = edit_dataset(data_id, default_target_attribute="shape")
166 | print(f"Forked dataset ID: {data_id}")
167 |
168 | # %%
169 | openml.config.stop_using_configuration_for_example()
170 | # License: BSD 3-Clauses
171 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/fetch_evaluations_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Fetching Evaluations
3 |
4 | # Evaluations contain a concise summary of the results of all runs made. Each evaluation
5 | # provides information on the dataset used, the flow applied, the setup used, the metric
6 | # evaluated, and the result obtained on the metric, for each such run made. These collection
7 | # of results can be used for efficient benchmarking of an algorithm and also allow transparent
8 | # reuse of results from previous experiments on similar parameters.
9 | #
10 | # In this example, we shall do the following:
11 | #
12 | # * Retrieve evaluations based on different metrics
13 | # * Fetch evaluations pertaining to a specific task
14 | # * Sort the obtained results in descending order of the metric
15 | # * Plot a cumulative distribution function for the evaluations
16 | # * Compare the top 10 performing flows based on the evaluation performance
17 | # * Retrieve evaluations with hyperparameter settings
18 |
19 | # %%
20 | import openml
21 |
22 | # %% [markdown]
23 | # ## Listing evaluations
24 | # Evaluations can be retrieved from the database in the chosen output format.
25 | # Required filters can be applied to retrieve results from runs as required.
26 |
27 | # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
28 |
29 | # %%
30 | openml.evaluations.list_evaluations(
31 | function="predictive_accuracy", size=10
32 | )
33 |
34 | # Using other evaluation metrics, 'precision' in this case
35 | evals = openml.evaluations.list_evaluations(
36 | function="precision", size=10
37 | )
38 |
39 | # Querying the returned results for precision above 0.98
40 | print(evals[evals.value > 0.98])
41 |
42 | # %% [markdown]
43 | # ## Viewing a sample task
44 | # Over here we shall briefly take a look at the details of the task.
45 | # We will start by displaying a simple *supervised classification* task:
46 |
47 | # %%
48 | task_id = 167140 # https://www.openml.org/t/167140
49 | task = openml.tasks.get_task(task_id)
50 | print(task)
51 |
52 | # %% [markdown]
53 | # ## Obtaining all the evaluations for the task
54 | # We'll now obtain all the evaluations that were uploaded for the task
55 | # we displayed previously.
56 | # Note that we now filter the evaluations based on another parameter 'task'.
57 |
58 | # %%
59 | metric = "predictive_accuracy"
60 | evals = openml.evaluations.list_evaluations(
61 | function=metric, tasks=[task_id], output_format="dataframe"
62 | )
63 | # Displaying the first 10 rows
64 | print(evals.head(n=10))
65 | # Sorting the evaluations in decreasing order of the metric chosen
66 | evals = evals.sort_values(by="value", ascending=False)
67 | print("\nDisplaying head of sorted dataframe: ")
68 | print(evals.head())
69 |
70 | # %% [markdown]
71 | # ## Obtaining CDF of metric for chosen task
72 | # We shall now analyse how the performance of various flows have been on this task,
73 | # by seeing the likelihood of the accuracy obtained across all runs.
74 | # We shall now plot a cumulative distributive function (CDF) for the accuracies obtained.
75 |
76 | # %%
77 | from matplotlib import pyplot as plt
78 |
79 |
80 | def plot_cdf(values, metric="predictive_accuracy"):
81 | max_val = max(values)
82 | n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
83 | patches[0].set_xy(patches[0].get_xy()[:-1])
84 | plt.xlim(max(0, min(values) - 0.1), 1)
85 | plt.title("CDF")
86 | plt.xlabel(metric)
87 | plt.ylabel("Likelihood")
88 | plt.grid(visible=True, which="major", linestyle="-")
89 | plt.minorticks_on()
90 | plt.grid(visible=True, which="minor", linestyle="--")
91 | plt.axvline(max_val, linestyle="--", color="gray")
92 | plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
93 | plt.show()
94 |
95 |
96 | plot_cdf(evals.value, metric)
97 |
98 | # %% [markdown]
99 | # This CDF plot shows that for the given task, based on the results of the
100 | # runs uploaded, it is almost certain to achieve an accuracy above 52%, i.e.,
101 | # with non-zero probability. While the maximum accuracy seen till now is 96.5%.
102 |
103 | # %% [markdown]
104 | # ## Comparing top 10 performing flows
105 | # Let us now try to see which flows generally performed the best for this task.
106 | # For this, we shall compare the top performing flows.
107 |
108 | # %%
109 | import numpy as np
110 | import pandas as pd
111 |
112 |
113 | def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
114 | # Collecting the top 10 performing unique flow_id
115 | flow_ids = evaluations.flow_id.unique()[:top_n]
116 |
117 | df = pd.DataFrame()
118 | # Creating a data frame containing only the metric values of the selected flows
119 | # assuming evaluations is sorted in decreasing order of metric
120 | for i in range(len(flow_ids)):
121 | flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
122 | df = pd.concat([df, flow_values], ignore_index=True, axis=1)
123 | fig, axs = plt.subplots()
124 | df.boxplot()
125 | axs.set_title("Boxplot comparing " + metric + " for different flows")
126 | axs.set_ylabel(metric)
127 | axs.set_xlabel("Flow ID")
128 | axs.set_xticklabels(flow_ids)
129 | axs.grid(which="major", linestyle="-", linewidth="0.5", color="gray", axis="y")
130 | axs.minorticks_on()
131 | axs.grid(which="minor", linestyle="--", linewidth="0.5", color="gray", axis="y")
132 | # Counting the number of entries for each flow in the data frame
133 | # which gives the number of runs for each flow
134 | flow_freq = list(df.count(axis=0, numeric_only=True))
135 | for i in range(len(flow_ids)):
136 | axs.text(i + 1.05, np.nanmin(df.values), str(flow_freq[i]) + "\nrun(s)", fontsize=7)
137 | plt.show()
138 |
139 |
140 | plot_flow_compare(evals, metric=metric, top_n=10)
141 |
142 | # %% [markdown]
143 | # The boxplots below show how the flows perform across multiple runs on the chosen
144 | # task. The green horizontal lines represent the median accuracy of all the runs for
145 | # that flow (number of runs denoted at the bottom of the boxplots). The higher the
146 | # green line, the better the flow is for the task at hand. The ordering of the flows
147 | # are in the descending order of the higest accuracy value seen under that flow.
148 |
149 | # Printing the corresponding flow names for the top 10 performing flow IDs
150 |
151 | # %%
152 | top_n = 10
153 | flow_ids = evals.flow_id.unique()[:top_n]
154 | flow_names = evals.flow_name.unique()[:top_n]
155 | for i in range(top_n):
156 | print((flow_ids[i], flow_names[i]))
157 |
158 | # %% [markdown]
159 | # ## Obtaining evaluations with hyperparameter settings
160 | # We'll now obtain the evaluations of a task and a flow with the hyperparameters
161 |
162 | # List evaluations in descending order based on predictive_accuracy with
163 | # hyperparameters
164 |
165 | # %%
166 | evals_setups = openml.evaluations.list_evaluations_setups(
167 | function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
168 | )
169 |
170 | print(evals_setups.head())
171 |
172 | # %% [markdown]
173 | # Return evaluations for flow_id in descending order based on predictive_accuracy
174 | # with hyperparameters. parameters_in_separate_columns returns parameters in
175 | # separate columns
176 |
177 | # %%
178 | evals_setups = openml.evaluations.list_evaluations_setups(
179 | function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
180 | )
181 |
182 | print(evals_setups.head(10))
183 |
184 | # License: BSD 3-Clause
185 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/flow_id_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Obtaining Flow IDs
3 | # This tutorial discusses different ways to obtain the ID of a flow in order to perform further
4 | # analysis.
5 |
6 |
7 | # %%
8 | import sklearn.tree
9 |
10 | import openml
11 |
12 |
13 | # %% [markdown]
14 | # .. warning::
15 | # .. include:: ../../test_server_usage_warning.txt
16 |
17 | # %%
18 | openml.config.start_using_configuration_for_example()
19 |
20 | # %%
21 | # Defining a classifier
22 | clf = sklearn.tree.DecisionTreeClassifier()
23 |
24 | # %% [markdown]
25 | # ## 1. Obtaining a flow given a classifier
26 |
27 | # %%
28 | flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
29 | flow_id = flow.flow_id
30 | print(flow_id)
31 |
32 | # %% [markdown]
33 | # This piece of code is rather involved. First, it retrieves a
34 | # :class:`~openml.extensions.Extension` which is registered and can handle the given model,
35 | # in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension
36 | # converts the classifier into an instance of :class:`openml.OpenMLFlow`. Third and finally,
37 | # the publish method checks whether the current flow is already present on OpenML. If not,
38 | # it uploads the flow, otherwise, it updates the current instance with all information computed
39 | # by the server (which is obviously also done when uploading/publishing a flow).
40 | #
41 | # To simplify the usage we have created a helper function which automates all these steps:
42 |
43 | # %%
44 | flow_id = openml.flows.get_flow_id(model=clf)
45 | print(flow_id)
46 |
47 | # %% [markdown]
48 | # ## 2. Obtaining a flow given its name
49 | # The schema of a flow is given in XSD (
50 | # [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # noqa E501
51 | # Only two fields are required, a unique name, and an external version. While it should be pretty
52 | # obvious why we need a name, the need for the additional external version information might not
53 | # be immediately clear. However, this information is very important as it allows to have multiple
54 | # flows with the same name for different versions of a software. This might be necessary if an
55 | # algorithm or implementation introduces, renames or drop hyperparameters over time.
56 |
57 | # %%
58 | print(flow.name, flow.external_version)
59 |
60 | # %% [markdown]
61 | # The name and external version are automatically added to a flow when constructing it from a
62 | # model. We can then use them to retrieve the flow id as follows:
63 |
64 | # %%
65 | flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version)
66 | print(flow_id)
67 |
68 | # %% [markdown]
69 | # We can also retrieve all flows for a given name:
70 |
71 | # %%
72 | flow_ids = openml.flows.get_flow_id(name=flow.name)
73 | print(flow_ids)
74 |
75 | # %% [markdown]
76 | # This also works with the actual model (generalizing the first part of this example):
77 |
78 | # %%
79 | flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
80 | print(flow_ids)
81 |
82 | # %%
83 | # Deactivating test configuration
84 | openml.config.stop_using_configuration_for_example()
85 | # License: BSD 3-Clause
86 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/plot_svm_hyperparameters_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Plotting hyperparameter surfaces
3 |
4 | # %%
5 | import openml
6 | import numpy as np
7 |
8 | # %% [markdown]
9 | # # First step - obtaining the data
10 | # First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
11 | # not part of this tutorial, this could for example be done via the website.
12 | #
13 | # For this we use the function ``list_evaluations_setup`` which can automatically join
14 | # evaluations conducted by the server with the hyperparameter settings extracted from the
15 | # uploaded runs (called *setup*).
16 |
17 | # %%
18 | df = openml.evaluations.list_evaluations_setups(
19 | function="predictive_accuracy",
20 | flows=[8353],
21 | tasks=[6],
22 | output_format="dataframe",
23 | # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
24 | # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
25 | parameters_in_separate_columns=True,
26 | )
27 | print(df.head(n=10))
28 |
29 | # %% [markdown]
30 | # We can see all the hyperparameter names in the columns of the dataframe:
31 |
32 | # %%
33 | for name in df.columns:
34 | print(name)
35 |
36 | # %% [markdown]
37 | # Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we
38 | # can nicely plot them.
39 |
40 | # %%
41 | hyperparameters = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
42 | df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log10)
43 |
44 | # %% [markdown]
45 | # ## Option 1 - plotting via the pandas helper functions
46 |
47 | # %%
48 | df.plot.hexbin(
49 | x="sklearn.svm.classes.SVC(16)_C",
50 | y="sklearn.svm.classes.SVC(16)_gamma",
51 | C="value",
52 | reduce_C_function=np.mean,
53 | gridsize=25,
54 | title="SVM performance landscape",
55 | )
56 |
57 | # %% [markdown]
58 | # ## Option 2 - plotting via matplotlib
59 |
60 | # %%
61 | import matplotlib.pyplot as plt
62 |
63 | fig, ax = plt.subplots()
64 |
65 | C = df["sklearn.svm.classes.SVC(16)_C"]
66 | gamma = df["sklearn.svm.classes.SVC(16)_gamma"]
67 | score = df["value"]
68 |
69 | # Plotting all evaluations:
70 | ax.plot(C, gamma, "ko", ms=1)
71 | # Create a contour plot
72 | cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
73 | # Adjusting the colorbar
74 | fig.colorbar(cntr, ax=ax, label="accuracy")
75 | # Adjusting the axis limits
76 | ax.set(
77 | xlim=(min(C), max(C)),
78 | ylim=(min(gamma), max(gamma)),
79 | xlabel="C (log10)",
80 | ylabel="gamma (log10)",
81 | )
82 | ax.set_title("SVM performance landscape")
83 | # License: BSD 3-Clause
84 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/run_setup_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Run Setup
3 | # One of the key features of the openml-python library is that is allows to
4 | # reinstantiate flows with hyperparameter settings that were uploaded before.
5 | # This tutorial uses the concept of setups. Although setups are not extensively
6 | # described in the OpenML documentation (because most users will not directly
7 | # use them), they form a important concept within OpenML distinguishing between
8 | # hyperparameter configurations.
9 | # A setup is the combination of a flow with all its hyperparameters set.
10 | #
11 | # A key requirement for reinstantiating a flow is to have the same scikit-learn
12 | # version as the flow that was uploaded. However, this tutorial will upload the
13 | # flow (that will later be reinstantiated) itself, so it can be ran with any
14 | # scikit-learn version that is supported by this library. In this case, the
15 | # requirement of the corresponding scikit-learn versions is automatically met.
16 | #
17 | # In this tutorial we will
18 | # 1) Create a flow and use it to solve a task;
19 | # 2) Download the flow, reinstantiate the model with same hyperparameters,
20 | # and solve the same task again;
21 | # 3) We will verify that the obtained results are exactly the same.
22 |
23 | # %%
24 |
25 | import numpy as np
26 | import openml
27 | from openml.extensions.sklearn import cat, cont
28 |
29 | from sklearn.pipeline import make_pipeline, Pipeline
30 | from sklearn.compose import ColumnTransformer
31 | from sklearn.impute import SimpleImputer
32 | from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
33 | from sklearn.ensemble import RandomForestClassifier
34 | from sklearn.decomposition import TruncatedSVD
35 |
36 | # %% [markdown]
37 | # .. warning::
38 | # .. include:: ../../test_server_usage_warning.txt
39 |
40 | # %%
41 | openml.config.start_using_configuration_for_example()
42 |
43 | # %% [markdown]
44 | # 1) Create a flow and use it to solve a task
45 |
46 | # First, let's download the task that we are interested in
47 |
48 | # %%
49 | task = openml.tasks.get_task(6)
50 |
51 | # %% [markdown]
52 | # we will create a fairly complex model, with many preprocessing components and
53 | # many potential hyperparameters. Of course, the model can be as complex and as
54 | # easy as you want it to be
55 |
56 |
57 | # %%
58 | cat_imp = make_pipeline(
59 | OneHotEncoder(handle_unknown="ignore", sparse=False),
60 | TruncatedSVD(),
61 | )
62 | cont_imp = SimpleImputer(strategy="median")
63 | ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
64 | model_original = Pipeline(
65 | steps=[
66 | ("transform", ct),
67 | ("estimator", RandomForestClassifier()),
68 | ]
69 | )
70 |
71 | # %% [markdown]
72 | # Let's change some hyperparameters. Of course, in any good application we
73 | # would tune them using, e.g., Random Search or Bayesian Optimization, but for
74 | # the purpose of this tutorial we set them to some specific values that might
75 | # or might not be optimal
76 |
77 | # %%
78 | hyperparameters_original = {
79 | "estimator__criterion": "gini",
80 | "estimator__n_estimators": 50,
81 | "estimator__max_depth": 10,
82 | "estimator__min_samples_leaf": 1,
83 | }
84 | model_original.set_params(**hyperparameters_original)
85 |
86 | # solve the task and upload the result (this implicitly creates the flow)
87 | run = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)
88 | run_original = run.publish() # this implicitly uploads the flow
89 |
90 | # %% [markdown]
91 | # ## 2) Download the flow and solve the same task again.
92 |
93 | # %%
94 | # obtain setup id (note that the setup id is assigned by the OpenML server -
95 | # therefore it was not yet available in our local copy of the run)
96 | run_downloaded = openml.runs.get_run(run_original.run_id)
97 | setup_id = run_downloaded.setup_id
98 |
99 | # after this, we can easily reinstantiate the model
100 | model_duplicate = openml.setups.initialize_model(setup_id)
101 | # it will automatically have all the hyperparameters set
102 |
103 | # and run the task again
104 | run_duplicate = openml.runs.run_model_on_task(model_duplicate, task, avoid_duplicate_runs=False)
105 |
106 |
107 | # %% [markdown]
108 | # ## 3) We will verify that the obtained results are exactly the same.
109 |
110 | # %%
111 | # the run has stored all predictions in the field data content
112 | np.testing.assert_array_equal(run_original.data_content, run_duplicate.data_content)
113 |
114 |
115 | # %%
116 | openml.config.stop_using_configuration_for_example()
117 |
118 | # By: Jan N. van Rijn
119 | # License: BSD 3-Clause
120 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/study_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Benchmark studies
3 | # How to list, download and upload benchmark studies.
4 | # In contrast to
5 | # [benchmark suites](https://docs.openml.org/benchmark/#benchmarking-suites) which
6 | # hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and
7 | # tasks, all required information about a study can be retrieved.
8 |
9 | # %%
10 | import uuid
11 |
12 | from sklearn.ensemble import RandomForestClassifier
13 |
14 | import openml
15 |
16 |
17 | # %% [markdown]
18 | # ## Listing studies
19 | #
20 | # * Use the output_format parameter to select output type
21 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
22 | # easier-to-work-with data structure
23 |
24 | # %%
25 | studies = openml.study.list_studies(output_format="dataframe", status="all")
26 | print(studies.head(n=10))
27 |
28 |
29 | # %% [markdown]
30 | # ## Downloading studies
31 |
32 | # %% [markdown]
33 | # This is done based on the study ID.
34 |
35 | # %%
36 | study = openml.study.get_study(123)
37 | print(study)
38 |
39 | # %% [markdown]
40 | # Studies also features a description:
41 |
42 | # %%
43 | print(study.description)
44 |
45 | # %% [markdown]
46 | # Studies are a container for runs:
47 |
48 | # %%
49 | print(study.runs)
50 |
51 | # %% [markdown]
52 | # And we can use the evaluation listing functionality to learn more about
53 | # the evaluations available for the conducted runs:
54 |
55 | # %%
56 | evaluations = openml.evaluations.list_evaluations(
57 | function="predictive_accuracy",
58 | output_format="dataframe",
59 | study=study.study_id,
60 | )
61 | print(evaluations.head())
62 |
63 | # %% [markdown]
64 | # We'll use the test server for the rest of this tutorial.
65 | #
66 | # .. warning::
67 | # .. include:: ../../test_server_usage_warning.txt
68 |
69 | # %%
70 | openml.config.start_using_configuration_for_example()
71 |
72 | # %% [markdown]
73 | # ## Uploading studies
74 | #
75 | # Creating a study is as simple as creating any kind of other OpenML entity.
76 | # In this examples we'll create a few runs for the OpenML-100 benchmark
77 | # suite which is available on the OpenML test server.
78 |
79 | # %%
80 | # Model to be used
81 | clf = RandomForestClassifier()
82 |
83 | # We'll create a study with one run on 3 datasets present in the suite
84 | tasks = [115, 259, 307]
85 |
86 | # To verify
87 | suite = openml.study.get_suite(1)
88 | print(all([t_id in suite.tasks for t_id in tasks]))
89 |
90 | run_ids = []
91 | for task_id in tasks:
92 | task = openml.tasks.get_task(task_id)
93 | run = openml.runs.run_model_on_task(clf, task)
94 | run.publish()
95 | run_ids.append(run.run_id)
96 |
97 | # The study needs a machine-readable and unique alias. To obtain this,
98 | # we simply generate a random uuid.
99 | alias = uuid.uuid4().hex
100 |
101 | new_study = openml.study.create_study(
102 | name="Test-Study",
103 | description="Test study for the Python tutorial on studies",
104 | run_ids=run_ids,
105 | alias=alias,
106 | benchmark_suite=suite.study_id,
107 | )
108 | new_study.publish()
109 | print(new_study)
110 |
111 |
112 | # %%
113 | openml.config.stop_using_configuration_for_example()
114 | # License: BSD 3-Clause
115 |
--------------------------------------------------------------------------------
/docs/examples/30_extended/suites_tutorial.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Benchmark suites
3 | #
4 | # How to list, download and upload benchmark suites.
5 | #
6 | # If you want to learn more about benchmark suites, check out our
7 | # brief introductory tutorial ["Simple suites tutorial"](../20_basic/simple_suites_tutorial) or the
8 | # [OpenML benchmark docs](https://docs.openml.org/benchmark/#benchmarking-suites).
9 |
10 | # %%
11 | import uuid
12 |
13 | import numpy as np
14 |
15 | import openml
16 |
17 |
18 | # %% [markdown]
19 | # ## Listing suites
20 | #
21 | # * Use the output_format parameter to select output type
22 | # * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
23 | # easier-to-work-with data structure
24 |
25 | # %%
26 | suites = openml.study.list_suites(output_format="dataframe", status="all")
27 | print(suites.head(n=10))
28 |
29 | # %% [markdown]
30 | # ## Downloading suites
31 |
32 | # %% [markdown]
33 | # This is done based on the dataset ID.
34 |
35 | # %%
36 | suite = openml.study.get_suite(99)
37 | print(suite)
38 |
39 | # %% [markdown]
40 | # Suites also feature a description:
41 |
42 | # %%
43 | print(suite.description)
44 |
45 | # %% [markdown]
46 | # Suites are a container for tasks:
47 |
48 | # %%
49 | print(suite.tasks)
50 |
51 | # %% [markdown]
52 | # And we can use the task listing functionality to learn more about them:
53 |
54 | # %%
55 | tasks = openml.tasks.list_tasks(output_format="dataframe")
56 |
57 | # %% [markdown]
58 | # Using ``@`` in
59 | # [pd.DataFrame.query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html)
60 | # accesses variables outside of the current dataframe.
61 |
62 | # %%
63 | tasks = tasks.query("tid in @suite.tasks")
64 | print(tasks.describe().transpose())
65 |
66 | # %% [markdown]
67 | # We'll use the test server for the rest of this tutorial.
68 | #
69 | # .. warning::
70 | # .. include:: ../../test_server_usage_warning.txt
71 |
72 | # %%
73 | openml.config.start_using_configuration_for_example()
74 |
75 | # %% [markdown]
76 | # ## Uploading suites
77 | #
78 | # Uploading suites is as simple as uploading any kind of other OpenML
79 | # entity - the only reason why we need so much code in this example is
80 | # because we upload some random data.
81 |
82 | # We'll take a random subset of at least ten tasks of all available tasks on
83 | # the test server:
84 |
85 | # %%
86 | all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
87 | task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
88 |
89 | # The study needs a machine-readable and unique alias. To obtain this,
90 | # we simply generate a random uuid.
91 |
92 | alias = uuid.uuid4().hex
93 |
94 | new_suite = openml.study.create_benchmark_suite(
95 | name="Test-Suite",
96 | description="Test suite for the Python tutorial on benchmark suites",
97 | task_ids=task_ids_for_suite,
98 | alias=alias,
99 | )
100 | new_suite.publish()
101 | print(new_suite)
102 |
103 | # %%
104 | openml.config.stop_using_configuration_for_example()
105 | # License: BSD 3-Clause
106 |
--------------------------------------------------------------------------------
/docs/examples/40_paper/2015_neurips_feurer_example.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Feurer et al. (2015)
3 |
4 | # A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
5 | #
6 | # Auto-sklearn website: https://automl.github.io/auto-sklearn/
7 | #
8 | # ## Publication
9 | #
10 | # | Efficient and Robust Automated Machine Learning
11 | # | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
12 | # | In *Advances in Neural Information Processing Systems 28*, 2015
13 | # | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
14 |
15 | # %%
16 | import pandas as pd
17 |
18 | import openml
19 |
20 | # %% [markdown]
21 | # List of dataset IDs given in the supplementary material of Feurer et al.:
22 | # https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
23 |
24 | # %%
25 | dataset_ids = [
26 | 3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
27 | 57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
28 | 390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
29 | 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
30 | 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
31 | 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
32 | 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
33 | 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
34 | 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
35 | 1134, 1138, 1139, 1142, 1146, 1161, 1166,
36 | ]
37 |
38 | # %% [markdown]
39 | # The dataset IDs could be used directly to load the dataset and split the data into a training set
40 | # and a test set. However, to be reproducible, we will first obtain the respective tasks from
41 | # OpenML, which define both the target feature and the train/test split.
42 | #
43 | # .. note::
44 | # It is discouraged to work directly on datasets and only provide dataset IDs in a paper as
45 | # this does not allow reproducibility (unclear splitting). Please do not use datasets but the
46 | # respective tasks as basis for a paper and publish task IDS. This example is only given to
47 | # showcase the use of OpenML-Python for a published paper and as a warning on how not to do it.
48 | # Please check the `OpenML documentation of tasks `_ if you
49 | # want to learn more about them.
50 |
51 | # %% [markdown]
52 | # This lists both active and inactive tasks (because of ``status='all'``). Unfortunately,
53 | # this is necessary as some of the datasets contain issues found after the publication and became
54 | # deactivated, which also deactivated the tasks on them. More information on active or inactive
55 | # datasets can be found in the [online docs](https://docs.openml.org/#dataset-status).
56 |
57 | # %%
58 | tasks = openml.tasks.list_tasks(
59 | task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
60 | status="all",
61 | output_format="dataframe",
62 | )
63 |
64 | # Query only those with holdout as the resampling startegy.
65 | tasks = tasks.query('estimation_procedure == "33% Holdout set"')
66 |
67 | task_ids = []
68 | for did in dataset_ids:
69 | tasks_ = list(tasks.query("did == {}".format(did)).tid)
70 | if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
71 | task_id = min(tasks_)
72 | else:
73 | raise ValueError(did)
74 |
75 | # Optional - Check that the task has the same target attribute as the
76 | # dataset default target attribute
77 | # (disabled for this example as it needs to run fast to be rendered online)
78 | # task = openml.tasks.get_task(task_id)
79 | # dataset = task.get_dataset()
80 | # if task.target_name != dataset.default_target_attribute:
81 | # raise ValueError(
82 | # (task.target_name, dataset.default_target_attribute)
83 | # )
84 |
85 | task_ids.append(task_id)
86 |
87 | assert len(task_ids) == 140
88 | task_ids.sort()
89 |
90 | # These are the tasks to work with:
91 | print(task_ids)
92 |
93 | # License: BSD 3-Clause
94 |
--------------------------------------------------------------------------------
/docs/examples/40_paper/2018_ida_strang_example.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Strang et al. (2018)
3 | #
4 | # A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models
5 | # Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*.
6 | #
7 | # ## Publication
8 | #
9 | # | Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML
10 | # | Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter
11 | # | In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018
12 | # | Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25
13 |
14 | # %%
15 | import matplotlib.pyplot as plt
16 | import openml
17 | import pandas as pd
18 |
19 | # %% [markdown]
20 | # A basic step for each data-mining or machine learning task is to determine
21 | # which model to choose based on the problem and the data at hand. In this
22 | # work we investigate when non-linear classifiers outperform linear
23 | # classifiers by means of a large scale experiment.
24 | #
25 | # The paper is accompanied with a study object, containing all relevant tasks
26 | # and runs (``study_id=123``). The paper features three experiment classes:
27 | # Support Vector Machines (SVM), Neural Networks (NN) and Decision Trees (DT).
28 | # This example demonstrates how to reproduce the plots, comparing two
29 | # classifiers given the OpenML flow ids. Note that this allows us to reproduce
30 | # the SVM and NN experiment, but not the DT experiment, as this requires a bit
31 | # more effort to distinguish the same flow with different hyperparameter
32 | # values.
33 |
34 | # %%
35 | study_id = 123
36 | # for comparing svms: flow_ids = [7754, 7756]
37 | # for comparing nns: flow_ids = [7722, 7729]
38 | # for comparing dts: flow_ids = [7725], differentiate on hyper-parameter value
39 | classifier_family = "SVM"
40 | flow_ids = [7754, 7756]
41 | measure = "predictive_accuracy"
42 | meta_features = ["NumberOfInstances", "NumberOfFeatures"]
43 | class_values = ["non-linear better", "linear better", "equal"]
44 |
45 | # Downloads all evaluation records related to this study
46 | evaluations = openml.evaluations.list_evaluations(
47 | measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
48 | )
49 | # gives us a table with columns data_id, flow1_value, flow2_value
50 | evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
51 | # downloads all data qualities (for scatter plot)
52 | data_qualities = openml.datasets.list_datasets(
53 | data_id=list(evaluations.index.values), output_format="dataframe"
54 | )
55 | # removes irrelevant data qualities
56 | data_qualities = data_qualities[meta_features]
57 | # makes a join between evaluation table and data qualities table,
58 | # now we have columns data_id, flow1_value, flow2_value, meta_feature_1,
59 | # meta_feature_2
60 | evaluations = evaluations.join(data_qualities, how="inner")
61 |
62 | # adds column that indicates the difference between the two classifiers
63 | evaluations["diff"] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]]
64 |
65 | # %% [markdown]
66 | # makes the s-plot
67 |
68 | # %%
69 | fig_splot, ax_splot = plt.subplots()
70 | ax_splot.plot(range(len(evaluations)), sorted(evaluations["diff"]))
71 | ax_splot.set_title(classifier_family)
72 | ax_splot.set_xlabel("Dataset (sorted)")
73 | ax_splot.set_ylabel("difference between linear and non-linear classifier")
74 | ax_splot.grid(linestyle="--", axis="y")
75 | plt.show()
76 |
77 |
78 | # %% [markdown]
79 | # adds column that indicates the difference between the two classifiers,
80 | # needed for the scatter plot
81 |
82 |
83 | # %%
84 | def determine_class(val_lin, val_nonlin):
85 | if val_lin < val_nonlin:
86 | return class_values[0]
87 | elif val_nonlin < val_lin:
88 | return class_values[1]
89 | else:
90 | return class_values[2]
91 |
92 |
93 | evaluations["class"] = evaluations.apply(
94 | lambda row: determine_class(row[flow_ids[0]], row[flow_ids[1]]), axis=1
95 | )
96 |
97 | # does the plotting and formatting
98 | fig_scatter, ax_scatter = plt.subplots()
99 | for class_val in class_values:
100 | df_class = evaluations[evaluations["class"] == class_val]
101 | plt.scatter(df_class[meta_features[0]], df_class[meta_features[1]], label=class_val)
102 | ax_scatter.set_title(classifier_family)
103 | ax_scatter.set_xlabel(meta_features[0])
104 | ax_scatter.set_ylabel(meta_features[1])
105 | ax_scatter.legend()
106 | ax_scatter.set_xscale("log")
107 | ax_scatter.set_yscale("log")
108 | plt.show()
109 |
110 | # %% [markdown]
111 | # makes a scatter plot where each data point represents the performance of the
112 | # two algorithms on various axis (not in the paper)
113 |
114 | # %%
115 | fig_diagplot, ax_diagplot = plt.subplots()
116 | ax_diagplot.grid(linestyle="--")
117 | ax_diagplot.plot([0, 1], ls="-", color="black")
118 | ax_diagplot.plot([0.2, 1.2], ls="--", color="black")
119 | ax_diagplot.plot([-0.2, 0.8], ls="--", color="black")
120 | ax_diagplot.scatter(evaluations[flow_ids[0]], evaluations[flow_ids[1]])
121 | ax_diagplot.set_xlabel(measure)
122 | ax_diagplot.set_ylabel(measure)
123 | plt.show()
124 | # License: BSD 3-Clause
125 |
--------------------------------------------------------------------------------
/docs/examples/40_paper/README.txt:
--------------------------------------------------------------------------------
1 | Usage in research papers
2 | ========================
3 |
4 | These examples demonstrate how OpenML-Python can be used for research purposes by re-implementing
5 | its use in recent publications.
6 |
--------------------------------------------------------------------------------
/docs/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _examples-index:
2 |
3 | ================
4 | Examples Gallery
5 | ================
6 |
--------------------------------------------------------------------------------
/docs/examples/SUMMARY.md:
--------------------------------------------------------------------------------
1 | * Basic
2 | * [introduction_tutorial.py](20_basic/introduction_tutorial.py)
3 | * [simple_datasets_tutorial.py](20_basic/simple_datasets_tutorial.py)
4 | * [simple_flows_and_runs_tutorial.py](20_basic/simple_flows_and_runs_tutorial.py)
5 | * [simple_suites_tutorial.py](20_basic/simple_suites_tutorial.py)
6 | * Extended
7 | * [configure_logging.py](30_extended/configure_logging.py)
8 | * [create_upload_tutorial.py](30_extended/create_upload_tutorial.py)
9 | * [custom_flow_.py](30_extended/custom_flow_.py)
10 | * [datasets_tutorial.py](30_extended/datasets_tutorial.py)
11 | * [fetch_evaluations_tutorial.py](30_extended/fetch_evaluations_tutorial.py)
12 | * [fetch_runtimes_tutorial.py](30_extended/fetch_runtimes_tutorial.py)
13 | * [flow_id_tutorial.py](30_extended/flow_id_tutorial.py)
14 | * [flows_and_runs_tutorial.py](30_extended/flows_and_runs_tutorial.py)
15 | * [plot_svm_hyperparameters_tutorial.py](30_extended/plot_svm_hyperparameters_tutorial.py)
16 | * [run_setup_tutorial.py](30_extended/run_setup_tutorial.py)
17 | * [study_tutorial.py](30_extended/study_tutorial.py)
18 | * [suites_tutorial.py](30_extended/suites_tutorial.py)
19 | * [task_manual_iteration_tutorial.py](30_extended/task_manual_iteration_tutorial.py)
20 | * [tasks_tutorial.py](30_extended/tasks_tutorial.py)
21 | * Paper
22 | * [2015_neurips_feurer_example.py](40_paper/2015_neurips_feurer_example.py)
23 | * [2018_ida_strang_example.py](40_paper/2018_ida_strang_example.py)
24 | * [2018_kdd_rijn_example.py](40_paper/2018_kdd_rijn_example.py)
25 | * [2018_neurips_perrone_example.py](40_paper/2018_neurips_perrone_example.py)
26 |
--------------------------------------------------------------------------------
/docs/help/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | icon: material/help-circle
3 | ---
4 |
5 | If you have questions, you can contact us in various ways.
6 | Here are the best ways to reach out depending on your questions.
7 |
8 | ## General questions
9 |
10 | For all non-technical questions, such as how to get involved, collaboration ideas, and interesting proposals, please should the communication channel of your choice.
11 | We believe in making the world better, together, so we're always happy to hear from you :fontawesome-solid-hand-peace:
12 |
13 |
14 |
15 |
16 | - :material-email:{ .lg .middle } Contact us via email
17 |
18 | ---
19 |
20 | Old school. We love it :). This email reaches all the main OpenML developers.
21 |
22 | [:octicons-arrow-right-24: Write an email](mailto:openmlhq@googlegroups.com)
23 |
24 | - :simple-slack:{ .lg .middle } Chat with us on Slack
25 |
26 | ---
27 |
28 | Get involved in current discussions or start your own.
29 |
30 | [:octicons-arrow-right-24: Join us on Slack](https://join.slack.com/t/openml/shared_invite/enQtODg4NjgzNTE4NjU3LTYwZDFhNzQ5NmE0NjIyNmM3NDMyMjFkZDQ0YWZkYWYxMTIxODFmMDhhMTUzMGYzMmM4NjIzYTZlYjBkOGE5MTQ)
31 |
32 | - :simple-x:{ .lg .middle } Social media
33 |
34 | ---
35 |
36 | You can reach us on X (formerly Twitter)
37 |
38 | [:octicons-arrow-right-24: Post something now](https://x.com/intent/post?screen_name=open_ml&text=%23openml.org)
39 |
40 |
41 |
42 | ## Technical questions
43 |
44 | For technical questions, the best way is to open an issue on GitHub.
45 | We have several issue trackers which are closely monitored by the people who can best answer your questions. We are happy to help, so don't be shy :smiley:
46 |
47 | ### OpenML client libraries
48 |
49 |
50 |
51 | - :simple-python:{ .lg .middle } Using OpenML in Python
52 |
53 | ---
54 |
55 | For all questions on using OpenML via the Python API
56 |
57 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/openml-python/issues)
58 |
59 | - :material-language-r:{ .lg .middle } Using OpenML in R
60 |
61 | ---
62 |
63 | For all questions on using OpenML via the R API (via mlr3)
64 |
65 | [:octicons-arrow-right-24: Post your question](https://github.com/mlr-org/mlr3oml/issues)
66 |
67 | - :simple-julia:{ .lg .middle } Using OpenML in Julia
68 |
69 | ---
70 |
71 | For all questions on using OpenML via the Julia API
72 |
73 | [:octicons-arrow-right-24: Post your question](https://github.com/JuliaAI/OpenML.jl/issues)
74 |
75 | - :fontawesome-brands-java:{ .lg .middle } Using OpenML in Java
76 |
77 | ---
78 |
79 | For all questions on using OpenML via the Java API
80 |
81 |
82 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/openml-java/issues)
83 |
84 |
85 |
86 |
87 | ### OpenML platform
88 |
89 |
90 |
91 | - :fontawesome-solid-layer-group:{ .lg .middle } OpenML Server
92 |
93 | ---
94 |
95 | For all questions on the OpenML server and REST API
96 |
97 | [:octicons-arrow-right-24: Version 1 of the API (PHP)](https://github.com/openml/OpenML/issues)
98 | [:octicons-arrow-right-24: Version 2 of the API (Python)](https://github.com/openml/server-api/issues)
99 |
100 | - :material-react:{ .lg .middle } OpenML Website
101 |
102 | ---
103 |
104 | For all questions about the OpenML website
105 |
106 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/openml.org/issues)
107 |
108 | - :simple-docker:{ .lg .middle } OpenML deployment
109 |
110 | ---
111 |
112 | For all questions on deploying OpenML locally or how OpenML runs in production
113 |
114 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/services/issues)
115 |
116 | - :simple-readthedocs:{ .lg .middle } OpenML documentation
117 |
118 | ---
119 |
120 | For all questions and suggestions for these documentation pages
121 |
122 |
123 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/openml-docs/issues)
124 |
125 |
126 |
127 | ### OpenML content
128 |
129 |
130 |
131 | - :material-database:{ .lg .middle } Datasets
132 |
133 | ---
134 |
135 | For all questions about OpenML datasets
136 |
137 | [:octicons-arrow-right-24: Post your question](https://github.com/openml/data/issues)
138 |
139 | - :croissant:{ .lg .middle } Croissant
140 |
141 | ---
142 |
143 | For all questions about Croissant, the metadata standard we use for describing machine learning datasets
144 |
145 | [:octicons-arrow-right-24: Post your question](https://github.com/mlcommons/croissant/issues)
146 |
147 |
Stand on the shoulders of giants, and collaborate in real time
15 |
Make your work more visible and reusable
16 |
Built for automation: streamline your experiments and model building
17 |
18 | ## Installation
19 |
20 | The OpenML package is available in many languages and across libraries. For more information about them, see the [Integrations](./ecosystem/index.md) page.
21 |
22 | === "Python/sklearn"
23 |
24 | - [Python/sklearn repository](https://github.com/openml/openml-python)
25 | - `pip install openml`
26 |
27 | === "Pytorch"
28 |
29 | - [Pytorch repository](https://github.com/openml/openml-pytorch)
30 | - `pip install openml-pytorch`
31 |
32 | === "Keras"
33 |
34 | - [Keras repository](https://github.com/openml/openml-keras)
35 | - `pip install openml-keras`
36 |
37 | === "TensorFlow"
38 |
39 | - [TensorFlow repository](https://github.com/openml/openml-tensorflow)
40 | - `pip install openml-tensorflow`
41 |
42 | === "R"
43 |
44 | - [R repository](https://github.com/openml/openml-R)
45 | - `install.packages("mlr3oml")`
46 | === "Julia"
47 |
48 | - [Julia repository](https://github.com/JuliaAI/OpenML.jl/tree/master)
49 | - `using Pkg;Pkg.add("OpenML")`
50 |
51 | === "RUST"
52 |
53 | - [RUST repository](https://github.com/mbillingr/openml-rust)
54 | - Install from source
55 |
56 | === ".Net"
57 |
58 | - [.Net repository](https://github.com/openml/openml-dotnet)
59 | - `Install-Package openMl`
60 |
61 |
62 | You might also need to set up the API key. For more information, see [Authentication](http://localhost:8000/concepts/openness/).
63 |
64 | ## Learning OpenML
65 |
66 | Aside from the individual package documentations, you can learn more about OpenML through the following resources:
67 | The core concepts of OpenML are explained in the [Concepts](./concepts/index.md) page. These concepts include the principle behind using Datasets, Runs, Tasks, Flows, Benchmarking and much more. Going through them will help you leverage OpenML even better in your work.
68 |
69 | ## Contributing to OpenML
70 |
71 | OpenML is an open source project, hosted on GitHub. We welcome everybody to help improve OpenML, and make it more useful for everyone. For more information on how to contribute, see the [Contributing](./contributing/index.md) page.
72 |
73 | We want to make machine learning and data analysis **simple**, **accessible**, **collaborative** and **open** with an optimal **division of labour** between computers and humans.
74 |
75 | ## Want to get involved?
76 |
77 | Awesome, we're happy to have you! :tada:
78 |
79 | OpenML is dependent on the community. If you want to help, please email us (openmlHQ@googlegroups.com). If you feel already comfortable you can help by opening issues or make a pull request on GitHub. We also have regular workshops you can join (they are announced on openml.org).
80 |
--------------------------------------------------------------------------------
/docs/intro/showcase.md:
--------------------------------------------------------------------------------
1 | # Research using OpenML
2 |
3 | This page will have a list of interesting research papers that have used OpenML. If you have used OpenML in your research and would like to have your paper listed here, please drop a PR with the relevant information (click the :material-pencil: icon above).
--------------------------------------------------------------------------------
/docs/intro/terms.md:
--------------------------------------------------------------------------------
1 | ## Honor Code
2 |
By joining OpenML, you join a special worldwide community of data scientists building on each other's results and connecting their minds as efficiently as possible. This community depends on your motivation to share data, tools and ideas, and to do so with honesty. In return, you will gain trust, visibility and reputation, igniting online collaborations and studies that otherwise may not have happened.
3 |
4 |
5 |
6 |
By using any part of OpenML, you agree to:
7 |
8 |
Give credit where credit is due. Cite the authors whose work you are building on, or build collaborations where appropriate.
9 |
Give back to the community by sharing your own data as openly and as soon as possible, or by helping the community in other ways. In doing so, you gain visibility and impact (citations).
10 |
Share data according to your best efforts. Everybody make mistakes, but we trust you to correct them as soon as possible. Remove or flag data that cannot be trusted.
11 |
Be polite and constructive in all discussions. Criticism of methods is welcomed, but personal criticisms should be avoided.
12 |
Respect circles of trust. OpenML allows you to collaborate in 'circles' of trusted people to share unpublished results. Be considerate in sharing data with people outside this circle.
13 |
Do not steal the work of people who openly share it. OpenML makes it easy to find all shared data (and when it was shared), thus everybody will know if you do this.
14 |
15 |
16 | ## Terms of Use
17 |
You agree that you are responsible for your own use of OpenML.org and all content submitted by you, in accordance with the Honor Code and all applicable local, state, national and international laws.
18 |
By submitting or distributing content from OpenML.org, you affirm that you have the necessary rights, licenses, consents and/or permissions to reproduce and publish this content. You cannot upload sensitive or confidential data. You, and not the developers of OpenML.org, are solely responsible for your submissions.
19 |
By submitting content to OpenML.org, you grant OpenML.org the right to host, transfer, display and use this content, in accordance with your sharing settings and any licences granted by you. You also grant to each user a non-exclusive license to access and use this content for their own research purposes, in accordance with any licences granted by you.
20 |
You may maintain one user account and not let anyone else use your username and/or password. You may not impersonate other persons.
21 |
You will not intend to damage, disable, or impair any OpenML server or interfere with any other party's use and enjoyment of the service. You may not attempt to gain unauthorized access to the Site, other accounts, computer systems or networks connected to any OpenML server. You may not obtain or attempt to obtain any materials or information not intentionally made available through OpenML.
22 |
Strictly prohibited are content that defames, harasses or threatens others, that infringes another's intellectual property, as well as indecent or unlawful content, advertising, or intentionally inaccurate information posted with the intent of misleading others. It is also prohibited to post code containing viruses, malware, spyware or any other similar software that may damage the operation of another's computer or property.
23 |
--------------------------------------------------------------------------------
/docs/js/extra.js:
--------------------------------------------------------------------------------
1 | element = document.getElementsByClassName("md-header-nav__topic")[0];
2 | element.innerHTML = "OpenML Documentation";
3 |
4 | (function() {
5 | var startingTime = new Date().getTime();
6 | // Load the script
7 | var script = document.createElement("SCRIPT");
8 | script.src = 'https://code.jquery.com/jquery-3.3.1.slim.min.js';
9 | script.type = 'text/javascript';
10 | script.onload = function() {
11 | var $ = window.jQuery;
12 | $(function() {
13 | $(document).ready(function() {
14 | if($('.framed-content').length == 1){
15 | $(".md-content").css('margin-right',0);
16 | $(".md-content__inner").css('margin-left',0);
17 | $(".md-content__inner").css('margin-right',0);
18 | $(".md-content__inner:before").css('display','none');
19 | $("article").find("h1").css('display','none');
20 | $("article > a").attr('target','_blank');
21 | }
22 | if($('.framed-python-guide').length == 1){
23 | $("article > a").attr('href','https://github.com/openml/openml-python/edit/master/doc/usage.rst');
24 | }
25 | if($('.framed-python-api').length == 1){
26 | $("article > a").attr('href','https://github.com/openml/openml-python/edit/master/doc/api.rst');
27 | }
28 | if($('.framed-python-start').length == 1){
29 | $("article > a").attr('href','https://github.com/openml/openml-python/edit/master/doc/index.rst');
30 | }
31 | if($('.framed-r').length == 1){
32 | $("article > a").attr('href','https://github.com/openml/openml-r/edit/master/vignettes/OpenML.Rmd');
33 | }
34 | if($('.framed-r-api').length == 1){
35 | $("article > a").css('display','none');
36 | }
37 | if($('.framed-java-api').length == 1){
38 | $("article > a").css('display','none');
39 | }
40 | });
41 | });
42 | };
43 | document.getElementsByTagName("head")[0].appendChild(script);
44 | })();
45 |
--------------------------------------------------------------------------------
/docs/old/altmetrics.md:
--------------------------------------------------------------------------------
1 | To encourage open science, OpenML now includes a score system to track and reward scientific activity, reach and impact, and in the future will include further gamification features such as badges. Because the system is still experimental and very much in development, the details are subject to change. Below, the score system is described in more detailed followed by our rationale for this system for those interested. If anything is unclear or you have any feedback of the system do not hesitate to let us know.
2 |
3 | ## The scores
4 |
5 | All scores are awarded to users and involve datasets, flows, tasks and runs, or knowledge pieces in short.
6 |
7 |
8 |
Activity
9 |
Activity score is awarded to users for contributing to the knowledge base of OpenML. This includes uploading knowledge pieces, leaving likes and downloading new knowledge pieces. Uploads are rewarded strongest, with 3 activity, followed by likes, with 2 activity, and downloads are rewarded the least, with 1 activity.
10 |
11 |
12 |
Reach
13 |
Reach score is awarded to knowledge pieces and by extension their uploaders for the expressed interest of other users. It is increased by 2 for every user that leaves a like on a knowledge piece and increased by 1 for every user that downloads it for the first time.
14 |
15 |
16 |
Impact
17 |
Impact score is awarded to knowledge pieces and by extension their uploaders for the reuse of these knowledge pieces. A dataset is reused if when it is used as input in a task while flows and tasks are reused in runs. 1 Impact is awarded for every reuse by a user that is not the uploader. Impact of a reused knowledge piece is further increased by half of the acquired reach and half of the acquired impact of a reuse, usually rounded down. So the impact of a dataset that is used in a single task with reach 10 and impact 5, is 8 (⌊1+0.5*10+0.5*5 ⌋).
18 |
19 |
20 |
21 | ## The rationale
22 |
23 | One of OpenML's core ideas is to create an open science environment for sharing and exploration of knowledge while getting credit for your work. The activity score serves the encouragement of sharing and exploration. Reach makes exploration easier (by finding well liked, and/or often downloaded knowledge pieces), while also providing a form of credit to the user. Impact is another form of credit that is closer in concept to citation scores.
24 |
25 | ## Where to find it
26 | The number of likes and downloads as well as the reach and impact of knowledge pieces can be found on the top of their respective pages, for example the Iris data set. In the top right you will also find the new Like button next to the already familiar download button.
27 |
28 | When searching for knowledge pieces on the search page, you will now be able to see the statistics mentioned above as well. In addition you can sort the search results on their downloads, likes, reach or impact.
29 |
30 | On user profiles you will find all statistics relevant to that user, as well as graphs of their progress on the three scores.
31 |
32 | ## Badges
33 | Badges are intended to provide discrete goals for users to aim for. They are only in a conceptual phase, depending on the community's reaction they will be further developed.
34 |
35 | The badges a user has acquired can be found on their user profile below the score graphs. The currently implemented badges are:
36 |
37 |
38 |
Clockwork Scientist
For being active every day for a period of time.
39 |
Team Player
For collaborating with other users; reusing a knowledge piece of someone who has reused a knowledge piece of yours.
40 |
Good News Everyone
For achieving a high reach on singular knowledge piece you uploaded.
41 |
42 |
43 | ## Downvotes
44 | Although not part of the scores, downvotes have also been introduced. They are intended to indicate a flaw of a data set, flow, task or run that can be fixed, for example a missing description.
45 |
46 | If you want to indicate something is wrong with a knowledge piece, click the number of issues statistic at the top the page. A panel will open where you either agree with an already raised issue anonymously or submit your own issue (not anonymously).
47 |
48 | You can also sort search results by the number of downvotes, or issues on the search page.
49 |
50 | ## Opting out
51 | If you really do not like the altmetrics you can opt-out by changing the setting on your profile. This hides your scores and badges from other users and hides their scores and badges from you. You will still be able to see the number of likes, downloads and downvotes on knowledge pieces, and your likes, downloads and downvotes will still be counted.
52 |
--------------------------------------------------------------------------------
/docs/python/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Contribution to the OpenML package is highly appreciated in all forms.
4 | In particular, a few ways to contribute to openml-python are:
5 |
6 | - A direct contribution to the package, by means of improving the
7 | code, documentation or examples. To get started, see [this
8 | file](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
9 | with details on how to set up your environment to develop for
10 | openml-python.
11 | - A contribution to an openml-python extension. An extension package
12 | allows OpenML to interface with a machine learning package (such
13 | as scikit-learn or keras). These extensions are hosted in separate
14 | repositories and may have their own guidelines. For more
15 | information, see also [extensions](extensions.md).
16 | - Bug reports. If something doesn't work for you or is cumbersome,
17 | please open a new issue to let us know about the problem. See
18 | [this
19 | section](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md).
20 | - [Cite OpenML](https://www.openml.org/cite) if you use it in a
21 | scientific publication.
22 | - Visit one of our [hackathons](https://www.openml.org/meet).
23 | - Contribute to another OpenML project, such as [the main OpenML
24 | project](https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md).
25 |
--------------------------------------------------------------------------------
/docs/python/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | icon: material/bookshelf
3 | ---
4 |
5 | # OpenML
6 |
7 | **Collaborative Machine Learning in Python**
8 |
9 | Welcome to the documentation of the OpenML Python API, a connector to
10 | the collaborative machine learning platform
11 | [OpenML.org](https://www.openml.org). The OpenML Python package allows
12 | to use datasets and tasks from OpenML together with scikit-learn and
13 | share the results online.
14 |
15 | ## Example
16 |
17 | ```python
18 | import openml
19 | from sklearn import impute, tree, pipeline
20 |
21 | # Define a scikit-learn classifier or pipeline
22 | clf = pipeline.Pipeline(
23 | steps=[
24 | ('imputer', impute.SimpleImputer()),
25 | ('estimator', tree.DecisionTreeClassifier())
26 | ]
27 | )
28 | # Download the OpenML task for the pendigits dataset with 10-fold
29 | # cross-validation.
30 | task = openml.tasks.get_task(32)
31 | # Run the scikit-learn model on the task.
32 | run = openml.runs.run_model_on_task(clf, task)
33 | # Publish the experiment on OpenML (optional, requires an API key.
34 | # You can get your own API key by signing up to OpenML.org)
35 | run.publish()
36 | print(f'View the run online: {run.openml_url}')
37 | ```
38 |
39 | Find more examples in the sidebar on the left.
40 |
41 | ## How to get OpenML for python
42 |
43 | You can install the OpenML package via `pip` (we recommend using a virtual environment):
44 |
45 | ```bash
46 | python -m pip install openml
47 | ```
48 |
49 | For more advanced installation information, please see the
50 | ["Introduction"](../examples/20_basic/introduction_tutorial.py) example.
51 |
52 |
53 | ## Further information
54 |
55 | - [OpenML documentation](https://docs.openml.org/)
56 | - [OpenML client APIs](https://docs.openml.org/APIs/)
57 | - [OpenML developer guide](https://docs.openml.org/contributing/)
58 | - [Contact information](https://www.openml.org/contact)
59 | - [Citation request](https://www.openml.org/cite)
60 | - [OpenML blog](https://medium.com/open-machine-learning)
61 | - [OpenML twitter account](https://twitter.com/open_ml)
62 |
63 | ## Contributing
64 |
65 | Contribution to the OpenML package is highly appreciated. Please see the
66 | ["Contributing"][contributing] page for more information.
67 |
68 | ## Citing OpenML-Python
69 |
70 | If you use OpenML-Python in a scientific publication, we would
71 | appreciate a reference to our JMLR-MLOSS paper
72 | ["OpenML-Python: an extensible Python API for OpenML"](https://www.jmlr.org/papers/v22/19-920.html):
73 |
74 | === "Bibtex"
75 |
76 | ```bibtex
77 | @article{JMLR:v22:19-920,
78 | author = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
79 | title = {OpenML-Python: an extensible Python API for OpenML},
80 | journal = {Journal of Machine Learning Research},
81 | year = {2021},
82 | volume = {22},
83 | number = {100},
84 | pages = {1--5},
85 | url = {http://jmlr.org/papers/v22/19-920.html}
86 | }
87 | ```
88 |
89 | === "MLA"
90 |
91 | Feurer, Matthias, et al.
92 | "OpenML-Python: an extensible Python API for OpenML."
93 | _Journal of Machine Learning Research_ 22.100 (2021):1−5.
94 |
--------------------------------------------------------------------------------
/mkdocs-local.yml:
--------------------------------------------------------------------------------
1 | # Short version of the docs (without any external doc imports) for faster editing and previewing
2 | # Run with `mkdocs serve -f mkdocs-local.yml`
3 |
4 | site_name: Open Machine Learning
5 | repo_url: https://github.com/openml/docs/
6 | repo_name: 'openml/docs'
7 | edit_uri: edit/master/docs/
8 | site_url: https://docs.openml.org/
9 | theme:
10 | name: "material"
11 | language: "en"
12 | features:
13 | - content.code.copy
14 | - content.action.edit
15 | - content.action.view
16 | - content.tabs.link
17 | - navigation.indexes
18 | - navigation.tabs
19 | - navigation.tabs.sticky
20 | - navigation.footer
21 | - search.highlight
22 | - search.suggest
23 | - toc.follow
24 | palette:
25 | # Palette toggle for automatic mode
26 | - media: "(prefers-color-scheme)"
27 | toggle:
28 | icon: material/brightness-auto
29 |
30 | # Palette toggle for light mode
31 | - media: "(prefers-color-scheme: light)"
32 | scheme: default
33 | toggle:
34 | icon: material/brightness-7
35 |
36 | # Palette toggle for dark mode
37 | - media: "(prefers-color-scheme: dark)"
38 | scheme: slate
39 | toggle:
40 | icon: material/brightness-4
41 | font:
42 | text: "Roboto"
43 | code: "Roboto Mono"
44 | logo: img/openml.png
45 | icon:
46 | edit: material/pencil
47 | view: material/eye
48 | markdown_extensions:
49 | - admonition
50 | - codehilite
51 | - tables
52 | - attr_list
53 | - md_in_html
54 | - toc:
55 | permalink: true
56 | - markdown_include.include:
57 | base_path: docs
58 | - pymdownx.arithmatex
59 | - pymdownx.betterem:
60 | smart_enable: all
61 | - pymdownx.caret
62 | - pymdownx.critic
63 | - pymdownx.details
64 | - pymdownx.inlinehilite
65 | - pymdownx.magiclink
66 | - pymdownx.mark
67 | - pymdownx.smartsymbols
68 | - pymdownx.superfences
69 | - pymdownx.tasklist:
70 | custom_checkbox: true
71 | - pymdownx.tilde
72 | - pymdownx.tabbed:
73 | alternate_style: true
74 | - pymdownx.inlinehilite
75 | - pymdownx.emoji:
76 | emoji_index: !!python/name:material.extensions.emoji.twemoji
77 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
78 | - pymdownx.highlight:
79 | anchor_linenums: true
80 | linenums: true
81 |
82 | plugins:
83 | - autorefs
84 | - section-index
85 | - redirects:
86 | redirect_maps:
87 | 'APIs.md': 'https://www.openml.org/apis'
88 | 'REST-API.md': 'https://www.openml.org/apis'
89 | - search:
90 | separator: '[\s\-\.]+'
91 | lang:
92 | - en
93 | - literate-nav:
94 | nav_file: SUMMARY.md
95 | - git-revision-date-localized:
96 | enable_creation_date: true
97 | fallback_to_build_date: true
98 | - git-committers:
99 | repository: openml/docs
100 | nav:
101 | - OpenML:
102 | - Introduction: index.md
103 | - Getting Started: notebooks/getting_started.ipynb
104 | - Concepts:
105 | - Main concepts: concepts/index.md
106 | - Data: concepts/data.md
107 | - Tasks: concepts/tasks.md
108 | - Flows: concepts/flows.md
109 | - Runs: concepts/runs.md
110 | - Collections & Benchmarks: concepts/benchmarking.md
111 | - Tagging: concepts/tagging.md
112 | - Authentication: concepts/authentication.md
113 | - Sharing: concepts/sharing.md
114 | - Showcase : intro/showcase.md
115 | - Governance: intro/Governance.md
116 | - Terms : intro/terms.md
117 | - Datasets:
118 | - Creating datasets: data/index.md
119 | - Using datasets: data/use.md
120 | - Technical specs: data/specs.md
121 | - Benchmarking:
122 | - Benchmarking Suites: benchmark/index.md
123 | - AutoML Benchmark:
124 | - AutoML Benchmark: benchmark/automl/AutoML-Benchmark.md
125 | - Important Parameters: benchmark/automl/important_params.md
126 | - Benchmark on OpenML: benchmark/automl/benchmark_on_openml.md
127 | - Tutorials:
128 | - Basic Example - Random Forest: benchmark/automl/basic_example.md
129 | - Specific Task and Fold: benchmark/automl/specific_task_fold_example.md
130 | - Ecosystem:
131 | - Overview: ecosystem/index.md
132 | - Python Integrations: ecosystem/Python_extensions.md
133 | - Scikit-learn:
134 | - Introduction: ecosystem/Scikit-learn/index.md
135 | - Tutorials:
136 | - Basic tutorial: ecosystem/Scikit-learn/basic_tutorial.ipynb
137 | - Loading a dataset: ecosystem/Scikit-learn/datasets_tutorial.ipynb
138 | - MLR: ecosystem/mlr.md
139 | - Weka: ecosystem/Weka.md
140 | - MOA: ecosystem/MOA.md
141 | - Java: ecosystem/Java.md
142 | - REST API: ecosystem/Rest.md
143 | - Contributing:
144 | - How to Contribute: contributing/index.md
145 | - Documentation: contributing/OpenML-Docs.md
146 | - Backend:
147 | - Local Installation: contributing/backend/Local-Installation.md
148 | - Services: services/README.md
149 | - Datasets: contributing/backend/Datasets.md
150 | - Code structure: contributing/backend/API-development.md
151 | - Evaluation Engine: contributing/backend/Java-App.md
152 | - Frontend:
153 | - Getting started: contributing/website/Website.md
154 | - Flask backend: contributing/website/Flask.md
155 | - React frontend: contributing/website/React.md
156 | - Dash visualizations: contributing/website/Dash.md
157 | - Clients:
158 | - Client development: contributing/clients/Client-API-Standards.md
159 | - Metadata definition: contributing/clients/metadata_definition.md
160 | - REST API: contributing/clients/Rest.md
161 | - Style guide : contributing/Style.md
162 | - Resources: contributing/resources.md
163 | - Questions: help/index.md
164 |
165 | extra_css:
166 | - css/extra.css
167 | extra_javascript:
168 | - js/extra.js
169 | exclude_docs: |
170 | scripts/
171 | old/
172 | python/openml/
173 | python/examples/
174 | pytorch/openml_pytorch/
175 | pytorch/Examples/index.md
176 | ecosystem/showcase.md
177 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs==1.6.1
2 | mkdocs-material==9.5.40
3 | mkdocs-material-extensions==1.3.1
4 | mkdocs-redirects==1.2.1
5 | mkdocs-jupyter==0.25.0
6 | mkdocs-awesome-pages-plugin==2.9.3
7 | mkdocs-multirepo-plugin==0.8.3
8 | mkdocs-autorefs
9 | mkdocs-section-index
10 | mkdocs-gen-files
11 | mkdocs-literate-nav
12 | mkdocs-git-committers-plugin-2
13 | mkdocs-git-revision-date-localized-plugin
14 | mkdocstrings
15 | mkdocstrings-python
16 | markdown-include
17 | notebook==6.4.12
18 | tqdm
--------------------------------------------------------------------------------
/scripts/gen_python_ref_pages.py:
--------------------------------------------------------------------------------
1 | """
2 | Generate the code reference pages.
3 |
4 | based on https://github.com/mkdocstrings/mkdocstrings/blob/33aa573efb17b13e7b9da77e29aeccb3fbddd8e8/docs/recipes.md
5 | but modified for lack of "src/" file structure.
6 |
7 | """
8 |
9 | from pathlib import Path
10 | import shutil
11 | import mkdocs_gen_files
12 | import os
13 | import shutil
14 |
15 | # Move the python code and example folders into the root folder. This is necessary because the literate-nav has very strong
16 | # opinions on where the files should be located. It refuses to work from the temp_dir directory.
17 | def copy_folders_to_destinations(source_folders:list[str], destination_folders:list[str]):
18 | """
19 | Copies folders from source to specified destinations and overwrites if they already exist.
20 |
21 | Parameters:
22 | - source_folders (list of str): List of paths to the source folders.
23 | - destination_folders (list of str): List of full paths to the target directories, including the new folder names.
24 | """
25 | if len(source_folders) != len(destination_folders):
26 | return
27 |
28 | # Copy each folder to its specified destination
29 | for src, dest in zip(source_folders, destination_folders):
30 | # Ensure the parent directory of the destination path exists
31 | os.makedirs(os.path.dirname(dest), exist_ok=True)
32 |
33 | # Remove the folder if it already exists
34 | if os.path.exists(dest):
35 | shutil.rmtree(dest)
36 |
37 | # Copy the folder
38 | shutil.copytree(src, dest)
39 |
40 | temp_dir = Path(__file__).parent.parent / "temp_dir" / "python"
41 | source_folders = [
42 | temp_dir / "docs",
43 | temp_dir / "openml",
44 | temp_dir / "examples",
45 | ]
46 | destination_folders = [
47 | Path(__file__).parent.parent / "docs" / "python",
48 | Path(__file__).parent.parent / "openml",
49 | Path(__file__).parent.parent / "docs" / "examples" # Move them straight here to avoid duplication. mkdocs-jupyter will handle them.
50 | ]
51 | copy_folders_to_destinations(source_folders, destination_folders)
52 |
53 | # Generate the reference page docs
54 | nav = mkdocs_gen_files.Nav()
55 | root = Path(__file__).parent.parent
56 | src = root / "openml"
57 | edit_path_root = "/openml/openml-python/blob/docs/mkdoc/"
58 |
59 | for path in sorted(src.rglob("*.py")):
60 | module_path = path.relative_to(root).with_suffix("")
61 | doc_path = path.relative_to(src).with_suffix(".md")
62 | full_doc_path = Path("reference", doc_path)
63 |
64 | parts = tuple(module_path.parts)
65 |
66 | if parts[-1] == "__init__":
67 | parts = parts[:-1]
68 | doc_path = doc_path.with_name("index.md")
69 | full_doc_path = full_doc_path.with_name("index.md")
70 | elif parts[-1] == "__main__":
71 | continue
72 |
73 | if len(parts) > 1 and not parts[1].startswith("_"):
74 | nav[parts[1:]] = doc_path.as_posix()
75 |
76 | with mkdocs_gen_files.open(full_doc_path, "w") as fd:
77 | identifier = ".".join(parts)
78 | print("::: " + identifier, file=fd)
79 |
80 | mkdocs_gen_files.set_edit_path(full_doc_path, Path(edit_path_root) / path.relative_to(root))
81 |
82 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
83 | nav_file.writelines(nav.build_literate_nav())
84 |
85 | # Generate the example page index
86 | nav = mkdocs_gen_files.Nav()
87 | examples_src = root / "docs" / "examples"
88 | for path in sorted(examples_src.rglob("*.py")):
89 | dest_path = path.relative_to(examples_src)
90 |
91 | # Temporary. Renames the ugly folder names
92 | parts = list(dest_path.parts)
93 | parts[0] = parts[0].split("_", 1)[-1].capitalize()
94 | parts = tuple(parts)
95 |
96 | if len(parts) > 1:
97 | nav[parts] = dest_path.as_posix()
98 | with open(examples_src / "SUMMARY.md", "w") as nav_file:
99 | nav_file.writelines(nav.build_literate_nav())
100 |
101 | # Add icon to the reference pages
102 | content_to_add = "---\nicon: material/bookshelf\n---\n\n"
103 | index_file = root / "docs" / "python" / "index.md"
104 | with open(index_file, "r+") as file:
105 | original_content = file.read()
106 | file.seek(0)
107 | file.write(content_to_add + original_content)
108 |
--------------------------------------------------------------------------------
/scripts/github_scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Script to scrape the github repositories of the projects in the showcase_urls.txt file and generate a markdown file with a grid of cards with the information of the repositories.
3 |
4 | Does not rely on the GitHub API, so it is limited to the information that can be scraped from the GitHub website.
5 |
6 | Inspired in part from https://brightdata.com/blog/how-tos/how-to-scrape-github-repositories-in-python
7 | """
8 |
9 | import requests
10 | from bs4 import BeautifulSoup
11 | from tqdm import tqdm
12 |
13 | with open("showcase_urls.txt", "r") as file:
14 | target_urls = file.readlines()
15 | target_urls = [url.strip() for url in target_urls]
16 |
17 | def get_github_info(target_url:str)->tuple[str, str, str]:
18 | """
19 | Get the name, description and number of stars of a GitHub repository from its URL.
20 | """
21 | print(target_url)
22 | page = requests.get(target_url)
23 | soup = BeautifulSoup(page.text, "html.parser")
24 | name_html_element = soup.select_one('[itemprop="name"]')
25 | name = name_html_element.text.strip()
26 |
27 | bordergrid_html_element = soup.select_one(".BorderGrid")
28 | about_html_element = bordergrid_html_element.select_one("h2")
29 | description_html_element = about_html_element.find_next_sibling("p")
30 | description = description_html_element.get_text().strip()
31 |
32 | star_icon_html_element = bordergrid_html_element.select_one(".octicon-star")
33 | stars_html_element = star_icon_html_element.find_next_sibling("strong")
34 | stars = stars_html_element.get_text().strip().replace(",", "")
35 |
36 | return name, description, stars
37 |
38 |
39 | def return_details(target_urls:list[str])->dict[str, dict[str, str]]:
40 | """
41 | For a list of GitHub URLs, return a dictionary with the name, description and number of stars of the repositories.
42 | """
43 | target_urls = set(target_urls) # remove duplicates
44 | urls = {}
45 | for target_url in target_urls:
46 | name, description, stars = get_github_info(target_url)
47 | if len(name) > 0:
48 | urls[target_url] = {
49 | "name": name,
50 | "description": description,
51 | "stars": stars,
52 | }
53 | # sort by stars
54 | urls = dict(
55 | sorted(urls.items(), key=lambda item: int(item[1]["stars"]), reverse=True)
56 | )
57 | return urls
58 |
59 |
60 | def return_div(url:str, urls:dict[str, dict[str, str]]):
61 | """
62 | Return a div element with the information of a GitHub repository. Creates a card with the name, description and number of stars of the repository.
63 |
64 | Example CSS
65 |
66 | .card-container {
67 | display: flex;
68 | flex-wrap: wrap;
69 | gap: 20px;
70 | justify-content: center;
71 | }
72 |
73 | .card {
74 | border: 1px solid #ccc;
75 | border-radius: 5px;
76 | padding: 20px;
77 | width: 300px;
78 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
79 | }
80 |
81 | .card h2 {
82 | margin-top: 0;
83 | }
84 |
85 | .card p {
86 | margin-bottom: 0;}
87 |
88 | .github-logo {
89 | height: 15px;
90 | width: 13px;
91 | margin-left: 10px;
92 | }
93 |
94 | iframe[seamless] {
95 | border: none;
96 | }
97 | """
98 | info = urls[url]
99 | return f"""
100 | \n
\n
104 | """
105 |
106 |
107 | def generate_page(info:dict[str,str]):
108 | """
109 | Generate a page with a grid of cards with the information of the repositories.
110 | """
111 |
112 | page = """
\n"""
113 | for target_url in tqdm(info.keys(), total=len(info)):
114 | page += return_div(target_url, info)
115 | page += "