├── .gitignore
├── conda.yml
├── tower.yml
├── modules
    ├── fetch_dataset
    │   ├── main.nf
    │   └── resources
    │   │   └── usr
    │   │       └── bin
    │   │           └── fetch-dataset.py
    ├── visualize
    │   ├── main.nf
    │   └── resources
    │   │   └── usr
    │   │       └── bin
    │   │           └── visualize.py
    ├── split_train_test
    │   ├── main.nf
    │   └── resources
    │   │   └── usr
    │   │       └── bin
    │   │           └── split-train-test.py
    ├── predict
    │   ├── main.nf
    │   └── resources
    │   │   └── usr
    │   │       └── bin
    │   │           └── predict.py
    └── train
    │   ├── main.nf
    │   └── resources
    │       └── usr
    │           └── bin
    │               └── train.py
├── .github
    └── workflows
    │   └── build.yml
├── nextflow.config
├── README.md
├── nextflow_schema.json
├── main.nf
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .nextflow*
2 | .idea
3 | data
4 | results
5 | work


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
1 | name: hyperopt
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - matplotlib
6 |   - numpy
7 |   - pandas
8 |   - python=3.10
9 |   - scikit-learn


--------------------------------------------------------------------------------
/tower.yml:
--------------------------------------------------------------------------------
 1 | reports:
 2 |   "*.data.txt":
 3 |     display: "Input datasets"
 4 |   "*.train.txt":
 5 |     display: "Training data"
 6 |   "*.test.txt":
 7 |     display: "Test data"
 8 |   "*.png":
 9 |     display: "t-SNE plots"
10 |   "*.score.json":
11 |     display: "Evaluation scores"


--------------------------------------------------------------------------------
/modules/fetch_dataset/main.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | process fetch_dataset {
 3 |     publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" }
 4 |     tag "${dataset_name}"
 5 | 
 6 |     input:
 7 |     val(dataset_name)
 8 | 
 9 |     output:
10 |     tuple val(dataset_name), path('data.txt'), path('meta.json'), emit: datasets
11 | 
12 |     script:
13 |     """
14 |     fetch-dataset.py --name ${dataset_name}
15 |     """
16 | }
17 | 


--------------------------------------------------------------------------------
/modules/visualize/main.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | process visualize {
 3 |     publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" }
 4 | 
 5 |     input:
 6 |     tuple val(dataset_name), path(data_file), path(meta_file)
 7 | 
 8 |     output:
 9 |     tuple val(dataset_name), path('*.png'), emit: plots
10 | 
11 |     script:
12 |     """
13 |     visualize.py \
14 |         --data    ${data_file} \
15 |         --meta    ${meta_file} \
16 |         --outfile `basename ${data_file} .txt`.png
17 |     """
18 | }
19 | 


--------------------------------------------------------------------------------
/modules/split_train_test/main.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | process split_train_test {
 3 |     publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" }
 4 |     tag "${dataset_name}"
 5 | 
 6 |     input:
 7 |     tuple val(dataset_name), path(data_file), path(meta_file)
 8 | 
 9 |     output:
10 |     tuple val(dataset_name), path('train.txt'), path(meta_file), emit: train_datasets
11 |     tuple val(dataset_name), path('test.txt'), path(meta_file), emit: test_datasets
12 | 
13 |     script:
14 |     """
15 |     split-train-test.py --data ${data_file}
16 |     """
17 | }
18 | 


--------------------------------------------------------------------------------
/modules/predict/main.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | process predict {
 3 |     publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${model_type}.${file}" }
 4 |     tag "${dataset_name}/${model_type}"
 5 | 
 6 |     input:
 7 |     tuple val(dataset_name), val(model_type), path(model_file), path(data_file), path(meta_file)
 8 | 
 9 |     output:
10 |     tuple val(dataset_name), val(model_type), path('score.json'), emit: scores
11 |     tuple val(dataset_name), val(model_type), stdout, emit: logs
12 | 
13 |     script:
14 |     """
15 |     predict.py \
16 |         --model ${model_file} \
17 |         --data  ${data_file} \
18 |         --meta  ${meta_file}
19 |     """
20 | }
21 | 


--------------------------------------------------------------------------------
/modules/train/main.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | process train {
 3 |     publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${model_type}.${file}" }
 4 |     tag "${dataset_name}/${model_type}"
 5 | 
 6 |     input:
 7 |     tuple val(dataset_name), path(data_file), path(meta_file)
 8 |     each model_type
 9 | 
10 |     output:
11 |     tuple val(dataset_name), val(model_type), path('model.pkl'), emit: models
12 |     tuple val(dataset_name), val(model_type), stdout, emit: logs
13 | 
14 |     script:
15 |     """
16 |     train.py \
17 |         --data       ${data_file} \
18 |         --meta       ${meta_file} \
19 |         --scaler     standard \
20 |         --model-type ${model_type}
21 |     """
22 | }
23 | 


--------------------------------------------------------------------------------
/modules/split_train_test/resources/usr/bin/split-train-test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     # parse command-line arguments
10 |     parser = argparse.ArgumentParser(description='Split a dataset into train/test sets')
11 |     parser.add_argument('--data', help='data file', required=True)
12 |     parser.add_argument('--train-size', help='training set proportion', type=float, default=0.8)
13 |     parser.add_argument('--train-data', help='training data file', default='train.txt')
14 |     parser.add_argument('--test-data', help='test data file', default='test.txt')
15 | 
16 |     args = parser.parse_args()
17 | 
18 |     # load dataset
19 |     df = pd.read_csv(args.data, index_col=0, sep='\t')
20 | 
21 |     # split dataset into train/test sets
22 |     df_train, df_test = train_test_split(df, test_size=1 - args.train_size)
23 | 
24 |     # save datasets
25 |     df_train.to_csv(args.train_data, sep='\t')
26 |     df_test.to_csv(args.test_data, sep='\t')
27 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: hyperopt
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - 'master'
 6 |   schedule:
 7 |     - cron:  '5 5 * * *'
 8 | 
 9 | jobs:
10 |   build:
11 |     name: hyperopt ci
12 |     runs-on: ubuntu-latest
13 |     timeout-minutes: 10
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         java_version: ['17']
18 | 
19 |     steps:
20 |       - name: Environment
21 |         run: env | sort
22 | 
23 |       - name: Checkout
24 |         uses: actions/checkout@v4
25 |         with:
26 |           fetch-depth: 1
27 |           submodules: true
28 | 
29 |       - name: Setup Java
30 |         uses: actions/setup-java@v4
31 |         with:
32 |           java-version: ${{ matrix.java_version }}
33 |           distribution: 'temurin'
34 |           architecture: x64
35 | 
36 |       - name: Tests
37 |         run: |
38 |           curl -fsSL get.nextflow.io | bash
39 |           export NXF_EDGE=1
40 |           ./nextflow -self-update
41 |           ./nextflow run . -profile wave
42 |         env:
43 |           NXF_ANSI_LOG: false
44 |           NXF_TRACE: nextflow.processor
45 | 
46 |       - name: Upload logs
47 |         uses: actions/upload-artifact@v4
48 |         if: always()
49 |         with:
50 |           name: nextflow-logs-${{ matrix.java_version }}
51 |           path: |
52 |             .nextflow.*
53 | 


--------------------------------------------------------------------------------
/modules/fetch_dataset/resources/usr/bin/fetch-dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import json
 5 | from sklearn.datasets import fetch_openml
 6 | 
 7 | 
 8 | def is_categorical(y):
 9 |     return y.dtype.kind in 'OSUV'
10 | 
11 | 
12 | def get_categories(df):
13 |     result = {}
14 |     for c in df.columns:
15 |         if is_categorical(df[c]):
16 |             values = df[c].unique().tolist()
17 | 
18 |             # fix bug with numerical categories
19 |             if sum(v.isdigit() for v in values) == len(values):
20 |                 values = [int(v) for v in values]
21 | 
22 |             result[c] = values
23 | 
24 |     return result
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # parse command-line arguments
29 |     parser = argparse.ArgumentParser(description='Download an OpenML dataset')
30 |     parser.add_argument('--name', help='dataset name', required=True)
31 |     parser.add_argument('--data', help='data file', default='data.txt')
32 |     parser.add_argument('--meta', help='metadata file', default='meta.json')
33 | 
34 |     args = parser.parse_args()
35 | 
36 |     # download dataset from openml
37 |     dataset = fetch_openml(args.name, as_frame=True)
38 | 
39 |     # save data
40 |     dataset.frame.to_csv(args.data, sep='\t')
41 | 
42 |     # save metadata
43 |     meta = {
44 |         'name': args.name,
45 |         'feature_names': dataset.feature_names,
46 |         'target_names': dataset.target_names,
47 |         'categories': get_categories(dataset.frame) 
48 |     }
49 | 
50 |     with open(args.meta, 'w') as f:
51 |         json.dump(meta, f)
52 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, Seqera Labs.
 3 |  *
 4 |  * This Source Code Form is subject to the terms of the Mozilla Public
 5 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 6 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 |  *
 8 |  * This Source Code Form is "Incompatible With Secondary Licenses", as
 9 |  * defined by the Mozilla Public License, v. 2.0.
10 |  *
11 |  */
12 | manifest {
13 |     description = 'Proof-of-concept pipeline for performing hyperparameter optimization of machine learning models with Nextflow'
14 |     author = 'Ben Sherman'
15 |     nextflowVersion = '>=23.10.0'
16 | }
17 | 
18 | nextflow {
19 |     enable.moduleBinaries = true
20 | }
21 | 
22 | /*
23 |  * Default pipeline parameters. They can be overriden on the command line eg.
24 |  * given `params.foo` specify on the run command line `--foo some_value`.
25 |  */
26 | params {
27 |     fetch_dataset = true
28 |     dataset_name = 'wdbc'
29 | 
30 |     visualize = true
31 | 
32 |     datadir = 'data'
33 | 
34 |     train = true
35 |     train_data = "${params.datadir}/*.train.txt"
36 |     train_meta = "${params.datadir}/*.meta.json"
37 |     train_models = ['dummy', 'gb', 'lr', 'mlp', 'rf']
38 | 
39 |     predict = true
40 |     predict_models = "${params.datadir}/*.pkl"
41 |     predict_data = "${params.datadir}/*.predict.txt"
42 |     predict_meta = "${params.datadir}/*.meta.json"
43 | 
44 |     outdir = 'results'
45 | }
46 | 
47 | 
48 | /*
49 |  * Execution profiles for different environments.
50 |  */
51 | profiles {
52 |     slurm {
53 |         process.executor = 'slurm'
54 |     }
55 | 
56 |     conda {
57 |         process.conda = "$baseDir/conda.yml"
58 |         conda.enabled = true
59 |     }
60 | 
61 |     wave {
62 |         process.conda = "$baseDir/conda.yml"
63 |         docker.enabled = true
64 |         wave.enabled = true
65 |         wave.strategy = 'conda'
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/modules/visualize/resources/usr/bin/visualize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import json
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | from sklearn.manifold import TSNE
10 | 
11 | 
12 | def encode_onehot(x, categories):
13 |     for column, values in categories.items():
14 |         if column in x:
15 |             for v in values:
16 |                 x['%s_%s' % (column, v)] = (x[column] == v)
17 |             x = x.drop(columns=[column])
18 | 
19 |     return x
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     # parse command-line arguments
24 |     parser = argparse.ArgumentParser(description='Visualize a dataset with t-SNE')
25 |     parser.add_argument('--data', help='data file', required=True)
26 |     parser.add_argument('--meta', help='metadata file', required=True)
27 |     parser.add_argument('--outfile', help='output plot file', required=True)
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     # load dataset
32 |     df = pd.read_csv(args.data, index_col=0, sep='\t')
33 | 
34 |     with open(args.meta, 'r') as f:
35 |         meta = json.load(f)
36 | 
37 |     # extract input features
38 |     x = df[meta['feature_names']]
39 |     x = encode_onehot(x, meta['categories'])
40 | 
41 |     # extract target column
42 |     target = meta['target_names'][0]
43 |     y = df[target]
44 | 
45 |     # compute t-SNE embedding
46 |     x_tsne = TSNE().fit_transform(x)
47 | 
48 |     # plot t-SNE embedding with class labels or colorbar
49 |     plt.axis('off')
50 | 
51 |     if target in meta['categories']:
52 |         classes = meta['categories'][target]
53 | 
54 |         for c in classes:
55 |             indices = (y == c)
56 |             plt.scatter(x_tsne[indices, 0], x_tsne[indices, 1], label=c, edgecolors='w')
57 | 
58 |         plt.subplots_adjust(right=0.70)
59 |         plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
60 | 
61 |     else:
62 |         plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y, edgecolors='w')
63 |         plt.colorbar()
64 | 
65 |     plt.savefig(args.outfile)
66 |     plt.close()
67 | 


--------------------------------------------------------------------------------
/modules/predict/resources/usr/bin/predict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import json
 5 | import pandas as pd
 6 | import pickle
 7 | from sklearn.metrics import accuracy_score, r2_score
 8 | 
 9 | 
10 | def encode_onehot(x, categories):
11 |     for column, values in categories.items():
12 |         if column in x:
13 |             for v in values:
14 |                 x['%s_%s' % (column, v)] = (x[column] == v)
15 |             x = x.drop(columns=[column])
16 | 
17 |     return x
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     # parse command-line arguments
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--model', help='trained model file', required=True)
24 |     parser.add_argument('--data', help='data file', required=True)
25 |     parser.add_argument('--meta', help='metadata file', required=True)
26 |     parser.add_argument('--outfile', help='score file', default='score.json')
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     # load model
31 |     print('loading model')
32 | 
33 |     with open(args.model, 'rb') as f:
34 |         model = pickle.load(f)
35 | 
36 |     # load dataset
37 |     print('loading dataset')
38 | 
39 |     df = pd.read_csv(args.data, index_col=0, sep='\t')
40 | 
41 |     with open(args.meta, 'r') as f:
42 |         meta = json.load(f)
43 | 
44 |     # extract input features
45 |     x = df[meta['feature_names']]
46 |     x = encode_onehot(x, meta['categories'])
47 | 
48 |     # extract target column
49 |     target = meta['target_names'][0]
50 |     y_true = df[target]
51 |     is_categorical = target in meta['categories']
52 | 
53 |     # perform inference
54 |     print('performing inference')
55 | 
56 |     y_pred = model.predict(x)
57 | 
58 |     if is_categorical:
59 |         classes = meta['categories'][target]
60 |         y_pred = [classes[v] for v in y_pred]
61 | 
62 |     for sample_name, v_pred, v_true in zip(df.index, y_pred, y_true):
63 |         print('%8s: %8s (%8s)' % (sample_name, v_pred, v_true))
64 | 
65 |     # save score
66 |     if is_categorical:
67 |         score = {
68 |             'name': 'accuracy',
69 |             'value': accuracy_score(y_true, y_pred)
70 |         }
71 | 
72 |     else:
73 |         score = {
74 |             'name': 'r2',
75 |             'value': r2_score(y_true, y_pred)
76 |         }
77 | 
78 |     print('%s: %0.3f' % (score['name'], score['value']))
79 | 
80 |     with open(args.outfile, 'w') as f:
81 |         json.dump(score, f)
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hyperopt
 2 | 
 3 | A proof-of-concept pipeline for performing hyperparameter optimization of machine learning models with Nextflow.
 4 | 
 5 | 
 6 | ## Requirements
 7 | 
 8 | * Unix-like operating system (Linux, macOS, etc)
 9 | * Java >=11
10 | * [Conda](https://docs.conda.io/en/latest/) or [Docker](https://docs.docker.com/)
11 | 
12 | 
13 | ## Quickstart
14 | 
15 | 1. Install Nextflow (version 23.10.x or higher):
16 |     ```bash
17 |     curl -s https://get.nextflow.io | bash
18 |     ```
19 | 
20 | 2. Launch the pipeline:
21 |     ```bash
22 |     # use conda natively (requires Conda)
23 |     ./nextflow run nextflow-io/hyperopt -profile conda
24 | 
25 |     # use Wave containers (requires Docker)
26 |     ./nextflow run nextflow-io/hyperopt -profile wave
27 |     ```
28 | 
29 | 3. When the pipeline completes, you can view the training and prediction results in the `results` folder.
30 | 
31 | Note: the first time you execute the pipeline, Nextflow will take a few minutes to download the pipeline code from this GitHub repository and any related software dependencies (e.g. conda packages or Docker images).
32 | 
33 | 
34 | ## Configuration
35 | 
36 | The hyperopt pipeline consists of the following steps:
37 | 
38 | 1. Download a dataset
39 | 2. Split the dataset into train/test sets
40 | 3. Visualize the train/test sets
41 | 4. Train a variety of models on the training set
42 | 5. Evaluate each model on the test set
43 | 6. Select the best model based on evaluation score
44 | 
45 | You can control many aspects of this workflow with the pipeline parameters, including:
46 | 
47 | * Enable/disable each individual step
48 | * Download a different dataset (default is `wdbc`, see [OpenML.org](https://www.openml.org/search?type=data&status=active) to view available datasets)
49 | * Provide your own training data instead of downloading it
50 | * Provide your own pre-trained model and test data
51 | * Select different models (see the `train` module for all available options)
52 | 
53 | See the `nextflow.config` file for the list of pipeline parameters.
54 | 
55 | 
56 | ## Cluster support
57 | 
58 | Since [Nextflow](http://www.nextflow.io) provides an abstraction between the pipeline logic and the underlying execution environment, the hyperopt pipeline can be executed on a single computer or an HPC cluster without any modifications.
59 | 
60 | Visit the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html) to see which HPC schedulers are supported, and how to use them.
61 | 
62 | 
63 | ## Components
64 | 
65 | The hyperopt pipeline uses Python (>=3.10) and several Python packages for machine learning and data science. These dependencies are defined in the `conda.yml` file.
66 | 


--------------------------------------------------------------------------------
/nextflow_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "http://json-schema.org/draft-07/schema",
 3 |   "$id": "https://raw.githubusercontent.com/nextflow-io/hyperopt/master/nextflow_schema.json",
 4 |   "title": "hyperopt pipeline parameters",
 5 |   "description": "Proof-of-concept pipeline for training and evaluating machine learning models with Nextflow. Not all parameters are included in this schema, refer to the nextflow.config for more details.",
 6 |   "type": "object",
 7 |   "definitions": {
 8 |       "pipeline_options": {
 9 |           "title": "Pipeline options",
10 |           "type": "object",
11 |           "description": "",
12 |           "default": "",
13 |           "fa_icon": "fas fa-terminal",
14 |           "properties": {
15 |               "fetch_dataset": {
16 |                   "type": "boolean",
17 |                   "description": "Fetch a dataset by name from OpenML",
18 |                   "default": true,
19 |                   "fa_icon": "fas fa-download"
20 |               },
21 |               "dataset_name": {
22 |                   "type": "string",
23 |                   "description": "Name of dataset to fetch from OpenML",
24 |                   "default": "wdbc",
25 |                   "fa_icon": "fas fa-file-csv"
26 |               },
27 |               "visualize": {
28 |                   "type": "boolean",
29 |                   "description": "Visualize the training and test sets",
30 |                   "default": true,
31 |                   "fa_icon": "fas fa-chart-line"
32 |               },
33 |               "datadir": {
34 |                   "type": "string",
35 |                   "description": "Directory that contains any input datasets and pre-trained models",
36 |                   "default": "data",
37 |                   "fa_icon": "fas fa-folder-open"
38 |               },
39 |               "train": {
40 |                   "type": "boolean",
41 |                   "description": "Train a set of models on the given training set(s)",
42 |                   "default": true,
43 |                   "fa_icon": "fas fa-dumbbell"
44 |               },
45 |               "predict": {
46 |                   "type": "boolean",
47 |                   "description": "Evaluate a set of trained models on the given test set(s)",
48 |                   "default": true,
49 |                   "fa_icon": "fas fa-balance-scale-right"
50 |               },
51 |               "outdir": {
52 |                   "type": "string",
53 |                   "description": "Directory to publish output data",
54 |                   "default": "results",
55 |                   "fa_icon": "fas fa-folder-open"
56 |               }
57 |           }
58 |       }
59 |   },
60 |   "allOf": [
61 |       {
62 |           "$ref": "#/definitions/pipeline_options"
63 |       }
64 |   ]
65 | }
66 | 


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow 
  2 | 
  3 | /*
  4 |  * Copyright (c) 2022, Seqera Labs.
  5 |  *
  6 |  * This Source Code Form is subject to the terms of the Mozilla Public
  7 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  8 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9 |  *
 10 |  * This Source Code Form is "Incompatible With Secondary Licenses", as
 11 |  * defined by the Mozilla Public License, v. 2.0.
 12 |  *
 13 |  */
 14 | import groovy.json.JsonSlurper
 15 | 
 16 | include { fetch_dataset } from './modules/fetch_dataset'
 17 | include { split_train_test } from './modules/split_train_test'
 18 | include { visualize } from './modules/visualize'
 19 | include { train } from './modules/train'
 20 | include { predict } from './modules/predict'
 21 | 
 22 | 
 23 | log.info """
 24 |     M L - H Y P E R O P T   P I P E L I N E
 25 |     =======================================
 26 |     fetch_dataset   : ${params.fetch_dataset}
 27 |     dataset_name    : ${params.dataset_name}
 28 | 
 29 |     visualize       : ${params.visualize}
 30 | 
 31 |     train           : ${params.train}
 32 |     train_data      : ${params.train_data}
 33 |     train_meta      : ${params.train_meta}
 34 |     train_models    : ${params.train_models}
 35 | 
 36 |     predict         : ${params.predict}
 37 |     predict_models  : ${params.predict_models}
 38 |     predict_data    : ${params.predict_data}
 39 |     predict_meta    : ${params.predict_meta}
 40 | 
 41 |     outdir          : ${params.outdir}
 42 |     """
 43 | 
 44 | 
 45 | /* 
 46 |  * main script flow
 47 |  */
 48 | workflow {
 49 |     // fetch dataset if specified
 50 |     if ( params.fetch_dataset == true ) {
 51 |         ch_datasets = fetch_dataset(params.dataset_name)
 52 | 
 53 |         (ch_train_datasets, ch_predict_datasets) = split_train_test(ch_datasets)
 54 |     }
 55 | 
 56 |     // otherwise load input files
 57 |     else {
 58 |         ch_train_data = Channel.fromFilePairs(params.train_data, size: 1, flat: true)
 59 |         ch_train_meta = Channel.fromFilePairs(params.train_meta, size: 1, flat: true)
 60 |         ch_train_datasets = ch_train_data.join(ch_train_meta)
 61 | 
 62 |         ch_predict_data = Channel.fromFilePairs(params.predict_data, size: 1, flat: true)
 63 |         ch_predict_meta = Channel.fromFilePairs(params.predict_meta, size: 1, flat: true)
 64 |         ch_predict_datasets = ch_predict_data.join(ch_predict_meta)
 65 |     }
 66 | 
 67 |     // visualize train/test sets
 68 |     if ( params.visualize == true ) {
 69 |         visualize(ch_train_datasets.concat(ch_predict_datasets))
 70 |     }
 71 | 
 72 |     // print warning if both training and pre-trained model are enabled
 73 |     if ( params.train == true && params.predict_models != null ) {
 74 |         log.warn 'Training is enabled but pre-trained model(s) are also provided, pre-trained models will be ignored'
 75 |     }
 76 | 
 77 |     // perform training if specified
 78 |     if ( params.train == true ) {
 79 |         (ch_models, ch_train_logs) = train(ch_train_datasets, params.train_models)
 80 |     }
 81 | 
 82 |     // otherwise load trained model if specified
 83 |     else if ( params.predict_models != null ) {
 84 |         ch_models = Channel.fromFilePairs(params.predict_models, size: 1, flat: true)
 85 |             | map { [it[0], 'pretrained', it[1]] }
 86 |     }
 87 | 
 88 |     // perform inference if specified
 89 |     if ( params.predict == true ) {
 90 |         ch_predict_inputs = ch_models.combine(ch_predict_datasets, by: 0)
 91 |         (ch_scores, ch_predict_logs) = predict(ch_predict_inputs)
 92 | 
 93 |         // select the best model based on inference score
 94 |         ch_scores
 95 |             | max {
 96 |                 new JsonSlurper().parse(it[2])['value']
 97 |             }
 98 |             | subscribe { dataset_name, model_type, score_file ->
 99 |                 def score = new JsonSlurper().parse(score_file)
100 |                 println "The best model for \'${dataset_name}\' was \'${model_type}\', with ${score.name} = ${String.format('%.3f', score.value)}"
101 |             }
102 |     }
103 | }
104 | 
105 | 
106 | /* 
107 |  * completion handler
108 |  */
109 | workflow.onComplete {
110 | 	log.info ( workflow.success ? '\nDone!' : '\nOops .. something went wrong' )
111 | }
112 | 


--------------------------------------------------------------------------------
/modules/train/resources/usr/bin/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import json
  5 | import pandas as pd
  6 | import pickle
  7 | from sklearn.dummy import DummyClassifier, DummyRegressor
  8 | from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
  9 | from sklearn.linear_model import LinearRegression, LogisticRegression
 10 | from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
 11 | from sklearn.model_selection import cross_val_predict
 12 | from sklearn.neural_network import MLPClassifier, MLPRegressor
 13 | from sklearn.pipeline import Pipeline
 14 | from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler
 15 | 
 16 | 
 17 | def encode_onehot(x, categories):
 18 |     for column, values in categories.items():
 19 |         if column in x:
 20 |             for v in values:
 21 |                 x['%s_%s' % (column, v)] = (x[column] == v)
 22 |             x = x.drop(columns=[column])
 23 | 
 24 |     return x
 25 | 
 26 | 
 27 | if __name__ == '__main__':
 28 |     # parse command-line arguments
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument('--data', help='training data file', required=True)
 31 |     parser.add_argument('--meta', help='training metadata file', required=True)
 32 |     parser.add_argument('--scaler', help='preprocessing transform to apply to inputs', choices=['maxabs', 'minmax', 'standard'], default='standard')
 33 |     parser.add_argument('--model-type', help='which model to train', choices=['dummy', 'gb', 'lr', 'mlp', 'rf'], default='dummy')
 34 |     parser.add_argument('--model-name', help='name of trained model file', default='model.pkl')
 35 | 
 36 |     args = parser.parse_args()
 37 | 
 38 |     # load dataset
 39 |     print('loading dataset')
 40 | 
 41 |     df = pd.read_csv(args.data, index_col=0, sep='\t')
 42 | 
 43 |     with open(args.meta, 'r') as f:
 44 |         meta = json.load(f)
 45 | 
 46 |     # extract input features
 47 |     x = df[meta['feature_names']]
 48 |     x = encode_onehot(x, meta['categories'])
 49 | 
 50 |     # extract target column
 51 |     target = meta['target_names'][0]
 52 |     is_categorical = target in meta['categories']
 53 | 
 54 |     if is_categorical:
 55 |         classes = {v: i for i, v in enumerate(meta['categories'][target])}
 56 |         y = df[target].apply(lambda v: classes[v])
 57 | 
 58 |     else:
 59 |         y = df[target]
 60 | 
 61 |     # select scaler
 62 |     Scaler = {
 63 |         'maxabs': MaxAbsScaler,
 64 |         'minmax': MinMaxScaler,
 65 |         'standard': StandardScaler
 66 |     }[args.scaler]
 67 | 
 68 |     # select estimator
 69 |     Estimator = {
 70 |         True: {
 71 |             'dummy': DummyClassifier,
 72 |             'gb': GradientBoostingClassifier,
 73 |             'lr': LogisticRegression,
 74 |             'mlp': MLPClassifier,
 75 |             'rf': RandomForestClassifier
 76 |         },
 77 |         False: {
 78 |             'dummy': DummyRegressor,
 79 |             'gb': GradientBoostingRegressor,
 80 |             'lr': LinearRegression,
 81 |             'mlp': MLPRegressor,
 82 |             'rf': RandomForestRegressor
 83 |         }
 84 |     }[is_categorical][args.model_type]
 85 | 
 86 |     # create model pipeline
 87 |     model = Pipeline([
 88 |         ('scaler', Scaler()),
 89 |         ('estimator', Estimator())
 90 |     ])
 91 | 
 92 |     # train and evaluate model
 93 |     print('training model')
 94 | 
 95 |     y_pred = cross_val_predict(model, x, y, cv=5)
 96 | 
 97 |     scorers = {
 98 |         True: [
 99 |             ('mse', mean_squared_error),
100 |             ('mae', mean_absolute_error),
101 |             ('acc', accuracy_score)
102 |         ],
103 |         False: [
104 |             ('mse', mean_squared_error),
105 |             ('mae', mean_absolute_error),
106 |             ('r2', r2_score)
107 |         ]
108 |     }[is_categorical]
109 | 
110 |     for name, score_fn in scorers:
111 |         print('%s: %0.3f' % (name, score_fn(y, y_pred)))
112 | 
113 |     # train model on full dataset
114 |     model.fit(x, y)
115 | 
116 |     # save trained model to file
117 |     print('saving model')
118 | 
119 |     with open(args.model_name, 'wb') as f:
120 |         pickle.dump(model, f)
121 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in 
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.


--------------------------------------------------------------------------------