├── .gitignore ├── conda.yml ├── tower.yml ├── modules ├── fetch_dataset │ ├── main.nf │ └── resources │ │ └── usr │ │ └── bin │ │ └── fetch-dataset.py ├── visualize │ ├── main.nf │ └── resources │ │ └── usr │ │ └── bin │ │ └── visualize.py ├── split_train_test │ ├── main.nf │ └── resources │ │ └── usr │ │ └── bin │ │ └── split-train-test.py ├── predict │ ├── main.nf │ └── resources │ │ └── usr │ │ └── bin │ │ └── predict.py └── train │ ├── main.nf │ └── resources │ └── usr │ └── bin │ └── train.py ├── .github └── workflows │ └── build.yml ├── nextflow.config ├── README.md ├── nextflow_schema.json ├── main.nf └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .nextflow* 2 | .idea 3 | data 4 | results 5 | work -------------------------------------------------------------------------------- /conda.yml: -------------------------------------------------------------------------------- 1 | name: hyperopt 2 | channels: 3 | - defaults 4 | dependencies: 5 | - matplotlib 6 | - numpy 7 | - pandas 8 | - python=3.10 9 | - scikit-learn -------------------------------------------------------------------------------- /tower.yml: -------------------------------------------------------------------------------- 1 | reports: 2 | "*.data.txt": 3 | display: "Input datasets" 4 | "*.train.txt": 5 | display: "Training data" 6 | "*.test.txt": 7 | display: "Test data" 8 | "*.png": 9 | display: "t-SNE plots" 10 | "*.score.json": 11 | display: "Evaluation scores" -------------------------------------------------------------------------------- /modules/fetch_dataset/main.nf: -------------------------------------------------------------------------------- 1 | 2 | process fetch_dataset { 3 | publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" } 4 | tag "${dataset_name}" 5 | 6 | input: 7 | val(dataset_name) 8 | 9 | output: 10 | tuple val(dataset_name), path('data.txt'), path('meta.json'), emit: datasets 11 | 12 | script: 13 | """ 14 | fetch-dataset.py --name ${dataset_name} 15 | """ 16 | } 17 | -------------------------------------------------------------------------------- /modules/visualize/main.nf: -------------------------------------------------------------------------------- 1 | 2 | process visualize { 3 | publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" } 4 | 5 | input: 6 | tuple val(dataset_name), path(data_file), path(meta_file) 7 | 8 | output: 9 | tuple val(dataset_name), path('*.png'), emit: plots 10 | 11 | script: 12 | """ 13 | visualize.py \ 14 | --data ${data_file} \ 15 | --meta ${meta_file} \ 16 | --outfile `basename ${data_file} .txt`.png 17 | """ 18 | } 19 | -------------------------------------------------------------------------------- /modules/split_train_test/main.nf: -------------------------------------------------------------------------------- 1 | 2 | process split_train_test { 3 | publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${file}" } 4 | tag "${dataset_name}" 5 | 6 | input: 7 | tuple val(dataset_name), path(data_file), path(meta_file) 8 | 9 | output: 10 | tuple val(dataset_name), path('train.txt'), path(meta_file), emit: train_datasets 11 | tuple val(dataset_name), path('test.txt'), path(meta_file), emit: test_datasets 12 | 13 | script: 14 | """ 15 | split-train-test.py --data ${data_file} 16 | """ 17 | } 18 | -------------------------------------------------------------------------------- /modules/predict/main.nf: -------------------------------------------------------------------------------- 1 | 2 | process predict { 3 | publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${model_type}.${file}" } 4 | tag "${dataset_name}/${model_type}" 5 | 6 | input: 7 | tuple val(dataset_name), val(model_type), path(model_file), path(data_file), path(meta_file) 8 | 9 | output: 10 | tuple val(dataset_name), val(model_type), path('score.json'), emit: scores 11 | tuple val(dataset_name), val(model_type), stdout, emit: logs 12 | 13 | script: 14 | """ 15 | predict.py \ 16 | --model ${model_file} \ 17 | --data ${data_file} \ 18 | --meta ${meta_file} 19 | """ 20 | } 21 | -------------------------------------------------------------------------------- /modules/train/main.nf: -------------------------------------------------------------------------------- 1 | 2 | process train { 3 | publishDir params.outdir, mode: 'copy', saveAs: { file -> "${dataset_name}.${model_type}.${file}" } 4 | tag "${dataset_name}/${model_type}" 5 | 6 | input: 7 | tuple val(dataset_name), path(data_file), path(meta_file) 8 | each model_type 9 | 10 | output: 11 | tuple val(dataset_name), val(model_type), path('model.pkl'), emit: models 12 | tuple val(dataset_name), val(model_type), stdout, emit: logs 13 | 14 | script: 15 | """ 16 | train.py \ 17 | --data ${data_file} \ 18 | --meta ${meta_file} \ 19 | --scaler standard \ 20 | --model-type ${model_type} 21 | """ 22 | } 23 | -------------------------------------------------------------------------------- /modules/split_train_test/resources/usr/bin/split-train-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import pandas as pd 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | if __name__ == '__main__': 9 | # parse command-line arguments 10 | parser = argparse.ArgumentParser(description='Split a dataset into train/test sets') 11 | parser.add_argument('--data', help='data file', required=True) 12 | parser.add_argument('--train-size', help='training set proportion', type=float, default=0.8) 13 | parser.add_argument('--train-data', help='training data file', default='train.txt') 14 | parser.add_argument('--test-data', help='test data file', default='test.txt') 15 | 16 | args = parser.parse_args() 17 | 18 | # load dataset 19 | df = pd.read_csv(args.data, index_col=0, sep='\t') 20 | 21 | # split dataset into train/test sets 22 | df_train, df_test = train_test_split(df, test_size=1 - args.train_size) 23 | 24 | # save datasets 25 | df_train.to_csv(args.train_data, sep='\t') 26 | df_test.to_csv(args.test_data, sep='\t') 27 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: hyperopt 2 | on: 3 | push: 4 | branches: 5 | - 'master' 6 | schedule: 7 | - cron: '5 5 * * *' 8 | 9 | jobs: 10 | build: 11 | name: hyperopt ci 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 10 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | java_version: ['17'] 18 | 19 | steps: 20 | - name: Environment 21 | run: env | sort 22 | 23 | - name: Checkout 24 | uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 1 27 | submodules: true 28 | 29 | - name: Setup Java 30 | uses: actions/setup-java@v4 31 | with: 32 | java-version: ${{ matrix.java_version }} 33 | distribution: 'temurin' 34 | architecture: x64 35 | 36 | - name: Tests 37 | run: | 38 | curl -fsSL get.nextflow.io | bash 39 | export NXF_EDGE=1 40 | ./nextflow -self-update 41 | ./nextflow run . -profile wave 42 | env: 43 | NXF_ANSI_LOG: false 44 | NXF_TRACE: nextflow.processor 45 | 46 | - name: Upload logs 47 | uses: actions/upload-artifact@v4 48 | if: always() 49 | with: 50 | name: nextflow-logs-${{ matrix.java_version }} 51 | path: | 52 | .nextflow.* 53 | -------------------------------------------------------------------------------- /modules/fetch_dataset/resources/usr/bin/fetch-dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | from sklearn.datasets import fetch_openml 6 | 7 | 8 | def is_categorical(y): 9 | return y.dtype.kind in 'OSUV' 10 | 11 | 12 | def get_categories(df): 13 | result = {} 14 | for c in df.columns: 15 | if is_categorical(df[c]): 16 | values = df[c].unique().tolist() 17 | 18 | # fix bug with numerical categories 19 | if sum(v.isdigit() for v in values) == len(values): 20 | values = [int(v) for v in values] 21 | 22 | result[c] = values 23 | 24 | return result 25 | 26 | 27 | if __name__ == '__main__': 28 | # parse command-line arguments 29 | parser = argparse.ArgumentParser(description='Download an OpenML dataset') 30 | parser.add_argument('--name', help='dataset name', required=True) 31 | parser.add_argument('--data', help='data file', default='data.txt') 32 | parser.add_argument('--meta', help='metadata file', default='meta.json') 33 | 34 | args = parser.parse_args() 35 | 36 | # download dataset from openml 37 | dataset = fetch_openml(args.name, as_frame=True) 38 | 39 | # save data 40 | dataset.frame.to_csv(args.data, sep='\t') 41 | 42 | # save metadata 43 | meta = { 44 | 'name': args.name, 45 | 'feature_names': dataset.feature_names, 46 | 'target_names': dataset.target_names, 47 | 'categories': get_categories(dataset.frame) 48 | } 49 | 50 | with open(args.meta, 'w') as f: 51 | json.dump(meta, f) 52 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, Seqera Labs. 3 | * 4 | * This Source Code Form is subject to the terms of the Mozilla Public 5 | * License, v. 2.0. If a copy of the MPL was not distributed with this 6 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | * 8 | * This Source Code Form is "Incompatible With Secondary Licenses", as 9 | * defined by the Mozilla Public License, v. 2.0. 10 | * 11 | */ 12 | manifest { 13 | description = 'Proof-of-concept pipeline for performing hyperparameter optimization of machine learning models with Nextflow' 14 | author = 'Ben Sherman' 15 | nextflowVersion = '>=23.10.0' 16 | } 17 | 18 | nextflow { 19 | enable.moduleBinaries = true 20 | } 21 | 22 | /* 23 | * Default pipeline parameters. They can be overriden on the command line eg. 24 | * given `params.foo` specify on the run command line `--foo some_value`. 25 | */ 26 | params { 27 | fetch_dataset = true 28 | dataset_name = 'wdbc' 29 | 30 | visualize = true 31 | 32 | datadir = 'data' 33 | 34 | train = true 35 | train_data = "${params.datadir}/*.train.txt" 36 | train_meta = "${params.datadir}/*.meta.json" 37 | train_models = ['dummy', 'gb', 'lr', 'mlp', 'rf'] 38 | 39 | predict = true 40 | predict_models = "${params.datadir}/*.pkl" 41 | predict_data = "${params.datadir}/*.predict.txt" 42 | predict_meta = "${params.datadir}/*.meta.json" 43 | 44 | outdir = 'results' 45 | } 46 | 47 | 48 | /* 49 | * Execution profiles for different environments. 50 | */ 51 | profiles { 52 | slurm { 53 | process.executor = 'slurm' 54 | } 55 | 56 | conda { 57 | process.conda = "$baseDir/conda.yml" 58 | conda.enabled = true 59 | } 60 | 61 | wave { 62 | process.conda = "$baseDir/conda.yml" 63 | docker.enabled = true 64 | wave.enabled = true 65 | wave.strategy = 'conda' 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /modules/visualize/resources/usr/bin/visualize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | from sklearn.manifold import TSNE 10 | 11 | 12 | def encode_onehot(x, categories): 13 | for column, values in categories.items(): 14 | if column in x: 15 | for v in values: 16 | x['%s_%s' % (column, v)] = (x[column] == v) 17 | x = x.drop(columns=[column]) 18 | 19 | return x 20 | 21 | 22 | if __name__ == '__main__': 23 | # parse command-line arguments 24 | parser = argparse.ArgumentParser(description='Visualize a dataset with t-SNE') 25 | parser.add_argument('--data', help='data file', required=True) 26 | parser.add_argument('--meta', help='metadata file', required=True) 27 | parser.add_argument('--outfile', help='output plot file', required=True) 28 | 29 | args = parser.parse_args() 30 | 31 | # load dataset 32 | df = pd.read_csv(args.data, index_col=0, sep='\t') 33 | 34 | with open(args.meta, 'r') as f: 35 | meta = json.load(f) 36 | 37 | # extract input features 38 | x = df[meta['feature_names']] 39 | x = encode_onehot(x, meta['categories']) 40 | 41 | # extract target column 42 | target = meta['target_names'][0] 43 | y = df[target] 44 | 45 | # compute t-SNE embedding 46 | x_tsne = TSNE().fit_transform(x) 47 | 48 | # plot t-SNE embedding with class labels or colorbar 49 | plt.axis('off') 50 | 51 | if target in meta['categories']: 52 | classes = meta['categories'][target] 53 | 54 | for c in classes: 55 | indices = (y == c) 56 | plt.scatter(x_tsne[indices, 0], x_tsne[indices, 1], label=c, edgecolors='w') 57 | 58 | plt.subplots_adjust(right=0.70) 59 | plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) 60 | 61 | else: 62 | plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y, edgecolors='w') 63 | plt.colorbar() 64 | 65 | plt.savefig(args.outfile) 66 | plt.close() 67 | -------------------------------------------------------------------------------- /modules/predict/resources/usr/bin/predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import pandas as pd 6 | import pickle 7 | from sklearn.metrics import accuracy_score, r2_score 8 | 9 | 10 | def encode_onehot(x, categories): 11 | for column, values in categories.items(): 12 | if column in x: 13 | for v in values: 14 | x['%s_%s' % (column, v)] = (x[column] == v) 15 | x = x.drop(columns=[column]) 16 | 17 | return x 18 | 19 | 20 | if __name__ == '__main__': 21 | # parse command-line arguments 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--model', help='trained model file', required=True) 24 | parser.add_argument('--data', help='data file', required=True) 25 | parser.add_argument('--meta', help='metadata file', required=True) 26 | parser.add_argument('--outfile', help='score file', default='score.json') 27 | 28 | args = parser.parse_args() 29 | 30 | # load model 31 | print('loading model') 32 | 33 | with open(args.model, 'rb') as f: 34 | model = pickle.load(f) 35 | 36 | # load dataset 37 | print('loading dataset') 38 | 39 | df = pd.read_csv(args.data, index_col=0, sep='\t') 40 | 41 | with open(args.meta, 'r') as f: 42 | meta = json.load(f) 43 | 44 | # extract input features 45 | x = df[meta['feature_names']] 46 | x = encode_onehot(x, meta['categories']) 47 | 48 | # extract target column 49 | target = meta['target_names'][0] 50 | y_true = df[target] 51 | is_categorical = target in meta['categories'] 52 | 53 | # perform inference 54 | print('performing inference') 55 | 56 | y_pred = model.predict(x) 57 | 58 | if is_categorical: 59 | classes = meta['categories'][target] 60 | y_pred = [classes[v] for v in y_pred] 61 | 62 | for sample_name, v_pred, v_true in zip(df.index, y_pred, y_true): 63 | print('%8s: %8s (%8s)' % (sample_name, v_pred, v_true)) 64 | 65 | # save score 66 | if is_categorical: 67 | score = { 68 | 'name': 'accuracy', 69 | 'value': accuracy_score(y_true, y_pred) 70 | } 71 | 72 | else: 73 | score = { 74 | 'name': 'r2', 75 | 'value': r2_score(y_true, y_pred) 76 | } 77 | 78 | print('%s: %0.3f' % (score['name'], score['value'])) 79 | 80 | with open(args.outfile, 'w') as f: 81 | json.dump(score, f) 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hyperopt 2 | 3 | A proof-of-concept pipeline for performing hyperparameter optimization of machine learning models with Nextflow. 4 | 5 | 6 | ## Requirements 7 | 8 | * Unix-like operating system (Linux, macOS, etc) 9 | * Java >=11 10 | * [Conda](https://docs.conda.io/en/latest/) or [Docker](https://docs.docker.com/) 11 | 12 | 13 | ## Quickstart 14 | 15 | 1. Install Nextflow (version 23.10.x or higher): 16 | ```bash 17 | curl -s https://get.nextflow.io | bash 18 | ``` 19 | 20 | 2. Launch the pipeline: 21 | ```bash 22 | # use conda natively (requires Conda) 23 | ./nextflow run nextflow-io/hyperopt -profile conda 24 | 25 | # use Wave containers (requires Docker) 26 | ./nextflow run nextflow-io/hyperopt -profile wave 27 | ``` 28 | 29 | 3. When the pipeline completes, you can view the training and prediction results in the `results` folder. 30 | 31 | Note: the first time you execute the pipeline, Nextflow will take a few minutes to download the pipeline code from this GitHub repository and any related software dependencies (e.g. conda packages or Docker images). 32 | 33 | 34 | ## Configuration 35 | 36 | The hyperopt pipeline consists of the following steps: 37 | 38 | 1. Download a dataset 39 | 2. Split the dataset into train/test sets 40 | 3. Visualize the train/test sets 41 | 4. Train a variety of models on the training set 42 | 5. Evaluate each model on the test set 43 | 6. Select the best model based on evaluation score 44 | 45 | You can control many aspects of this workflow with the pipeline parameters, including: 46 | 47 | * Enable/disable each individual step 48 | * Download a different dataset (default is `wdbc`, see [OpenML.org](https://www.openml.org/search?type=data&status=active) to view available datasets) 49 | * Provide your own training data instead of downloading it 50 | * Provide your own pre-trained model and test data 51 | * Select different models (see the `train` module for all available options) 52 | 53 | See the `nextflow.config` file for the list of pipeline parameters. 54 | 55 | 56 | ## Cluster support 57 | 58 | Since [Nextflow](http://www.nextflow.io) provides an abstraction between the pipeline logic and the underlying execution environment, the hyperopt pipeline can be executed on a single computer or an HPC cluster without any modifications. 59 | 60 | Visit the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html) to see which HPC schedulers are supported, and how to use them. 61 | 62 | 63 | ## Components 64 | 65 | The hyperopt pipeline uses Python (>=3.10) and several Python packages for machine learning and data science. These dependencies are defined in the `conda.yml` file. 66 | -------------------------------------------------------------------------------- /nextflow_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema", 3 | "$id": "https://raw.githubusercontent.com/nextflow-io/hyperopt/master/nextflow_schema.json", 4 | "title": "hyperopt pipeline parameters", 5 | "description": "Proof-of-concept pipeline for training and evaluating machine learning models with Nextflow. Not all parameters are included in this schema, refer to the nextflow.config for more details.", 6 | "type": "object", 7 | "definitions": { 8 | "pipeline_options": { 9 | "title": "Pipeline options", 10 | "type": "object", 11 | "description": "", 12 | "default": "", 13 | "fa_icon": "fas fa-terminal", 14 | "properties": { 15 | "fetch_dataset": { 16 | "type": "boolean", 17 | "description": "Fetch a dataset by name from OpenML", 18 | "default": true, 19 | "fa_icon": "fas fa-download" 20 | }, 21 | "dataset_name": { 22 | "type": "string", 23 | "description": "Name of dataset to fetch from OpenML", 24 | "default": "wdbc", 25 | "fa_icon": "fas fa-file-csv" 26 | }, 27 | "visualize": { 28 | "type": "boolean", 29 | "description": "Visualize the training and test sets", 30 | "default": true, 31 | "fa_icon": "fas fa-chart-line" 32 | }, 33 | "datadir": { 34 | "type": "string", 35 | "description": "Directory that contains any input datasets and pre-trained models", 36 | "default": "data", 37 | "fa_icon": "fas fa-folder-open" 38 | }, 39 | "train": { 40 | "type": "boolean", 41 | "description": "Train a set of models on the given training set(s)", 42 | "default": true, 43 | "fa_icon": "fas fa-dumbbell" 44 | }, 45 | "predict": { 46 | "type": "boolean", 47 | "description": "Evaluate a set of trained models on the given test set(s)", 48 | "default": true, 49 | "fa_icon": "fas fa-balance-scale-right" 50 | }, 51 | "outdir": { 52 | "type": "string", 53 | "description": "Directory to publish output data", 54 | "default": "results", 55 | "fa_icon": "fas fa-folder-open" 56 | } 57 | } 58 | } 59 | }, 60 | "allOf": [ 61 | { 62 | "$ref": "#/definitions/pipeline_options" 63 | } 64 | ] 65 | } 66 | -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * Copyright (c) 2022, Seqera Labs. 5 | * 6 | * This Source Code Form is subject to the terms of the Mozilla Public 7 | * License, v. 2.0. If a copy of the MPL was not distributed with this 8 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | * 10 | * This Source Code Form is "Incompatible With Secondary Licenses", as 11 | * defined by the Mozilla Public License, v. 2.0. 12 | * 13 | */ 14 | import groovy.json.JsonSlurper 15 | 16 | include { fetch_dataset } from './modules/fetch_dataset' 17 | include { split_train_test } from './modules/split_train_test' 18 | include { visualize } from './modules/visualize' 19 | include { train } from './modules/train' 20 | include { predict } from './modules/predict' 21 | 22 | 23 | log.info """ 24 | M L - H Y P E R O P T P I P E L I N E 25 | ======================================= 26 | fetch_dataset : ${params.fetch_dataset} 27 | dataset_name : ${params.dataset_name} 28 | 29 | visualize : ${params.visualize} 30 | 31 | train : ${params.train} 32 | train_data : ${params.train_data} 33 | train_meta : ${params.train_meta} 34 | train_models : ${params.train_models} 35 | 36 | predict : ${params.predict} 37 | predict_models : ${params.predict_models} 38 | predict_data : ${params.predict_data} 39 | predict_meta : ${params.predict_meta} 40 | 41 | outdir : ${params.outdir} 42 | """ 43 | 44 | 45 | /* 46 | * main script flow 47 | */ 48 | workflow { 49 | // fetch dataset if specified 50 | if ( params.fetch_dataset == true ) { 51 | ch_datasets = fetch_dataset(params.dataset_name) 52 | 53 | (ch_train_datasets, ch_predict_datasets) = split_train_test(ch_datasets) 54 | } 55 | 56 | // otherwise load input files 57 | else { 58 | ch_train_data = Channel.fromFilePairs(params.train_data, size: 1, flat: true) 59 | ch_train_meta = Channel.fromFilePairs(params.train_meta, size: 1, flat: true) 60 | ch_train_datasets = ch_train_data.join(ch_train_meta) 61 | 62 | ch_predict_data = Channel.fromFilePairs(params.predict_data, size: 1, flat: true) 63 | ch_predict_meta = Channel.fromFilePairs(params.predict_meta, size: 1, flat: true) 64 | ch_predict_datasets = ch_predict_data.join(ch_predict_meta) 65 | } 66 | 67 | // visualize train/test sets 68 | if ( params.visualize == true ) { 69 | visualize(ch_train_datasets.concat(ch_predict_datasets)) 70 | } 71 | 72 | // print warning if both training and pre-trained model are enabled 73 | if ( params.train == true && params.predict_models != null ) { 74 | log.warn 'Training is enabled but pre-trained model(s) are also provided, pre-trained models will be ignored' 75 | } 76 | 77 | // perform training if specified 78 | if ( params.train == true ) { 79 | (ch_models, ch_train_logs) = train(ch_train_datasets, params.train_models) 80 | } 81 | 82 | // otherwise load trained model if specified 83 | else if ( params.predict_models != null ) { 84 | ch_models = Channel.fromFilePairs(params.predict_models, size: 1, flat: true) 85 | | map { [it[0], 'pretrained', it[1]] } 86 | } 87 | 88 | // perform inference if specified 89 | if ( params.predict == true ) { 90 | ch_predict_inputs = ch_models.combine(ch_predict_datasets, by: 0) 91 | (ch_scores, ch_predict_logs) = predict(ch_predict_inputs) 92 | 93 | // select the best model based on inference score 94 | ch_scores 95 | | max { 96 | new JsonSlurper().parse(it[2])['value'] 97 | } 98 | | subscribe { dataset_name, model_type, score_file -> 99 | def score = new JsonSlurper().parse(score_file) 100 | println "The best model for \'${dataset_name}\' was \'${model_type}\', with ${score.name} = ${String.format('%.3f', score.value)}" 101 | } 102 | } 103 | } 104 | 105 | 106 | /* 107 | * completion handler 108 | */ 109 | workflow.onComplete { 110 | log.info ( workflow.success ? '\nDone!' : '\nOops .. something went wrong' ) 111 | } 112 | -------------------------------------------------------------------------------- /modules/train/resources/usr/bin/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import pandas as pd 6 | import pickle 7 | from sklearn.dummy import DummyClassifier, DummyRegressor 8 | from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor 9 | from sklearn.linear_model import LinearRegression, LogisticRegression 10 | from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score 11 | from sklearn.model_selection import cross_val_predict 12 | from sklearn.neural_network import MLPClassifier, MLPRegressor 13 | from sklearn.pipeline import Pipeline 14 | from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler 15 | 16 | 17 | def encode_onehot(x, categories): 18 | for column, values in categories.items(): 19 | if column in x: 20 | for v in values: 21 | x['%s_%s' % (column, v)] = (x[column] == v) 22 | x = x.drop(columns=[column]) 23 | 24 | return x 25 | 26 | 27 | if __name__ == '__main__': 28 | # parse command-line arguments 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--data', help='training data file', required=True) 31 | parser.add_argument('--meta', help='training metadata file', required=True) 32 | parser.add_argument('--scaler', help='preprocessing transform to apply to inputs', choices=['maxabs', 'minmax', 'standard'], default='standard') 33 | parser.add_argument('--model-type', help='which model to train', choices=['dummy', 'gb', 'lr', 'mlp', 'rf'], default='dummy') 34 | parser.add_argument('--model-name', help='name of trained model file', default='model.pkl') 35 | 36 | args = parser.parse_args() 37 | 38 | # load dataset 39 | print('loading dataset') 40 | 41 | df = pd.read_csv(args.data, index_col=0, sep='\t') 42 | 43 | with open(args.meta, 'r') as f: 44 | meta = json.load(f) 45 | 46 | # extract input features 47 | x = df[meta['feature_names']] 48 | x = encode_onehot(x, meta['categories']) 49 | 50 | # extract target column 51 | target = meta['target_names'][0] 52 | is_categorical = target in meta['categories'] 53 | 54 | if is_categorical: 55 | classes = {v: i for i, v in enumerate(meta['categories'][target])} 56 | y = df[target].apply(lambda v: classes[v]) 57 | 58 | else: 59 | y = df[target] 60 | 61 | # select scaler 62 | Scaler = { 63 | 'maxabs': MaxAbsScaler, 64 | 'minmax': MinMaxScaler, 65 | 'standard': StandardScaler 66 | }[args.scaler] 67 | 68 | # select estimator 69 | Estimator = { 70 | True: { 71 | 'dummy': DummyClassifier, 72 | 'gb': GradientBoostingClassifier, 73 | 'lr': LogisticRegression, 74 | 'mlp': MLPClassifier, 75 | 'rf': RandomForestClassifier 76 | }, 77 | False: { 78 | 'dummy': DummyRegressor, 79 | 'gb': GradientBoostingRegressor, 80 | 'lr': LinearRegression, 81 | 'mlp': MLPRegressor, 82 | 'rf': RandomForestRegressor 83 | } 84 | }[is_categorical][args.model_type] 85 | 86 | # create model pipeline 87 | model = Pipeline([ 88 | ('scaler', Scaler()), 89 | ('estimator', Estimator()) 90 | ]) 91 | 92 | # train and evaluate model 93 | print('training model') 94 | 95 | y_pred = cross_val_predict(model, x, y, cv=5) 96 | 97 | scorers = { 98 | True: [ 99 | ('mse', mean_squared_error), 100 | ('mae', mean_absolute_error), 101 | ('acc', accuracy_score) 102 | ], 103 | False: [ 104 | ('mse', mean_squared_error), 105 | ('mae', mean_absolute_error), 106 | ('r2', r2_score) 107 | ] 108 | }[is_categorical] 109 | 110 | for name, score_fn in scorers: 111 | print('%s: %0.3f' % (name, score_fn(y, y_pred))) 112 | 113 | # train model on full dataset 114 | model.fit(x, y) 115 | 116 | # save trained model to file 117 | print('saving model') 118 | 119 | with open(args.model_name, 'wb') as f: 120 | pickle.dump(model, f) 121 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. --------------------------------------------------------------------------------