├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
├── CONFIG.md
└── HPTUNE_CONFIG.md
├── examples
├── experimental
│ └── kfp-2
│ │ ├── config.yaml.example
│ │ ├── demo.py
│ │ ├── pipeline_from_config_demo.ipynb
│ │ └── user-input
│ │ └── preprocess
│ │ ├── Dockerfile
│ │ ├── build.sh
│ │ ├── component.yaml
│ │ └── split_train_eval.py
├── getting_started_notebook.ipynb
├── kfp
│ ├── bin
│ │ └── wi_setup.sh
│ ├── config.yaml.example
│ ├── demo.py
│ ├── hptuning_config.yaml
│ └── model
│ │ ├── __init__.py
│ │ ├── census_preprocess.py
│ │ └── tf_model.py
├── sklearn
│ ├── config.yaml.example
│ ├── demo.py
│ ├── hptuning_config.yaml
│ └── model
│ │ ├── __init__.py
│ │ ├── census_preprocess.py
│ │ └── sklearn_model.py
├── taxi
│ ├── sklearn
│ │ ├── config.yaml.example
│ │ ├── demo.py
│ │ ├── hptuning_config.yaml
│ │ └── model
│ │ │ ├── __init__.py
│ │ │ ├── sklearn_model.py
│ │ │ └── taxi_preprocess.py
│ ├── tf
│ │ ├── config.yaml.example
│ │ ├── demo.py
│ │ ├── hptuning_config.yaml
│ │ └── model
│ │ │ ├── __init__.py
│ │ │ ├── taxi_preprocess.py
│ │ │ └── tf_model.py
│ └── xgb
│ │ ├── config.yaml.example
│ │ ├── demo.py
│ │ ├── hptuning_config.yaml
│ │ └── model
│ │ ├── __init__.py
│ │ ├── taxi_preprocess.py
│ │ └── xgb_model.py
├── tf
│ ├── config.yaml.example
│ ├── demo.py
│ ├── hptuning_config.yaml
│ └── model
│ │ ├── __init__.py
│ │ ├── census_preprocess.py
│ │ └── tf_model.py
└── xgboost
│ ├── config.yaml.example
│ ├── demo.py
│ ├── hptuning_config.yaml
│ └── model
│ ├── __init__.py
│ ├── census_preprocess.py
│ └── xgboost_model.py
├── ml_pipeline_gen
├── __init__.py
├── experimental
│ ├── component_lib.py
│ └── component_spec.yaml
├── models.py
├── parsers.py
├── pipelines.py
├── static
│ ├── bin
│ │ ├── cleanup.sh
│ │ └── run.local_train.sh
│ ├── orchestration
│ │ ├── __init__.py
│ │ └── components
│ │ │ └── list_blobs.yaml
│ └── trainer
│ │ ├── __init__.py
│ │ └── utils.py
└── templates
│ ├── experimental
│ ├── example_pipeline.ipynb
│ ├── get_tuned_params
│ │ ├── Dockerfile
│ │ ├── build.sh
│ │ ├── component.yaml
│ │ └── get_tuned_params.py
│ ├── hptune
│ │ ├── Dockerfile
│ │ ├── build.sh
│ │ ├── component.yaml
│ │ └── hptune.sh
│ ├── hptuning_config.yaml
│ └── kfp_pipeline_from_config.py
│ ├── kfp_pipeline.py
│ ├── setup.py
│ ├── sklearn_inputs.py
│ ├── sklearn_model.py
│ ├── sklearn_task.py
│ ├── tf_inputs.py
│ ├── tf_model.py
│ ├── tf_task.py
│ ├── xgboost_inputs.py
│ ├── xgboost_model.py
│ └── xgboost_task.py
├── setup.py
└── tests
├── __init__.py
├── integration
├── fixtures
│ └── test_config.yaml
└── src
│ ├── __init__.py
│ └── test_models.py
├── test_utils.py
└── unit
├── __init__.py
├── examples
├── __init__.py
├── sklearn
│ ├── __init__.py
│ └── test_sklearn_model.py
└── tensorflow
│ ├── __init__.py
│ └── test_tf_model.py
└── src
├── __init__.py
└── test_models.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Virtual envs
2 | venv/*
3 | testenv/
4 |
5 | # Generated files
6 | *.pyc
7 | models/*
8 | *.egg-info/
9 | dist/*
10 | build/*
11 | *.tar.gz
12 | config.yaml
13 | trainer/model.py
14 | trainer/task.py
15 | trainer/inputs.py
16 | orchestration/pipeline.py
17 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | branches:
2 | only:
3 | - master
4 | language: python
5 | python:
6 | - "3.6"
7 | - "3.7"
8 | # Tensorflow 1.x does not support python 3.8+
9 | install:
10 | - pip install -e .
11 | script:
12 | - python -m unittest discover -s tests/unit
13 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to
6 | project_id: [project ID] 7 | bucket_id: [GCS bucket ID] 8 | region: [GCP region to train ML Pipeline Generator models in, on AI Platform] 9 | cluster_name: [Name of GKE cluster hosting Kubeflow Pipelines] 10 | cluster_zone: [Zone in which GKE cluster is deployed] 11 | scale_tier: [compute specifications for training the model on AI Platform] 12 | runtime_version: [AI Platform Training runtime version] 13 | python_version: [Python version used in the model code for training] 14 | package_name: [name for the source distribution to be uploaded to GCS] 15 | machine_type_pred: [type of virtual machine that AI Platform Prediction uses for the nodes that serve predictions, defaults to mls1-c1-m2] 16 | 17 | data: 18 | schema: 19 | - [schema for input & target features in the training data] 20 | train: [GCS location url to upload preprocessed training data] 21 | evaluation: [GCS location url to upload preprocessed eval data] 22 | prediction: 23 | input_data_paths: 24 | - [GCS location urls for prediction input data] 25 | input_format: [prediction input format] 26 | output_format: [prediction output format] 27 | 28 | model: 29 | name: [unique model name, must start with a letter and only contain letters, numbers, and underscores] 30 | path: [local dir path to the model.py file] 31 | target: [target feature in training data] 32 | metrics: [metrics to evaluate model training on, such as “accuracy”] 33 | 34 | model_params: 35 | input_args: [Any input params to be submitted with the job] 36 | arg_name: 37 | type: [data type of the arg, such as int] 38 | help: [short description of the arg] 39 | default: [default value of the arg] 40 | hyperparam_config: [optional; local path to hyperparam tuning config yaml. See schema here for this config file.] 41 | explanation: [optional; explainability features for the training job] 42 | 43 | orchestration: 44 | kubeflow_url: [for KFP backend; URL of preconfigured Kubeflow instance] 45 |46 | -------------------------------------------------------------------------------- /docs/HPTUNE_CONFIG.md: -------------------------------------------------------------------------------- 1 | ### hptune_config.yaml schema 2 | 3 | Below schema should be used when preparing a `hptune_config.yaml` file for models using the tool. The parameters follow the Cloud AI Platform [HyperparameterSpec](https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#HyperparameterSpec), some of which are optional and marked as such. 4 | 5 |
6 | trainingInput: 7 | hyperparameters: 8 | goal: [the type of goal to use for tuning, MAXIMIZE or MINIMIZE] 9 | params: [the set of parameters to tune] 10 | - parameterName: [unique parameter name, e.g. “learning_rate”] 11 | type: [parameter type] 12 | minValue: [min value for the parameter, if DOUBLE or INTEGER type] 13 | maxValue: [max value for the parameter, if DOUBLE or INTEGER type] 14 | scaleType: [optional; how the parameter should be scaled] 15 | maxTrials: [optional; how many training trials should be attempted to optimize the specified hyperparameters] 16 | maxParallelTrials: [optional; the number of training trials to run concurrently] 17 | maxFailedTrials: [optional; the number of failed trials that need to be seen before failing the hyperparameter tuning job] 18 | hyperparameterMetricTag: [optional; TensorFlow summary tag name to use for optimizing trials] 19 | resumePreviousJobId: [optional; the prior hyperparameter tuning job id that users hope to continue with] 20 | enableTrialEarlyStopping: [optional; indicates if the hyperparameter tuning job enables auto trial early stopping] 21 | algorithm: [optional; search algorithm to be used] 22 |23 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for AI Pipeline. 16 | 17 | output_package: ./caipa-output 18 | project_id: gcp-demo-2-262319 19 | bucket_id: poc-bucket-0120 20 | region: us-central1 21 | runtime_version: "1.10" 22 | python_version: 3.6 23 | 24 | model: 25 | name: loan_delinq_v1 26 | path: 27 | 28 | # Search Path for pre-built components 29 | github_component_url: https://raw.githubusercontent.com/kubeflow/pipelines/3f4b80127f35e40760eeb1813ce1d3f641502222/components/gcp/ 30 | kfp_deployment_url: https://54f49491f869f31e-dot-us-central2.pipelines.googleusercontent.com 31 | 32 | preprocess: 33 | component: user-input/preprocess 34 | component_args: 35 | - name: project_id 36 | default: '' 37 | - name: dataset_bucket 38 | default: poc-bucket-0120 39 | 40 | hptune: 41 | component: AUTO 42 | config: gs://poc-bucket-0120/hpconfig.yaml 43 | args: 44 | - name: output_dir 45 | default: gs://poc-bucket-0120/hptune 46 | - name: input_bucket 47 | default: gs://poc-bucket-0120 48 | - name: eval_steps 49 | default: 10 50 | - name: train_examples 51 | default: 200 52 | 53 | get_tuned_params: 54 | component: AUTO 55 | 56 | train: 57 | python_module: trainer.task 58 | python_package: gs://poc-bucket-0120/trainer.tar.gz 59 | model_out_prefix: /export/exporter 60 | component: ml_engine/train 61 | args: 62 | - name: output_dir 63 | default: gs://poc-bucket-0120/train 64 | - name: input_bucket 65 | default: gs://poc-bucket-0120 66 | - name: eval_steps 67 | default: 10 68 | - name: train_examples 69 | default: 2000 70 | 71 | deploy: 72 | component: ml_engine/deploy 73 | model_id: Loand_Delinq 74 | version_id: v1.0 75 | 76 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for KubeFlow Pipelines.""" 16 | from ml_pipeline_gen.pipelines import KfpPipeline 17 | 18 | 19 | def main(): 20 | config = './config.yaml' 21 | pipeline = KfpPipeline(config=config) 22 | # Review the components 23 | pipeline.list_components() 24 | # define pipeline structure 25 | preprocess = pipeline.add_component('preprocess') 26 | hptune = pipeline.add_component('hptune', parent=preprocess) 27 | get_best_params = pipeline.add_component('get_tuned_params', parent=hptune) 28 | train = pipeline.add_component('train', parent=get_best_params) 29 | deploy = pipeline.add_component('deploy', parent=train) 30 | 31 | pipeline.print_structure() 32 | pipeline.generate_pipeline_from_config() 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/pipeline_from_config_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from ml_pipeline_gen.pipelines import KfpPipeline" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "config = \"./config.yaml\"\n", 19 | "pipeline = KfpPipeline(config=config)\n", 20 | "#pipeline.print_structure()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "['preprocess', 'hptune', 'get_tuned_params', 'train', 'deploy']\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Review the components\n", 38 | "pipeline.list_components()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 6, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# define pipeline structure\n", 48 | "preprocess = pipeline.add_component('preprocess')\n", 49 | "hptune = pipeline.add_component('hptune', parent=preprocess)\n", 50 | "get_best_params= pipeline.add_component('get_tuned_params', parent=hptune)\n", 51 | "train = pipeline.add_component('train', parent=get_best_params)\n", 52 | "deploy = pipeline.add_component('deploy', parent=train)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Generate kubeflow pipeline\n", 62 | "pipeline.generate_pipeline_from_config()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.6.10" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The Kubeflow Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:latest 16 | 17 | RUN pip install -U scipy 18 | 19 | RUN pip install -U numpy 20 | 21 | RUN pip install -U scikit-learn 22 | 23 | RUN pip install pandas 24 | 25 | RUN pip install --upgrade google-cloud-storage 26 | 27 | COPY . / 28 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | CONTAINER_NAME=loan-pipeline-trainevalsplit 30 | 31 | docker build -t ${CONTAINER_NAME} . 32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Component Descriptor for Split-train-eval 16 | name: Split-train-Eval - Preprocess 17 | description: Splits a given input.csv to train and eval csv files 18 | 19 | inputs: 20 | - {name: project_id, type: String} 21 | - {name: dataset_bucket, type: String} 22 | 23 | #outputs: 24 | #- {name: train, type: XGBoost model, help: Trained XGBoost model} 25 | 26 | implementation: 27 | container: 28 | image: gcr.io/gcp-demo-2-262319/loan-pipeline-trainevalsplit:latest 29 | command: [ 30 | python, /split_train_eval.py, 31 | --project_id, {inputValue: project_id}, 32 | --dataset_bucket, {inputValue: dataset_bucket}, 33 | ] 34 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/split_train_eval.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """.""" 16 | import pandas as pd 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.utils import shuffle 19 | from io import BytesIO 20 | from google.cloud import storage 21 | import argparse 22 | 23 | 24 | def obtain_train_eval(project_id, bucket_name): 25 | """.""" 26 | # # All of the data is in a file called Step10_Final_dataset.csv 27 | print('reading the data file from gcs...') 28 | print('Project-ID: %s ' %(project_id)) 29 | print('Bucket-ID: %s ' %(bucket_name)) 30 | 31 | 32 | # The following was derived from the contents of this reply: 33 | # https://stackoverflow.com/a/50201179 34 | storage_client = storage.Client(project=project_id, credentials=None) 35 | bucket = storage_client.get_bucket(bucket_name) 36 | blob = bucket.blob('input/Step10_Final_dataset.csv') 37 | 38 | byte_stream = BytesIO() 39 | blob.download_to_file(byte_stream) 40 | byte_stream.seek(0) 41 | df = pd.read_csv(byte_stream) 42 | 43 | # We need to rearrange the columns below just as they shall be 44 | # expected by the estimator 45 | print('rearranging data...') 46 | key_column = 'LOAN_SEQUENCE_NUMBER' 47 | label_column = 'TARGET' 48 | bool_cols = [] 49 | int_cols = ['credit_score', 50 | 'mortgage_insurance_percentage', 51 | 'Number_of_units', 52 | 'cltv', 53 | 'original_upb', 54 | 'ltv', 55 | 'original_loan_term', 56 | 'number_of_borrowers', 57 | 'min_CURRENT_DEFERRED_UPB'] 58 | str_cols = ['first_time_home_buyer_flag', 59 | 'occupancy_status', 60 | 'channel', 61 | 'property_state', 62 | 'property_type', 63 | 'loan_purpose', 64 | 'seller_name', 65 | 'service_name'] 66 | # str_nuniques = [2, 3, 3, 52, 5, 2, 20, 24] 67 | float_cols = ['metropolitan_division', 68 | 'original_interest_rate', 69 | 'min_CURRENT_ACTUAL_UPB', 70 | 'max_CURRENT_ACTUAL_UPB', 71 | 'Range_CURRENT_ACTUAL_UPB', 72 | 'stdev_CURRENT_ACTUAL_UPB', 73 | 'mode_CURRENT_ACTUAL_UPB', 74 | 'average_CURRENT_ACTUAL_UPB', 75 | 'max_CURRENT_DEFERRED_UPB', 76 | 'Range_CURRENT_DEFERRED_UPB', 77 | 'mode_CURRENT_DEFERRED_UPB', 78 | 'average_CURRENT_DEFERRED_UPB', 79 | 'stdev_CURRENT_DEFERRED_UPB', 80 | 'min_CURRENT_INTEREST_RATE', 81 | 'max_CURRENT_INTEREST_RATE', 82 | 'Range_CURRENT_INTEREST_RATE', 83 | 'mode_CURRENT_INTEREST_RATE', 84 | 'stdev_CURRENT_INTEREST_RATE', 85 | 'average_CURRENT_INTEREST_RATE', 86 | 'PREFINAL_LOAN_DELINQUENCY_STATUS', 87 | 'frequency_0', 88 | 'frequency_1', 89 | 'frequency_2', 90 | 'frequency_3', 91 | 'Recency_0', 92 | 'Recency_1', 93 | 'Recency_2', 94 | 'Recency_3'] 95 | # DEFAULTS = [[''] for col in bool_cols] + \ 96 | # [[0] for col in int_cols] + \ 97 | # [[0.0] for col in float_cols] + \ 98 | # [[''] for col in str_cols] + [[''], [0]] 99 | csv_columns = bool_cols + int_cols + float_cols + \ 100 | str_cols + [key_column, label_column] 101 | traindata = df[csv_columns] 102 | 103 | # Here, we'll split with a small test size so as to 104 | # allow our model to train on more data 105 | print('splitting...') 106 | x_train, x_test, y_train, y_test = train_test_split( 107 | traindata.drop(label_column, axis=1), traindata[label_column], 108 | stratify=traindata[label_column], shuffle=True, test_size=0.1) 109 | traindf = pd.concat([x_train, y_train], axis=1) 110 | evaldf = pd.concat([x_test, y_test], axis=1) 111 | 112 | alld = pd.concat([traindf, evaldf]) 113 | strcols = [col for col in alld.columns if alld[col].dtype == 'object'] 114 | if key_column in strcols: 115 | strcols.remove(key_column) 116 | alld = pd.get_dummies(alld, columns=strcols) 117 | 118 | divline = traindf.shape[0] 119 | traindf_wdummies = alld.iloc[:divline, :] 120 | # not necessary only cmle but can be used to 121 | # test performance if so desired 122 | evaldf_wdummies = alld.iloc[divline:, :] 123 | del alld 124 | 125 | print('Undersample for XG Boost....') 126 | 127 | traindfu_wdummies = pd.concat([ 128 | traindf_wdummies[traindf_wdummies[label_column] == 0].sample( 129 | frac=0.01), 130 | traindf_wdummies[traindf_wdummies[label_column] == 1].sample( 131 | frac=0.55), 132 | traindf_wdummies[traindf_wdummies[label_column] > 1]]) 133 | traindfu_wdummies = shuffle(traindfu_wdummies) 134 | 135 | # traindfu_wdummies.drop(key_column, axis=1) 136 | # .to_csv('xgb_train.csv', index=False) 137 | # evaldf_wdummies.drop([key_column,label_column], axis=1) 138 | # .to_csv('xgb_eval.csv', index=False) 139 | 140 | # Since the results are small enough to fit in a single 141 | # well-provisioned VM, we'll write the results to csv files locally 142 | # then move them to gcs so we have two copies to work 143 | # with as we please 144 | 145 | print('writing tf model files...') 146 | write_file( 147 | storage_client, 148 | traindf[csv_columns], 149 | bucket_name, 150 | 'train.csv', 151 | header=False) 152 | write_file( 153 | storage_client, 154 | evaldf[csv_columns], 155 | bucket_name, 156 | 'eval.csv', 157 | header=False) 158 | 159 | # traindf[csv_columns].to_csv('train.csv', index=False, header=False) 160 | # evaldf[csv_columns].to_csv('eval.csv', index=False, header=False) 161 | 162 | print('writing XG Boost model files...') 163 | write_file( 164 | storage_client, 165 | traindfu_wdummies.drop(key_column, axis=1), 166 | bucket_name, 167 | 'xgb_train.csv', 168 | header=True) 169 | write_file( 170 | storage_client, 171 | evaldf_wdummies.drop([key_column, label_column], axis=1), 172 | bucket_name, 173 | 'xgb_eval.csv', 174 | header=True) 175 | 176 | with open('./output.txt', 'w') as output_file: 177 | output_file.write(bucket_name) 178 | print('Done!') 179 | 180 | 181 | def write_file(storage_client, 182 | df, 183 | bucket_name, 184 | destination_file_name, 185 | header): 186 | """Write a blob from the bucket.""" 187 | df_str = df.to_csv(index=False, header=header) 188 | # storage_client = storage.Client() 189 | bucket = storage_client.get_bucket(bucket_name) 190 | blob = bucket.blob('output/' + destination_file_name) 191 | blob.upload_from_string(df_str) 192 | 193 | 194 | if __name__ == '__main__': 195 | parser = argparse.ArgumentParser() 196 | parser.add_argument('--project_id', 197 | type=str, 198 | required=True, 199 | help='The GCP project_id containing the source file') 200 | parser.add_argument('--dataset_bucket', 201 | type=str, 202 | required=True, 203 | help='Bucket to store outputs.') 204 | args = parser.parse_args() 205 | 206 | obtain_train_eval(args.project_id, args.dataset_bucket) 207 | -------------------------------------------------------------------------------- /examples/kfp/bin/wi_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Script to set up Google service accounts and workload identity bindings for a 18 | # Kubeflow Pipelines (KFP) standalone deployment. 19 | # 20 | # The script checks if the GKE cluster has Workload Identity enabled and 21 | # configured with a custom label, and if not, enables it and updates the label. 22 | # 23 | # Adapted for ML Pipeline Generator from https://github.com/kubeflow/pipelines/blob/master/manifests/kustomize/gcp-workload-identity-setup.sh 24 | # 25 | # What the script configures: 26 | # 1. Workload Identity for the cluster. 27 | # 2. Google service accounts (GSAs): $SYSTEM_GSA and $USER_GSA. 28 | # 3. Service account IAM policy bindings. 29 | # 4. Kubernetes service account annotations. 30 | # 31 | # Note: Since the node pool is updated with WI, a new KFP hostname is generated. 32 | # 33 | # Requirements: 34 | # 1. gcloud set up in the environment calling the script 35 | # 2. KFP is deployed on a GKE cluster 36 | set -e 37 | 38 | # Cluster vars 39 | PROJECT_ID=$1 40 | CLUSTER_NAME=$2 41 | ZONE=$3 42 | NAMESPACE=$4 43 | 44 | echo "Workload Identity has not been provisioned for "${CLUSTER_NAME}" ("${ZONE}"), enabling it now..." 45 | 46 | # Google service Account (GSA) 47 | SYSTEM_GSA=$CLUSTER_NAME-kfp-system 48 | USER_GSA=$CLUSTER_NAME-kfp-user 49 | 50 | # Kubernetes Service Account (KSA) 51 | SYSTEM_KSA=(ml-pipeline-ui ml-pipeline-visualizationserver) 52 | USER_KSA=(pipeline-runner default) 53 | 54 | gcloud container clusters get-credentials $CLUSTER_NAME \ 55 | --zone=$ZONE 56 | 57 | gcloud container clusters update $CLUSTER_NAME \ 58 | --zone=$ZONE \ 59 | --workload-pool="${PROJECT_ID}".svc.id.goog 60 | 61 | gcloud beta container node-pools update default-pool \ 62 | --cluster=$CLUSTER_NAME \ 63 | --zone=$ZONE \ 64 | --max-surge-upgrade=3 \ 65 | --max-unavailable-upgrade=0 66 | 67 | gcloud container node-pools update default-pool \ 68 | --cluster=$CLUSTER_NAME \ 69 | --zone=$ZONE \ 70 | --workload-metadata=GKE_METADATA 71 | 72 | echo "Creating Google Service Accounts..." 73 | function create_gsa_if_not_present { 74 | local name=${1} 75 | local already_present=$(gcloud iam service-accounts list --filter='name:'$name'' --format='value(name)') 76 | if [ -n "$already_present" ]; then 77 | echo "Service account $name already exists" 78 | else 79 | gcloud iam service-accounts create $name 80 | fi 81 | } 82 | 83 | create_gsa_if_not_present $SYSTEM_GSA 84 | create_gsa_if_not_present $USER_GSA 85 | 86 | # Add iam policy bindings to grant project permissions to these GSAs. 87 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 88 | --member="serviceAccount:$SYSTEM_GSA@$PROJECT_ID.iam.gserviceaccount.com" \ 89 | --role="roles/editor" 90 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 91 | --member="serviceAccount:$USER_GSA@$PROJECT_ID.iam.gserviceaccount.com" \ 92 | --role="roles/editor" 93 | 94 | # Bind KSA to GSA through workload identity. 95 | function bind_gsa_and_ksa { 96 | local gsa=${1} 97 | local ksa=${2} 98 | 99 | gcloud iam service-accounts add-iam-policy-binding $gsa@$PROJECT_ID.iam.gserviceaccount.com \ 100 | --member="serviceAccount:$PROJECT_ID.svc.id.goog[$NAMESPACE/$ksa]" \ 101 | --role="roles/iam.workloadIdentityUser" \ 102 | > /dev/null 103 | 104 | kubectl annotate serviceaccount \ 105 | --namespace $NAMESPACE \ 106 | --overwrite \ 107 | $ksa iam.gke.io/gcp-service-account=$gsa@$PROJECT_ID.iam.gserviceaccount.com 108 | 109 | echo "* Bound KSA $ksa to GSA $gsa" 110 | } 111 | 112 | echo "Binding each kfp system KSA to $SYSTEM_GSA" 113 | for ksa in ${SYSTEM_KSA[@]}; do 114 | bind_gsa_and_ksa $SYSTEM_GSA $ksa 115 | done 116 | 117 | echo "Binding each kfp user KSA to $USER_GSA" 118 | for ksa in ${USER_KSA[@]}; do 119 | bind_gsa_and_ksa $USER_GSA $ksa 120 | done 121 | 122 | gcloud container clusters update $CLUSTER_NAME \ 123 | --zone=$ZONE 124 | --update-labels mlpg_wi_auth=true 125 | 126 | RED='\033[0;31m' 127 | COLOR_RESET='\033[0m' 128 | echo -e "${RED}Workload Identity has been enabled, and KFP dashboard URL has been updated. Please update the hostname in config.yaml for future runs.${COLOR_RESET}" 129 | -------------------------------------------------------------------------------- /examples/kfp/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | cluster_name: [GKE CLUSTER NAME] 21 | cluster_zone: [GKE CLUSTER ZONE] 22 | scale_tier: "STANDARD_1" 23 | runtime_version: "1.15" 24 | python_version: "3.7" 25 | package_name: "ml_pipeline_gen" 26 | machine_type_pred: "n1-standard-4" 27 | 28 | data: 29 | schema: 30 | - "age" 31 | - "workclass" 32 | - "education_num" 33 | - "marital_status" 34 | - "occupation" 35 | - "relationship" 36 | - "race" 37 | - "capital_gain" 38 | - "capital_loss" 39 | - "hours_per_week" 40 | - "native_country" 41 | - "income_bracket" 42 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 43 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 44 | prediction: 45 | input_data_paths: 46 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 47 | input_format: "JSON" 48 | output_format: "JSON" 49 | 50 | model: 51 | # Name must start with a letter and only contain letters, numbers, and 52 | # underscores. 53 | name: [MODEL NAME] 54 | path: "model.tf_model" 55 | target: "income_bracket" 56 | metrics: 57 | - "accuracy" 58 | 59 | model_params: 60 | input_args: 61 | first_layer_size: 62 | type: "int" 63 | help: "Size of the NN first layer." 64 | default: 50 65 | num_layers: 66 | type: "int" 67 | help: "Number of layers in the NN." 68 | default: 5 69 | max_steps: 70 | default: 1000 71 | # Relative path. 72 | hyperparam_config: "hptuning_config.yaml" 73 | explain_output: 74 | explain_type: "sampledShapleyAttribution" 75 | explain_param: 76 | name: "numPaths" 77 | value: 40 78 | 79 | orchestration: 80 | host: [KFP DASHBOARD URL] 81 | -------------------------------------------------------------------------------- /examples/kfp/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for KubeFlow Pipelines.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from ml_pipeline_gen.pipelines import KfpPipeline 21 | from model.census_preprocess import load_data 22 | 23 | 24 | def _upload_data_to_gcs(model): 25 | """Calls the preprocessing fn which uploads train/eval data to GCS.""" 26 | load_data(model.data["train"], model.data["evaluation"]) 27 | 28 | 29 | # TODO(humichael): See if there's a way to support csv batch predicts. 30 | def _upload_input_data_to_gcs(model, data): 31 | input_path = "tf_input_data.json" 32 | with open(input_path, "w+") as f: 33 | for features in data: 34 | f.write(json.dumps(features) + "\n") 35 | model.upload_pred_input_data(input_path) 36 | os.remove(input_path) 37 | 38 | 39 | # pylint: disable=g-import-not-at-top 40 | def main(): 41 | config = "config.yaml" 42 | model = TFModel(config) 43 | model.generate_files() 44 | _upload_data_to_gcs(model) 45 | pipeline = KfpPipeline(model) 46 | 47 | # preprocess and upload dataset to expected location. 48 | load_data(model.data["train"], model.data["evaluation"]) 49 | 50 | # define pipeline structure 51 | p = pipeline.add_train_component() 52 | pipeline.add_deploy_component(parent=p) 53 | pipeline.add_predict_component(parent=p) 54 | pipeline.print_structure() 55 | 56 | pipeline.generate_pipeline() 57 | 58 | # Create batch prediction data in GCS. 59 | pred_input = [{ 60 | "age": 0.02599666, 61 | "workclass": 6, 62 | "education_num": 1.1365801, 63 | "marital_status": 4, 64 | "occupation": 0, 65 | "relationship": 1, 66 | "race": 4, 67 | "capital_gain": 0.14693314, 68 | "capital_loss": -0.21713187, 69 | "hours_per_week": -0.034039237, 70 | "native_country": 38, 71 | "income_bracket": 0, 72 | }] 73 | _upload_input_data_to_gcs(model, pred_input) 74 | 75 | # Run the pipeline. 76 | # pylint: disable=import-outside-toplevel 77 | from orchestration import pipeline as kfp_pipeline 78 | kfp_pipeline.main() 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /examples/kfp/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | hyperparameterMetricTag: accuracy 17 | goal: MAXIMIZE 18 | maxTrials: 4 19 | maxParallelTrials: 2 20 | enableTrialEarlyStopping: True 21 | params: 22 | - parameterName: first_layer_size 23 | type: INTEGER 24 | minValue: 50 25 | maxValue: 500 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: num_layers 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 15 31 | scaleType: UNIT_LINEAR_SCALE 32 | -------------------------------------------------------------------------------- /examples/kfp/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/kfp/model/__init__.py -------------------------------------------------------------------------------- /examples/kfp/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/kfp/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.census_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on iris data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/sklearn/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.sklearn_model" 53 | target: "income_bracket" 54 | 55 | model_params: 56 | input_args: 57 | C: 58 | type: "float" 59 | help: "Regularization parameter, must be positive." 60 | default: 1.0 61 | # Relative path. 62 | hyperparam_config: "hptuning_config.yaml" 63 | -------------------------------------------------------------------------------- /examples/sklearn/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for scikit-learn ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import SklearnModel 17 | from model.census_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [ 27 | [0.02599666, 6, 1.1365801, 4, 0, 1, 4, 0.14693314, -0.21713187, 28 | -0.034039237, 38], 29 | ] 30 | model = SklearnModel(config) 31 | model.generate_files() 32 | _upload_data_to_gcs(model) 33 | 34 | job_id = model.train(tune=True) 35 | version = model.deploy(job_id=job_id) 36 | preds = model.online_predict(pred_input, version=version) 37 | 38 | print("Features: {}".format(pred_input)) 39 | print("Predictions: {}".format(preds)) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /examples/sklearn/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | scaleTier: STANDARD_1 16 | hyperparameters: 17 | goal: MAXIMIZE 18 | maxTrials: 2 19 | maxParallelTrials: 2 20 | hyperparameterMetricTag: score 21 | enableTrialEarlyStopping: TRUE 22 | params: 23 | - parameterName: C 24 | type: DOUBLE 25 | minValue: .001 26 | maxValue: 10 27 | scaleType: UNIT_LOG_SCALE 28 | -------------------------------------------------------------------------------- /examples/sklearn/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/sklearn/model/__init__.py -------------------------------------------------------------------------------- /examples/sklearn/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/sklearn/model/sklearn_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple SVM classifier.""" 16 | 17 | import argparse 18 | import numpy as np 19 | from sklearn import svm 20 | 21 | from model.census_preprocess import load_data 22 | 23 | 24 | def get_model(params): 25 | """Trains a classifier.""" 26 | classifier = svm.SVC(C=params.C) 27 | return classifier 28 | 29 | 30 | def main(): 31 | """Trains a model locally to test get_model().""" 32 | train_x, train_y, eval_x, eval_y = load_data() 33 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 34 | params = argparse.Namespace(C=1.0) 35 | model = get_model(params) 36 | model.fit(train_x, train_y) 37 | score = model.score(eval_x, eval_y) 38 | print(score) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.sklearn_model" 58 | target: "tip" 59 | 60 | model_params: 61 | input_args: 62 | C: 63 | type: "float" 64 | help: "Regularization parameter, must be positive." 65 | default: 1.0 66 | hyperparam_config: "hptuning_config.yaml" 67 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for scikit-learn ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import SklearnModel 17 | from model.taxi_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [ 27 | [1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147, 28 | 0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409, 29 | 41.881, -87.633, 41.885, -87.62100000000001, 1, 3], 30 | ] 31 | model = SklearnModel(config) 32 | model.generate_files() 33 | _upload_data_to_gcs(model) 34 | 35 | job_id = model.train(tune=True) 36 | version = model.deploy(job_id=job_id) 37 | preds = model.online_predict(pred_input, version=version) 38 | 39 | print("Features: {}".format(pred_input)) 40 | print("Predictions: {}".format(preds)) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | trainingInput: 16 | scaleTier: STANDARD_1 17 | hyperparameters: 18 | goal: MAXIMIZE 19 | maxTrials: 2 20 | maxParallelTrials: 2 21 | hyperparameterMetricTag: score 22 | enableTrialEarlyStopping: TRUE 23 | params: 24 | - parameterName: C 25 | type: DOUBLE 26 | minValue: .001 27 | maxValue: 10 28 | scaleType: UNIT_LOG_SCALE 29 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/sklearn/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/sklearn/model/sklearn_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | from sklearn import svm 32 | 33 | from model.taxi_preprocess import load_data 34 | 35 | 36 | def get_model(params): 37 | """Trains a classifier.""" 38 | classifier = svm.SVC(C=params.C) 39 | return classifier 40 | 41 | 42 | def main(): 43 | """Trains a model locally to test get_model().""" 44 | train_x, train_y, eval_x, eval_y = load_data() 45 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 46 | params = argparse.Namespace(C=1.0) 47 | model = get_model(params) 48 | model.fit(train_x, train_y) 49 | score = model.score(eval_x, eval_y) 50 | print(score) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /examples/taxi/tf/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "n1-standard-4" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.tf_model" 58 | target: "tip" 59 | metrics: 60 | - "accuracy" 61 | 62 | model_params: 63 | input_args: 64 | first_layer_size: 65 | type: "int" 66 | help: "Size of the NN first layer." 67 | default: 50 68 | num_layers: 69 | type: "int" 70 | help: "Number of layers in the NN." 71 | default: 5 72 | max_steps: 73 | default: 1000 74 | hyperparam_config: "hptuning_config.yaml" 75 | explain_output: 76 | explain_type: "sampledShapleyAttribution" 77 | explain_param: 78 | name: "numPaths" 79 | value: 40 80 | -------------------------------------------------------------------------------- /examples/taxi/tf/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for TF ML Pipeline Generator.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from model.taxi_preprocess import load_data 21 | 22 | 23 | def _upload_data_to_gcs(model): 24 | load_data(model.data["train"], model.data["evaluation"]) 25 | 26 | 27 | # TODO(humichael): See if there"s a way to support csv batch predicts. 28 | def _upload_input_data_to_gcs(model, data): 29 | input_path = "./tf_input_data.json" 30 | with open(input_path, "w+") as f: 31 | for features in data: 32 | f.write(json.dumps(features) + "\n") 33 | model.upload_pred_input_data(input_path) 34 | os.remove(input_path) 35 | 36 | 37 | def main(): 38 | explanations = True 39 | config = "config.yaml" 40 | pred_input = [{ 41 | "trip_miles": 1.0, 42 | "trip_seconds": -0.56447923, 43 | "fare": -0.5502175, 44 | "trip_start_month": -1.00234, 45 | "trip_start_hour": -0.60791147, 46 | "trip_start_day": 0.38163432, 47 | "pickup_community_area": 0.5846407, 48 | "dropoff_community_area": 0.6274534, 49 | "pickup_census_tract": 1.4543412, 50 | "dropoff_census_tract": -0.09238409, 51 | "pickup_latitude": 41.881, 52 | "pickup_longitude": -87.633, 53 | "dropoff_latitude": 41.885, 54 | "dropoff_longitude": -87.62100000000001, 55 | "payment_type": 1, 56 | "company": 3 57 | }] 58 | model = TFModel(config) 59 | model.generate_files() 60 | _upload_data_to_gcs(model) 61 | 62 | job_id = model.train(tune=True) 63 | version = model.deploy(job_id=job_id, explanations=explanations) 64 | if explanations: 65 | explanations = model.online_explanations(pred_input, 66 | version=version) 67 | print("Online Explanations") 68 | print("Explanations: {}".format(explanations)) 69 | preds = model.online_predict(pred_input, version=version) 70 | 71 | print("Online Predictions") 72 | print("Features: {}".format(pred_input)) 73 | print("Predictions: {}".format(preds)) 74 | 75 | if not explanations: 76 | _upload_input_data_to_gcs(model, pred_input) 77 | model.batch_predict(version=version) 78 | print("Batch predictions written to", 79 | model.get_pred_output_path()) 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /examples/taxi/tf/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | trainingInput: 16 | hyperparameters: 17 | hyperparameterMetricTag: accuracy 18 | goal: MAXIMIZE 19 | maxTrials: 4 20 | maxParallelTrials: 2 21 | enableTrialEarlyStopping: True 22 | params: 23 | - parameterName: first_layer_size 24 | type: INTEGER 25 | minValue: 50 26 | maxValue: 500 27 | scaleType: UNIT_LINEAR_SCALE 28 | - parameterName: num_layers 29 | type: INTEGER 30 | minValue: 1 31 | maxValue: 15 32 | scaleType: UNIT_LINEAR_SCALE 33 | -------------------------------------------------------------------------------- /examples/taxi/tf/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/tf/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/tf/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.taxi_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on taxi data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/taxi/xgb/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.xgb_model" 58 | target: "tip" 59 | 60 | model_params: 61 | input_args: 62 | n_estimators: 63 | type: "int" 64 | help: "Number of output categories." 65 | default: 10 66 | hyperparam_config: "hptuning_config.yaml" 67 | -------------------------------------------------------------------------------- /examples/taxi/xgb/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for XGBoost ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import XGBoostModel 17 | from model.taxi_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [[ 27 | 1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147, 28 | 0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409, 29 | 41.881, -87.633, 41.885, -87.62100000000001, 1, 3 30 | ]] 31 | 32 | model = XGBoostModel(config) 33 | model.generate_files() 34 | _upload_data_to_gcs(model) 35 | 36 | job_id = model.train(tune=True) 37 | version = model.deploy(job_id=job_id) 38 | preds = model.online_predict(pred_input, version=version) 39 | 40 | print("Features: {}".format(pred_input)) 41 | print("Predictions: {}".format(preds)) 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /examples/taxi/xgb/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | goal: MAXIMIZE 17 | maxTrials: 4 18 | maxParallelTrials: 2 19 | hyperparameterMetricTag: roc_auc 20 | enableTrialEarlyStopping: TRUE 21 | params: 22 | - parameterName: max_depth 23 | type: INTEGER 24 | minValue: 3 25 | maxValue: 8 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: n_estimators 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 20 31 | scaleType: UNIT_LINEAR_SCALE 32 | - parameterName: booster 33 | type: CATEGORICAL 34 | categoricalValues: [ 35 | "gbtree", 36 | "gblinear", 37 | "dart" 38 | ] 39 | -------------------------------------------------------------------------------- /examples/taxi/xgb/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/xgb/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/xgb/model/xgb_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | 32 | from sklearn import metrics 33 | from xgboost import XGBClassifier 34 | 35 | from model.taxi_preprocess import load_data 36 | 37 | TARGET_COLUMN = "TARGET" 38 | 39 | 40 | def get_model(args): 41 | """Returns a XGBoost model.""" 42 | params = { 43 | "n_estimators": args.n_estimators, 44 | "max_depth": args.max_depth, 45 | "booster": args.booster, 46 | "min_child_weight": args.min_child_weight, 47 | "learning_rate": args.learning_rate, 48 | "gamma": args.gamma, 49 | "subsample": args.subsample, 50 | "colsample_bytree": args.colsample_bytree, 51 | "reg_alpha": args.reg_alpha, 52 | "num_class": args.num_classes 53 | } 54 | xgb_model = XGBClassifier(**params) 55 | return xgb_model 56 | 57 | 58 | def main(): 59 | """Trains a model locally to test get_model().""" 60 | train_x, train_y, eval_x, eval_y = load_data() 61 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 62 | params = argparse.Namespace( 63 | n_estimators = 2, 64 | max_depth = 3, 65 | booster = "gbtree", 66 | min_child_weight = 1, 67 | learning_rate = 0.3, 68 | gamma = 0, 69 | subsample = 1, 70 | colsample_bytree = 1, 71 | reg_alpha = 0, 72 | num_class = 1) 73 | model = get_model(params) 74 | model.fit(train_x, train_y) 75 | y_pred = model.predict(eval_x) 76 | score = metrics.roc_auc_score(eval_y, y_pred, average="macro") 77 | print("ROC: {}".format(score)) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /examples/tf/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "n1-standard-4" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.tf_model" 53 | target: "income_bracket" 54 | metrics: 55 | - "accuracy" 56 | 57 | model_params: 58 | input_args: 59 | first_layer_size: 60 | type: "int" 61 | help: "Size of the NN first layer." 62 | default: 50 63 | num_layers: 64 | type: "int" 65 | help: "Number of layers in the NN." 66 | default: 5 67 | max_steps: 68 | default: 1000 69 | # Relative path. 70 | hyperparam_config: "hptuning_config.yaml" 71 | explain_output: 72 | explain_type: "sampledShapleyAttribution" 73 | explain_param: 74 | name: "numPaths" 75 | value: 40 76 | -------------------------------------------------------------------------------- /examples/tf/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for TF ML Pipeline Generator.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from model.census_preprocess import load_data 21 | 22 | 23 | def _upload_data_to_gcs(model): 24 | """Calls the preprocessing fn which uploads train/eval data to GCS.""" 25 | load_data(model.data["train"], model.data["evaluation"]) 26 | 27 | 28 | # TODO(humichael): See if there's a way to support csv batch predicts. 29 | def _upload_input_data_to_gcs(model, data): 30 | input_path = "tf_input_data.json" 31 | with open(input_path, "w+") as f: 32 | for features in data: 33 | f.write(json.dumps(features) + "\n") 34 | model.upload_pred_input_data(input_path) 35 | os.remove(input_path) 36 | 37 | 38 | def main(): 39 | explanations = True 40 | config = "config.yaml" 41 | pred_input = [{ 42 | "age": 0.02599666, 43 | "workclass": 6, 44 | "education_num": 1.1365801, 45 | "marital_status": 4, 46 | "occupation": 0, 47 | "relationship": 1, 48 | "race": 4, 49 | "capital_gain": 0.14693314, 50 | "capital_loss": -0.21713187, 51 | "hours_per_week": -0.034039237, 52 | "native_country": 38, 53 | "income_bracket": 0, 54 | }] 55 | model = TFModel(config) 56 | model.generate_files() 57 | _upload_data_to_gcs(model) 58 | 59 | job_id = model.train(tune=True) 60 | version = model.deploy(job_id=job_id, explanations=explanations) 61 | if explanations: 62 | explanations = model.online_explanations(pred_input, 63 | version=version) 64 | print("Online Explanations") 65 | print("Explanations: {}".format(explanations)) 66 | preds = model.online_predict(pred_input, version=version) 67 | 68 | print("Online Predictions") 69 | print("Features: {}".format(pred_input)) 70 | print("Predictions: {}".format(preds)) 71 | 72 | if not explanations: 73 | _upload_input_data_to_gcs(model, pred_input) 74 | model.batch_predict(version=version) 75 | print("Batch predictions written to", 76 | model.get_pred_output_path()) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /examples/tf/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | hyperparameterMetricTag: accuracy 17 | goal: MAXIMIZE 18 | maxTrials: 4 19 | maxParallelTrials: 2 20 | enableTrialEarlyStopping: True 21 | params: 22 | - parameterName: first_layer_size 23 | type: INTEGER 24 | minValue: 50 25 | maxValue: 500 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: num_layers 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 15 31 | scaleType: UNIT_LINEAR_SCALE 32 | -------------------------------------------------------------------------------- /examples/tf/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/tf/model/__init__.py -------------------------------------------------------------------------------- /examples/tf/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/tf/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.census_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on iris data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/xgboost/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.xgboost_model" 53 | target: "income_bracket" 54 | 55 | model_params: 56 | input_args: 57 | n_estimators: 58 | type: "int" 59 | help: "Number of output categories." 60 | default: 10 61 | # Relative path. 62 | hyperparam_config: "hptuning_config.yaml" 63 | -------------------------------------------------------------------------------- /examples/xgboost/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for XGBoost ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import XGBoostModel 17 | from model.census_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [[ 27 | 7.65000000e+02, 2.81400000e+04, 0.00000000e+00, 1.00000000e+00, 28 | 8.30000000e+01, 3.26000000e+05, 8.30000000e+01, 4.87500000e+00, 29 | 3.60000000e+02, 1.00000000e+00, 3.09730330e+05, 3.25000000e+05, 30 | 1.52696700e+04, 4.67629611e+03, 0.00000000e+00, 3.17866362e+05, 31 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 32 | 0.00000000e+00, 0.00000000e+00, 4.87500000e+00, 4.87500000e+00, 33 | 0.00000000e+00, 4.87500000e+00, 0.00000000e+00, 4.87500000e+00, 34 | 0.00000000e+00, 5.95836265e-06, 0.00000000e+00, 0.00000000e+00, 35 | 0.00000000e+00, 2.63157895e-02, 9.99000000e+02, 9.99000000e+02, 36 | 9.99000000e+02, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 37 | 1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 38 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 39 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 40 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 41 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 42 | 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 43 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 44 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 45 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 46 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 47 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 48 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 49 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 50 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 51 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 52 | 0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 53 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 54 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 55 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 56 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 57 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 58 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 59 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 60 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 61 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 62 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 63 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00 64 | ]] 65 | 66 | model = XGBoostModel(config) 67 | model.generate_files() 68 | _upload_data_to_gcs(model) 69 | 70 | job_id = model.train() 71 | version = model.deploy(job_id=job_id) 72 | preds = model.online_predict(pred_input, version=version) 73 | 74 | print("Features: {}".format(pred_input)) 75 | print("Predictions: {}".format(preds)) 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /examples/xgboost/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | goal: MAXIMIZE 17 | maxTrials: 4 18 | maxParallelTrials: 2 19 | hyperparameterMetricTag: roc_auc 20 | enableTrialEarlyStopping: TRUE 21 | params: 22 | - parameterName: max_depth 23 | type: INTEGER 24 | minValue: 3 25 | maxValue: 8 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: n_estimators 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 20 31 | scaleType: UNIT_LINEAR_SCALE 32 | - parameterName: booster 33 | type: CATEGORICAL 34 | categoricalValues: [ 35 | "gbtree", 36 | "gblinear", 37 | "dart" 38 | ] 39 | -------------------------------------------------------------------------------- /examples/xgboost/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/xgboost/model/__init__.py -------------------------------------------------------------------------------- /examples/xgboost/model/xgboost_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | from xgboost import XGBClassifier 32 | 33 | from model.census_preprocess import load_data 34 | 35 | TARGET_COLUMN = 'TARGET' 36 | 37 | 38 | def get_model(args): 39 | """Returns a XGBoost model.""" 40 | params = { 41 | 'n_estimators': args.n_estimators, 42 | 'max_depth': args.max_depth, 43 | 'booster': args.booster, 44 | 'min_child_weight': args.min_child_weight, 45 | 'learning_rate': args.learning_rate, 46 | 'gamma': args.gamma, 47 | 'subsample': args.subsample, 48 | 'colsample_bytree': args.colsample_bytree, 49 | 'reg_alpha': args.reg_alpha, 50 | 'num_class': args.num_classes 51 | } 52 | xgb_model = XGBClassifier(**params) 53 | return xgb_model 54 | 55 | 56 | def main(): 57 | """Trains a model locally to test get_model().""" 58 | train_x, train_y, eval_x, eval_y = load_data() 59 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 60 | params = argparse.Namespace(C=1.0) 61 | model = get_model(params) 62 | model.fit(train_x, train_y) 63 | score = model.score(eval_x, eval_y) 64 | print(score) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /ml_pipeline_gen/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Michael Hu, Stefan Hosein" 2 | __license__ = "Apache 2.0" 3 | __copyright__ = """ 4 | Copyright 2020 Google Inc. All Rights Reserved. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | __version__ = "0.0.5" 19 | -------------------------------------------------------------------------------- /ml_pipeline_gen/experimental/component_lib.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Method for generating component files from respective templates.""" 16 | from os import path 17 | import pathlib 18 | 19 | from ml_pipeline_gen.parsers import parse_yaml 20 | import jinja2 as jinja 21 | 22 | 23 | def generate_component(config, name, template_spec='./component_spec.yaml'): 24 | """Generate the component files from the templates.""" 25 | template_spec_path = path.join(path.dirname(__file__), template_spec) 26 | output_spec = parse_yaml(template_spec_path) 27 | current_spec = output_spec[name] 28 | 29 | loader = jinja.PackageLoader('ml_pipeline_gen', current_spec['template_dir']) 30 | env = jinja.Environment(loader=loader, trim_blocks=True, 31 | lstrip_blocks='True') 32 | template_file_list = current_spec['files'] 33 | for template in template_file_list: 34 | template_in = env.get_template(template['input']) 35 | template_out = template_in.render( 36 | 37 | config=config 38 | ) 39 | output_file = path.join(config.output_package, template['output']) 40 | pathlib.Path(output_file).parent.mkdir(parents=True, exist_ok=True) 41 | with open(output_file, 'w') as f: 42 | f.write(template_out) 43 | -------------------------------------------------------------------------------- /ml_pipeline_gen/experimental/component_spec.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Reference for template generated pipeline steps 16 | hptune: 17 | template_dir: "./templates/hptune" 18 | files: 19 | - input: 'component.yaml' 20 | output: 'hptune/component.yaml' 21 | - input: 'Dockerfile' 22 | output: 'hptune/Dockerfile' 23 | - input: 'build.sh' 24 | output: 'hptune/build.sh' 25 | - input: 'hptune.sh' 26 | output: 'hptune/hptune.sh' 27 | 28 | 29 | get_tuned_params: 30 | template_dir: "./templates/get_tuned_params" 31 | files: 32 | - input: 'component.yaml' 33 | output: 'get_tuned_params/component.yaml' 34 | - input: 'Dockerfile' 35 | output: 'get_tuned_params/Dockerfile' 36 | - input: 'build.sh' 37 | output: 'get_tuned_params/build.sh' 38 | - input: 'get_tuned_params.py' 39 | output: 'get_tuned_params/get_tuned_params.py' 40 | 41 | -------------------------------------------------------------------------------- /ml_pipeline_gen/parsers.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions for parsing data sources.""" 16 | import types 17 | import yaml 18 | 19 | 20 | # TODO(humichael): Replace with gfile to support GCS. 21 | def parse_yaml(path): 22 | """Parses the given config file.""" 23 | with open(path, "r") as f: 24 | doc = f.read() 25 | return yaml.load(doc, Loader=yaml.FullLoader) 26 | 27 | 28 | class NestedNamespace(types.SimpleNamespace): 29 | """Parse nested disctionary to create nested namespace object.""" 30 | 31 | def __init__(self, dictionary, **kwargs): 32 | super(NestedNamespace, self).__init__(**kwargs) 33 | for key, value in dictionary.items(): 34 | if isinstance(value, dict): 35 | self.__setattr__(key, NestedNamespace(value)) 36 | elif isinstance(value, list): 37 | self.__setattr__(key, 38 | [NestedNamespace(i) 39 | if isinstance(i, dict) 40 | else i for i in value]) 41 | else: 42 | self.__setattr__(key, value) 43 | 44 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/bin/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | rm trainer/task.py 2> /dev/null 17 | rm trainer/model.py 2> /dev/null 18 | rm trainer/inputs.py 2> /dev/null 19 | rm orchestration/pipeline.py 2> /dev/null 20 | rm *.tar.gz 2> /dev/null 21 | rm -rf dist/ 2> /dev/null 22 | rm -rf *.egg-info/ 2> /dev/null 23 | rm -rf models/ 2> /dev/null 24 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/bin/run.local_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2019 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Convenience script for training model locally. 18 | PACKAGE_PATH=trainer 19 | MODULE_NAME=trainer.task 20 | 21 | gcloud ai-platform local train \ 22 | --package-path "${PACKAGE_PATH}" \ 23 | --module-name "${MODULE_NAME}" \ 24 | -- 25 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/orchestration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/orchestration/__init__.py -------------------------------------------------------------------------------- /ml_pipeline_gen/static/orchestration/components/list_blobs.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Modified version of https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/storage/list/component.yaml 16 | 17 | name: List blobs 18 | inputs: 19 | - {name: GCS path, type: String, description: 'GCS path for listing. For recursive listing use the "gs://bucket/path/**" syntax".'} 20 | outputs: 21 | - {name: Paths} 22 | implementation: 23 | container: 24 | image: google/cloud-sdk 25 | command: 26 | - sh 27 | - -ex 28 | - -c 29 | - | 30 | if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then 31 | gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" 32 | fi 33 | mkdir -p "$(dirname "$1")" 34 | gsutil ls "$0" | tail -n1 > "$1" 35 | - inputValue: GCS path 36 | - outputPath: Paths 37 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/trainer/__init__.py -------------------------------------------------------------------------------- /ml_pipeline_gen/static/trainer/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utility functions.""" 15 | import os 16 | 17 | import tensorflow as tf 18 | 19 | from sklearn.externals import joblib 20 | from google.cloud import storage 21 | 22 | 23 | 24 | def dump_object(obj, output_path, model_type=""): 25 | """Pickle the given object and write to output_path. 26 | 27 | Args: 28 | obj: object to pickle. 29 | output_path: a local or GCS path. 30 | model_type: whether we are saving a TF model or sklearn/xgboost 31 | """ 32 | if not tf.io.gfile.exists(output_path): 33 | tf.io.gfile.makedirs(os.path.dirname(output_path)) 34 | if model_type == "tf": 35 | tf.saved_model.save(obj, output_path) 36 | else: 37 | with tf.io.gfile.GFile(output_path, "w+") as f: 38 | joblib.dump(obj, f) 39 | 40 | 41 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 42 | """Uploads a file to the bucket.""" 43 | # bucket_name = "your-bucket-name" 44 | # source_file_name = "local/path/to/file" 45 | # destination_blob_name = "storage-object-name" 46 | 47 | storage_client = storage.Client() 48 | bucket = storage_client.bucket(bucket_name) 49 | blob = bucket.blob(destination_blob_name) 50 | 51 | blob.upload_from_filename(source_file_name) 52 | 53 | print( 54 | "File {} uploaded to {}.".format( 55 | source_file_name, destination_blob_name 56 | ) 57 | ) -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/example_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import kfp\n", 10 | "import kfp.components as comp\n", 11 | "import kfp.dsl as dsl\n", 12 | "from kfp.gcp import use_gcp_secret\n", 13 | "from kfp.components import ComponentStore\n", 14 | "from os import path\n", 15 | "import json" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "cs = ComponentStore(local_search_paths=['.', '{{config.output_package}}'],\n", 25 | " url_search_prefixes=['{{config.github_component_url}}'])" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "pre_process_op = cs.load_component('{{config.preprocess.component}}')\n", 35 | "hpt_op = cs.load_component('hptune')\n", 36 | "param_comp = cs.load_component('get_tuned_params')\n", 37 | "train_op = cs.load_component('{{config.train.component}}')\n", 38 | "deploy_op = cs.load_component('{{config.deploy.component}}')\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "@dsl.pipeline(\n", 48 | " name='KFP-Pipelines Example',\n", 49 | " description='Kubeflow pipeline generated from ai-pipeline asset'\n", 50 | ")\n", 51 | "def pipeline_sample(\n", 52 | " project_id='{{config.project_id}}',\n", 53 | " region = '{{config.region}}',\n", 54 | " python_module = '{{config.train.python_module}}',\n", 55 | " package_uri = '{{config.train.python_package}}',\n", 56 | " dataset_bucket = '{{config.bucket_id}}',\n", 57 | " staging_bucket = 'gs://{{config.bucket_id}}',\n", 58 | " job_dir_hptune = 'gs://{{config.bucket_id}}/hptune',\n", 59 | " job_dir_train = 'gs://{{config.bucket_id}}/train',\n", 60 | " runtime_version_train = '{{config.runtime_version}}',\n", 61 | " runtime_version_deploy = '{{config.runtime_version}}',\n", 62 | " hptune_config='{{config.hptune.config}}',\n", 63 | " model_id='{{config.deploy.model_id}}',\n", 64 | " version_id='{{config.deploy.version_id}}',\n", 65 | " common_args_hpt=json.dumps([\n", 66 | " {% for arg in config.hptune.args %}", 67 | " {% set name = arg.name %}", 68 | " {% set value = arg.default %}", 69 | " '--{{name}}', '{{value}}' ,\n", 70 | " {% endfor %}", 71 | " ]),\n", 72 | " common_args_train=json.dumps([\n", 73 | " {% for arg in config.train.args %}", 74 | " {% set name = arg.name %}", 75 | " {% set value = arg.default%}", 76 | " '--{{name}}', '{{value}}' ,\n", 77 | " {% endfor %}", 78 | " ]),\n", 79 | " replace_existing_version=True\n", 80 | "):\n", 81 | "\n", 82 | " #Preprocess Task\n", 83 | " pre_process_task = pre_process_op(\n", 84 | " {% for arg in config.preprocess.component_args %}\n", 85 | " {% set name = arg.name %}\n", 86 | " {{name}}={{name}},\n", 87 | " {% endfor %}\n", 88 | " )\n", 89 | "\n", 90 | " # HP tune Task\n", 91 | " hpt_task = hpt_op (\n", 92 | " region = region,\n", 93 | " python_module = python_module,\n", 94 | " package_uri = package_uri,\n", 95 | " staging_bucket = staging_bucket,\n", 96 | " job_dir = job_dir_hptune,\n", 97 | " config=hptune_config,\n", 98 | " runtime_version = runtime_version_train,\n", 99 | " args = common_args_hpt ,\n", 100 | " )\n", 101 | " hpt_task.after(pre_process_task)\n", 102 | "\n", 103 | " # Get the best hyperparameters\n", 104 | " param_task = param_comp (\n", 105 | " project_id=project_id,\n", 106 | " hptune_job_id=hpt_task.outputs['job_id'].to_struct(),\n", 107 | " common_args=common_args_train,\n", 108 | " )\n", 109 | "\n", 110 | " # Train Task\n", 111 | " train_task = train_op (\n", 112 | " project_id = project_id,\n", 113 | " python_module = python_module,\n", 114 | " package_uris = json.dumps([package_uri.to_struct()]),\n", 115 | " region = region,\n", 116 | " args = str(param_task.outputs['tuned_parameters_out']) ,\n", 117 | " job_dir = job_dir_train,\n", 118 | " python_version = '',\n", 119 | " runtime_version = runtime_version_train,\n", 120 | " master_image_uri = '',\n", 121 | " worker_image_uri = '',\n", 122 | " training_input = '',\n", 123 | " job_id_prefix = '',\n", 124 | " wait_interval = '30'\n", 125 | " )\n", 126 | "\n", 127 | " #model_uri=train_task.outputs['job_dir'],\n", 128 | " #model_uri='gs://poc-bucket-0120/train/out/export/exporter',\n", 129 | " deploy_model = deploy_op(\n", 130 | " model_uri=train_task.outputs['job_dir'].to_struct()+'{{config.train.model_out_prefix}}',\n", 131 | " project_id=project_id,\n", 132 | " model_id=model_id,\n", 133 | " version_id=version_id,\n", 134 | " runtime_version=runtime_version_deploy,\n", 135 | " replace_existing_version=replace_existing_version\n", 136 | " )\n", 137 | " kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))\n", 138 | "\n", 139 | "\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "client = kfp.Client(host='{{config.kfp_deployment_url}}')\n", 149 | "\n", 150 | "client.create_run_from_pipeline_func(pipeline_sample, arguments={})" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.10" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 4 175 | } 176 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The Kubeflow Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.6 16 | RUN pip install -U google-api-python-client==1.7.11 17 | RUN pip install -U oauth2client==4.1.3 18 | COPY . / 19 | ENTRYPOINT ["python", "get_tuned_params.py" ] 20 | 21 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | CONTAINER_NAME=ml-pipeline-get-tuned-params 30 | 31 | docker build -t ${CONTAINER_NAME} . 32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Get Best Hparam 16 | description: | 17 | A Kubeflow Pipeline component to extract best hyperparameters from a given 18 | hyperparameter job ID for a given project. 19 | inputs: 20 | - name: project_id 21 | description: 'Required. The ID of the parent project of the job.' 22 | type: String 23 | - name: hptune_job_id 24 | description: 'hyperparameter tuning job ID' 25 | type: String 26 | - name: common_args 27 | description: 'Common (non-tunable) args' 28 | type: String 29 | outputs: 30 | - name: tuned_parameters_out 31 | description: 'tuned paramters from the given job' 32 | type: String 33 | implementation: 34 | container: 35 | image: gcr.io/gcp-demo-2-262319/ml-pipeline-get-tuned-params:latest 36 | args: [ 37 | --project_id, {inputValue: project_id}, 38 | --hptune_job_id, {inputValue: hptune_job_id}, 39 | --common_args, {inputValue: common_args}, 40 | --tuned_parameters_out, {outputPath: tuned_parameters_out} 41 | ] 42 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/get_tuned_params.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Script to extract hyperparamters from the job-ID.""" 16 | import argparse 17 | 18 | from pathlib import Path 19 | 20 | from googleapiclient import discovery 21 | from googleapiclient import errors 22 | from types import SimpleNamespace 23 | import ast 24 | 25 | 26 | # Modified from: https://stackoverflow.com/a/54332748 27 | class NestedNamespace(SimpleNamespace): 28 | """Parse nested disctionary to create nested namespace object.""" 29 | 30 | def __init__(self, dictionary, **kwargs): 31 | super(NestedNamespace, self).__init__(**kwargs) 32 | for key, value in dictionary.items(): 33 | if isinstance(value, dict): 34 | self.__setattr__(key, NestedNamespace(value)) 35 | elif isinstance(value, list): 36 | self.__setattr__(key, 37 | [NestedNamespace(i) 38 | if isinstance(i, dict) 39 | else i for i in value]) 40 | else: 41 | self.__setattr__(key, value) 42 | 43 | 44 | def print_best_parameters(project_id, 45 | hp_tune_job, 46 | filename='tuned_params', 47 | common_args='[]'): 48 | """Select best hyperparameter set from the job_id.""" 49 | job_id = 'projects/{}/jobs/{}'.format(project_id, hp_tune_job) 50 | 51 | # Build a representation of the Cloud ML API. 52 | ml = discovery.build('ml', 'v1') 53 | 54 | # Create a request to call projects.models.create. 55 | request = ml.projects().jobs().get(name=job_id) 56 | # Make the call. 57 | try: 58 | response = request.execute() 59 | except errors.HttpError as err: 60 | # Something went wrong, print out some information. 61 | print('There was an error getting the job info, Check the details:') 62 | print(err._get_reason()) 63 | 64 | job_info = NestedNamespace(response) 65 | param_list = ast.literal_eval(common_args) 66 | for key, value in job_info.trainingOutput.trials[0].hyperparameters.__dict__.items(): 67 | param_list.append('--'+key) 68 | param_list.append(value) 69 | # Creating the directory where the output file will be created (the directory may or may not exist). 70 | Path(filename).parent.mkdir(parents=True, exist_ok=True) 71 | with open(filename, 'w') as f: 72 | f.write(str(param_list)) 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('--hptune_job_id', 77 | type=str, 78 | required=True, 79 | help='ID of hparam search job') 80 | parser.add_argument('--project_id', 81 | type=str, 82 | required=True, 83 | help='GCP project ID') 84 | parser.add_argument('--common_args', 85 | type=str, 86 | required=True, 87 | help='common (not tunable) arguments for training application') 88 | parser.add_argument('--tuned_parameters_out', 89 | type=str, 90 | required=True, 91 | help='Path to the file containing Tuned Parameters array') 92 | args = parser.parse_args() 93 | print_best_parameters(args.project_id, args.hptune_job_id, args.tuned_parameters_out, args.common_args) 94 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM google/cloud-sdk:latest 16 | 17 | COPY . / 18 | 19 | ENTRYPOINT ["bash", "/hptune.sh" ] 20 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | 30 | CONTAINER_NAME=ml-pipeline-hptune 31 | 32 | docker build -t ${CONTAINER_NAME} . 33 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 35 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Submitting a Cloud ML Hyper Parameter Search job as a pipeline step 16 | description: | 17 | A Kubeflow Pipeline component to submit a Cloud Machine Learning (Cloud ML) 18 | Engine Hyperparameter search job as a step in a pipeline. 19 | inputs: 20 | - name: python_module 21 | description: 'The Python module name to run after installing the packages.' 22 | default: '' 23 | type: String 24 | - name: staging_bucket 25 | description: 'The GCS bucket for staging' 26 | default: '' 27 | type: GCSPath 28 | - name: job_dir 29 | description: 'The GCS bucket dir for where the hparam search run files are created.' 30 | default: '' 31 | type: GCSPath 32 | - name: package_uri 33 | description: 'The Cloud Storage location of the training package.' 34 | default: '' 35 | type: GCSPath 36 | - name: region 37 | description: 'The Compute Engine region in which the training job is run.' 38 | default: '' 39 | type: GCPRegion 40 | - name: args 41 | description: 'The command line arguments to pass to the program.' 42 | default: '' 43 | type: List 44 | - name: runtime_version 45 | description: 'The Cloud ML Engine runtime version to use for training' 46 | default: '' 47 | type: String 48 | - name: config 49 | description: 'hptun yaml' 50 | default: '' 51 | type: String 52 | 53 | outputs: 54 | - name: job_id 55 | description: 'The ID of the created job.' 56 | type: String 57 | implementation: 58 | container: 59 | image: gcr.io/gcp-demo-2-262319/ml-pipeline-hptune:latest 60 | args: [ 61 | --python_module, {inputValue: python_module}, 62 | --package_uri, {inputValue: package_uri}, 63 | --region, {inputValue: region}, 64 | --args, {inputValue: args}, 65 | --staging_bucket, {inputValue: staging_bucket}, 66 | --runtime_version, {inputValue: runtime_version}, 67 | --config, {inputValue: config}, 68 | --job_dir, {inputValue: job_dir}, 69 | --job_id, {outputPath: job_id} 70 | 71 | ] 72 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/hptune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | while [ $# -ne 0 ]; do 18 | case "$1" in 19 | -h|--help) echo "Usage: ./hptune.sh \\" 20 | echo "--region=