├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── CONFIG.md └── HPTUNE_CONFIG.md ├── examples ├── experimental │ └── kfp-2 │ │ ├── config.yaml.example │ │ ├── demo.py │ │ ├── pipeline_from_config_demo.ipynb │ │ └── user-input │ │ └── preprocess │ │ ├── Dockerfile │ │ ├── build.sh │ │ ├── component.yaml │ │ └── split_train_eval.py ├── getting_started_notebook.ipynb ├── kfp │ ├── bin │ │ └── wi_setup.sh │ ├── config.yaml.example │ ├── demo.py │ ├── hptuning_config.yaml │ └── model │ │ ├── __init__.py │ │ ├── census_preprocess.py │ │ └── tf_model.py ├── sklearn │ ├── config.yaml.example │ ├── demo.py │ ├── hptuning_config.yaml │ └── model │ │ ├── __init__.py │ │ ├── census_preprocess.py │ │ └── sklearn_model.py ├── taxi │ ├── sklearn │ │ ├── config.yaml.example │ │ ├── demo.py │ │ ├── hptuning_config.yaml │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── sklearn_model.py │ │ │ └── taxi_preprocess.py │ ├── tf │ │ ├── config.yaml.example │ │ ├── demo.py │ │ ├── hptuning_config.yaml │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── taxi_preprocess.py │ │ │ └── tf_model.py │ └── xgb │ │ ├── config.yaml.example │ │ ├── demo.py │ │ ├── hptuning_config.yaml │ │ └── model │ │ ├── __init__.py │ │ ├── taxi_preprocess.py │ │ └── xgb_model.py ├── tf │ ├── config.yaml.example │ ├── demo.py │ ├── hptuning_config.yaml │ └── model │ │ ├── __init__.py │ │ ├── census_preprocess.py │ │ └── tf_model.py └── xgboost │ ├── config.yaml.example │ ├── demo.py │ ├── hptuning_config.yaml │ └── model │ ├── __init__.py │ ├── census_preprocess.py │ └── xgboost_model.py ├── ml_pipeline_gen ├── __init__.py ├── experimental │ ├── component_lib.py │ └── component_spec.yaml ├── models.py ├── parsers.py ├── pipelines.py ├── static │ ├── bin │ │ ├── cleanup.sh │ │ └── run.local_train.sh │ ├── orchestration │ │ ├── __init__.py │ │ └── components │ │ │ └── list_blobs.yaml │ └── trainer │ │ ├── __init__.py │ │ └── utils.py └── templates │ ├── experimental │ ├── example_pipeline.ipynb │ ├── get_tuned_params │ │ ├── Dockerfile │ │ ├── build.sh │ │ ├── component.yaml │ │ └── get_tuned_params.py │ ├── hptune │ │ ├── Dockerfile │ │ ├── build.sh │ │ ├── component.yaml │ │ └── hptune.sh │ ├── hptuning_config.yaml │ └── kfp_pipeline_from_config.py │ ├── kfp_pipeline.py │ ├── setup.py │ ├── sklearn_inputs.py │ ├── sklearn_model.py │ ├── sklearn_task.py │ ├── tf_inputs.py │ ├── tf_model.py │ ├── tf_task.py │ ├── xgboost_inputs.py │ ├── xgboost_model.py │ └── xgboost_task.py ├── setup.py └── tests ├── __init__.py ├── integration ├── fixtures │ └── test_config.yaml └── src │ ├── __init__.py │ └── test_models.py ├── test_utils.py └── unit ├── __init__.py ├── examples ├── __init__.py ├── sklearn │ ├── __init__.py │ └── test_sklearn_model.py └── tensorflow │ ├── __init__.py │ └── test_tf_model.py └── src ├── __init__.py └── test_models.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Virtual envs 2 | venv/* 3 | testenv/ 4 | 5 | # Generated files 6 | *.pyc 7 | models/* 8 | *.egg-info/ 9 | dist/* 10 | build/* 11 | *.tar.gz 12 | config.yaml 13 | trainer/model.py 14 | trainer/task.py 15 | trainer/inputs.py 16 | orchestration/pipeline.py 17 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | language: python 5 | python: 6 | - "3.6" 7 | - "3.7" 8 | # Tensorflow 1.x does not support python 3.8+ 9 | install: 10 | - pip install -e . 11 | script: 12 | - python -m unittest discover -s tests/unit 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google/conduct/). 29 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.6 16 | COPY . /app 17 | WORKDIR /app 18 | RUN pip install -e ".[dev]" 19 | RUN python -m unittest discover -s tests/unit 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft ml_pipeline_gen/templates 2 | graft ml_pipeline_gen/static 3 | 4 | prune **/experimental 5 | global-exclude *.py[cod] 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML Pipeline Generator 2 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ml-pipeline-gen) 3 | [![PyPI version](https://badge.fury.io/py/ml-pipeline-gen.svg)](https://badge.fury.io/py/ml-pipeline-gen) 4 | [![Build 5 | Status](https://travis-ci.com/GoogleCloudPlatform/ml-pipeline-generator-python.svg?branch=master)](https://travis-ci.com/GoogleCloudPlatform/ml-pipeline-generator-python) 6 | 7 | ML Pipeline Generator is a tool for generating end-to-end pipelines composed of GCP components so that users can easily migrate their local ML models onto GCP and start realizing the benefits of the Cloud quickly. 8 | 9 | The following ML frameworks will be supported: 10 | 1. TensorFlow (TF) 11 | 1. Scikit-learn (SKL) 12 | 1. XGBoost (XGB) 13 | 14 | The following backends are currently supported for model training: 15 | 1. [Google Cloud AI Platform](https://cloud.google.com/ai-platform) 16 | 1. [AI Platform Pipelines](https://cloud.google.com/ai-platform/pipelines/docs) (managed Kubeflow Pipelines) 17 | 18 | ## Setup 19 | ### GCP credentials 20 | ```bash 21 | gcloud auth login 22 | gcloud auth application-default login 23 | gcloud config set project [PROJECT_ID] 24 | ``` 25 | 26 | ### Enabling required APIs 27 | 28 | The tool requires following Google Cloud APIs to be enabled: 29 | 1. [Compute Engine](https://console.cloud.google.com/apis/api/compute.googleapis.com) 30 | 1. [AI Platform Training and Prediction](https://console.cloud.google.com/apis/api/ml.googleapis.com) 31 | 1. [Cloud Storage](https://console.cloud.google.com/apis/api/storage-component.googleapis.com) 32 | 33 | Enable the above APIs by following the links, or run the below command to 34 | enable the APIs for your project. 35 | 36 | ```bash 37 | gcloud services enable ml.googleapis.com \ 38 | compute.googleapis.com \ 39 | storage-component.googleapis.com 40 | ``` 41 | 42 | ### Python environment 43 | ```bash 44 | python3 -m venv venv 45 | source ./venv/bin/activate 46 | pip install ml-pipeline-gen 47 | ``` 48 | 49 | ### Kubeflow Pipelines 50 | Create a Kubeflow Pipelines instance on [AI Platform Pipelines](https://console.cloud.google.com/ai-platform/pipelines). 51 | Once the instance is provisioned, note down the hostname (Dashboard URL). 52 | 53 | ## End to end tutorial notebook 54 | You can view the notebook [here](https://github.com/GoogleCloudPlatform/ml-pipeline-generator-python/blob/master/examples/getting_started_notebook.ipynb) 55 | which can run on your local jupyter notebook, Cloud AI Platform and 56 | Colab. This takes you through how a typical user would leverage this solution. 57 | 58 | ## Cloud AI Platform Demo 59 | This demo uses the scikit-learn model in 60 | `examples/sklearn/model/sklearn_model.py` to create a training module to run on 61 | CAIP. First, make a copy of the `sklearn` example directory. 62 | 63 | ```bash 64 | cp -r examples/sklearn sklearn-demo 65 | cd sklearn-demo 66 | ``` 67 | 68 | Create a `config.yaml` by using the `config.yaml.example` template. See the 69 | [docs](docs/CONFIG.md) for details on the config parameters. Once the 70 | config file is filled out, run the demo. 71 | 72 | ```bash 73 | python demo.py 74 | ``` 75 | 76 | Running this demo uses the config file to generate a `trainer/` module that is 77 | compatible with CAIP. 78 | 79 | ## Kubeflow Pipelines Demo 80 | This demo orechestrates training and prediction using a TensorFlow model in 81 | `examples/kfp/model/tf_model.py` over Kubeflow Pipelines (hosted on AI Platform 82 | Pipelines). First, make a copy of the `kfp/` example directory. 83 | 84 | ```bash 85 | cp -r examples/kfp kfp-demo 86 | cd kfp-demo 87 | ``` 88 | 89 | Create a `config.yaml` by using the `config.yaml.example` template. See the 90 | [docs](docs/CONFIG.md) for details on the config parameters. Once the 91 | config file is filled out, run the demo. 92 | 93 | ```bash 94 | python demo.py 95 | ``` 96 | 97 | Running this demo uses the config file to generate a `trainer/` module that is 98 | compatible with CAIP. It also generates `orchestration/pipeline.py`, which 99 | compiles a Kubeflow Pipelines pipeline. 100 | 101 | _Note: If you're using a GKE cluster without Workload Identity configured, the 102 | tool provisions Workload Identity for the GKE cluster which modifies the 103 | dashboard URL. If this occurs, you will need to update the your config.yaml with 104 | the new Kubeflow Pipelines URL and rerun the demo._ 105 | 106 | ## Tests 107 | The tests use `unittest`, Python's built-in unit testing framework. By running 108 | `python -m unittest`, the framework performs test discovery to find all tests 109 | within this project. Tests can be run on a more granular level by feeding a 110 | directory to test discover. Read more about `unittest` 111 | [here](https://docs.python.org/3/library/unittest.html). 112 | 113 | Unit tests: 114 | ```bash 115 | python -m unittest discover -s tests/unit 116 | ``` 117 | 118 | Integration tests: 119 | ```bash 120 | python -m unittest discover -s tests/integration 121 | ``` 122 | 123 | ## Input args 124 | The following input args are included by default. Overwrite them by adding them 125 | as inputs in the config file. 126 | 127 | | Arg | Description | 128 | | ------------- | ----- | 129 | | train_path| Dir or bucket containing train data.| 130 | | eval_path | Dir or bucket containing eval data.| 131 | | model_dir | Dir or bucket to save model files. | 132 | | batch_size | Number of rows of data to be fed into the model each iteration. | 133 | | max_steps | The maximum number of iterations to train the model for. | 134 | | learning_rate| Multiplier that controls how much the weights of our network are adjusted with respect to the loss gradient.| 135 | | export_format | File format expected by the exported model at inference time. | 136 | | save_checkpoints_steps | Number of steps to run before saving a model checkpoint. | 137 | | keep_checkpoint_max | Number of model checkpoints to keep. | 138 | | log_step_count_steps | Number of steps to run before logging training performance. | 139 | | eval_steps | Number of steps to use to evaluate the model. | 140 | | early_stopping_steps | Number of steps with no loss decrease before stopping early. | 141 | 142 | ## Contribute 143 | To modify the behavior of the library, install `ml-pipeline-gen` using: 144 | 145 | ```bash 146 | pip install -e ".[dev]" 147 | ``` 148 | -------------------------------------------------------------------------------- /docs/CONFIG.md: -------------------------------------------------------------------------------- 1 | ### config.yaml schema 2 | 3 | Below schema should be used when preparing a `config.yaml` file for models using the tool. Some parameters are optional and marked as such. 4 | 5 |
 6 | project_id: [project ID]
 7 | bucket_id: [GCS bucket ID]
 8 | region: [GCP region to train ML Pipeline Generator models in, on AI Platform]
 9 | cluster_name: [Name of GKE cluster hosting Kubeflow Pipelines]
10 | cluster_zone: [Zone in which GKE cluster is deployed]
11 | scale_tier: [compute specifications for training the model on AI Platform]
12 | runtime_version: [AI Platform Training runtime version]
13 | python_version: [Python version used in the model code for training]
14 | package_name: [name for the source distribution to be uploaded to GCS]
15 | machine_type_pred: [type of virtual machine that AI Platform Prediction uses for the nodes that serve predictions, defaults to mls1-c1-m2]
16 | 
17 | data:
18 | 	schema:
19 | 		- [schema for input & target features in the training data]
20 | 	train: [GCS location url to upload preprocessed training data]
21 | 	evaluation: [GCS location url to upload preprocessed eval data]
22 | 	prediction:
23 | 		input_data_paths:
24 | 			- [GCS location urls for prediction input data]
25 | 		input_format: [prediction input format]
26 | 		output_format: [prediction output format]
27 | 
28 | model:
29 | 	name: [unique model name, must start with a letter and only contain letters, numbers, and underscores]
30 | 	path: [local dir path to the model.py file]
31 | 	target: [target feature in training data]
32 | 	metrics: [metrics to evaluate model training on, such as “accuracy”]
33 | 
34 | model_params:
35 | 	input_args: [Any input params to be submitted with the job]
36 | 		arg_name:
37 | 			type: [data type of the arg, such as int]
38 | 			help: [short description of the arg]
39 | 			default: [default value of the arg]
40 | 	hyperparam_config: [optional; local path to hyperparam tuning config yaml. See schema here for this config file.]
41 | 	explanation: [optional; explainability features for the training job]
42 | 
43 | orchestration:
44 | 	kubeflow_url: [for KFP backend; URL of preconfigured Kubeflow instance]
45 | 
46 | -------------------------------------------------------------------------------- /docs/HPTUNE_CONFIG.md: -------------------------------------------------------------------------------- 1 | ### hptune_config.yaml schema 2 | 3 | Below schema should be used when preparing a `hptune_config.yaml` file for models using the tool. The parameters follow the Cloud AI Platform [HyperparameterSpec](https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#HyperparameterSpec), some of which are optional and marked as such. 4 | 5 |
 6 | trainingInput:
 7 | 	hyperparameters:
 8 | 		goal: [the type of goal to use for tuning, MAXIMIZE or MINIMIZE]
 9 | 		params: [the set of parameters to tune]
10 | 			- parameterName: [unique parameter name, e.g. “learning_rate”]
11 | 			  type: [parameter type]
12 | 			  minValue: [min value for the parameter, if DOUBLE or INTEGER type]
13 | 			  maxValue: [max value for the parameter, if DOUBLE or INTEGER type]
14 | 			  scaleType: [optional; how the parameter should be scaled]
15 | 		maxTrials: [optional; how many training trials should be attempted to optimize the specified hyperparameters]
16 | 		maxParallelTrials: [optional; the number of training trials to run concurrently]
17 | 		maxFailedTrials: [optional; the number of failed trials that need to be seen before failing the hyperparameter tuning job]
18 | 		hyperparameterMetricTag: [optional; TensorFlow summary tag name to use for optimizing trials]
19 | 		resumePreviousJobId: [optional; the prior hyperparameter tuning job id that users hope to continue with]
20 | 		enableTrialEarlyStopping: [optional; indicates if the hyperparameter tuning job enables auto trial early stopping]
21 | 		algorithm: [optional; search algorithm to be used]
22 | 
23 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for AI Pipeline. 16 | 17 | output_package: ./caipa-output 18 | project_id: gcp-demo-2-262319 19 | bucket_id: poc-bucket-0120 20 | region: us-central1 21 | runtime_version: "1.10" 22 | python_version: 3.6 23 | 24 | model: 25 | name: loan_delinq_v1 26 | path: 27 | 28 | # Search Path for pre-built components 29 | github_component_url: https://raw.githubusercontent.com/kubeflow/pipelines/3f4b80127f35e40760eeb1813ce1d3f641502222/components/gcp/ 30 | kfp_deployment_url: https://54f49491f869f31e-dot-us-central2.pipelines.googleusercontent.com 31 | 32 | preprocess: 33 | component: user-input/preprocess 34 | component_args: 35 | - name: project_id 36 | default: '' 37 | - name: dataset_bucket 38 | default: poc-bucket-0120 39 | 40 | hptune: 41 | component: AUTO 42 | config: gs://poc-bucket-0120/hpconfig.yaml 43 | args: 44 | - name: output_dir 45 | default: gs://poc-bucket-0120/hptune 46 | - name: input_bucket 47 | default: gs://poc-bucket-0120 48 | - name: eval_steps 49 | default: 10 50 | - name: train_examples 51 | default: 200 52 | 53 | get_tuned_params: 54 | component: AUTO 55 | 56 | train: 57 | python_module: trainer.task 58 | python_package: gs://poc-bucket-0120/trainer.tar.gz 59 | model_out_prefix: /export/exporter 60 | component: ml_engine/train 61 | args: 62 | - name: output_dir 63 | default: gs://poc-bucket-0120/train 64 | - name: input_bucket 65 | default: gs://poc-bucket-0120 66 | - name: eval_steps 67 | default: 10 68 | - name: train_examples 69 | default: 2000 70 | 71 | deploy: 72 | component: ml_engine/deploy 73 | model_id: Loand_Delinq 74 | version_id: v1.0 75 | 76 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for KubeFlow Pipelines.""" 16 | from ml_pipeline_gen.pipelines import KfpPipeline 17 | 18 | 19 | def main(): 20 | config = './config.yaml' 21 | pipeline = KfpPipeline(config=config) 22 | # Review the components 23 | pipeline.list_components() 24 | # define pipeline structure 25 | preprocess = pipeline.add_component('preprocess') 26 | hptune = pipeline.add_component('hptune', parent=preprocess) 27 | get_best_params = pipeline.add_component('get_tuned_params', parent=hptune) 28 | train = pipeline.add_component('train', parent=get_best_params) 29 | deploy = pipeline.add_component('deploy', parent=train) 30 | 31 | pipeline.print_structure() 32 | pipeline.generate_pipeline_from_config() 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/pipeline_from_config_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from ml_pipeline_gen.pipelines import KfpPipeline" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "config = \"./config.yaml\"\n", 19 | "pipeline = KfpPipeline(config=config)\n", 20 | "#pipeline.print_structure()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "['preprocess', 'hptune', 'get_tuned_params', 'train', 'deploy']\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Review the components\n", 38 | "pipeline.list_components()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 6, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# define pipeline structure\n", 48 | "preprocess = pipeline.add_component('preprocess')\n", 49 | "hptune = pipeline.add_component('hptune', parent=preprocess)\n", 50 | "get_best_params= pipeline.add_component('get_tuned_params', parent=hptune)\n", 51 | "train = pipeline.add_component('train', parent=get_best_params)\n", 52 | "deploy = pipeline.add_component('deploy', parent=train)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Generate kubeflow pipeline\n", 62 | "pipeline.generate_pipeline_from_config()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.6.10" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The Kubeflow Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:latest 16 | 17 | RUN pip install -U scipy 18 | 19 | RUN pip install -U numpy 20 | 21 | RUN pip install -U scikit-learn 22 | 23 | RUN pip install pandas 24 | 25 | RUN pip install --upgrade google-cloud-storage 26 | 27 | COPY . / 28 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | CONTAINER_NAME=loan-pipeline-trainevalsplit 30 | 31 | docker build -t ${CONTAINER_NAME} . 32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Component Descriptor for Split-train-eval 16 | name: Split-train-Eval - Preprocess 17 | description: Splits a given input.csv to train and eval csv files 18 | 19 | inputs: 20 | - {name: project_id, type: String} 21 | - {name: dataset_bucket, type: String} 22 | 23 | #outputs: 24 | #- {name: train, type: XGBoost model, help: Trained XGBoost model} 25 | 26 | implementation: 27 | container: 28 | image: gcr.io/gcp-demo-2-262319/loan-pipeline-trainevalsplit:latest 29 | command: [ 30 | python, /split_train_eval.py, 31 | --project_id, {inputValue: project_id}, 32 | --dataset_bucket, {inputValue: dataset_bucket}, 33 | ] 34 | -------------------------------------------------------------------------------- /examples/experimental/kfp-2/user-input/preprocess/split_train_eval.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """.""" 16 | import pandas as pd 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.utils import shuffle 19 | from io import BytesIO 20 | from google.cloud import storage 21 | import argparse 22 | 23 | 24 | def obtain_train_eval(project_id, bucket_name): 25 | """.""" 26 | # # All of the data is in a file called Step10_Final_dataset.csv 27 | print('reading the data file from gcs...') 28 | print('Project-ID: %s ' %(project_id)) 29 | print('Bucket-ID: %s ' %(bucket_name)) 30 | 31 | 32 | # The following was derived from the contents of this reply: 33 | # https://stackoverflow.com/a/50201179 34 | storage_client = storage.Client(project=project_id, credentials=None) 35 | bucket = storage_client.get_bucket(bucket_name) 36 | blob = bucket.blob('input/Step10_Final_dataset.csv') 37 | 38 | byte_stream = BytesIO() 39 | blob.download_to_file(byte_stream) 40 | byte_stream.seek(0) 41 | df = pd.read_csv(byte_stream) 42 | 43 | # We need to rearrange the columns below just as they shall be 44 | # expected by the estimator 45 | print('rearranging data...') 46 | key_column = 'LOAN_SEQUENCE_NUMBER' 47 | label_column = 'TARGET' 48 | bool_cols = [] 49 | int_cols = ['credit_score', 50 | 'mortgage_insurance_percentage', 51 | 'Number_of_units', 52 | 'cltv', 53 | 'original_upb', 54 | 'ltv', 55 | 'original_loan_term', 56 | 'number_of_borrowers', 57 | 'min_CURRENT_DEFERRED_UPB'] 58 | str_cols = ['first_time_home_buyer_flag', 59 | 'occupancy_status', 60 | 'channel', 61 | 'property_state', 62 | 'property_type', 63 | 'loan_purpose', 64 | 'seller_name', 65 | 'service_name'] 66 | # str_nuniques = [2, 3, 3, 52, 5, 2, 20, 24] 67 | float_cols = ['metropolitan_division', 68 | 'original_interest_rate', 69 | 'min_CURRENT_ACTUAL_UPB', 70 | 'max_CURRENT_ACTUAL_UPB', 71 | 'Range_CURRENT_ACTUAL_UPB', 72 | 'stdev_CURRENT_ACTUAL_UPB', 73 | 'mode_CURRENT_ACTUAL_UPB', 74 | 'average_CURRENT_ACTUAL_UPB', 75 | 'max_CURRENT_DEFERRED_UPB', 76 | 'Range_CURRENT_DEFERRED_UPB', 77 | 'mode_CURRENT_DEFERRED_UPB', 78 | 'average_CURRENT_DEFERRED_UPB', 79 | 'stdev_CURRENT_DEFERRED_UPB', 80 | 'min_CURRENT_INTEREST_RATE', 81 | 'max_CURRENT_INTEREST_RATE', 82 | 'Range_CURRENT_INTEREST_RATE', 83 | 'mode_CURRENT_INTEREST_RATE', 84 | 'stdev_CURRENT_INTEREST_RATE', 85 | 'average_CURRENT_INTEREST_RATE', 86 | 'PREFINAL_LOAN_DELINQUENCY_STATUS', 87 | 'frequency_0', 88 | 'frequency_1', 89 | 'frequency_2', 90 | 'frequency_3', 91 | 'Recency_0', 92 | 'Recency_1', 93 | 'Recency_2', 94 | 'Recency_3'] 95 | # DEFAULTS = [[''] for col in bool_cols] + \ 96 | # [[0] for col in int_cols] + \ 97 | # [[0.0] for col in float_cols] + \ 98 | # [[''] for col in str_cols] + [[''], [0]] 99 | csv_columns = bool_cols + int_cols + float_cols + \ 100 | str_cols + [key_column, label_column] 101 | traindata = df[csv_columns] 102 | 103 | # Here, we'll split with a small test size so as to 104 | # allow our model to train on more data 105 | print('splitting...') 106 | x_train, x_test, y_train, y_test = train_test_split( 107 | traindata.drop(label_column, axis=1), traindata[label_column], 108 | stratify=traindata[label_column], shuffle=True, test_size=0.1) 109 | traindf = pd.concat([x_train, y_train], axis=1) 110 | evaldf = pd.concat([x_test, y_test], axis=1) 111 | 112 | alld = pd.concat([traindf, evaldf]) 113 | strcols = [col for col in alld.columns if alld[col].dtype == 'object'] 114 | if key_column in strcols: 115 | strcols.remove(key_column) 116 | alld = pd.get_dummies(alld, columns=strcols) 117 | 118 | divline = traindf.shape[0] 119 | traindf_wdummies = alld.iloc[:divline, :] 120 | # not necessary only cmle but can be used to 121 | # test performance if so desired 122 | evaldf_wdummies = alld.iloc[divline:, :] 123 | del alld 124 | 125 | print('Undersample for XG Boost....') 126 | 127 | traindfu_wdummies = pd.concat([ 128 | traindf_wdummies[traindf_wdummies[label_column] == 0].sample( 129 | frac=0.01), 130 | traindf_wdummies[traindf_wdummies[label_column] == 1].sample( 131 | frac=0.55), 132 | traindf_wdummies[traindf_wdummies[label_column] > 1]]) 133 | traindfu_wdummies = shuffle(traindfu_wdummies) 134 | 135 | # traindfu_wdummies.drop(key_column, axis=1) 136 | # .to_csv('xgb_train.csv', index=False) 137 | # evaldf_wdummies.drop([key_column,label_column], axis=1) 138 | # .to_csv('xgb_eval.csv', index=False) 139 | 140 | # Since the results are small enough to fit in a single 141 | # well-provisioned VM, we'll write the results to csv files locally 142 | # then move them to gcs so we have two copies to work 143 | # with as we please 144 | 145 | print('writing tf model files...') 146 | write_file( 147 | storage_client, 148 | traindf[csv_columns], 149 | bucket_name, 150 | 'train.csv', 151 | header=False) 152 | write_file( 153 | storage_client, 154 | evaldf[csv_columns], 155 | bucket_name, 156 | 'eval.csv', 157 | header=False) 158 | 159 | # traindf[csv_columns].to_csv('train.csv', index=False, header=False) 160 | # evaldf[csv_columns].to_csv('eval.csv', index=False, header=False) 161 | 162 | print('writing XG Boost model files...') 163 | write_file( 164 | storage_client, 165 | traindfu_wdummies.drop(key_column, axis=1), 166 | bucket_name, 167 | 'xgb_train.csv', 168 | header=True) 169 | write_file( 170 | storage_client, 171 | evaldf_wdummies.drop([key_column, label_column], axis=1), 172 | bucket_name, 173 | 'xgb_eval.csv', 174 | header=True) 175 | 176 | with open('./output.txt', 'w') as output_file: 177 | output_file.write(bucket_name) 178 | print('Done!') 179 | 180 | 181 | def write_file(storage_client, 182 | df, 183 | bucket_name, 184 | destination_file_name, 185 | header): 186 | """Write a blob from the bucket.""" 187 | df_str = df.to_csv(index=False, header=header) 188 | # storage_client = storage.Client() 189 | bucket = storage_client.get_bucket(bucket_name) 190 | blob = bucket.blob('output/' + destination_file_name) 191 | blob.upload_from_string(df_str) 192 | 193 | 194 | if __name__ == '__main__': 195 | parser = argparse.ArgumentParser() 196 | parser.add_argument('--project_id', 197 | type=str, 198 | required=True, 199 | help='The GCP project_id containing the source file') 200 | parser.add_argument('--dataset_bucket', 201 | type=str, 202 | required=True, 203 | help='Bucket to store outputs.') 204 | args = parser.parse_args() 205 | 206 | obtain_train_eval(args.project_id, args.dataset_bucket) 207 | -------------------------------------------------------------------------------- /examples/kfp/bin/wi_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Script to set up Google service accounts and workload identity bindings for a 18 | # Kubeflow Pipelines (KFP) standalone deployment. 19 | # 20 | # The script checks if the GKE cluster has Workload Identity enabled and 21 | # configured with a custom label, and if not, enables it and updates the label. 22 | # 23 | # Adapted for ML Pipeline Generator from https://github.com/kubeflow/pipelines/blob/master/manifests/kustomize/gcp-workload-identity-setup.sh 24 | # 25 | # What the script configures: 26 | # 1. Workload Identity for the cluster. 27 | # 2. Google service accounts (GSAs): $SYSTEM_GSA and $USER_GSA. 28 | # 3. Service account IAM policy bindings. 29 | # 4. Kubernetes service account annotations. 30 | # 31 | # Note: Since the node pool is updated with WI, a new KFP hostname is generated. 32 | # 33 | # Requirements: 34 | # 1. gcloud set up in the environment calling the script 35 | # 2. KFP is deployed on a GKE cluster 36 | set -e 37 | 38 | # Cluster vars 39 | PROJECT_ID=$1 40 | CLUSTER_NAME=$2 41 | ZONE=$3 42 | NAMESPACE=$4 43 | 44 | echo "Workload Identity has not been provisioned for "${CLUSTER_NAME}" ("${ZONE}"), enabling it now..." 45 | 46 | # Google service Account (GSA) 47 | SYSTEM_GSA=$CLUSTER_NAME-kfp-system 48 | USER_GSA=$CLUSTER_NAME-kfp-user 49 | 50 | # Kubernetes Service Account (KSA) 51 | SYSTEM_KSA=(ml-pipeline-ui ml-pipeline-visualizationserver) 52 | USER_KSA=(pipeline-runner default) 53 | 54 | gcloud container clusters get-credentials $CLUSTER_NAME \ 55 | --zone=$ZONE 56 | 57 | gcloud container clusters update $CLUSTER_NAME \ 58 | --zone=$ZONE \ 59 | --workload-pool="${PROJECT_ID}".svc.id.goog 60 | 61 | gcloud beta container node-pools update default-pool \ 62 | --cluster=$CLUSTER_NAME \ 63 | --zone=$ZONE \ 64 | --max-surge-upgrade=3 \ 65 | --max-unavailable-upgrade=0 66 | 67 | gcloud container node-pools update default-pool \ 68 | --cluster=$CLUSTER_NAME \ 69 | --zone=$ZONE \ 70 | --workload-metadata=GKE_METADATA 71 | 72 | echo "Creating Google Service Accounts..." 73 | function create_gsa_if_not_present { 74 | local name=${1} 75 | local already_present=$(gcloud iam service-accounts list --filter='name:'$name'' --format='value(name)') 76 | if [ -n "$already_present" ]; then 77 | echo "Service account $name already exists" 78 | else 79 | gcloud iam service-accounts create $name 80 | fi 81 | } 82 | 83 | create_gsa_if_not_present $SYSTEM_GSA 84 | create_gsa_if_not_present $USER_GSA 85 | 86 | # Add iam policy bindings to grant project permissions to these GSAs. 87 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 88 | --member="serviceAccount:$SYSTEM_GSA@$PROJECT_ID.iam.gserviceaccount.com" \ 89 | --role="roles/editor" 90 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 91 | --member="serviceAccount:$USER_GSA@$PROJECT_ID.iam.gserviceaccount.com" \ 92 | --role="roles/editor" 93 | 94 | # Bind KSA to GSA through workload identity. 95 | function bind_gsa_and_ksa { 96 | local gsa=${1} 97 | local ksa=${2} 98 | 99 | gcloud iam service-accounts add-iam-policy-binding $gsa@$PROJECT_ID.iam.gserviceaccount.com \ 100 | --member="serviceAccount:$PROJECT_ID.svc.id.goog[$NAMESPACE/$ksa]" \ 101 | --role="roles/iam.workloadIdentityUser" \ 102 | > /dev/null 103 | 104 | kubectl annotate serviceaccount \ 105 | --namespace $NAMESPACE \ 106 | --overwrite \ 107 | $ksa iam.gke.io/gcp-service-account=$gsa@$PROJECT_ID.iam.gserviceaccount.com 108 | 109 | echo "* Bound KSA $ksa to GSA $gsa" 110 | } 111 | 112 | echo "Binding each kfp system KSA to $SYSTEM_GSA" 113 | for ksa in ${SYSTEM_KSA[@]}; do 114 | bind_gsa_and_ksa $SYSTEM_GSA $ksa 115 | done 116 | 117 | echo "Binding each kfp user KSA to $USER_GSA" 118 | for ksa in ${USER_KSA[@]}; do 119 | bind_gsa_and_ksa $USER_GSA $ksa 120 | done 121 | 122 | gcloud container clusters update $CLUSTER_NAME \ 123 | --zone=$ZONE 124 | --update-labels mlpg_wi_auth=true 125 | 126 | RED='\033[0;31m' 127 | COLOR_RESET='\033[0m' 128 | echo -e "${RED}Workload Identity has been enabled, and KFP dashboard URL has been updated. Please update the hostname in config.yaml for future runs.${COLOR_RESET}" 129 | -------------------------------------------------------------------------------- /examples/kfp/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | cluster_name: [GKE CLUSTER NAME] 21 | cluster_zone: [GKE CLUSTER ZONE] 22 | scale_tier: "STANDARD_1" 23 | runtime_version: "1.15" 24 | python_version: "3.7" 25 | package_name: "ml_pipeline_gen" 26 | machine_type_pred: "n1-standard-4" 27 | 28 | data: 29 | schema: 30 | - "age" 31 | - "workclass" 32 | - "education_num" 33 | - "marital_status" 34 | - "occupation" 35 | - "relationship" 36 | - "race" 37 | - "capital_gain" 38 | - "capital_loss" 39 | - "hours_per_week" 40 | - "native_country" 41 | - "income_bracket" 42 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 43 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 44 | prediction: 45 | input_data_paths: 46 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 47 | input_format: "JSON" 48 | output_format: "JSON" 49 | 50 | model: 51 | # Name must start with a letter and only contain letters, numbers, and 52 | # underscores. 53 | name: [MODEL NAME] 54 | path: "model.tf_model" 55 | target: "income_bracket" 56 | metrics: 57 | - "accuracy" 58 | 59 | model_params: 60 | input_args: 61 | first_layer_size: 62 | type: "int" 63 | help: "Size of the NN first layer." 64 | default: 50 65 | num_layers: 66 | type: "int" 67 | help: "Number of layers in the NN." 68 | default: 5 69 | max_steps: 70 | default: 1000 71 | # Relative path. 72 | hyperparam_config: "hptuning_config.yaml" 73 | explain_output: 74 | explain_type: "sampledShapleyAttribution" 75 | explain_param: 76 | name: "numPaths" 77 | value: 40 78 | 79 | orchestration: 80 | host: [KFP DASHBOARD URL] 81 | -------------------------------------------------------------------------------- /examples/kfp/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for KubeFlow Pipelines.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from ml_pipeline_gen.pipelines import KfpPipeline 21 | from model.census_preprocess import load_data 22 | 23 | 24 | def _upload_data_to_gcs(model): 25 | """Calls the preprocessing fn which uploads train/eval data to GCS.""" 26 | load_data(model.data["train"], model.data["evaluation"]) 27 | 28 | 29 | # TODO(humichael): See if there's a way to support csv batch predicts. 30 | def _upload_input_data_to_gcs(model, data): 31 | input_path = "tf_input_data.json" 32 | with open(input_path, "w+") as f: 33 | for features in data: 34 | f.write(json.dumps(features) + "\n") 35 | model.upload_pred_input_data(input_path) 36 | os.remove(input_path) 37 | 38 | 39 | # pylint: disable=g-import-not-at-top 40 | def main(): 41 | config = "config.yaml" 42 | model = TFModel(config) 43 | model.generate_files() 44 | _upload_data_to_gcs(model) 45 | pipeline = KfpPipeline(model) 46 | 47 | # preprocess and upload dataset to expected location. 48 | load_data(model.data["train"], model.data["evaluation"]) 49 | 50 | # define pipeline structure 51 | p = pipeline.add_train_component() 52 | pipeline.add_deploy_component(parent=p) 53 | pipeline.add_predict_component(parent=p) 54 | pipeline.print_structure() 55 | 56 | pipeline.generate_pipeline() 57 | 58 | # Create batch prediction data in GCS. 59 | pred_input = [{ 60 | "age": 0.02599666, 61 | "workclass": 6, 62 | "education_num": 1.1365801, 63 | "marital_status": 4, 64 | "occupation": 0, 65 | "relationship": 1, 66 | "race": 4, 67 | "capital_gain": 0.14693314, 68 | "capital_loss": -0.21713187, 69 | "hours_per_week": -0.034039237, 70 | "native_country": 38, 71 | "income_bracket": 0, 72 | }] 73 | _upload_input_data_to_gcs(model, pred_input) 74 | 75 | # Run the pipeline. 76 | # pylint: disable=import-outside-toplevel 77 | from orchestration import pipeline as kfp_pipeline 78 | kfp_pipeline.main() 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /examples/kfp/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | hyperparameterMetricTag: accuracy 17 | goal: MAXIMIZE 18 | maxTrials: 4 19 | maxParallelTrials: 2 20 | enableTrialEarlyStopping: True 21 | params: 22 | - parameterName: first_layer_size 23 | type: INTEGER 24 | minValue: 50 25 | maxValue: 500 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: num_layers 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 15 31 | scaleType: UNIT_LINEAR_SCALE 32 | -------------------------------------------------------------------------------- /examples/kfp/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/kfp/model/__init__.py -------------------------------------------------------------------------------- /examples/kfp/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/kfp/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.census_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on iris data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/sklearn/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.sklearn_model" 53 | target: "income_bracket" 54 | 55 | model_params: 56 | input_args: 57 | C: 58 | type: "float" 59 | help: "Regularization parameter, must be positive." 60 | default: 1.0 61 | # Relative path. 62 | hyperparam_config: "hptuning_config.yaml" 63 | -------------------------------------------------------------------------------- /examples/sklearn/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for scikit-learn ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import SklearnModel 17 | from model.census_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [ 27 | [0.02599666, 6, 1.1365801, 4, 0, 1, 4, 0.14693314, -0.21713187, 28 | -0.034039237, 38], 29 | ] 30 | model = SklearnModel(config) 31 | model.generate_files() 32 | _upload_data_to_gcs(model) 33 | 34 | job_id = model.train(tune=True) 35 | version = model.deploy(job_id=job_id) 36 | preds = model.online_predict(pred_input, version=version) 37 | 38 | print("Features: {}".format(pred_input)) 39 | print("Predictions: {}".format(preds)) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /examples/sklearn/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | scaleTier: STANDARD_1 16 | hyperparameters: 17 | goal: MAXIMIZE 18 | maxTrials: 2 19 | maxParallelTrials: 2 20 | hyperparameterMetricTag: score 21 | enableTrialEarlyStopping: TRUE 22 | params: 23 | - parameterName: C 24 | type: DOUBLE 25 | minValue: .001 26 | maxValue: 10 27 | scaleType: UNIT_LOG_SCALE 28 | -------------------------------------------------------------------------------- /examples/sklearn/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/sklearn/model/__init__.py -------------------------------------------------------------------------------- /examples/sklearn/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/sklearn/model/sklearn_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple SVM classifier.""" 16 | 17 | import argparse 18 | import numpy as np 19 | from sklearn import svm 20 | 21 | from model.census_preprocess import load_data 22 | 23 | 24 | def get_model(params): 25 | """Trains a classifier.""" 26 | classifier = svm.SVC(C=params.C) 27 | return classifier 28 | 29 | 30 | def main(): 31 | """Trains a model locally to test get_model().""" 32 | train_x, train_y, eval_x, eval_y = load_data() 33 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 34 | params = argparse.Namespace(C=1.0) 35 | model = get_model(params) 36 | model.fit(train_x, train_y) 37 | score = model.score(eval_x, eval_y) 38 | print(score) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.sklearn_model" 58 | target: "tip" 59 | 60 | model_params: 61 | input_args: 62 | C: 63 | type: "float" 64 | help: "Regularization parameter, must be positive." 65 | default: 1.0 66 | hyperparam_config: "hptuning_config.yaml" 67 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for scikit-learn ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import SklearnModel 17 | from model.taxi_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [ 27 | [1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147, 28 | 0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409, 29 | 41.881, -87.633, 41.885, -87.62100000000001, 1, 3], 30 | ] 31 | model = SklearnModel(config) 32 | model.generate_files() 33 | _upload_data_to_gcs(model) 34 | 35 | job_id = model.train(tune=True) 36 | version = model.deploy(job_id=job_id) 37 | preds = model.online_predict(pred_input, version=version) 38 | 39 | print("Features: {}".format(pred_input)) 40 | print("Predictions: {}".format(preds)) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | trainingInput: 16 | scaleTier: STANDARD_1 17 | hyperparameters: 18 | goal: MAXIMIZE 19 | maxTrials: 2 20 | maxParallelTrials: 2 21 | hyperparameterMetricTag: score 22 | enableTrialEarlyStopping: TRUE 23 | params: 24 | - parameterName: C 25 | type: DOUBLE 26 | minValue: .001 27 | maxValue: 10 28 | scaleType: UNIT_LOG_SCALE 29 | -------------------------------------------------------------------------------- /examples/taxi/sklearn/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/sklearn/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/sklearn/model/sklearn_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | from sklearn import svm 32 | 33 | from model.taxi_preprocess import load_data 34 | 35 | 36 | def get_model(params): 37 | """Trains a classifier.""" 38 | classifier = svm.SVC(C=params.C) 39 | return classifier 40 | 41 | 42 | def main(): 43 | """Trains a model locally to test get_model().""" 44 | train_x, train_y, eval_x, eval_y = load_data() 45 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 46 | params = argparse.Namespace(C=1.0) 47 | model = get_model(params) 48 | model.fit(train_x, train_y) 49 | score = model.score(eval_x, eval_y) 50 | print(score) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /examples/taxi/tf/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "n1-standard-4" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.tf_model" 58 | target: "tip" 59 | metrics: 60 | - "accuracy" 61 | 62 | model_params: 63 | input_args: 64 | first_layer_size: 65 | type: "int" 66 | help: "Size of the NN first layer." 67 | default: 50 68 | num_layers: 69 | type: "int" 70 | help: "Number of layers in the NN." 71 | default: 5 72 | max_steps: 73 | default: 1000 74 | hyperparam_config: "hptuning_config.yaml" 75 | explain_output: 76 | explain_type: "sampledShapleyAttribution" 77 | explain_param: 78 | name: "numPaths" 79 | value: 40 80 | -------------------------------------------------------------------------------- /examples/taxi/tf/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for TF ML Pipeline Generator.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from model.taxi_preprocess import load_data 21 | 22 | 23 | def _upload_data_to_gcs(model): 24 | load_data(model.data["train"], model.data["evaluation"]) 25 | 26 | 27 | # TODO(humichael): See if there"s a way to support csv batch predicts. 28 | def _upload_input_data_to_gcs(model, data): 29 | input_path = "./tf_input_data.json" 30 | with open(input_path, "w+") as f: 31 | for features in data: 32 | f.write(json.dumps(features) + "\n") 33 | model.upload_pred_input_data(input_path) 34 | os.remove(input_path) 35 | 36 | 37 | def main(): 38 | explanations = True 39 | config = "config.yaml" 40 | pred_input = [{ 41 | "trip_miles": 1.0, 42 | "trip_seconds": -0.56447923, 43 | "fare": -0.5502175, 44 | "trip_start_month": -1.00234, 45 | "trip_start_hour": -0.60791147, 46 | "trip_start_day": 0.38163432, 47 | "pickup_community_area": 0.5846407, 48 | "dropoff_community_area": 0.6274534, 49 | "pickup_census_tract": 1.4543412, 50 | "dropoff_census_tract": -0.09238409, 51 | "pickup_latitude": 41.881, 52 | "pickup_longitude": -87.633, 53 | "dropoff_latitude": 41.885, 54 | "dropoff_longitude": -87.62100000000001, 55 | "payment_type": 1, 56 | "company": 3 57 | }] 58 | model = TFModel(config) 59 | model.generate_files() 60 | _upload_data_to_gcs(model) 61 | 62 | job_id = model.train(tune=True) 63 | version = model.deploy(job_id=job_id, explanations=explanations) 64 | if explanations: 65 | explanations = model.online_explanations(pred_input, 66 | version=version) 67 | print("Online Explanations") 68 | print("Explanations: {}".format(explanations)) 69 | preds = model.online_predict(pred_input, version=version) 70 | 71 | print("Online Predictions") 72 | print("Features: {}".format(pred_input)) 73 | print("Predictions: {}".format(preds)) 74 | 75 | if not explanations: 76 | _upload_input_data_to_gcs(model, pred_input) 77 | model.batch_predict(version=version) 78 | print("Batch predictions written to", 79 | model.get_pred_output_path()) 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /examples/taxi/tf/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | trainingInput: 16 | hyperparameters: 17 | hyperparameterMetricTag: accuracy 18 | goal: MAXIMIZE 19 | maxTrials: 4 20 | maxParallelTrials: 2 21 | enableTrialEarlyStopping: True 22 | params: 23 | - parameterName: first_layer_size 24 | type: INTEGER 25 | minValue: 50 26 | maxValue: 500 27 | scaleType: UNIT_LINEAR_SCALE 28 | - parameterName: num_layers 29 | type: INTEGER 30 | minValue: 1 31 | maxValue: 15 32 | scaleType: UNIT_LINEAR_SCALE 33 | -------------------------------------------------------------------------------- /examples/taxi/tf/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/tf/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/tf/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.taxi_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on taxi data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/taxi/xgb/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "trip_miles" 29 | - "trip_seconds" 30 | - "fare" 31 | - "trip_start_month" 32 | - "trip_start_hour" 33 | - "trip_start_day" 34 | - "pickup_community_area" 35 | - "dropoff_community_area" 36 | - "pickup_census_tract" 37 | - "dropoff_census_tract" 38 | - "pickup_latitude" 39 | - "pickup_longitude" 40 | - "dropoff_latitude" 41 | - "dropoff_longitude" 42 | - "payment_type" 43 | - "company" 44 | - "tip" 45 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv" 46 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv" 47 | prediction: 48 | input_data_paths: 49 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 50 | input_format: "JSON" 51 | output_format: "JSON" 52 | 53 | model: 54 | # Name must start with a letter and only contain letters, numbers, and 55 | # underscores. 56 | name: [MODEL NAME] 57 | path: "model.xgb_model" 58 | target: "tip" 59 | 60 | model_params: 61 | input_args: 62 | n_estimators: 63 | type: "int" 64 | help: "Number of output categories." 65 | default: 10 66 | hyperparam_config: "hptuning_config.yaml" 67 | -------------------------------------------------------------------------------- /examples/taxi/xgb/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for XGBoost ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import XGBoostModel 17 | from model.taxi_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [[ 27 | 1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147, 28 | 0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409, 29 | 41.881, -87.633, 41.885, -87.62100000000001, 1, 3 30 | ]] 31 | 32 | model = XGBoostModel(config) 33 | model.generate_files() 34 | _upload_data_to_gcs(model) 35 | 36 | job_id = model.train(tune=True) 37 | version = model.deploy(job_id=job_id) 38 | preds = model.online_predict(pred_input, version=version) 39 | 40 | print("Features: {}".format(pred_input)) 41 | print("Predictions: {}".format(preds)) 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /examples/taxi/xgb/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | goal: MAXIMIZE 17 | maxTrials: 4 18 | maxParallelTrials: 2 19 | hyperparameterMetricTag: roc_auc 20 | enableTrialEarlyStopping: TRUE 21 | params: 22 | - parameterName: max_depth 23 | type: INTEGER 24 | minValue: 3 25 | maxValue: 8 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: n_estimators 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 20 31 | scaleType: UNIT_LINEAR_SCALE 32 | - parameterName: booster 33 | type: CATEGORICAL 34 | categoricalValues: [ 35 | "gbtree", 36 | "gblinear", 37 | "dart" 38 | ] 39 | -------------------------------------------------------------------------------- /examples/taxi/xgb/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/xgb/model/__init__.py -------------------------------------------------------------------------------- /examples/taxi/xgb/model/xgb_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | 32 | from sklearn import metrics 33 | from xgboost import XGBClassifier 34 | 35 | from model.taxi_preprocess import load_data 36 | 37 | TARGET_COLUMN = "TARGET" 38 | 39 | 40 | def get_model(args): 41 | """Returns a XGBoost model.""" 42 | params = { 43 | "n_estimators": args.n_estimators, 44 | "max_depth": args.max_depth, 45 | "booster": args.booster, 46 | "min_child_weight": args.min_child_weight, 47 | "learning_rate": args.learning_rate, 48 | "gamma": args.gamma, 49 | "subsample": args.subsample, 50 | "colsample_bytree": args.colsample_bytree, 51 | "reg_alpha": args.reg_alpha, 52 | "num_class": args.num_classes 53 | } 54 | xgb_model = XGBClassifier(**params) 55 | return xgb_model 56 | 57 | 58 | def main(): 59 | """Trains a model locally to test get_model().""" 60 | train_x, train_y, eval_x, eval_y = load_data() 61 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 62 | params = argparse.Namespace( 63 | n_estimators = 2, 64 | max_depth = 3, 65 | booster = "gbtree", 66 | min_child_weight = 1, 67 | learning_rate = 0.3, 68 | gamma = 0, 69 | subsample = 1, 70 | colsample_bytree = 1, 71 | reg_alpha = 0, 72 | num_class = 1) 73 | model = get_model(params) 74 | model.fit(train_x, train_y) 75 | y_pred = model.predict(eval_x) 76 | score = metrics.roc_auc_score(eval_y, y_pred, average="macro") 77 | print("ROC: {}".format(score)) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /examples/tf/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "n1-standard-4" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.tf_model" 53 | target: "income_bracket" 54 | metrics: 55 | - "accuracy" 56 | 57 | model_params: 58 | input_args: 59 | first_layer_size: 60 | type: "int" 61 | help: "Size of the NN first layer." 62 | default: 50 63 | num_layers: 64 | type: "int" 65 | help: "Number of layers in the NN." 66 | default: 5 67 | max_steps: 68 | default: 1000 69 | # Relative path. 70 | hyperparam_config: "hptuning_config.yaml" 71 | explain_output: 72 | explain_type: "sampledShapleyAttribution" 73 | explain_param: 74 | name: "numPaths" 75 | value: 40 76 | -------------------------------------------------------------------------------- /examples/tf/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for TF ML Pipeline Generator.""" 16 | import json 17 | import os 18 | 19 | from ml_pipeline_gen.models import TFModel 20 | from model.census_preprocess import load_data 21 | 22 | 23 | def _upload_data_to_gcs(model): 24 | """Calls the preprocessing fn which uploads train/eval data to GCS.""" 25 | load_data(model.data["train"], model.data["evaluation"]) 26 | 27 | 28 | # TODO(humichael): See if there's a way to support csv batch predicts. 29 | def _upload_input_data_to_gcs(model, data): 30 | input_path = "tf_input_data.json" 31 | with open(input_path, "w+") as f: 32 | for features in data: 33 | f.write(json.dumps(features) + "\n") 34 | model.upload_pred_input_data(input_path) 35 | os.remove(input_path) 36 | 37 | 38 | def main(): 39 | explanations = True 40 | config = "config.yaml" 41 | pred_input = [{ 42 | "age": 0.02599666, 43 | "workclass": 6, 44 | "education_num": 1.1365801, 45 | "marital_status": 4, 46 | "occupation": 0, 47 | "relationship": 1, 48 | "race": 4, 49 | "capital_gain": 0.14693314, 50 | "capital_loss": -0.21713187, 51 | "hours_per_week": -0.034039237, 52 | "native_country": 38, 53 | "income_bracket": 0, 54 | }] 55 | model = TFModel(config) 56 | model.generate_files() 57 | _upload_data_to_gcs(model) 58 | 59 | job_id = model.train(tune=True) 60 | version = model.deploy(job_id=job_id, explanations=explanations) 61 | if explanations: 62 | explanations = model.online_explanations(pred_input, 63 | version=version) 64 | print("Online Explanations") 65 | print("Explanations: {}".format(explanations)) 66 | preds = model.online_predict(pred_input, version=version) 67 | 68 | print("Online Predictions") 69 | print("Features: {}".format(pred_input)) 70 | print("Predictions: {}".format(preds)) 71 | 72 | if not explanations: 73 | _upload_input_data_to_gcs(model, pred_input) 74 | model.batch_predict(version=version) 75 | print("Batch predictions written to", 76 | model.get_pred_output_path()) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /examples/tf/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | hyperparameterMetricTag: accuracy 17 | goal: MAXIMIZE 18 | maxTrials: 4 19 | maxParallelTrials: 2 20 | enableTrialEarlyStopping: True 21 | params: 22 | - parameterName: first_layer_size 23 | type: INTEGER 24 | minValue: 50 25 | maxValue: 500 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: num_layers 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 15 31 | scaleType: UNIT_LINEAR_SCALE 32 | -------------------------------------------------------------------------------- /examples/tf/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/tf/model/__init__.py -------------------------------------------------------------------------------- /examples/tf/model/census_preprocess.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for MNIST dataset. 16 | 17 | This example comes from the cloudml-samples keras demo. 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | from six.moves import urllib 26 | import tempfile 27 | 28 | import numpy as np 29 | import pandas as pd 30 | import tensorflow.compat.v1 as tf 31 | 32 | 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data") 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform" 35 | + "/census/data/") 36 | TRAINING_FILE = "adult.data.csv" 37 | EVAL_FILE = "adult.test.csv" 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE) 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE) 40 | 41 | _CSV_COLUMNS = [ 42 | "age", "workclass", "fnlwgt", "education", "education_num", 43 | "marital_status", "occupation", "relationship", "race", "gender", 44 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 45 | "income_bracket", 46 | ] 47 | _LABEL_COLUMN = "income_bracket" 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"] 49 | 50 | _CATEGORICAL_TYPES = { 51 | "workclass": pd.api.types.CategoricalDtype(categories=[ 52 | "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc", 53 | "Self-emp-not-inc", "State-gov", "Without-pay" 54 | ]), 55 | "marital_status": pd.api.types.CategoricalDtype(categories=[ 56 | "Divorced", "Married-AF-spouse", "Married-civ-spouse", 57 | "Married-spouse-absent", "Never-married", "Separated", "Widowed" 58 | ]), 59 | "occupation": pd.api.types.CategoricalDtype([ 60 | "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial", 61 | "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", 62 | "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv", 63 | "Sales", "Tech-support", "Transport-moving" 64 | ]), 65 | "relationship": pd.api.types.CategoricalDtype(categories=[ 66 | "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried", 67 | "Wife" 68 | ]), 69 | "race": pd.api.types.CategoricalDtype(categories=[ 70 | "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" 71 | ]), 72 | "native_country": pd.api.types.CategoricalDtype(categories=[ 73 | "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic", 74 | "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece", 75 | "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong", 76 | "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan", 77 | "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru", 78 | "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South", 79 | "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam", 80 | "Yugoslavia" 81 | ]), 82 | "income_bracket": pd.api.types.CategoricalDtype(categories=[ 83 | "<=50K", ">50K" 84 | ]) 85 | } 86 | 87 | 88 | def _download_and_clean_file(filename, url): 89 | """Downloads data from url, and makes changes to match the CSV format. 90 | 91 | The CSVs may use spaces after the comma delimters (non-standard) or include 92 | rows which do not represent well-formed examples. This function strips out 93 | some of these problems. 94 | 95 | Args: 96 | filename: filename to save url to 97 | url: URL of resource to download 98 | """ 99 | temp_file, _ = urllib.request.urlretrieve(url) 100 | with tf.io.gfile.GFile(temp_file, "r") as temp_file_object: 101 | with tf.io.gfile.GFile(filename, "w") as file_object: 102 | for line in temp_file_object: 103 | line = line.strip() 104 | line = line.replace(", ", ",") 105 | if not line or "," not in line: 106 | continue 107 | if line[-1] == ".": 108 | line = line[:-1] 109 | line += "\n" 110 | file_object.write(line) 111 | tf.io.gfile.remove(temp_file) 112 | 113 | 114 | def download(data_dir): 115 | """Downloads census data if it is not already present. 116 | 117 | Args: 118 | data_dir: directory where we will access/save the census data 119 | 120 | Returns: 121 | foo 122 | """ 123 | tf.io.gfile.makedirs(data_dir) 124 | 125 | training_file_path = os.path.join(data_dir, TRAINING_FILE) 126 | if not tf.io.gfile.exists(training_file_path): 127 | _download_and_clean_file(training_file_path, TRAINING_URL) 128 | 129 | eval_file_path = os.path.join(data_dir, EVAL_FILE) 130 | if not tf.io.gfile.exists(eval_file_path): 131 | _download_and_clean_file(eval_file_path, EVAL_URL) 132 | 133 | return training_file_path, eval_file_path 134 | 135 | 136 | def upload(train_df, eval_df, train_path, eval_path): 137 | train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE), 138 | index=False, header=False) 139 | eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE), 140 | index=False, header=False) 141 | 142 | 143 | def preprocess(dataframe): 144 | """Converts categorical features to numeric. Removes unused columns. 145 | 146 | Args: 147 | dataframe: Pandas dataframe with raw data 148 | 149 | Returns: 150 | Dataframe with preprocessed data 151 | """ 152 | dataframe = dataframe.drop(columns=UNUSED_COLUMNS) 153 | 154 | # Convert integer valued (numeric) columns to floating point 155 | numeric_columns = dataframe.select_dtypes(["int64"]).columns 156 | dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32") 157 | 158 | # Convert categorical columns to numeric 159 | cat_columns = dataframe.select_dtypes(["object"]).columns 160 | dataframe[cat_columns] = dataframe[cat_columns].apply( 161 | lambda x: x.astype(_CATEGORICAL_TYPES[x.name])) 162 | dataframe[cat_columns] = dataframe[cat_columns].apply( 163 | lambda x: x.cat.codes) 164 | return dataframe 165 | 166 | 167 | def standardize(dataframe): 168 | """Scales numerical columns using their means and standard deviation. 169 | 170 | Args: 171 | dataframe: Pandas dataframe 172 | 173 | Returns: 174 | Input dataframe with the numerical columns scaled to z-scores 175 | """ 176 | dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes))) 177 | for column, dtype in dtypes: 178 | if dtype == "float32": 179 | dataframe[column] -= dataframe[column].mean() 180 | dataframe[column] /= dataframe[column].std() 181 | return dataframe 182 | 183 | 184 | def load_data(train_path="", eval_path=""): 185 | """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes. 186 | 187 | Args: 188 | train_path: Local or GCS path to uploaded train data to. 189 | eval_path: Local or GCS path to uploaded eval data to. 190 | 191 | Returns: 192 | A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are 193 | Pandas dataframes with features for training and train_y and eval_y are 194 | numpy arrays with the corresponding labels. 195 | """ 196 | # Download Census dataset: Training and eval csv files. 197 | training_file_path, eval_file_path = download(DATA_DIR) 198 | 199 | train_df = pd.read_csv( 200 | training_file_path, names=_CSV_COLUMNS, na_values="?") 201 | eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?") 202 | 203 | train_df = preprocess(train_df) 204 | eval_df = preprocess(eval_df) 205 | 206 | # Split train and eval data with labels. The pop method copies and removes 207 | # the label column from the dataframe. 208 | train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN) 209 | eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN) 210 | 211 | # Join train_x and eval_x to normalize on overall means and standard 212 | # deviations. Then separate them again. 213 | all_x = pd.concat([train_x, eval_x], keys=["train", "eval"]) 214 | all_x = standardize(all_x) 215 | train_x, eval_x = all_x.xs("train"), all_x.xs("eval") 216 | 217 | # Rejoin features and labels and upload to GCS. 218 | if train_path and eval_path: 219 | train_df = train_x.copy() 220 | train_df[_LABEL_COLUMN] = train_y 221 | eval_df = eval_x.copy() 222 | eval_df[_LABEL_COLUMN] = eval_y 223 | upload(train_df, eval_df, train_path, eval_path) 224 | 225 | # Reshape label columns for use with tf.data.Dataset 226 | train_y = np.asarray(train_y).astype("float32").reshape((-1, 1)) 227 | eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1)) 228 | 229 | return train_x, train_y, eval_x, eval_y 230 | 231 | -------------------------------------------------------------------------------- /examples/tf/model/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Train a simple TF classifier for census dataset.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from model.census_preprocess import load_data 25 | 26 | 27 | def get_model(inputs, params): 28 | """Trains a classifier on iris data.""" 29 | dense = tf.keras.layers.Dense 30 | nn = dense(params.first_layer_size, activation="relu", 31 | kernel_initializer="uniform")(inputs) 32 | for i in reversed(range(1, params.num_layers)): 33 | layer_size = int(params.first_layer_size * (i / params.num_layers)) 34 | nn = dense(max(1, layer_size), activation="relu")(nn) 35 | logits = dense(1, activation="sigmoid")(nn) 36 | 37 | return logits 38 | 39 | 40 | # TODO(humichael): create get_predicition and get_evaluation instead. 41 | def get_loss(): 42 | """The loss function to use.""" 43 | return tf.losses.sigmoid_cross_entropy 44 | 45 | 46 | def main(): 47 | """Trains a model locally to test get_model() and get_loss().""" 48 | train_x, train_y, _, _ = load_data() 49 | input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],)) 50 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 51 | predictions = get_model(input_layer, params) 52 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 53 | model.compile(optimizer="adam", loss=get_loss(), 54 | metrics=["accuracy"]) 55 | model.fit(train_x, train_y, epochs=1) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/xgboost/config.yaml.example: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: [PROJECT ID] 18 | bucket_id: [BUCKET ID] 19 | region: "us-central1" 20 | scale_tier: "STANDARD_1" 21 | runtime_version: "1.15" 22 | python_version: "3.7" 23 | package_name: "ml_pipeline_gen" 24 | machine_type_pred: "mls1-c4-m2" 25 | 26 | data: 27 | schema: 28 | - "age" 29 | - "workclass" 30 | - "education_num" 31 | - "marital_status" 32 | - "occupation" 33 | - "relationship" 34 | - "race" 35 | - "capital_gain" 36 | - "capital_loss" 37 | - "hours_per_week" 38 | - "native_country" 39 | - "income_bracket" 40 | train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv" 41 | evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv" 42 | prediction: 43 | input_data_paths: 44 | - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*" 45 | input_format: "JSON" 46 | output_format: "JSON" 47 | 48 | model: 49 | # Name must start with a letter and only contain letters, numbers, and 50 | # underscores. 51 | name: [MODEL NAME] 52 | path: "model.xgboost_model" 53 | target: "income_bracket" 54 | 55 | model_params: 56 | input_args: 57 | n_estimators: 58 | type: "int" 59 | help: "Number of output categories." 60 | default: 10 61 | # Relative path. 62 | hyperparam_config: "hptuning_config.yaml" 63 | -------------------------------------------------------------------------------- /examples/xgboost/demo.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for XGBoost ML Pipeline Generator.""" 16 | from ml_pipeline_gen.models import XGBoostModel 17 | from model.census_preprocess import load_data 18 | 19 | 20 | def _upload_data_to_gcs(model): 21 | load_data(model.data["train"], model.data["evaluation"]) 22 | 23 | 24 | def main(): 25 | config = "config.yaml" 26 | pred_input = [[ 27 | 7.65000000e+02, 2.81400000e+04, 0.00000000e+00, 1.00000000e+00, 28 | 8.30000000e+01, 3.26000000e+05, 8.30000000e+01, 4.87500000e+00, 29 | 3.60000000e+02, 1.00000000e+00, 3.09730330e+05, 3.25000000e+05, 30 | 1.52696700e+04, 4.67629611e+03, 0.00000000e+00, 3.17866362e+05, 31 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 32 | 0.00000000e+00, 0.00000000e+00, 4.87500000e+00, 4.87500000e+00, 33 | 0.00000000e+00, 4.87500000e+00, 0.00000000e+00, 4.87500000e+00, 34 | 0.00000000e+00, 5.95836265e-06, 0.00000000e+00, 0.00000000e+00, 35 | 0.00000000e+00, 2.63157895e-02, 9.99000000e+02, 9.99000000e+02, 36 | 9.99000000e+02, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 37 | 1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 38 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 39 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 40 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 41 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 42 | 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 43 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 44 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 45 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 46 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 47 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 48 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 49 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 50 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 51 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 52 | 0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 53 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 54 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 55 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 56 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 57 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 58 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 59 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 60 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 61 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 62 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 63 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00 64 | ]] 65 | 66 | model = XGBoostModel(config) 67 | model.generate_files() 68 | _upload_data_to_gcs(model) 69 | 70 | job_id = model.train() 71 | version = model.deploy(job_id=job_id) 72 | preds = model.online_predict(pred_input, version=version) 73 | 74 | print("Features: {}".format(pred_input)) 75 | print("Predictions: {}".format(preds)) 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /examples/xgboost/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | goal: MAXIMIZE 17 | maxTrials: 4 18 | maxParallelTrials: 2 19 | hyperparameterMetricTag: roc_auc 20 | enableTrialEarlyStopping: TRUE 21 | params: 22 | - parameterName: max_depth 23 | type: INTEGER 24 | minValue: 3 25 | maxValue: 8 26 | scaleType: UNIT_LINEAR_SCALE 27 | - parameterName: n_estimators 28 | type: INTEGER 29 | minValue: 1 30 | maxValue: 20 31 | scaleType: UNIT_LINEAR_SCALE 32 | - parameterName: booster 33 | type: CATEGORICAL 34 | categoricalValues: [ 35 | "gbtree", 36 | "gblinear", 37 | "dart" 38 | ] 39 | -------------------------------------------------------------------------------- /examples/xgboost/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/xgboost/model/__init__.py -------------------------------------------------------------------------------- /examples/xgboost/model/xgboost_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2019 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # python3 14 | # Copyright 2020 Google Inc. All Rights Reserved. 15 | # 16 | # Licensed under the Apache License, Version 2.0 (the "License"); 17 | # you may not use this file except in compliance with the License. 18 | # You may obtain a copy of the License at 19 | # 20 | # http://www.apache.org/licenses/LICENSE-2.0 21 | # 22 | # Unless required by applicable law or agreed to in writing, software 23 | # distributed under the License is distributed on an "AS IS" BASIS, 24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | # See the License for the specific language governing permissions and 26 | # limitations under the License. 27 | """Train a simple SVM classifier.""" 28 | 29 | import argparse 30 | import numpy as np 31 | from xgboost import XGBClassifier 32 | 33 | from model.census_preprocess import load_data 34 | 35 | TARGET_COLUMN = 'TARGET' 36 | 37 | 38 | def get_model(args): 39 | """Returns a XGBoost model.""" 40 | params = { 41 | 'n_estimators': args.n_estimators, 42 | 'max_depth': args.max_depth, 43 | 'booster': args.booster, 44 | 'min_child_weight': args.min_child_weight, 45 | 'learning_rate': args.learning_rate, 46 | 'gamma': args.gamma, 47 | 'subsample': args.subsample, 48 | 'colsample_bytree': args.colsample_bytree, 49 | 'reg_alpha': args.reg_alpha, 50 | 'num_class': args.num_classes 51 | } 52 | xgb_model = XGBClassifier(**params) 53 | return xgb_model 54 | 55 | 56 | def main(): 57 | """Trains a model locally to test get_model().""" 58 | train_x, train_y, eval_x, eval_y = load_data() 59 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 60 | params = argparse.Namespace(C=1.0) 61 | model = get_model(params) 62 | model.fit(train_x, train_y) 63 | score = model.score(eval_x, eval_y) 64 | print(score) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /ml_pipeline_gen/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Michael Hu, Stefan Hosein" 2 | __license__ = "Apache 2.0" 3 | __copyright__ = """ 4 | Copyright 2020 Google Inc. All Rights Reserved. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | __version__ = "0.0.5" 19 | -------------------------------------------------------------------------------- /ml_pipeline_gen/experimental/component_lib.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Method for generating component files from respective templates.""" 16 | from os import path 17 | import pathlib 18 | 19 | from ml_pipeline_gen.parsers import parse_yaml 20 | import jinja2 as jinja 21 | 22 | 23 | def generate_component(config, name, template_spec='./component_spec.yaml'): 24 | """Generate the component files from the templates.""" 25 | template_spec_path = path.join(path.dirname(__file__), template_spec) 26 | output_spec = parse_yaml(template_spec_path) 27 | current_spec = output_spec[name] 28 | 29 | loader = jinja.PackageLoader('ml_pipeline_gen', current_spec['template_dir']) 30 | env = jinja.Environment(loader=loader, trim_blocks=True, 31 | lstrip_blocks='True') 32 | template_file_list = current_spec['files'] 33 | for template in template_file_list: 34 | template_in = env.get_template(template['input']) 35 | template_out = template_in.render( 36 | 37 | config=config 38 | ) 39 | output_file = path.join(config.output_package, template['output']) 40 | pathlib.Path(output_file).parent.mkdir(parents=True, exist_ok=True) 41 | with open(output_file, 'w') as f: 42 | f.write(template_out) 43 | -------------------------------------------------------------------------------- /ml_pipeline_gen/experimental/component_spec.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Reference for template generated pipeline steps 16 | hptune: 17 | template_dir: "./templates/hptune" 18 | files: 19 | - input: 'component.yaml' 20 | output: 'hptune/component.yaml' 21 | - input: 'Dockerfile' 22 | output: 'hptune/Dockerfile' 23 | - input: 'build.sh' 24 | output: 'hptune/build.sh' 25 | - input: 'hptune.sh' 26 | output: 'hptune/hptune.sh' 27 | 28 | 29 | get_tuned_params: 30 | template_dir: "./templates/get_tuned_params" 31 | files: 32 | - input: 'component.yaml' 33 | output: 'get_tuned_params/component.yaml' 34 | - input: 'Dockerfile' 35 | output: 'get_tuned_params/Dockerfile' 36 | - input: 'build.sh' 37 | output: 'get_tuned_params/build.sh' 38 | - input: 'get_tuned_params.py' 39 | output: 'get_tuned_params/get_tuned_params.py' 40 | 41 | -------------------------------------------------------------------------------- /ml_pipeline_gen/parsers.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions for parsing data sources.""" 16 | import types 17 | import yaml 18 | 19 | 20 | # TODO(humichael): Replace with gfile to support GCS. 21 | def parse_yaml(path): 22 | """Parses the given config file.""" 23 | with open(path, "r") as f: 24 | doc = f.read() 25 | return yaml.load(doc, Loader=yaml.FullLoader) 26 | 27 | 28 | class NestedNamespace(types.SimpleNamespace): 29 | """Parse nested disctionary to create nested namespace object.""" 30 | 31 | def __init__(self, dictionary, **kwargs): 32 | super(NestedNamespace, self).__init__(**kwargs) 33 | for key, value in dictionary.items(): 34 | if isinstance(value, dict): 35 | self.__setattr__(key, NestedNamespace(value)) 36 | elif isinstance(value, list): 37 | self.__setattr__(key, 38 | [NestedNamespace(i) 39 | if isinstance(i, dict) 40 | else i for i in value]) 41 | else: 42 | self.__setattr__(key, value) 43 | 44 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/bin/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | rm trainer/task.py 2> /dev/null 17 | rm trainer/model.py 2> /dev/null 18 | rm trainer/inputs.py 2> /dev/null 19 | rm orchestration/pipeline.py 2> /dev/null 20 | rm *.tar.gz 2> /dev/null 21 | rm -rf dist/ 2> /dev/null 22 | rm -rf *.egg-info/ 2> /dev/null 23 | rm -rf models/ 2> /dev/null 24 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/bin/run.local_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2019 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Convenience script for training model locally. 18 | PACKAGE_PATH=trainer 19 | MODULE_NAME=trainer.task 20 | 21 | gcloud ai-platform local train \ 22 | --package-path "${PACKAGE_PATH}" \ 23 | --module-name "${MODULE_NAME}" \ 24 | -- 25 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/orchestration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/orchestration/__init__.py -------------------------------------------------------------------------------- /ml_pipeline_gen/static/orchestration/components/list_blobs.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Modified version of https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/storage/list/component.yaml 16 | 17 | name: List blobs 18 | inputs: 19 | - {name: GCS path, type: String, description: 'GCS path for listing. For recursive listing use the "gs://bucket/path/**" syntax".'} 20 | outputs: 21 | - {name: Paths} 22 | implementation: 23 | container: 24 | image: google/cloud-sdk 25 | command: 26 | - sh 27 | - -ex 28 | - -c 29 | - | 30 | if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then 31 | gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" 32 | fi 33 | mkdir -p "$(dirname "$1")" 34 | gsutil ls "$0" | tail -n1 > "$1" 35 | - inputValue: GCS path 36 | - outputPath: Paths 37 | -------------------------------------------------------------------------------- /ml_pipeline_gen/static/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/trainer/__init__.py -------------------------------------------------------------------------------- /ml_pipeline_gen/static/trainer/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utility functions.""" 15 | import os 16 | 17 | import tensorflow as tf 18 | 19 | from sklearn.externals import joblib 20 | from google.cloud import storage 21 | 22 | 23 | 24 | def dump_object(obj, output_path, model_type=""): 25 | """Pickle the given object and write to output_path. 26 | 27 | Args: 28 | obj: object to pickle. 29 | output_path: a local or GCS path. 30 | model_type: whether we are saving a TF model or sklearn/xgboost 31 | """ 32 | if not tf.io.gfile.exists(output_path): 33 | tf.io.gfile.makedirs(os.path.dirname(output_path)) 34 | if model_type == "tf": 35 | tf.saved_model.save(obj, output_path) 36 | else: 37 | with tf.io.gfile.GFile(output_path, "w+") as f: 38 | joblib.dump(obj, f) 39 | 40 | 41 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 42 | """Uploads a file to the bucket.""" 43 | # bucket_name = "your-bucket-name" 44 | # source_file_name = "local/path/to/file" 45 | # destination_blob_name = "storage-object-name" 46 | 47 | storage_client = storage.Client() 48 | bucket = storage_client.bucket(bucket_name) 49 | blob = bucket.blob(destination_blob_name) 50 | 51 | blob.upload_from_filename(source_file_name) 52 | 53 | print( 54 | "File {} uploaded to {}.".format( 55 | source_file_name, destination_blob_name 56 | ) 57 | ) -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/example_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import kfp\n", 10 | "import kfp.components as comp\n", 11 | "import kfp.dsl as dsl\n", 12 | "from kfp.gcp import use_gcp_secret\n", 13 | "from kfp.components import ComponentStore\n", 14 | "from os import path\n", 15 | "import json" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "cs = ComponentStore(local_search_paths=['.', '{{config.output_package}}'],\n", 25 | " url_search_prefixes=['{{config.github_component_url}}'])" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "pre_process_op = cs.load_component('{{config.preprocess.component}}')\n", 35 | "hpt_op = cs.load_component('hptune')\n", 36 | "param_comp = cs.load_component('get_tuned_params')\n", 37 | "train_op = cs.load_component('{{config.train.component}}')\n", 38 | "deploy_op = cs.load_component('{{config.deploy.component}}')\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "@dsl.pipeline(\n", 48 | " name='KFP-Pipelines Example',\n", 49 | " description='Kubeflow pipeline generated from ai-pipeline asset'\n", 50 | ")\n", 51 | "def pipeline_sample(\n", 52 | " project_id='{{config.project_id}}',\n", 53 | " region = '{{config.region}}',\n", 54 | " python_module = '{{config.train.python_module}}',\n", 55 | " package_uri = '{{config.train.python_package}}',\n", 56 | " dataset_bucket = '{{config.bucket_id}}',\n", 57 | " staging_bucket = 'gs://{{config.bucket_id}}',\n", 58 | " job_dir_hptune = 'gs://{{config.bucket_id}}/hptune',\n", 59 | " job_dir_train = 'gs://{{config.bucket_id}}/train',\n", 60 | " runtime_version_train = '{{config.runtime_version}}',\n", 61 | " runtime_version_deploy = '{{config.runtime_version}}',\n", 62 | " hptune_config='{{config.hptune.config}}',\n", 63 | " model_id='{{config.deploy.model_id}}',\n", 64 | " version_id='{{config.deploy.version_id}}',\n", 65 | " common_args_hpt=json.dumps([\n", 66 | " {% for arg in config.hptune.args %}", 67 | " {% set name = arg.name %}", 68 | " {% set value = arg.default %}", 69 | " '--{{name}}', '{{value}}' ,\n", 70 | " {% endfor %}", 71 | " ]),\n", 72 | " common_args_train=json.dumps([\n", 73 | " {% for arg in config.train.args %}", 74 | " {% set name = arg.name %}", 75 | " {% set value = arg.default%}", 76 | " '--{{name}}', '{{value}}' ,\n", 77 | " {% endfor %}", 78 | " ]),\n", 79 | " replace_existing_version=True\n", 80 | "):\n", 81 | "\n", 82 | " #Preprocess Task\n", 83 | " pre_process_task = pre_process_op(\n", 84 | " {% for arg in config.preprocess.component_args %}\n", 85 | " {% set name = arg.name %}\n", 86 | " {{name}}={{name}},\n", 87 | " {% endfor %}\n", 88 | " )\n", 89 | "\n", 90 | " # HP tune Task\n", 91 | " hpt_task = hpt_op (\n", 92 | " region = region,\n", 93 | " python_module = python_module,\n", 94 | " package_uri = package_uri,\n", 95 | " staging_bucket = staging_bucket,\n", 96 | " job_dir = job_dir_hptune,\n", 97 | " config=hptune_config,\n", 98 | " runtime_version = runtime_version_train,\n", 99 | " args = common_args_hpt ,\n", 100 | " )\n", 101 | " hpt_task.after(pre_process_task)\n", 102 | "\n", 103 | " # Get the best hyperparameters\n", 104 | " param_task = param_comp (\n", 105 | " project_id=project_id,\n", 106 | " hptune_job_id=hpt_task.outputs['job_id'].to_struct(),\n", 107 | " common_args=common_args_train,\n", 108 | " )\n", 109 | "\n", 110 | " # Train Task\n", 111 | " train_task = train_op (\n", 112 | " project_id = project_id,\n", 113 | " python_module = python_module,\n", 114 | " package_uris = json.dumps([package_uri.to_struct()]),\n", 115 | " region = region,\n", 116 | " args = str(param_task.outputs['tuned_parameters_out']) ,\n", 117 | " job_dir = job_dir_train,\n", 118 | " python_version = '',\n", 119 | " runtime_version = runtime_version_train,\n", 120 | " master_image_uri = '',\n", 121 | " worker_image_uri = '',\n", 122 | " training_input = '',\n", 123 | " job_id_prefix = '',\n", 124 | " wait_interval = '30'\n", 125 | " )\n", 126 | "\n", 127 | " #model_uri=train_task.outputs['job_dir'],\n", 128 | " #model_uri='gs://poc-bucket-0120/train/out/export/exporter',\n", 129 | " deploy_model = deploy_op(\n", 130 | " model_uri=train_task.outputs['job_dir'].to_struct()+'{{config.train.model_out_prefix}}',\n", 131 | " project_id=project_id,\n", 132 | " model_id=model_id,\n", 133 | " version_id=version_id,\n", 134 | " runtime_version=runtime_version_deploy,\n", 135 | " replace_existing_version=replace_existing_version\n", 136 | " )\n", 137 | " kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))\n", 138 | "\n", 139 | "\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "client = kfp.Client(host='{{config.kfp_deployment_url}}')\n", 149 | "\n", 150 | "client.create_run_from_pipeline_func(pipeline_sample, arguments={})" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.10" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 4 175 | } 176 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The Kubeflow Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.6 16 | RUN pip install -U google-api-python-client==1.7.11 17 | RUN pip install -U oauth2client==4.1.3 18 | COPY . / 19 | ENTRYPOINT ["python", "get_tuned_params.py" ] 20 | 21 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | CONTAINER_NAME=ml-pipeline-get-tuned-params 30 | 31 | docker build -t ${CONTAINER_NAME} . 32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Get Best Hparam 16 | description: | 17 | A Kubeflow Pipeline component to extract best hyperparameters from a given 18 | hyperparameter job ID for a given project. 19 | inputs: 20 | - name: project_id 21 | description: 'Required. The ID of the parent project of the job.' 22 | type: String 23 | - name: hptune_job_id 24 | description: 'hyperparameter tuning job ID' 25 | type: String 26 | - name: common_args 27 | description: 'Common (non-tunable) args' 28 | type: String 29 | outputs: 30 | - name: tuned_parameters_out 31 | description: 'tuned paramters from the given job' 32 | type: String 33 | implementation: 34 | container: 35 | image: gcr.io/gcp-demo-2-262319/ml-pipeline-get-tuned-params:latest 36 | args: [ 37 | --project_id, {inputValue: project_id}, 38 | --hptune_job_id, {inputValue: hptune_job_id}, 39 | --common_args, {inputValue: common_args}, 40 | --tuned_parameters_out, {outputPath: tuned_parameters_out} 41 | ] 42 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/get_tuned_params/get_tuned_params.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Script to extract hyperparamters from the job-ID.""" 16 | import argparse 17 | 18 | from pathlib import Path 19 | 20 | from googleapiclient import discovery 21 | from googleapiclient import errors 22 | from types import SimpleNamespace 23 | import ast 24 | 25 | 26 | # Modified from: https://stackoverflow.com/a/54332748 27 | class NestedNamespace(SimpleNamespace): 28 | """Parse nested disctionary to create nested namespace object.""" 29 | 30 | def __init__(self, dictionary, **kwargs): 31 | super(NestedNamespace, self).__init__(**kwargs) 32 | for key, value in dictionary.items(): 33 | if isinstance(value, dict): 34 | self.__setattr__(key, NestedNamespace(value)) 35 | elif isinstance(value, list): 36 | self.__setattr__(key, 37 | [NestedNamespace(i) 38 | if isinstance(i, dict) 39 | else i for i in value]) 40 | else: 41 | self.__setattr__(key, value) 42 | 43 | 44 | def print_best_parameters(project_id, 45 | hp_tune_job, 46 | filename='tuned_params', 47 | common_args='[]'): 48 | """Select best hyperparameter set from the job_id.""" 49 | job_id = 'projects/{}/jobs/{}'.format(project_id, hp_tune_job) 50 | 51 | # Build a representation of the Cloud ML API. 52 | ml = discovery.build('ml', 'v1') 53 | 54 | # Create a request to call projects.models.create. 55 | request = ml.projects().jobs().get(name=job_id) 56 | # Make the call. 57 | try: 58 | response = request.execute() 59 | except errors.HttpError as err: 60 | # Something went wrong, print out some information. 61 | print('There was an error getting the job info, Check the details:') 62 | print(err._get_reason()) 63 | 64 | job_info = NestedNamespace(response) 65 | param_list = ast.literal_eval(common_args) 66 | for key, value in job_info.trainingOutput.trials[0].hyperparameters.__dict__.items(): 67 | param_list.append('--'+key) 68 | param_list.append(value) 69 | # Creating the directory where the output file will be created (the directory may or may not exist). 70 | Path(filename).parent.mkdir(parents=True, exist_ok=True) 71 | with open(filename, 'w') as f: 72 | f.write(str(param_list)) 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('--hptune_job_id', 77 | type=str, 78 | required=True, 79 | help='ID of hparam search job') 80 | parser.add_argument('--project_id', 81 | type=str, 82 | required=True, 83 | help='GCP project ID') 84 | parser.add_argument('--common_args', 85 | type=str, 86 | required=True, 87 | help='common (not tunable) arguments for training application') 88 | parser.add_argument('--tuned_parameters_out', 89 | type=str, 90 | required=True, 91 | help='Path to the file containing Tuned Parameters array') 92 | args = parser.parse_args() 93 | print_best_parameters(args.project_id, args.hptune_job_id, args.tuned_parameters_out, args.common_args) 94 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM google/cloud-sdk:latest 16 | 17 | COPY . / 18 | 19 | ENTRYPOINT ["bash", "/hptune.sh" ] 20 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if [ -z "$1" ]; then 18 | PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") 19 | else 20 | PROJECT_ID=$1 21 | fi 22 | 23 | if [ -z "$2" ]; then 24 | TAG_NAME="latest" 25 | else 26 | TAG_NAME="$2" 27 | fi 28 | 29 | 30 | CONTAINER_NAME=ml-pipeline-hptune 31 | 32 | docker build -t ${CONTAINER_NAME} . 33 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 34 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME} 35 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/component.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Submitting a Cloud ML Hyper Parameter Search job as a pipeline step 16 | description: | 17 | A Kubeflow Pipeline component to submit a Cloud Machine Learning (Cloud ML) 18 | Engine Hyperparameter search job as a step in a pipeline. 19 | inputs: 20 | - name: python_module 21 | description: 'The Python module name to run after installing the packages.' 22 | default: '' 23 | type: String 24 | - name: staging_bucket 25 | description: 'The GCS bucket for staging' 26 | default: '' 27 | type: GCSPath 28 | - name: job_dir 29 | description: 'The GCS bucket dir for where the hparam search run files are created.' 30 | default: '' 31 | type: GCSPath 32 | - name: package_uri 33 | description: 'The Cloud Storage location of the training package.' 34 | default: '' 35 | type: GCSPath 36 | - name: region 37 | description: 'The Compute Engine region in which the training job is run.' 38 | default: '' 39 | type: GCPRegion 40 | - name: args 41 | description: 'The command line arguments to pass to the program.' 42 | default: '' 43 | type: List 44 | - name: runtime_version 45 | description: 'The Cloud ML Engine runtime version to use for training' 46 | default: '' 47 | type: String 48 | - name: config 49 | description: 'hptun yaml' 50 | default: '' 51 | type: String 52 | 53 | outputs: 54 | - name: job_id 55 | description: 'The ID of the created job.' 56 | type: String 57 | implementation: 58 | container: 59 | image: gcr.io/gcp-demo-2-262319/ml-pipeline-hptune:latest 60 | args: [ 61 | --python_module, {inputValue: python_module}, 62 | --package_uri, {inputValue: package_uri}, 63 | --region, {inputValue: region}, 64 | --args, {inputValue: args}, 65 | --staging_bucket, {inputValue: staging_bucket}, 66 | --runtime_version, {inputValue: runtime_version}, 67 | --config, {inputValue: config}, 68 | --job_dir, {inputValue: job_dir}, 69 | --job_id, {outputPath: job_id} 70 | 71 | ] 72 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptune/hptune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | while [ $# -ne 0 ]; do 18 | case "$1" in 19 | -h|--help) echo "Usage: ./hptune.sh \\" 20 | echo "--region= \\" 21 | echo "--module-name= \\" 22 | echo "--package-path= \\" 23 | echo "--job-dir= \\" 24 | echo "--staging-bucket= \\" 25 | echo "--config= \\" 26 | echo "--runtime-version= \\" 27 | echo "--stream-logs \\" 28 | echo "-- \\" 29 | echo "--common_args=" 30 | exit 31 | shift 32 | ;; 33 | --region) REGION=$2 34 | shift 35 | ;; 36 | --python_module) MODULE_NAME=$2 37 | shift 38 | ;; 39 | --package_uri) PACKAGE_URI=$2 40 | shift 41 | ;; 42 | --job_dir) JOB_DIR=$2 43 | shift 44 | ;; 45 | --staging_bucket)STAGING_BUCKET=$2 46 | shift 47 | ;; 48 | --config) CONFIG=$2 49 | shift 50 | ;; 51 | --runtime_version)RUNTIME_VERSION=$2 52 | shift 53 | ;; 54 | --args) ARGS=$2 55 | shift 56 | ;; 57 | ### 58 | --job_id) JOB_ID=$2 59 | shift 60 | ;; 61 | *) shift 62 | ;; 63 | esac 64 | done 65 | echo "Executing $0 $@ . ...." 66 | COMMON_ARGS=`python -c "import ast; print(' '.join(ast.literal_eval('$ARGS')))"` 67 | COMMON_ARGS=`echo $COMMON_ARGS | sed 's/--\([^ ]*\) *\([^-]*\)/--\1=\2/g'` 68 | 69 | JOBNAME=wd_hcr_hptuning_$(date -u +%y%m%d_%H%M) 70 | 71 | gsutil -m rm -rf $OUTPUT_DIR || echo "No object was deleted" 72 | gsutil -m cp $CONFIG . 73 | config_file=`basename $CONFIG` 74 | 75 | eval `echo "gcloud ai-platform jobs submit training $JOBNAME \ 76 | --region=$REGION \ 77 | --module-name=$MODULE_NAME \ 78 | --packages=$PACKAGE_URI \ 79 | --job-dir=$JOB_DIR \ 80 | --staging-bucket=$STAGING_BUCKET \ 81 | --config=$config_file \ 82 | --runtime-version=$RUNTIME_VERSION \ 83 | --stream-logs \ 84 | -- \ 85 | $COMMON_ARGS 86 | 87 | "` 88 | 89 | mkdir -p `dirname $JOB_ID` 90 | 91 | echo "$JOBNAME" > $JOB_ID 92 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | trainingInput: 15 | hyperparameters: 16 | hyperparameterMetricTag: accuracy 17 | goal: MAXIMIZE 18 | maxTrials: 4 19 | maxParallelTrials: 2 20 | enableTrialEarlyStopping: True 21 | params: 22 | - parameterName: batch_size 23 | type: INTEGER 24 | minValue: 8 25 | maxValue: 512 26 | scaleType: UNIT_LOG_SCALE 27 | - parameterName: dnn_lr 28 | type: DOUBLE 29 | minValue: 0.00001 30 | maxValue: 0.5 31 | scaleType: UNIT_LOG_SCALE 32 | - parameterName: lin_lr 33 | type: DOUBLE 34 | minValue: 0.00001 35 | maxValue: 0.5 36 | scaleType: UNIT_LOG_SCALE 37 | - parameterName: lin_lr_power 38 | type: DOUBLE 39 | minValue: -5 40 | maxValue: 0.0 41 | scaleType: UNIT_LINEAR_SCALE 42 | - parameterName: lin_l1 43 | type: DOUBLE 44 | minValue: 0.01 45 | maxValue: 100 46 | scaleType: UNIT_LOG_SCALE 47 | - parameterName: lin_l2 48 | type: DOUBLE 49 | minValue: 0.01 50 | maxValue: 100 51 | scaleType: UNIT_LOG_SCALE 52 | - parameterName: lin_shrinkage 53 | type: DOUBLE 54 | minValue: 0.00001 55 | maxValue: 100 56 | s 57 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/experimental/kfp_pipeline_from_config.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Kubeflow Pipeline Example.""" 16 | import json 17 | import kfp 18 | import kfp.dsl as dsl 19 | from kfp.components import ComponentStore 20 | from kfp.gcp import use_gcp_secret 21 | 22 | cs = ComponentStore(local_search_paths=['.', '{{config.output_package}}'], 23 | url_search_prefixes=['{{config.github_component_url}}']) 24 | preprocess_op = cs.load_component('{{config.preprocess.component}}') 25 | hpt_op = cs.load_component('hptune') 26 | param_comp = cs.load_component('get_tuned_params') 27 | train_op = cs.load_component('{{config.train.component}}') 28 | deploy_op = cs.load_component('{{config.deploy.component}}') 29 | 30 | 31 | @dsl.pipeline( 32 | name='KFP-Pipelines Example', 33 | description='Kubeflow pipeline generated from ai-pipeline asset' 34 | ) 35 | def pipeline_sample( 36 | project_id='{{config.project_id}}', 37 | region='{{config.region}}', 38 | python_module='{{config.train.python_module}}', 39 | package_uri='{{config.train.python_package}}', 40 | dataset_bucket='{{config.bucket_id}}', 41 | staging_bucket='gs://{{config.bucket_id}}', 42 | job_dir_hptune='gs://{{config.bucket_id}}/hptune', 43 | job_dir_train='gs://{{config.bucket_id}}/train', 44 | runtime_version_train='{{config.runtime_version}}', 45 | runtime_version_deploy='{{config.runtime_version}}', 46 | hptune_config='{{config.hptune.config}}', 47 | model_id='{{config.deploy.model_id}}', 48 | version_id='{{config.deploy.version_id}}', 49 | common_args_hpt=json.dumps([ 50 | {% for arg in config.hptune.args %} 51 | {% set name = arg.name %} 52 | {% set value = arg.default %} 53 | '--{{name}}', '{{value}}', 54 | {% endfor %} 55 | ]), 56 | common_args_train=json.dumps([ 57 | {% for arg in config.train.args %} 58 | {% set name = arg.name %} 59 | {% set value = arg.default%} 60 | '--{{name}}', '{{value}}', 61 | {% endfor %} 62 | ]), 63 | replace_existing_version=True): 64 | """.""" 65 | preprocess_task = preprocess_op( 66 | {% for arg in config.preprocess.component_args %} 67 | {% set name = arg.name %} 68 | {{name}}={{name}}, 69 | {% endfor %} 70 | ) 71 | 72 | hpt_task = hpt_op( 73 | region=region, 74 | python_module=python_module, 75 | package_uri=package_uri, 76 | staging_bucket=staging_bucket, 77 | job_dir=job_dir_hptune, 78 | config=hptune_config, 79 | runtime_version=runtime_version_train, 80 | args=common_args_hpt 81 | ) 82 | hpt_task.after(preprocess_task) 83 | 84 | param_task = param_comp( 85 | project_id=project_id, 86 | hptune_job_id=hpt_task.outputs['job_id'].to_struct(), 87 | common_args=common_args_train 88 | ) 89 | 90 | train_task = train_op( 91 | project_id=project_id, 92 | python_module=python_module, 93 | package_uris=json.dumps([package_uri.to_struct()]), 94 | region=region, 95 | args=str(param_task.outputs['tuned_parameters_out']), 96 | job_dir=job_dir_train, 97 | python_version='', 98 | runtime_version=runtime_version_train, 99 | master_image_uri='', 100 | worker_image_uri='', 101 | training_input='', 102 | job_id_prefix='', 103 | wait_interval='30' 104 | ) 105 | 106 | deploy_model = deploy_op( # pylint: disable=unused-variable 107 | model_uri=train_task.outputs['job_dir'].to_struct()+'{{config.train.model_out_prefix}}', 108 | project_id=project_id, 109 | model_id=model_id, 110 | version_id=version_id, 111 | runtime_version=runtime_version_deploy, 112 | replace_existing_version=replace_existing_version 113 | ) 114 | 115 | kfp.dsl.get_pipeline_conf().add_op_transformer( 116 | use_gcp_secret('user-gcp-sa')) 117 | 118 | client = kfp.Client(host='{{config.kfp_deployment_url}}') 119 | 120 | client.create_run_from_pipeline_func(pipeline_sample, arguments={}) 121 | 122 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/kfp_pipeline.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Defines a KubeFlow pipeline.""" 16 | 17 | import kfp 18 | import kfp.gcp as gcp 19 | from kfp_server_api.rest import ApiException 20 | from typing import NamedTuple 21 | 22 | 23 | # pylint: disable=redefined-outer-name 24 | # pylint: disable=g-import-not-at-top 25 | # pylint: disable=reimported 26 | def make_op_func(func): 27 | """Converts a self-contained python function into an op. 28 | 29 | Args: 30 | func: a python function with no outside dependencies. 31 | 32 | Returns: 33 | A function that ingests PipelineParams, parses them, and passes the 34 | results to the given function, all within a container. 35 | """ 36 | return kfp.components.func_to_container_op(func) 37 | 38 | 39 | def get_train_op(github_url, prev_op_id=""): 40 | """Returns an op for running AI Platform training jobs. 41 | 42 | Args: 43 | github_url: url to the github commit the component definition will be 44 | read from. 45 | prev_op_id: an output from a previous component to use to chain 46 | components together. 47 | 48 | Returns: 49 | A Kubeflow Pipelines component for running training. 50 | """ 51 | {% filter indent(width=4, indentfirst=False) %} 52 | params = {{train_params}} 53 | {% endfilter %} 54 | 55 | params["job_id_prefix"] += prev_op_id 56 | mlengine_train_op = kfp.components.load_component_from_url( 57 | "{}/ml_engine/train/component.yaml".format(github_url)) 58 | train_op = mlengine_train_op(**params) 59 | return train_op 60 | 61 | 62 | def get_model_path(prev_op_id="") -> NamedTuple("params", [ 63 | ("model_path", str), 64 | ("stub", str), 65 | ]): 66 | """Builds a model path prefix to use to search for the export dir.""" 67 | model_path = "{{ model_dir }}" 68 | return (model_path, prev_op_id) 69 | 70 | 71 | def get_model_path_op(prev_op_id): 72 | """Returns a component for getting the model path.""" 73 | model_path_op = make_op_func(get_model_path)(prev_op_id) 74 | list_blobs = kfp.components.load_component( 75 | "orchestration/components/list_blobs.yaml") 76 | gsutil_op = list_blobs(model_path_op.outputs["model_path"]) 77 | return gsutil_op 78 | 79 | 80 | def get_deploy_op(github_url, prev_op_id=""): 81 | """Returns an op for deploying models on CAIP. 82 | 83 | Args: 84 | github_url: url to the github commit the component definition will be 85 | read from. 86 | prev_op_id: an output from a previous component to use to chain 87 | components together. 88 | 89 | Returns: 90 | A Kubeflow Pipelines component for deploying models. 91 | """ 92 | 93 | {% filter indent(width=4, indentfirst=False) %} 94 | params = {{deploy_params}} 95 | {% endfilter %} 96 | 97 | params["version_id"] = prev_op_id 98 | if "model_uri" not in params: 99 | gsutil_op = get_model_path_op(prev_op_id) 100 | params["model_uri"] = gsutil_op.output 101 | 102 | mlengine_deploy_op = kfp.components.load_component_from_url( 103 | "{}/ml_engine/deploy/component.yaml".format(github_url)) 104 | deploy_op = mlengine_deploy_op(**params) 105 | return deploy_op 106 | 107 | 108 | def get_predict_op(github_url, prev_op_id="", version_name=""): 109 | """Returns an op for running AI Platform batch prediction jobs. 110 | 111 | Args: 112 | github_url: url to the github commit the component definition will be 113 | read from. 114 | prev_op_id: an output from a previous component to use to chain 115 | components together. 116 | version_name: a version name of a deployed model to predict with. 117 | 118 | Returns: 119 | A Kubeflow Pipelines component for running batch predictions. 120 | """ 121 | 122 | {% filter indent(width=4, indentfirst=False) %} 123 | params = {{prediction_params}} 124 | {% endfilter %} 125 | 126 | if prev_op_id: 127 | gsutil_op = get_model_path_op(prev_op_id) 128 | params["model_path"] = gsutil_op.output 129 | elif version_name: 130 | params["model_path"] = version_name 131 | mlengine_batch_predict_op = kfp.components.load_component_from_url( 132 | "{}/ml_engine/batch_predict/component.yaml".format(github_url)) 133 | predict_op = mlengine_batch_predict_op(**params) 134 | return predict_op 135 | 136 | 137 | @kfp.dsl.pipeline( 138 | name="train_pipeline", 139 | description="Pipeline for training a model on CAIP.") 140 | def train_pipeline(): 141 | """Defines a Kubeflow Pipeline.""" 142 | github_url = ("https://raw.githubusercontent.com/kubeflow/pipelines/" 143 | + "02c991dd265054b040265b3dfa1903d5b49df859/components/gcp") 144 | 145 | # TODO(humichael): Add params. 146 | {% for p, c in relations %} 147 | {% set parent = components[p] %} 148 | {% set parent_name = "{}_{}_op".format(parent.role, parent.id) %} 149 | {% set parent_func = "get_{}_op".format(parent.role) %} 150 | {% set parent_out = "version_name" if parent.role == "deploy" else "job_id" %} 151 | {% set connection = "version_name" if parent.role == "deploy" and child.role == "predict" else "prev_op_id" %} 152 | {% set child = components[c] %} 153 | {% set child_name = "{}_{}_op".format(child.role, child.id) %} 154 | {% set child_func = "get_{}_op".format(child.role) %} 155 | 156 | {% if p == -1 %} 157 | {{ child_name }} = {{ child_func }}(github_url) 158 | {% else %} 159 | {{ child_name }} = {{ child_func }}( 160 | github_url, 161 | {{ connection }}={{ parent_name }}.outputs["{{ parent_out }}"], 162 | ) 163 | {% endif %} 164 | {% endfor %} 165 | 166 | 167 | def main(compile=False): 168 | """Compile the pipeline and also create a run.""" 169 | if compile: 170 | kfp.compiler.Compiler().compile(train_pipeline, "train_pipeline.tar.gz") 171 | 172 | try: 173 | client = kfp.Client(host="{{ host }}") 174 | client.create_run_from_pipeline_func(train_pipeline, arguments={}) 175 | except ApiException as e: 176 | print("{0}: KFP Dashboard unreachable. Please update the config.yaml with latest hostname.".format(e.reason)) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Config for installing a Python module/package.""" 15 | 16 | from setuptools import find_packages 17 | from setuptools import setup 18 | 19 | NAME = "{{ package_name }}" 20 | VERSION = "1.0" 21 | REQUIRED_PACKAGES = ["gcsfs"] 22 | 23 | setup( 24 | name=NAME, 25 | version=VERSION, 26 | author="Author", 27 | author_email="author@example.com", 28 | packages=find_packages(), 29 | install_requires=REQUIRED_PACKAGES, 30 | url="www.example.com", 31 | ) 32 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/sklearn_inputs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Input functions.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import numpy as np 21 | import pandas as pd 22 | 23 | SCHEMA = {{ schema }} 24 | TARGET = "{{ target }}" 25 | 26 | 27 | def download_data(train_path, eval_path): 28 | """Downloads train and eval datasets from GCP. 29 | 30 | Args: 31 | train_path: GCS path to training data. 32 | eval_path: GCS path to evaluation data. 33 | 34 | Returns: 35 | train_x: dataframe of training features. 36 | train_y: dataframe of training labels. 37 | eval_x: dataframe of eval features. 38 | eval_y: dataframe of eval labels. 39 | """ 40 | train_df = pd.read_csv(train_path, names=SCHEMA) 41 | eval_df = pd.read_csv(eval_path, names=SCHEMA) 42 | 43 | train_x, train_y = train_df, train_df.pop(TARGET) 44 | eval_x, eval_y = eval_df, eval_df.pop(TARGET) 45 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 46 | return train_x, train_y, eval_x, eval_y 47 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/sklearn_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ML model definitions.""" 15 | from {{model_path}} import get_model 16 | 17 | def get_estimator(params): 18 | """Returns a SKLearn model.""" 19 | estimator = get_model(params) 20 | return estimator 21 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/sklearn_task.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Executes model training and evaluation.""" 16 | 17 | import argparse 18 | import json 19 | import logging 20 | import os 21 | import sys 22 | 23 | import hypertune 24 | import numpy as np 25 | from sklearn import model_selection 26 | 27 | from trainer import inputs 28 | from trainer import model 29 | from trainer import utils 30 | 31 | 32 | def _parse_arguments(argv): 33 | """Parses execution arguments and replaces default values. 34 | 35 | Args: 36 | argv: Input arguments from sys. 37 | 38 | Returns: 39 | Dictionary of parsed arguments. 40 | """ 41 | parser = argparse.ArgumentParser() 42 | 43 | # TODO(humichael): Make this into modular template. 44 | {% for name, arg in input_args.items() %} 45 | parser.add_argument( 46 | "--{{name}}", 47 | help="{{arg.help}}", 48 | type={{arg.type}}, 49 | {% if arg.type == "str" and "default" in arg %} 50 | default="{{arg.default}}", 51 | {% elif "default" in arg %} 52 | default={{arg.default}}, 53 | {% endif %} 54 | ) 55 | {% endfor %} 56 | 57 | args, _ = parser.parse_known_args(args=argv[1:]) 58 | return args 59 | 60 | 61 | # TODO(humichael): Evaluate the results. 62 | def _train_and_evaluate(estimator, dataset, model_dir, params): 63 | """Runs model training and evaluation.""" 64 | x_train, y_train, x_eval, y_eval = dataset 65 | estimator.fit(x_train, y_train) 66 | 67 | model_path = os.path.join(model_dir, "model.joblib") 68 | utils.dump_object(estimator, model_path) 69 | 70 | scores = model_selection.cross_val_score( 71 | estimator, x_eval, y_eval, cv=params.cross_validations) 72 | metric_path = os.path.join(model_dir, "eval_metrics.joblib") 73 | utils.dump_object(scores, metric_path) 74 | 75 | hpt = hypertune.HyperTune() 76 | hpt.report_hyperparameter_tuning_metric( 77 | hyperparameter_metric_tag="score", 78 | metric_value=np.mean(scores)) 79 | 80 | 81 | def _get_trial_id(): 82 | """Returns the trial id if it exists, else "0".""" 83 | trial_id = json.loads( 84 | os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "") 85 | return trial_id if trial_id else "1" 86 | 87 | 88 | def run_experiment(params): 89 | """Testbed for running model training and evaluation.""" 90 | dataset = inputs.download_data(params.train_path, params.eval_path) 91 | estimator = model.get_estimator(params) 92 | trial_id = _get_trial_id() 93 | model_dir = os.path.join(params.model_dir, trial_id) 94 | _train_and_evaluate(estimator, dataset, model_dir, params) 95 | 96 | 97 | def main(): 98 | """Entry point.""" 99 | args = _parse_arguments(sys.argv) 100 | logging.basicConfig(level="INFO") 101 | run_experiment(args) 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/tf_inputs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Input functions.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import six 21 | import tensorflow as tf 22 | 23 | SCHEMA = {{ schema }} 24 | TARGET = "{{ target }}" 25 | 26 | 27 | def _decode_csv(line): 28 | """Takes the string input tensor and returns a dict of rank-2 tensors.""" 29 | columns = tf.decode_csv(line, record_defaults=[0.0] * len(SCHEMA)) 30 | features = dict(zip(SCHEMA, columns)) 31 | for key, _ in six.iteritems(features): 32 | features[key] = tf.expand_dims(features[key], -1) 33 | return features 34 | 35 | 36 | def get_input_fn(file_pattern, shuffle, batch_size, num_epochs=None, 37 | data_format="csv"): 38 | """Returns an input function. 39 | 40 | Two input methods are supported: 41 | TFRecord on GCS: provide a file_pattern. 42 | Local CSV: provide features and labels. 43 | 44 | Args: 45 | file_pattern: pattern of the input files. 46 | shuffle: boolean for whether to shuffle the data or not (set True for 47 | training, False for evaluation) 48 | batch_size: batch size used to read data. 49 | num_epochs: number of times to iterate over the dataset. 50 | data_format: format of input data. 51 | 52 | Returns: 53 | An input_fn. 54 | 55 | Raises: 56 | RuntimeError: either file_pattern or features and labels were not 57 | provided. 58 | """ 59 | def _csv_input_fn(): 60 | """Parses csv input using tf.data.""" 61 | filenames = tf.io.gfile.glob(file_pattern) 62 | dataset = tf.data.TextLineDataset(filenames).map( 63 | _decode_csv, 64 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 65 | if shuffle: 66 | dataset = dataset.shuffle(buffer_size=batch_size * 10) 67 | dataset = dataset.repeat(num_epochs) 68 | dataset = dataset.batch(batch_size) 69 | dataset = dataset.prefetch(buffer_size=10) 70 | features = dataset.make_one_shot_iterator().get_next() 71 | return features, features.pop(TARGET) 72 | 73 | data_formats = { 74 | "csv": _csv_input_fn, 75 | } 76 | if data_format in data_formats: 77 | return data_formats[data_format] 78 | raise RuntimeError("Invalid arguments") 79 | 80 | 81 | def get_serving_input_fn(data_format): 82 | """Returns a serving input function based on the given format. 83 | 84 | Args: 85 | data_format: format of input data. 86 | 87 | Returns: 88 | An input fn for serving. 89 | 90 | Raises: 91 | KeyError: the given data_format is invalid. 92 | """ 93 | 94 | def _csv_serving_input_fn(): 95 | """Build the serving inputs.""" 96 | csv_row = tf.placeholder(shape=[None], dtype=tf.string) 97 | features = _decode_csv(csv_row) 98 | return tf.estimator.export.ServingInputReceiver( 99 | features, {"csv_row": csv_row}) 100 | 101 | def _json_serving_input_fn(): 102 | """Build the serving inputs.""" 103 | inputs = {} 104 | for col in SCHEMA: 105 | if col != TARGET: 106 | inputs[col] = tf.placeholder(shape=[None], dtype=float) 107 | return tf.estimator.export.ServingInputReceiver(inputs, inputs) 108 | 109 | data_formats = { 110 | "csv": _csv_serving_input_fn, 111 | "json": _json_serving_input_fn, 112 | } 113 | if data_format in data_formats: 114 | return data_formats[data_format] 115 | raise KeyError("Invalid arguments") 116 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/tf_model.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ML model definitions.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import json 21 | import os 22 | 23 | import tensorflow.compat.v1 as tf 24 | 25 | from trainer import inputs 26 | from {{ model_path }} import get_model 27 | from {{ model_path }} import get_loss 28 | 29 | 30 | # pylint: disable=unused-argument 31 | def _model_fn(features, labels, mode, params): 32 | """Builds an EstimatorSpec. 33 | 34 | Args: 35 | features: a dict mapping feature names to tensors. 36 | labels: a tensor of labels. 37 | mode: a tf.estimator.ModeKey signifying the Estimator mode. 38 | params: hyperparameters for the model. 39 | 40 | Returns: 41 | an EstimatorSpec that defines the model to be run by an Estimator. 42 | """ 43 | schema = [x for x in inputs.SCHEMA if x != inputs.TARGET] 44 | feature_columns = [tf.feature_column.numeric_column( 45 | col, shape=(1,), dtype=tf.dtypes.float32) for col in schema] 46 | input_layer = tf.feature_column.input_layer(features, feature_columns) 47 | # TODO(humichael): support multiple outputs. 48 | predictions = get_model(input_layer, params) 49 | 50 | if mode == tf.estimator.ModeKeys.PREDICT: 51 | prediction_out = { 52 | "predictions": predictions, 53 | } 54 | return tf.estimator.EstimatorSpec(mode, predictions=prediction_out) 55 | 56 | loss = get_loss()(labels, predictions) 57 | metrics = {} 58 | 59 | {% for metric in metrics%} 60 | key = "{{ metric }}" 61 | # TODO(humichael): how to generate this from user? 62 | # may tie in with multiple outputs. Use logits for loss, preds for eval. 63 | predictions = tf.round(predictions) 64 | metric = tf.metrics.{{ metric }}(labels, predictions) 65 | metrics[key] = metric 66 | tf.summary.scalar(key, metric[1]) 67 | {% endfor %} 68 | 69 | tf.summary.merge_all() 70 | 71 | if mode == tf.estimator.ModeKeys.EVAL: 72 | return tf.estimator.EstimatorSpec( 73 | mode, loss=loss, eval_metric_ops=metrics) 74 | 75 | optimizer = tf.train.AdagradOptimizer(learning_rate=params.learning_rate) 76 | train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) 77 | 78 | hook = tf.estimator.LoggingTensorHook( 79 | [input_layer[:5], labels[:5], predictions[:5]], at_end=True) 80 | return tf.estimator.EstimatorSpec( 81 | mode, loss=loss, train_op=train_op, training_hooks=[hook]) 82 | 83 | 84 | def _get_trial_id(): 85 | """Returns the trial id if it exists, else "0".""" 86 | trial_id = json.loads( 87 | os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "") 88 | return trial_id if trial_id else "1" 89 | 90 | 91 | def get_estimator(params): 92 | """Returns a tf.Estimator for reconstruction. 93 | 94 | Args: 95 | params: a dict of hyperparameters for the model. 96 | 97 | Returns: 98 | A tf.Estimator. 99 | """ 100 | config = tf.estimator.RunConfig( 101 | save_checkpoints_steps=params.save_checkpoints_steps, 102 | keep_checkpoint_max=params.keep_checkpoint_max, 103 | log_step_count_steps=params.log_step_count_steps) 104 | trial_id = _get_trial_id() 105 | model_dir = os.path.join(params.model_dir, trial_id) 106 | 107 | estimator = tf.estimator.Estimator( 108 | model_fn=_model_fn, 109 | model_dir=model_dir, 110 | config=config, 111 | params=params) 112 | return estimator 113 | 114 | 115 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/tf_task.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Main script to train the model.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | import sys 22 | 23 | import tensorflow.compat.v1 as tf 24 | 25 | from trainer import inputs 26 | from trainer import model 27 | 28 | 29 | def _parse_arguments(argv): 30 | """Parses execution arguments and replaces default values. 31 | 32 | Args: 33 | argv: Input arguments from sys. 34 | 35 | Returns: 36 | Dictionary of parsed arguments. 37 | """ 38 | parser = argparse.ArgumentParser() 39 | 40 | {% for name, arg in input_args.items() %} 41 | parser.add_argument( 42 | "--{{name}}", 43 | help="{{arg.help}}", 44 | type={{arg.type}}, 45 | {% if arg.type == "str" and "default" in arg %} 46 | default="{{arg.default}}", 47 | {% elif "default" in arg %} 48 | default={{arg.default}}, 49 | {% endif %} 50 | ) 51 | {% endfor %} 52 | 53 | args, _ = parser.parse_known_args(args=argv[1:]) 54 | return args 55 | 56 | 57 | def run_training(params): 58 | """Initializes the estimator and runs train_and_evaluate.""" 59 | estimator = model.get_estimator(params) 60 | train_input_fn = inputs.get_input_fn( 61 | params.train_path, 62 | shuffle=True, 63 | batch_size=params.batch_size, 64 | num_epochs=params.num_epochs, 65 | ) 66 | train_spec = tf.estimator.TrainSpec( 67 | input_fn=train_input_fn, 68 | max_steps=params.max_steps, 69 | ) 70 | eval_input_fn = inputs.get_input_fn( 71 | params.eval_path, 72 | shuffle=False, 73 | batch_size=params.batch_size, 74 | ) 75 | exporter = tf.estimator.BestExporter( 76 | "export", inputs.get_serving_input_fn(params.export_format), 77 | exports_to_keep=1) 78 | eval_spec = tf.estimator.EvalSpec( 79 | input_fn=eval_input_fn, 80 | throttle_secs=1, 81 | steps=params.eval_steps, 82 | start_delay_secs=1, 83 | exporters=[exporter], 84 | ) 85 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 86 | 87 | 88 | def main(): 89 | """Trains a model.""" 90 | params = _parse_arguments(sys.argv) 91 | tf.logging.set_verbosity(tf.logging.INFO) 92 | run_training(params) 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/xgboost_inputs.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Input functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | SCHEMA = {{ schema }} 25 | TARGET = "{{ target }}" 26 | 27 | 28 | def download_data(train_path, eval_path): 29 | """Downloads train and eval datasets from GCP. 30 | 31 | Args: 32 | train_path: GCS path to training data. 33 | eval_path: GCS path to evaluation data. 34 | 35 | Returns: 36 | train_x: dataframe of training features. 37 | train_y: dataframe of training labels. 38 | eval_x: dataframe of eval features. 39 | eval_y: dataframe of eval labels. 40 | """ 41 | 42 | train_df = pd.read_csv(train_path, names=SCHEMA) 43 | eval_df = pd.read_csv(eval_path, names=SCHEMA) 44 | train_x, train_y = train_df.drop(TARGET, axis=1), train_df[TARGET] 45 | eval_x, eval_y = eval_df.drop(TARGET, axis=1), eval_df[TARGET] 46 | train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]] 47 | 48 | return train_x, train_y, eval_x, eval_y 49 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/xgboost_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ML model definition.""" 15 | from {{model_path}} import get_model 16 | 17 | def get_estimator(params): 18 | """Returns a SKLearn model.""" 19 | estimator = get_model(params) 20 | return estimator 21 | -------------------------------------------------------------------------------- /ml_pipeline_gen/templates/xgboost_task.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Executes model training and evaluation.""" 15 | 16 | import argparse 17 | import os 18 | import sys 19 | import json 20 | import logging 21 | import hypertune 22 | 23 | from sklearn import metrics 24 | from sklearn import preprocessing 25 | 26 | from trainer import inputs 27 | from trainer import model 28 | from trainer import utils 29 | 30 | 31 | def _parse_arguments(argv): 32 | """Parses execution arguments and replaces default values. 33 | 34 | Args: 35 | argv: Input arguments from sys. 36 | 37 | Returns: 38 | Dictionary of parsed arguments. 39 | """ 40 | parser = argparse.ArgumentParser() 41 | 42 | # TODO(humichael): Make this into modular template. 43 | {% for name, arg in input_args.items() %} 44 | parser.add_argument( 45 | "--{{name}}", 46 | help="{{arg.help}}", 47 | type={{arg.type}}, 48 | {% if arg.type == "str" and "default" in arg %} 49 | default="{{arg.default}}", 50 | {% elif "default" in arg %} 51 | default={{arg.default}}, 52 | {% endif %} 53 | ) 54 | {% endfor %} 55 | 56 | args, _ = parser.parse_known_args(args=argv[1:]) 57 | return args 58 | 59 | 60 | def _get_trial_id(): 61 | """Returns the trial id if it exists, else "0".""" 62 | trial_id = json.loads( 63 | os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", 64 | "") 65 | return trial_id if trial_id else "1" 66 | 67 | 68 | def _train_and_evaluate(estimator, dataset, model_dir): 69 | """Runs model training and evaluation.""" 70 | x_train, y_train, x_eval, y_eval = dataset 71 | estimator.fit(x_train, y_train) 72 | logging.info("Completed training XGBOOST model") 73 | 74 | bst = estimator.get_booster() 75 | bst_filename = 'model.bst' 76 | bst.save_model(bst_filename) 77 | model_output_path = os.path.join(model_dir, bst_filename) 78 | utils.upload_blob(model_output_path.split("/")[2], bst_filename, 79 | "/".join(model_output_path.split("/")[3:])) 80 | logging.info("Successfully uploaded file to GCS at location %s", 81 | model_dir) 82 | y_pred = estimator.predict(x_eval) 83 | 84 | # Binarize multiclass labels 85 | lb = preprocessing.LabelBinarizer() 86 | lb.fit(y_eval) 87 | y_test = lb.transform(y_eval) 88 | y_pred = lb.transform(y_pred) 89 | 90 | score = metrics.roc_auc_score(y_test, y_pred, average='macro') 91 | logging.info("AUC Score: %s", str(score)) 92 | 93 | hpt = hypertune.HyperTune() 94 | hpt.report_hyperparameter_tuning_metric( 95 | hyperparameter_metric_tag='roc_auc', 96 | metric_value=score, 97 | global_step=1000 98 | ) 99 | 100 | 101 | def run_experiment(params): 102 | """Testbed for running model training and evaluation.""" 103 | dataset = inputs.download_data(params.train_path, params.eval_path) 104 | estimator = model.get_estimator(params) 105 | trial_id = _get_trial_id() 106 | model_dir = os.path.join(params.model_dir, trial_id) 107 | _train_and_evaluate(estimator, dataset, model_dir) 108 | 109 | 110 | def main(): 111 | """Entry point.""" 112 | args = _parse_arguments(sys.argv) 113 | logging.basicConfig(level="INFO") 114 | run_experiment(args) 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Config for installing a Python module/package.""" 15 | 16 | import setuptools 17 | import ml_pipeline_gen 18 | 19 | with open('README.md', 'r') as f: 20 | long_description = f.read() 21 | 22 | setuptools.setup( 23 | name='ml-pipeline-gen', 24 | version=ml_pipeline_gen.__version__, 25 | author='Michael Hu', 26 | author_email='author@example.com', 27 | description='A tool for generating end-to-end pipelines on GCP.', 28 | long_description=long_description, 29 | long_description_content_type='text/markdown', 30 | url='https://github.com/GoogleCloudPlatform/ml-pipeline-generator-python', 31 | packages=['ml_pipeline_gen'], 32 | install_requires=[ 33 | 'cloudml-hypertune>=0.1.0.dev6', 34 | 'gcsfs>=0.6.2', 35 | 'google-api-python-client>=1.9.3', 36 | 'google-cloud-container>=0.5.0', 37 | 'jinja2>=2.11.2', 38 | 'joblib>=0.15.1', 39 | 'kfp>=0.5.1', 40 | 'pandas>=1.0.4', 41 | 'pyyaml>=5.3.1', 42 | 'scikit-learn>=0.23.1', 43 | 'tensorflow>=1.14.0,<2.0.0', 44 | 'xgboost>=1.1.1', 45 | ], 46 | extras_require={ 47 | 'dev': [ 48 | 'mock', 49 | ] 50 | }, 51 | classifiers=[ 52 | 'Programming Language :: Python :: 3.6', 53 | 'Programming Language :: Python :: 3.7', 54 | 'License :: OSI Approved :: Apache Software License', 55 | 'Operating System :: OS Independent', 56 | ], 57 | python_requires='>=3.6', 58 | include_package_data=True, 59 | ) 60 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration/fixtures/test_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Config file for ML Pipeline Generator. 16 | 17 | project_id: "ml-pipeline-gen" 18 | bucket_id: "ml-pipeline-gen-test" 19 | cluster_name: "cluster-test" 20 | region: "us-central1" 21 | zone: "us-central1-a" 22 | scale_tier: "STANDARD_1" 23 | runtime_version: "1.15" 24 | python_version: "3.7" 25 | package_name: "ml_pipeline_gen" 26 | machine_type_pred: "n1-standard-4" 27 | 28 | data: 29 | schema: 30 | - "age" 31 | - "workclass" 32 | - "education_num" 33 | - "marital_status" 34 | - "occupation" 35 | - "relationship" 36 | - "race" 37 | - "capital_gain" 38 | - "capital_loss" 39 | - "hours_per_week" 40 | - "native_country" 41 | - "income_bracket" 42 | train: "gs://ml-pipeline-gen-test/test_model/data/adult.data.csv" 43 | evaluation: "gs://ml-pipeline-gen-test/test_model/data/adult.test.csv" 44 | prediction: 45 | input_data_paths: 46 | - "gs://ml-pipeline-gen-test/test_model/inputs/*" 47 | input_format: "JSON" 48 | output_format: "JSON" 49 | 50 | model: 51 | # Name must start with a letter and only contain letters, numbers, and 52 | # underscores. 53 | name: "test_model" 54 | path: "model.test_model" 55 | target: "income_bracket" 56 | metrics: 57 | - "accuracy" 58 | 59 | model_params: 60 | # Relative path. 61 | hyperparam_config: "hptuning_config.yaml" 62 | explain_output: 63 | explain_type: "sampledShapleyAttribution" 64 | explain_param: 65 | name: "numPaths" 66 | value: 40 67 | 68 | orchestration: 69 | host: "https://5e892ccf4c09b627-dot-us-central2.pipelines.googleusercontent.com" 70 | -------------------------------------------------------------------------------- /tests/integration/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/integration/src/__init__.py -------------------------------------------------------------------------------- /tests/integration/src/test_models.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Integration tests for models classes.""" 16 | import mock 17 | import os 18 | import shutil 19 | import tempfile 20 | import time 21 | import unittest 22 | 23 | from googleapiclient import discovery 24 | from tensorflow.io import gfile 25 | 26 | from ml_pipeline_gen.models import BaseModel 27 | from ml_pipeline_gen.models import SklearnModel 28 | 29 | 30 | class TestSklearnModel(unittest.TestCase): 31 | """Tests SklearnModel class.""" 32 | 33 | @classmethod 34 | def setUpClass(cls): 35 | """Copies a demo and instantiates a model.""" 36 | super(TestSklearnModel, cls).setUpClass() 37 | cls.cwd = os.getcwd() 38 | cls.test_dir = tempfile.mkdtemp() 39 | cls.demo_dir = os.path.join(cls.test_dir, 'demo') 40 | shutil.copytree('examples/sklearn', cls.demo_dir) 41 | shutil.copyfile('tests/integration/fixtures/test_config.yaml', 42 | os.path.join(cls.demo_dir, 'test_config.yaml')) 43 | os.chdir(cls.demo_dir) 44 | 45 | @classmethod 46 | def tearDownClass(cls): 47 | """Switch back to the original working dir and removes the demo.""" 48 | super(TestSklearnModel, cls).tearDownClass() 49 | os.chdir(cls.cwd) 50 | shutil.rmtree(cls.test_dir) 51 | 52 | def modify_config(self): 53 | self.model.model['name'] = 'test_model_{}'.format(self.now) 54 | self.model.model['path'] = 'model.sklearn_model' 55 | self.model.model_params['input_args']['C'] = { 56 | 'type': 'float', 57 | 'default': 1.0, 58 | } 59 | 60 | def setUp(self): 61 | super(TestSklearnModel, self).setUp() 62 | # Delete models if exists 63 | self.now = int(time.time()) 64 | self.model = SklearnModel('test_config.yaml') 65 | self.modify_config() 66 | 67 | self.gcs_path = 'gs://ml-pipeline-gen-test/test_model_{}'.format( 68 | self.now) 69 | self.model_dir = os.path.join(self.gcs_path, 'models') 70 | 71 | def tearDown(self): 72 | super(TestSklearnModel, self).tearDown() 73 | self.model.clean_up() 74 | if gfile.exists(self.gcs_path): 75 | gfile.rmtree(self.gcs_path) 76 | 77 | def test_cloud_train(self): 78 | """Tests training on CAIP.""" 79 | self.model.generate_files() 80 | self.model.train(tune=False) 81 | 82 | self.assertTrue(gfile.exists(self.model_dir)) 83 | export_path = os.path.join(self.model_dir, '1', 'model.joblib') 84 | self.assertTrue(gfile.exists(export_path)) 85 | 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Lint as: python3 2 | """Utils for testing.""" 3 | import importlib.util 4 | import sys 5 | 6 | 7 | def load_module(name, path): 8 | spec = importlib.util.spec_from_file_location(name, path) 9 | module = importlib.util.module_from_spec(spec) 10 | sys.modules[name] = module 11 | spec.loader.exec_module(module) 12 | return module 13 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/__init__.py -------------------------------------------------------------------------------- /tests/unit/examples/sklearn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/sklearn/__init__.py -------------------------------------------------------------------------------- /tests/unit/examples/sklearn/test_sklearn_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unit tests demo scikit-learn model.""" 15 | import argparse 16 | import os 17 | import shutil 18 | import sys 19 | import tempfile 20 | import unittest 21 | 22 | from tests import test_utils 23 | 24 | 25 | class TestModel(unittest.TestCase): 26 | """Tests demo model.""" 27 | 28 | @classmethod 29 | def setUpClass(cls): 30 | super(TestModel, cls).setUpClass() 31 | cls.test_dir = tempfile.mkdtemp() 32 | cls.demo_dir = os.path.join(cls.test_dir, 'demo') 33 | shutil.copytree('examples/sklearn', cls.demo_dir) 34 | 35 | # TODO(humichael) We can't import the model using __import__ because 36 | # several other examples are also adding their demo dirs to sys.path. 37 | # It's very likely the model module that is imported is not the one from 38 | # this test. All examples currently use the same census_preprocess. 39 | # These tests will break if any example uses a different preprocessing 40 | # script. 41 | # We should just mock this. 42 | sys.path.append(cls.demo_dir) 43 | sklearn_model = test_utils.load_module( 44 | 'sklearn_model', os.path.join( 45 | cls.demo_dir, 'model', 'sklearn_model.py')) 46 | sklearn_preprocess = test_utils.load_module( 47 | 'sklearn_preprocess', os.path.join( 48 | cls.demo_dir, 'model', 'census_preprocess.py')) 49 | sys.path.remove(cls.demo_dir) 50 | params = argparse.Namespace(C=1.0) 51 | cls.model = sklearn_model.get_model(params) 52 | cls.features, cls.labels, _, _ = sklearn_preprocess.load_data() 53 | 54 | @classmethod 55 | def tearDownClass(cls): 56 | super(TestModel, cls).tearDownClass() 57 | shutil.rmtree(cls.test_dir) 58 | 59 | def setUp(self): 60 | super(TestModel, self).setUp() 61 | self.model = self.__class__.model 62 | self.features = self.__class__.features 63 | self.labels = self.__class__.labels 64 | 65 | def test_get_data(self): 66 | """Checks that there is a label for each feature.""" 67 | self.assertEqual(self.features.shape[0], self.labels.shape[0]) 68 | 69 | def test_get_model(self): 70 | """Checks that the model can be trained and used for predictions.""" 71 | self.model.fit(self.features, self.labels) 72 | preds = self.model.predict(self.features) 73 | self.assertEqual(preds.shape[0], self.labels.shape[0]) 74 | 75 | 76 | if __name__ == '__main__': 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /tests/unit/examples/tensorflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/tensorflow/__init__.py -------------------------------------------------------------------------------- /tests/unit/examples/tensorflow/test_tf_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unit tests demo TF model.""" 15 | import argparse 16 | import os 17 | import shutil 18 | import sys 19 | import tempfile 20 | import unittest 21 | 22 | import tensorflow.compat.v1 as tf 23 | 24 | from tests import test_utils 25 | 26 | 27 | class TestModel(tf.test.TestCase): 28 | """Tests TF demo model.""" 29 | 30 | @classmethod 31 | def setUpClass(cls): 32 | super(TestModel, cls).setUpClass() 33 | cls.test_dir = tempfile.mkdtemp() 34 | cls.demo_dir = os.path.join(cls.test_dir, 'demo') 35 | shutil.copytree('examples/tf', cls.demo_dir) 36 | 37 | # TODO(humichael) We can't import the model using __import__ because 38 | # several other examples are also adding their demo dirs to sys.path. 39 | # It's very likely the model module that is imported is not the one from 40 | # this test. All examples currently use the same census_preprocess. 41 | # These tests will break if any example uses a different preprocessing 42 | # script. 43 | # We should just mock this. 44 | sys.path.append(cls.demo_dir) 45 | tf_model = test_utils.load_module( 46 | 'tf_model', os.path.join(cls.demo_dir, 'model', 'tf_model.py')) 47 | tf_preprocess = test_utils.load_module( 48 | 'tf_preprocess', os.path.join( 49 | cls.demo_dir, 'model', 'census_preprocess.py')) 50 | sys.path.remove(cls.demo_dir) 51 | 52 | cls.features, cls.labels, _, _ = tf_preprocess.load_data() 53 | cls.model = tf_model 54 | 55 | @classmethod 56 | def tearDownClass(cls): 57 | super(TestModel, cls).tearDownClass() 58 | shutil.rmtree(cls.test_dir) 59 | 60 | # pylint: disable=g-import-not-at-top 61 | def setUp(self): 62 | super(TestModel, self).setUp() 63 | self.model = self.__class__.model 64 | self.features = self.__class__.features 65 | self.labels = self.__class__.labels 66 | 67 | def test_get_data(self): 68 | """Checks that there is a label for each feature.""" 69 | self.assertEqual(self.features.shape[0], self.labels.shape[0]) 70 | 71 | def test_get_model(self): 72 | """Checks that the model can be trained and used for predictions.""" 73 | input_layer = tf.keras.layers.Input(shape=(self.features.shape[1],)) 74 | params = argparse.Namespace(first_layer_size=50, num_layers=5) 75 | predictions = self.model.get_model(input_layer, params) 76 | 77 | model = tf.keras.models.Model(inputs=input_layer, outputs=predictions) 78 | model.compile(optimizer='adam', loss=tf.losses.sigmoid_cross_entropy, 79 | metrics=['accuracy']) 80 | model.fit(self.features, self.labels) 81 | preds = model.predict(self.features) 82 | self.assertEqual(preds.shape[0], self.labels.shape[0]) 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/unit/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/src/__init__.py -------------------------------------------------------------------------------- /tests/unit/src/test_models.py: -------------------------------------------------------------------------------- 1 | # python3 2 | # Copyright 2020 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Unit tests for models classes.""" 16 | import mock 17 | import os 18 | import shutil 19 | import tempfile 20 | import unittest 21 | 22 | from googleapiclient import discovery 23 | 24 | from ml_pipeline_gen.models import BaseModel 25 | from ml_pipeline_gen.models import SklearnModel 26 | 27 | 28 | class TestBaseModel(unittest.TestCase): 29 | """Tests BaseModel class.""" 30 | 31 | def test_init(self): 32 | """Ensure BaseModel is abstract.""" 33 | with self.assertRaises(TypeError): 34 | BaseModel() 35 | 36 | 37 | class TestSklearnModel(unittest.TestCase): 38 | """Tests SklearnModel class.""" 39 | 40 | @classmethod 41 | @mock.patch.object(discovery, 'build') 42 | def setUpClass(cls, build_mock): 43 | """Copies a demo and instantiates a model.""" 44 | super(TestSklearnModel, cls).setUpClass() 45 | build_mock.return_value = None 46 | cls.cwd = os.getcwd() 47 | cls.test_dir = tempfile.mkdtemp() 48 | cls.demo_dir = os.path.join(cls.test_dir, 'demo') 49 | shutil.copytree('examples/sklearn', cls.demo_dir) 50 | 51 | os.chdir(cls.demo_dir) 52 | cls.config = 'config.yaml.example' 53 | cls.model = SklearnModel(cls.config) 54 | 55 | @classmethod 56 | def tearDownClass(cls): 57 | """Switch back to the original working dir and removes the demo.""" 58 | super(TestSklearnModel, cls).tearDownClass() 59 | os.chdir(cls.cwd) 60 | shutil.rmtree(cls.test_dir) 61 | 62 | def setUp(self): 63 | super(TestSklearnModel, self).setUp() 64 | self.model = self.__class__.model 65 | 66 | def tearDown(self): 67 | super(TestSklearnModel, self).tearDown() 68 | try: 69 | self.__class__.model.clean_up() 70 | except FileNotFoundError: 71 | pass 72 | 73 | def test_generate_files(self): 74 | """Ensures task.py and model.py are created.""" 75 | self.assertFalse(os.path.exists('trainer')) 76 | self.model.generate_files() 77 | self.assertTrue(os.path.exists('trainer')) 78 | trainer_files = os.listdir('trainer') 79 | self.assertIn('task.py', trainer_files) 80 | self.assertIn('model.py', trainer_files) 81 | 82 | @unittest.skip('How to test without running training?') 83 | def test_local_train(self): 84 | """Tests local training.""" 85 | self.model.generate_files() 86 | self.model.train() 87 | model_files = os.listdir('models') 88 | self.assertIn('{}.joblib'.format(self.model.model['name']), model_files) 89 | 90 | # TODO(humichael): Need to spoof CAIP calls to test this. 91 | def test_cloud_train(self): 92 | """Tests training on CAIP.""" 93 | pass 94 | 95 | # TODO(humichael): Need to spoof CAIP calls to test this. 96 | def test_serve(self): 97 | """Tests serving.""" 98 | pass 99 | 100 | 101 | if __name__ == '__main__': 102 | unittest.main() 103 | --------------------------------------------------------------------------------