├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── CONFIG.md
    └── HPTUNE_CONFIG.md
├── examples
    ├── experimental
    │   └── kfp-2
    │   │   ├── config.yaml.example
    │   │   ├── demo.py
    │   │   ├── pipeline_from_config_demo.ipynb
    │   │   └── user-input
    │   │       └── preprocess
    │   │           ├── Dockerfile
    │   │           ├── build.sh
    │   │           ├── component.yaml
    │   │           └── split_train_eval.py
    ├── getting_started_notebook.ipynb
    ├── kfp
    │   ├── bin
    │   │   └── wi_setup.sh
    │   ├── config.yaml.example
    │   ├── demo.py
    │   ├── hptuning_config.yaml
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── census_preprocess.py
    │   │   └── tf_model.py
    ├── sklearn
    │   ├── config.yaml.example
    │   ├── demo.py
    │   ├── hptuning_config.yaml
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── census_preprocess.py
    │   │   └── sklearn_model.py
    ├── taxi
    │   ├── sklearn
    │   │   ├── config.yaml.example
    │   │   ├── demo.py
    │   │   ├── hptuning_config.yaml
    │   │   └── model
    │   │   │   ├── __init__.py
    │   │   │   ├── sklearn_model.py
    │   │   │   └── taxi_preprocess.py
    │   ├── tf
    │   │   ├── config.yaml.example
    │   │   ├── demo.py
    │   │   ├── hptuning_config.yaml
    │   │   └── model
    │   │   │   ├── __init__.py
    │   │   │   ├── taxi_preprocess.py
    │   │   │   └── tf_model.py
    │   └── xgb
    │   │   ├── config.yaml.example
    │   │   ├── demo.py
    │   │   ├── hptuning_config.yaml
    │   │   └── model
    │   │       ├── __init__.py
    │   │       ├── taxi_preprocess.py
    │   │       └── xgb_model.py
    ├── tf
    │   ├── config.yaml.example
    │   ├── demo.py
    │   ├── hptuning_config.yaml
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── census_preprocess.py
    │   │   └── tf_model.py
    └── xgboost
    │   ├── config.yaml.example
    │   ├── demo.py
    │   ├── hptuning_config.yaml
    │   └── model
    │       ├── __init__.py
    │       ├── census_preprocess.py
    │       └── xgboost_model.py
├── ml_pipeline_gen
    ├── __init__.py
    ├── experimental
    │   ├── component_lib.py
    │   └── component_spec.yaml
    ├── models.py
    ├── parsers.py
    ├── pipelines.py
    ├── static
    │   ├── bin
    │   │   ├── cleanup.sh
    │   │   └── run.local_train.sh
    │   ├── orchestration
    │   │   ├── __init__.py
    │   │   └── components
    │   │   │   └── list_blobs.yaml
    │   └── trainer
    │   │   ├── __init__.py
    │   │   └── utils.py
    └── templates
    │   ├── experimental
    │       ├── example_pipeline.ipynb
    │       ├── get_tuned_params
    │       │   ├── Dockerfile
    │       │   ├── build.sh
    │       │   ├── component.yaml
    │       │   └── get_tuned_params.py
    │       ├── hptune
    │       │   ├── Dockerfile
    │       │   ├── build.sh
    │       │   ├── component.yaml
    │       │   └── hptune.sh
    │       ├── hptuning_config.yaml
    │       └── kfp_pipeline_from_config.py
    │   ├── kfp_pipeline.py
    │   ├── setup.py
    │   ├── sklearn_inputs.py
    │   ├── sklearn_model.py
    │   ├── sklearn_task.py
    │   ├── tf_inputs.py
    │   ├── tf_model.py
    │   ├── tf_task.py
    │   ├── xgboost_inputs.py
    │   ├── xgboost_model.py
    │   └── xgboost_task.py
├── setup.py
└── tests
    ├── __init__.py
    ├── integration
        ├── fixtures
        │   └── test_config.yaml
        └── src
        │   ├── __init__.py
        │   └── test_models.py
    ├── test_utils.py
    └── unit
        ├── __init__.py
        ├── examples
            ├── __init__.py
            ├── sklearn
            │   ├── __init__.py
            │   └── test_sklearn_model.py
            └── tensorflow
            │   ├── __init__.py
            │   └── test_tf_model.py
        └── src
            ├── __init__.py
            └── test_models.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Virtual envs
 2 | venv/*
 3 | testenv/
 4 | 
 5 | # Generated files
 6 | *.pyc
 7 | models/*
 8 | *.egg-info/
 9 | dist/*
10 | build/*
11 | *.tar.gz
12 | config.yaml
13 | trainer/model.py
14 | trainer/task.py
15 | trainer/inputs.py
16 | orchestration/pipeline.py
17 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | branches:
 2 |   only:
 3 |     - master
 4 | language: python
 5 | python:
 6 |   - "3.6"
 7 |   - "3.7"
 8 |   # Tensorflow 1.x does not support python 3.8+
 9 | install:
10 |   - pip install -e .
11 | script:
12 |   - python -m unittest discover -s tests/unit
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google/conduct/).
29 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:3.6
16 | COPY . /app
17 | WORKDIR /app
18 | RUN pip install -e ".[dev]"
19 | RUN python -m unittest discover -s tests/unit
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft ml_pipeline_gen/templates
2 | graft ml_pipeline_gen/static
3 | 
4 | prune **/experimental
5 | global-exclude *.py[cod]
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ML Pipeline Generator
  2 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ml-pipeline-gen)
  3 | [![PyPI version](https://badge.fury.io/py/ml-pipeline-gen.svg)](https://badge.fury.io/py/ml-pipeline-gen)
  4 | [![Build
  5 | Status](https://travis-ci.com/GoogleCloudPlatform/ml-pipeline-generator-python.svg?branch=master)](https://travis-ci.com/GoogleCloudPlatform/ml-pipeline-generator-python)
  6 | 
  7 | ML Pipeline Generator is a tool for generating end-to-end pipelines composed of GCP components so that users can easily migrate their local ML models onto GCP and start realizing the benefits of the Cloud quickly. 
  8 | 
  9 | The following ML frameworks will be supported:
 10 | 1. TensorFlow (TF)
 11 | 1. Scikit-learn (SKL)
 12 | 1. XGBoost (XGB)
 13 | 
 14 | The following backends are currently supported for model training:
 15 | 1. [Google Cloud AI Platform](https://cloud.google.com/ai-platform) 
 16 | 1. [AI Platform Pipelines](https://cloud.google.com/ai-platform/pipelines/docs) (managed Kubeflow Pipelines)
 17 | 
 18 | ## Setup
 19 | ### GCP credentials
 20 | ```bash
 21 | gcloud auth login
 22 | gcloud auth application-default login
 23 | gcloud config set project [PROJECT_ID]
 24 | ```
 25 | 
 26 | ### Enabling required APIs
 27 | 
 28 | The tool requires following Google Cloud APIs to be enabled: 
 29 | 1. [Compute Engine](https://console.cloud.google.com/apis/api/compute.googleapis.com)
 30 | 1. [AI Platform Training and Prediction](https://console.cloud.google.com/apis/api/ml.googleapis.com)
 31 | 1. [Cloud Storage](https://console.cloud.google.com/apis/api/storage-component.googleapis.com)
 32 | 
 33 | Enable the above APIs by following the links, or run the below command to 
 34 | enable the APIs for your project.
 35 | 
 36 | ```bash
 37 | gcloud services enable ml.googleapis.com \
 38 | compute.googleapis.com \
 39 | storage-component.googleapis.com
 40 | ```
 41 | 
 42 | ### Python environment
 43 | ```bash
 44 | python3 -m venv venv
 45 | source ./venv/bin/activate
 46 | pip install ml-pipeline-gen
 47 | ```
 48 | 
 49 | ### Kubeflow Pipelines 
 50 | Create a Kubeflow Pipelines instance on [AI Platform Pipelines](https://console.cloud.google.com/ai-platform/pipelines). 
 51 | Once the instance is provisioned, note down the hostname (Dashboard URL).
 52 | 
 53 | ## End to end tutorial notebook
 54 | You can view the notebook [here](https://github.com/GoogleCloudPlatform/ml-pipeline-generator-python/blob/master/examples/getting_started_notebook.ipynb)
 55 | which can run on your local jupyter  notebook, Cloud AI Platform and 
 56 | Colab. This takes you through how a typical user would leverage this solution.
 57 | 
 58 | ## Cloud AI Platform Demo
 59 | This demo uses the scikit-learn model in
 60 | `examples/sklearn/model/sklearn_model.py` to create a training module to run on
 61 | CAIP. First, make a copy of the `sklearn` example directory.
 62 | 
 63 | ```bash
 64 | cp -r examples/sklearn sklearn-demo
 65 | cd sklearn-demo
 66 | ```
 67 | 
 68 | Create a `config.yaml` by using the `config.yaml.example` template. See the
 69 | [docs](docs/CONFIG.md) for details on the config parameters. Once the
 70 | config file is filled out, run the demo.
 71 | 
 72 | ```bash
 73 | python demo.py
 74 | ```
 75 | 
 76 | Running this demo uses the config file to generate a `trainer/` module that is
 77 | compatible with CAIP.
 78 | 
 79 | ## Kubeflow Pipelines Demo
 80 | This demo orechestrates training and prediction using a TensorFlow model in 
 81 | `examples/kfp/model/tf_model.py` over Kubeflow Pipelines (hosted on AI Platform 
 82 | Pipelines). First, make a copy of the `kfp/` example directory.
 83 | 
 84 | ```bash
 85 | cp -r examples/kfp kfp-demo
 86 | cd kfp-demo
 87 | ```
 88 | 
 89 | Create a `config.yaml` by using the `config.yaml.example` template. See the 
 90 | [docs](docs/CONFIG.md) for details on the config parameters. Once the
 91 | config file is filled out, run the demo.
 92 | 
 93 | ```bash
 94 | python demo.py
 95 | ```
 96 | 
 97 | Running this demo uses the config file to generate a `trainer/` module that is
 98 | compatible with CAIP. It also generates `orchestration/pipeline.py`, which
 99 | compiles a Kubeflow Pipelines pipeline.
100 | 
101 | _Note: If you're using a GKE cluster without Workload Identity configured, the 
102 | tool provisions Workload Identity for the GKE cluster which modifies the 
103 | dashboard URL. If this occurs, you will need to update the your config.yaml with 
104 | the new Kubeflow Pipelines URL and rerun the demo._
105 | 
106 | ## Tests
107 | The tests use `unittest`, Python's built-in unit testing framework. By running
108 | `python -m unittest`, the framework performs test discovery to find all tests
109 | within this project. Tests can be run on a more granular level by feeding a
110 | directory to test discover. Read more about `unittest`
111 | [here](https://docs.python.org/3/library/unittest.html).
112 | 
113 | Unit tests:
114 | ```bash
115 | python -m unittest discover -s tests/unit
116 | ```
117 | 
118 | Integration tests:
119 | ```bash
120 | python -m unittest discover -s tests/integration
121 | ```
122 | 
123 | ## Input args
124 | The following input args are included by default. Overwrite them by adding them
125 | as inputs in the config file.
126 | 
127 | | Arg | Description |
128 | | ------------- | ----- |
129 | | train_path| Dir or bucket containing train data.|
130 | | eval_path | Dir or bucket containing eval data.|
131 | | model_dir | Dir or bucket to save model files. |
132 | | batch_size | Number of rows of data to be fed into the model each iteration. |
133 | | max_steps | The maximum number of iterations to train the model for. |
134 | | learning_rate| Multiplier that controls how much the weights of our network are adjusted with respect to the loss gradient.|
135 | | export_format | File format expected by the exported model at inference time. |
136 | | save_checkpoints_steps | Number of steps to run before saving a model checkpoint. |
137 | | keep_checkpoint_max | Number of model checkpoints to keep. |
138 | | log_step_count_steps | Number of steps to run before logging training performance. |
139 | | eval_steps | Number of steps to use to evaluate the model. |
140 | | early_stopping_steps | Number of steps with no loss decrease before stopping early. |
141 | 
142 | ## Contribute
143 | To modify the behavior of the library, install `ml-pipeline-gen` using:
144 | 
145 | ```bash
146 | pip install -e ".[dev]"
147 | ```
148 | 


--------------------------------------------------------------------------------
/docs/CONFIG.md:
--------------------------------------------------------------------------------
 1 | ### config.yaml schema
 2 | 
 3 | Below schema should be used when preparing a `config.yaml` file for models using the tool. Some parameters are optional and marked as such.
 4 | 
 5 | <pre>
 6 | <b>project_id</b>: [project ID]
 7 | <b>bucket_id</b>: [GCS bucket ID]
 8 | <b>region</b>: [GCP region to train ML Pipeline Generator models in, on AI Platform]
 9 | <b>cluster_name</b>: [Name of GKE cluster hosting Kubeflow Pipelines]
10 | <b>cluster_zone</b>: [Zone in which GKE cluster is deployed]
11 | <b>scale_tier</b>: [compute <a href="https://cloud.google.com/ai-platform/training/docs/machine-types#scale_tiers">specifications</a> for training the model on AI Platform]
12 | <b>runtime_version</b>: [AI Platform Training <a href="https://cloud.google.com/ai-platform/training/docs/runtime-version-list">runtime version</a>]
13 | <b>python_version</b>: [Python version used in the model code for training]
14 | <b>package_name</b>: [name for the source distribution to be uploaded to GCS]
15 | <b>machine_type_pred</b>: [type of <a href="https://cloud.google.com/ai-platform/training/docs/runtime-version-list">virtual machine</a> that AI Platform Prediction uses for the nodes that serve predictions, defaults to mls1-c1-m2]
16 | 
17 | <b>data</b>:
18 | 	<b>schema</b>:
19 | 		- [schema for input & target features in the training data]
20 | 	<b>train</b>: [GCS location url to upload preprocessed training data]
21 | 	<b>evaluation</b>: [GCS location url to upload preprocessed eval data]
22 | 	<b>prediction</b>:
23 | 		<b>input_data_paths</b>:
24 | 			- [GCS location urls for prediction input data]
25 | 		<b>input_format</b>: [prediction input format]
26 | 		<b>output_format</b>: [prediction output format]
27 | 
28 | <b>model</b>:
29 | 	<b>name</b>: [unique model name, must start with a letter and only contain letters, numbers, and underscores]
30 | 	<b>path</b>: [local dir path to the model.py file]
31 | 	<b>target</b>: [target feature in training data]
32 | 	<b>metrics</b>: [metrics to evaluate model training on, such as “accuracy”]
33 | 
34 | <b>model_params</b>:
35 | 	<b>input_args</b>: [Any input params to be submitted with the job]
36 | 		<b>arg_name</b>:
37 | 			<b>type</b>: [data type of the arg, such as int]
38 | 			<b>help</b>: [short description of the arg]
39 | 			<b>default</b>: [default value of the arg]
40 | 	<b>hyperparam_config</b>: [optional; local path to <a href="https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#hyperparameterspec">hyperparam tuning</a> config yaml. See schema <a href="HPTUNE_CONFIG.md">here</a> for this config file.]
41 | 	<b>explanation</b>: [optional; <a href="https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.models.versions#explanationconfig">explainability features</a> for the training job]
42 | 
43 | <b>orchestration</b>:
44 | 	<b>kubeflow_url</b>: [for KFP backend; URL of preconfigured Kubeflow instance]
45 | </pre>
46 | 


--------------------------------------------------------------------------------
/docs/HPTUNE_CONFIG.md:
--------------------------------------------------------------------------------
 1 | ### hptune_config.yaml schema
 2 | 
 3 | Below schema should be used when preparing a `hptune_config.yaml` file for models using the tool. The parameters follow the Cloud AI Platform [HyperparameterSpec](https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#HyperparameterSpec), some of which are optional and marked as such.
 4 | 
 5 | <pre>
 6 | <b>trainingInput</b>:
 7 | 	<b>hyperparameters</b>:
 8 | 		<b>goal</b>: [the type of goal to use for tuning, MAXIMIZE or MINIMIZE]
 9 | 		<b>params</b>: [the set of parameters to tune]
10 | 			- <b>parameterName</b>: [unique parameter name, e.g. “learning_rate”]
11 | 			  <b>type</b>: [parameter <a href="https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#ParameterType">type</a>]
12 | 			  <b>minValue</b>: [min value for the parameter, if DOUBLE or INTEGER type]
13 | 			  <b>maxValue</b>: [max value for the parameter, if DOUBLE or INTEGER type]
14 | 			  <b>scaleType</b>: [optional; how the parameter should be <a href="https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#ScaleType">scaled</a>]
15 | 		<b>maxTrials</b>: [optional; how many training trials should be attempted to optimize the specified hyperparameters]
16 | 		<b>maxParallelTrials</b>: [optional; the number of training trials to run concurrently]
17 | 		<b>maxFailedTrials</b>: [optional; the number of failed trials that need to be seen before failing the hyperparameter tuning job]
18 | 		<b>hyperparameterMetricTag</b>: [optional; TensorFlow summary tag name to use for optimizing trials]
19 | 		<b>resumePreviousJobId</b>: [optional; the prior hyperparameter tuning job id that users hope to continue with]
20 | 		<b>enableTrialEarlyStopping</b>: [optional; indicates if the hyperparameter tuning job enables auto trial early stopping]
21 | 		<b>algorithm</b>: [optional; search <a href="https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#Algorithm">algorithm</a> to be used]
22 | </pre>
23 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for AI Pipeline.
16 | 
17 | output_package: ./caipa-output
18 | project_id: gcp-demo-2-262319
19 | bucket_id: poc-bucket-0120
20 | region: us-central1
21 | runtime_version: "1.10"
22 | python_version: 3.6
23 | 
24 | model:
25 |     name: loan_delinq_v1
26 |     path:
27 | 
28 | # Search Path for pre-built components
29 | github_component_url: https://raw.githubusercontent.com/kubeflow/pipelines/3f4b80127f35e40760eeb1813ce1d3f641502222/components/gcp/
30 | kfp_deployment_url: https://54f49491f869f31e-dot-us-central2.pipelines.googleusercontent.com
31 | 
32 | preprocess:
33 |   component: user-input/preprocess
34 |   component_args:
35 |           - name: project_id
36 |             default: ''
37 |           - name: dataset_bucket
38 |             default: poc-bucket-0120
39 | 
40 | hptune:
41 |   component: AUTO
42 |   config: gs://poc-bucket-0120/hpconfig.yaml
43 |   args:
44 |       - name: output_dir
45 |         default: gs://poc-bucket-0120/hptune
46 |       - name: input_bucket
47 |         default: gs://poc-bucket-0120
48 |       - name: eval_steps
49 |         default: 10
50 |       - name: train_examples
51 |         default: 200
52 | 
53 | get_tuned_params:
54 |   component: AUTO
55 | 
56 | train:
57 |   python_module: trainer.task
58 |   python_package: gs://poc-bucket-0120/trainer.tar.gz
59 |   model_out_prefix: /export/exporter
60 |   component: ml_engine/train
61 |   args:
62 |        - name: output_dir
63 |          default: gs://poc-bucket-0120/train
64 |        - name: input_bucket
65 |          default: gs://poc-bucket-0120
66 |        - name: eval_steps
67 |          default: 10
68 |        - name: train_examples
69 |          default: 2000
70 | 
71 | deploy:
72 |   component: ml_engine/deploy
73 |   model_id: Loand_Delinq
74 |   version_id: v1.0
75 | 
76 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for KubeFlow Pipelines."""
16 | from ml_pipeline_gen.pipelines import KfpPipeline
17 | 
18 | 
19 | def main():
20 |     config = './config.yaml'
21 |     pipeline = KfpPipeline(config=config)
22 |     # Review the components
23 |     pipeline.list_components()
24 |     # define pipeline structure
25 |     preprocess = pipeline.add_component('preprocess')
26 |     hptune = pipeline.add_component('hptune', parent=preprocess)
27 |     get_best_params = pipeline.add_component('get_tuned_params', parent=hptune)
28 |     train = pipeline.add_component('train', parent=get_best_params)
29 |     deploy = pipeline.add_component('deploy', parent=train)
30 | 
31 |     pipeline.print_structure()
32 |     pipeline.generate_pipeline_from_config()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/pipeline_from_config_demo.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from ml_pipeline_gen.pipelines import KfpPipeline"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 4,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "config = \"./config.yaml\"\n",
19 |     "pipeline = KfpPipeline(config=config)\n",
20 |     "#pipeline.print_structure()"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 5,
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "name": "stdout",
30 |      "output_type": "stream",
31 |      "text": [
32 |       "['preprocess', 'hptune', 'get_tuned_params', 'train', 'deploy']\n"
33 |      ]
34 |     }
35 |    ],
36 |    "source": [
37 |     "# Review the components\n",
38 |     "pipeline.list_components()"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "code",
43 |    "execution_count": 6,
44 |    "metadata": {},
45 |    "outputs": [],
46 |    "source": [
47 |     "# define pipeline structure\n",
48 |     "preprocess = pipeline.add_component('preprocess')\n",
49 |     "hptune = pipeline.add_component('hptune', parent=preprocess)\n",
50 |     "get_best_params= pipeline.add_component('get_tuned_params', parent=hptune)\n",
51 |     "train = pipeline.add_component('train', parent=get_best_params)\n",
52 |     "deploy = pipeline.add_component('deploy', parent=train)"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": 7,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "# Generate kubeflow pipeline\n",
62 |     "pipeline.generate_pipeline_from_config()"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "metadata": {},
69 |    "outputs": [],
70 |    "source": []
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.6.10"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 4
94 | }
95 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/user-input/preprocess/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 The Kubeflow Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:latest
16 | 
17 | RUN pip install -U scipy
18 | 
19 | RUN pip install -U numpy
20 | 
21 | RUN pip install -U scikit-learn
22 | 
23 | RUN pip install pandas
24 | 
25 | RUN pip install --upgrade google-cloud-storage
26 | 
27 | COPY . /
28 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/user-input/preprocess/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2018 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | if [ -z "$1" ]; then
18 |   PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
19 | else
20 |   PROJECT_ID=$1
21 | fi
22 | 
23 | if [ -z "$2" ]; then
24 |   TAG_NAME="latest"
25 | else
26 |   TAG_NAME="$2"
27 | fi
28 | 
29 | CONTAINER_NAME=loan-pipeline-trainevalsplit
30 | 
31 | docker build -t ${CONTAINER_NAME} .
32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
34 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/user-input/preprocess/component.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Component Descriptor for Split-train-eval
16 | name: Split-train-Eval - Preprocess
17 | description: Splits a given input.csv to train and eval csv files
18 | 
19 | inputs:
20 |   - {name: project_id, type: String}
21 |   - {name: dataset_bucket, type: String}
22 | 
23 |     #outputs:
24 |     #- {name: train, type: XGBoost model, help: Trained XGBoost model}
25 | 
26 | implementation:
27 |   container:
28 |     image: gcr.io/gcp-demo-2-262319/loan-pipeline-trainevalsplit:latest
29 |     command: [
30 |       python, /split_train_eval.py,
31 |       --project_id, {inputValue: project_id},
32 |       --dataset_bucket,    {inputValue:  dataset_bucket},
33 |     ]
34 | 


--------------------------------------------------------------------------------
/examples/experimental/kfp-2/user-input/preprocess/split_train_eval.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """."""
 16 | import pandas as pd
 17 | from sklearn.model_selection import train_test_split
 18 | from sklearn.utils import shuffle
 19 | from io import BytesIO
 20 | from google.cloud import storage
 21 | import argparse
 22 | 
 23 | 
 24 | def obtain_train_eval(project_id, bucket_name):
 25 |     """."""
 26 |     # # All of the data is in a file called Step10_Final_dataset.csv
 27 |     print('reading the data file from gcs...')
 28 |     print('Project-ID: %s ' %(project_id))
 29 |     print('Bucket-ID: %s ' %(bucket_name))
 30 | 
 31 | 
 32 |      # The following was derived from the contents of this reply:
 33 |      # https://stackoverflow.com/a/50201179
 34 |     storage_client = storage.Client(project=project_id, credentials=None)
 35 |     bucket = storage_client.get_bucket(bucket_name)
 36 |     blob = bucket.blob('input/Step10_Final_dataset.csv')
 37 | 
 38 |     byte_stream = BytesIO()
 39 |     blob.download_to_file(byte_stream)
 40 |     byte_stream.seek(0)
 41 |     df = pd.read_csv(byte_stream)
 42 | 
 43 |     # We need to rearrange the columns below just as they shall be
 44 |     # expected by the estimator
 45 |     print('rearranging data...')
 46 |     key_column = 'LOAN_SEQUENCE_NUMBER'
 47 |     label_column = 'TARGET'
 48 |     bool_cols = []
 49 |     int_cols = ['credit_score',
 50 |                 'mortgage_insurance_percentage',
 51 |                 'Number_of_units',
 52 |                 'cltv',
 53 |                 'original_upb',
 54 |                 'ltv',
 55 |                 'original_loan_term',
 56 |                 'number_of_borrowers',
 57 |                 'min_CURRENT_DEFERRED_UPB']
 58 |     str_cols = ['first_time_home_buyer_flag',
 59 |                 'occupancy_status',
 60 |                 'channel',
 61 |                 'property_state',
 62 |                 'property_type',
 63 |                 'loan_purpose',
 64 |                 'seller_name',
 65 |                 'service_name']
 66 |     # str_nuniques = [2, 3, 3, 52, 5, 2, 20, 24]
 67 |     float_cols = ['metropolitan_division',
 68 |                   'original_interest_rate',
 69 |                   'min_CURRENT_ACTUAL_UPB',
 70 |                   'max_CURRENT_ACTUAL_UPB',
 71 |                   'Range_CURRENT_ACTUAL_UPB',
 72 |                   'stdev_CURRENT_ACTUAL_UPB',
 73 |                   'mode_CURRENT_ACTUAL_UPB',
 74 |                   'average_CURRENT_ACTUAL_UPB',
 75 |                   'max_CURRENT_DEFERRED_UPB',
 76 |                   'Range_CURRENT_DEFERRED_UPB',
 77 |                   'mode_CURRENT_DEFERRED_UPB',
 78 |                   'average_CURRENT_DEFERRED_UPB',
 79 |                   'stdev_CURRENT_DEFERRED_UPB',
 80 |                   'min_CURRENT_INTEREST_RATE',
 81 |                   'max_CURRENT_INTEREST_RATE',
 82 |                   'Range_CURRENT_INTEREST_RATE',
 83 |                   'mode_CURRENT_INTEREST_RATE',
 84 |                   'stdev_CURRENT_INTEREST_RATE',
 85 |                   'average_CURRENT_INTEREST_RATE',
 86 |                   'PREFINAL_LOAN_DELINQUENCY_STATUS',
 87 |                   'frequency_0',
 88 |                   'frequency_1',
 89 |                   'frequency_2',
 90 |                   'frequency_3',
 91 |                   'Recency_0',
 92 |                   'Recency_1',
 93 |                   'Recency_2',
 94 |                   'Recency_3']
 95 |     # DEFAULTS = [[''] for col in bool_cols] + \
 96 |     #            [[0] for col in int_cols] + \
 97 |     #            [[0.0] for col in float_cols] + \
 98 |     #            [[''] for col in str_cols] + [[''], [0]]
 99 |     csv_columns = bool_cols + int_cols + float_cols + \
100 |            str_cols + [key_column, label_column]
101 |     traindata = df[csv_columns]
102 | 
103 |     # Here, we'll split with a small test size so as to
104 |     # allow our model to train on more data
105 |     print('splitting...')
106 |     x_train, x_test, y_train, y_test = train_test_split(
107 |         traindata.drop(label_column, axis=1), traindata[label_column],
108 |         stratify=traindata[label_column], shuffle=True, test_size=0.1)
109 |     traindf = pd.concat([x_train, y_train], axis=1)
110 |     evaldf = pd.concat([x_test, y_test], axis=1)
111 | 
112 |     alld = pd.concat([traindf, evaldf])
113 |     strcols = [col for col in alld.columns if alld[col].dtype == 'object']
114 |     if key_column in strcols:
115 |         strcols.remove(key_column)
116 |     alld = pd.get_dummies(alld, columns=strcols)
117 | 
118 |     divline = traindf.shape[0]
119 |     traindf_wdummies = alld.iloc[:divline, :]
120 |     # not necessary only cmle but can be used to
121 |     # test performance if so desired
122 |     evaldf_wdummies = alld.iloc[divline:, :]
123 |     del alld
124 | 
125 |     print('Undersample for XG Boost....')
126 | 
127 |     traindfu_wdummies = pd.concat([
128 |         traindf_wdummies[traindf_wdummies[label_column] == 0].sample(
129 |             frac=0.01),
130 |         traindf_wdummies[traindf_wdummies[label_column] == 1].sample(
131 |             frac=0.55),
132 |         traindf_wdummies[traindf_wdummies[label_column] > 1]])
133 |     traindfu_wdummies = shuffle(traindfu_wdummies)
134 | 
135 |     # traindfu_wdummies.drop(key_column, axis=1)
136 |     # .to_csv('xgb_train.csv', index=False)
137 |     # evaldf_wdummies.drop([key_column,label_column], axis=1)
138 |     # .to_csv('xgb_eval.csv', index=False)
139 | 
140 |     # Since the results are small enough to fit in a single
141 |     # well-provisioned VM, we'll write the results to csv files locally
142 |     # then move them to gcs so we have two copies to work
143 |     # with as we please
144 | 
145 |     print('writing tf model files...')
146 |     write_file(
147 |         storage_client,
148 |         traindf[csv_columns],
149 |         bucket_name,
150 |         'train.csv',
151 |         header=False)
152 |     write_file(
153 |         storage_client,
154 |         evaldf[csv_columns],
155 |         bucket_name,
156 |         'eval.csv',
157 |         header=False)
158 | 
159 |     # traindf[csv_columns].to_csv('train.csv', index=False, header=False)
160 |     # evaldf[csv_columns].to_csv('eval.csv', index=False, header=False)
161 | 
162 |     print('writing XG Boost model files...')
163 |     write_file(
164 |         storage_client,
165 |         traindfu_wdummies.drop(key_column, axis=1),
166 |         bucket_name,
167 |         'xgb_train.csv',
168 |         header=True)
169 |     write_file(
170 |         storage_client,
171 |         evaldf_wdummies.drop([key_column, label_column], axis=1),
172 |         bucket_name,
173 |        'xgb_eval.csv',
174 |         header=True)
175 | 
176 |     with open('./output.txt', 'w') as output_file:
177 |         output_file.write(bucket_name)
178 |         print('Done!')
179 | 
180 | 
181 | def write_file(storage_client,
182 |                df,
183 |                bucket_name,
184 |                destination_file_name,
185 |                header):
186 |     """Write a blob from the bucket."""
187 |     df_str = df.to_csv(index=False, header=header)
188 |     # storage_client = storage.Client()
189 |     bucket = storage_client.get_bucket(bucket_name)
190 |     blob = bucket.blob('output/' + destination_file_name)
191 |     blob.upload_from_string(df_str)
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     parser = argparse.ArgumentParser()
196 |     parser.add_argument('--project_id',
197 |                         type=str,
198 |                         required=True,
199 |                         help='The GCP project_id containing the source file')
200 |     parser.add_argument('--dataset_bucket',
201 |                         type=str,
202 |                         required=True,
203 |                         help='Bucket to store outputs.')
204 |     args = parser.parse_args()
205 | 
206 |     obtain_train_eval(args.project_id, args.dataset_bucket)
207 | 


--------------------------------------------------------------------------------
/examples/kfp/bin/wi_setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright 2020 Google LLC
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # Script to set up Google service accounts and workload identity bindings for a 
 18 | # Kubeflow Pipelines (KFP) standalone deployment.
 19 | #
 20 | # The script checks if the GKE cluster has Workload Identity enabled and 
 21 | # configured with a custom label, and if not, enables it and updates the label.
 22 | # 
 23 | # Adapted for ML Pipeline Generator from https://github.com/kubeflow/pipelines/blob/master/manifests/kustomize/gcp-workload-identity-setup.sh
 24 | #
 25 | # What the script configures:
 26 | #      1. Workload Identity for the cluster.
 27 | #      2. Google service accounts (GSAs): $SYSTEM_GSA and $USER_GSA.
 28 | #      3. Service account IAM policy bindings.
 29 | #      4. Kubernetes service account annotations.
 30 | #
 31 | # Note: Since the node pool is updated with WI, a new KFP hostname is generated.
 32 | # 
 33 | # Requirements:
 34 | #      1. gcloud set up in the environment calling the script
 35 | #      2. KFP is deployed on a GKE cluster 
 36 | set -e
 37 | 
 38 | # Cluster vars
 39 | PROJECT_ID=$1
 40 | CLUSTER_NAME=$2
 41 | ZONE=$3
 42 | NAMESPACE=$4
 43 | 
 44 | echo "Workload Identity has not been provisioned for "${CLUSTER_NAME}" ("${ZONE}"), enabling it now..."
 45 | 
 46 | # Google service Account (GSA)
 47 | SYSTEM_GSA=$CLUSTER_NAME-kfp-system
 48 | USER_GSA=$CLUSTER_NAME-kfp-user
 49 | 
 50 | # Kubernetes Service Account (KSA)
 51 | SYSTEM_KSA=(ml-pipeline-ui ml-pipeline-visualizationserver)
 52 | USER_KSA=(pipeline-runner default)
 53 | 
 54 | gcloud container clusters get-credentials $CLUSTER_NAME \
 55 |   --zone=$ZONE
 56 | 
 57 | gcloud container clusters update $CLUSTER_NAME \
 58 |   --zone=$ZONE \
 59 |   --workload-pool="${PROJECT_ID}".svc.id.goog 
 60 | 
 61 | gcloud beta container node-pools update default-pool \
 62 |   --cluster=$CLUSTER_NAME \
 63 |   --zone=$ZONE \
 64 |   --max-surge-upgrade=3 \  
 65 |   --max-unavailable-upgrade=0
 66 | 
 67 | gcloud container node-pools update default-pool \
 68 |   --cluster=$CLUSTER_NAME \
 69 |   --zone=$ZONE \
 70 |   --workload-metadata=GKE_METADATA
 71 | 
 72 | echo "Creating Google Service Accounts..."
 73 | function create_gsa_if_not_present {
 74 |   local name=${1}
 75 |   local already_present=$(gcloud iam service-accounts list --filter='name:'$name'' --format='value(name)')
 76 |   if [ -n "$already_present" ]; then
 77 |     echo "Service account $name already exists"
 78 |   else
 79 |     gcloud iam service-accounts create $name
 80 |   fi
 81 | }
 82 | 
 83 | create_gsa_if_not_present $SYSTEM_GSA
 84 | create_gsa_if_not_present $USER_GSA
 85 | 
 86 | # Add iam policy bindings to grant project permissions to these GSAs.
 87 | gcloud projects add-iam-policy-binding $PROJECT_ID \
 88 |   --member="serviceAccount:$SYSTEM_GSA@$PROJECT_ID.iam.gserviceaccount.com" \
 89 |   --role="roles/editor"
 90 | gcloud projects add-iam-policy-binding $PROJECT_ID \
 91 |   --member="serviceAccount:$USER_GSA@$PROJECT_ID.iam.gserviceaccount.com" \
 92 |   --role="roles/editor"
 93 | 
 94 | # Bind KSA to GSA through workload identity.
 95 | function bind_gsa_and_ksa {
 96 |   local gsa=${1}
 97 |   local ksa=${2}
 98 | 
 99 |   gcloud iam service-accounts add-iam-policy-binding $gsa@$PROJECT_ID.iam.gserviceaccount.com \
100 |     --member="serviceAccount:$PROJECT_ID.svc.id.goog[$NAMESPACE/$ksa]" \
101 |     --role="roles/iam.workloadIdentityUser" \
102 |     > /dev/null
103 | 
104 |   kubectl annotate serviceaccount \
105 |     --namespace $NAMESPACE \
106 |     --overwrite \
107 |     $ksa iam.gke.io/gcp-service-account=$gsa@$PROJECT_ID.iam.gserviceaccount.com
108 | 
109 |   echo "* Bound KSA $ksa to GSA $gsa"
110 | }
111 | 
112 | echo "Binding each kfp system KSA to $SYSTEM_GSA"
113 | for ksa in ${SYSTEM_KSA[@]}; do
114 |   bind_gsa_and_ksa $SYSTEM_GSA $ksa
115 | done
116 | 
117 | echo "Binding each kfp user KSA to $USER_GSA"
118 | for ksa in ${USER_KSA[@]}; do
119 |   bind_gsa_and_ksa $USER_GSA $ksa
120 | done
121 | 
122 | gcloud container clusters update $CLUSTER_NAME \
123 |   --zone=$ZONE
124 |   --update-labels mlpg_wi_auth=true
125 | 
126 | RED='\033[0;31m'
127 | COLOR_RESET='\033[0m'
128 | echo -e "${RED}Workload Identity has been enabled, and KFP dashboard URL has been updated. Please update the hostname in config.yaml for future runs.${COLOR_RESET}"
129 | 


--------------------------------------------------------------------------------
/examples/kfp/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | cluster_name: [GKE CLUSTER NAME]
21 | cluster_zone: [GKE CLUSTER ZONE]
22 | scale_tier: "STANDARD_1"
23 | runtime_version: "1.15"
24 | python_version: "3.7"
25 | package_name: "ml_pipeline_gen"
26 | machine_type_pred: "n1-standard-4"
27 | 
28 | data:
29 |     schema:
30 |         - "age"
31 |         - "workclass"
32 |         - "education_num"
33 |         - "marital_status"
34 |         - "occupation"
35 |         - "relationship"
36 |         - "race"
37 |         - "capital_gain"
38 |         - "capital_loss"
39 |         - "hours_per_week"
40 |         - "native_country"
41 |         - "income_bracket"
42 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv"
43 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv"
44 |     prediction:
45 |         input_data_paths:
46 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
47 |         input_format: "JSON"
48 |         output_format: "JSON"
49 | 
50 | model:
51 |     # Name must start with a letter and only contain letters, numbers, and
52 |     # underscores.
53 |     name: [MODEL NAME]
54 |     path: "model.tf_model"
55 |     target: "income_bracket"
56 |     metrics:
57 |         - "accuracy"
58 | 
59 | model_params:
60 |     input_args:
61 |         first_layer_size:
62 |             type: "int"
63 |             help: "Size of the NN first layer."
64 |             default: 50
65 |         num_layers:
66 |             type: "int"
67 |             help: "Number of layers in the NN."
68 |             default: 5
69 |         max_steps:
70 |             default: 1000
71 |     # Relative path.
72 |     hyperparam_config: "hptuning_config.yaml"
73 |     explain_output:
74 |         explain_type: "sampledShapleyAttribution"
75 |         explain_param:
76 |             name: "numPaths"
77 |             value: 40
78 | 
79 | orchestration:
80 |     host: [KFP DASHBOARD URL]
81 | 


--------------------------------------------------------------------------------
/examples/kfp/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for KubeFlow Pipelines."""
16 | import json
17 | import os
18 | 
19 | from ml_pipeline_gen.models import TFModel
20 | from ml_pipeline_gen.pipelines import KfpPipeline
21 | from model.census_preprocess import load_data
22 | 
23 | 
24 | def _upload_data_to_gcs(model):
25 |     """Calls the preprocessing fn which uploads train/eval data to GCS."""
26 |     load_data(model.data["train"], model.data["evaluation"])
27 | 
28 | 
29 | # TODO(humichael): See if there's a way to support csv batch predicts.
30 | def _upload_input_data_to_gcs(model, data):
31 |     input_path = "tf_input_data.json"
32 |     with open(input_path, "w+") as f:
33 |         for features in data:
34 |             f.write(json.dumps(features) + "\n")
35 |     model.upload_pred_input_data(input_path)
36 |     os.remove(input_path)
37 | 
38 | 
39 | # pylint: disable=g-import-not-at-top
40 | def main():
41 |     config = "config.yaml"
42 |     model = TFModel(config)
43 |     model.generate_files()
44 |     _upload_data_to_gcs(model)
45 |     pipeline = KfpPipeline(model)
46 | 
47 |     # preprocess and upload dataset to expected location.
48 |     load_data(model.data["train"], model.data["evaluation"])
49 | 
50 |     # define pipeline structure
51 |     p = pipeline.add_train_component()
52 |     pipeline.add_deploy_component(parent=p)
53 |     pipeline.add_predict_component(parent=p)
54 |     pipeline.print_structure()
55 | 
56 |     pipeline.generate_pipeline()
57 | 
58 |     # Create batch prediction data in GCS.
59 |     pred_input = [{
60 |         "age": 0.02599666,
61 |         "workclass": 6,
62 |         "education_num": 1.1365801,
63 |         "marital_status": 4,
64 |         "occupation": 0,
65 |         "relationship": 1,
66 |         "race": 4,
67 |         "capital_gain": 0.14693314,
68 |         "capital_loss": -0.21713187,
69 |         "hours_per_week": -0.034039237,
70 |         "native_country": 38,
71 |         "income_bracket": 0,
72 |     }]
73 |     _upload_input_data_to_gcs(model, pred_input)
74 | 
75 |     # Run the pipeline.
76 |     # pylint: disable=import-outside-toplevel
77 |     from orchestration import pipeline as kfp_pipeline
78 |     kfp_pipeline.main()
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/examples/kfp/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   hyperparameters:
16 |     hyperparameterMetricTag: accuracy
17 |     goal: MAXIMIZE
18 |     maxTrials: 4
19 |     maxParallelTrials: 2
20 |     enableTrialEarlyStopping: True
21 |     params:
22 |       - parameterName: first_layer_size
23 |         type: INTEGER
24 |         minValue: 50
25 |         maxValue: 500
26 |         scaleType: UNIT_LINEAR_SCALE
27 |       - parameterName: num_layers
28 |         type: INTEGER
29 |         minValue: 1
30 |         maxValue: 15
31 |         scaleType: UNIT_LINEAR_SCALE
32 | 


--------------------------------------------------------------------------------
/examples/kfp/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/kfp/model/__init__.py


--------------------------------------------------------------------------------
/examples/kfp/model/census_preprocess.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2019 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Train a simple TF classifier for MNIST dataset.
 16 | 
 17 | This example comes from the cloudml-samples keras demo.
 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras
 19 | """
 20 | from __future__ import absolute_import
 21 | from __future__ import division
 22 | from __future__ import print_function
 23 | 
 24 | import os
 25 | from six.moves import urllib
 26 | import tempfile
 27 | 
 28 | import numpy as np
 29 | import pandas as pd
 30 | import tensorflow.compat.v1 as tf
 31 | 
 32 | 
 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data")
 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform"
 35 |             + "/census/data/")
 36 | TRAINING_FILE = "adult.data.csv"
 37 | EVAL_FILE = "adult.test.csv"
 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE)
 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE)
 40 | 
 41 | _CSV_COLUMNS = [
 42 |     "age", "workclass", "fnlwgt", "education", "education_num",
 43 |     "marital_status", "occupation", "relationship", "race", "gender",
 44 |     "capital_gain", "capital_loss", "hours_per_week", "native_country",
 45 |     "income_bracket",
 46 | ]
 47 | _LABEL_COLUMN = "income_bracket"
 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"]
 49 | 
 50 | _CATEGORICAL_TYPES = {
 51 |     "workclass": pd.api.types.CategoricalDtype(categories=[
 52 |         "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc",
 53 |         "Self-emp-not-inc", "State-gov", "Without-pay"
 54 |     ]),
 55 |     "marital_status": pd.api.types.CategoricalDtype(categories=[
 56 |         "Divorced", "Married-AF-spouse", "Married-civ-spouse",
 57 |         "Married-spouse-absent", "Never-married", "Separated", "Widowed"
 58 |     ]),
 59 |     "occupation": pd.api.types.CategoricalDtype([
 60 |         "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial",
 61 |         "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct",
 62 |         "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv",
 63 |         "Sales", "Tech-support", "Transport-moving"
 64 |     ]),
 65 |     "relationship": pd.api.types.CategoricalDtype(categories=[
 66 |         "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried",
 67 |         "Wife"
 68 |     ]),
 69 |     "race": pd.api.types.CategoricalDtype(categories=[
 70 |         "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"
 71 |     ]),
 72 |     "native_country": pd.api.types.CategoricalDtype(categories=[
 73 |         "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic",
 74 |         "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece",
 75 |         "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong",
 76 |         "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan",
 77 |         "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru",
 78 |         "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South",
 79 |         "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam",
 80 |         "Yugoslavia"
 81 |     ]),
 82 |     "income_bracket": pd.api.types.CategoricalDtype(categories=[
 83 |         "<=50K", ">50K"
 84 |     ])
 85 | }
 86 | 
 87 | 
 88 | def _download_and_clean_file(filename, url):
 89 |     """Downloads data from url, and makes changes to match the CSV format.
 90 | 
 91 |     The CSVs may use spaces after the comma delimters (non-standard) or include
 92 |     rows which do not represent well-formed examples. This function strips out
 93 |     some of these problems.
 94 | 
 95 |     Args:
 96 |       filename: filename to save url to
 97 |       url: URL of resource to download
 98 |     """
 99 |     temp_file, _ = urllib.request.urlretrieve(url)
100 |     with tf.io.gfile.GFile(temp_file, "r") as temp_file_object:
101 |         with tf.io.gfile.GFile(filename, "w") as file_object:
102 |             for line in temp_file_object:
103 |                 line = line.strip()
104 |                 line = line.replace(", ", ",")
105 |                 if not line or "," not in line:
106 |                     continue
107 |                 if line[-1] == ".":
108 |                     line = line[:-1]
109 |                 line += "\n"
110 |                 file_object.write(line)
111 |     tf.io.gfile.remove(temp_file)
112 | 
113 | 
114 | def download(data_dir):
115 |     """Downloads census data if it is not already present.
116 | 
117 |     Args:
118 |       data_dir: directory where we will access/save the census data
119 | 
120 |     Returns:
121 |       foo
122 |     """
123 |     tf.io.gfile.makedirs(data_dir)
124 | 
125 |     training_file_path = os.path.join(data_dir, TRAINING_FILE)
126 |     if not tf.io.gfile.exists(training_file_path):
127 |         _download_and_clean_file(training_file_path, TRAINING_URL)
128 | 
129 |     eval_file_path = os.path.join(data_dir, EVAL_FILE)
130 |     if not tf.io.gfile.exists(eval_file_path):
131 |         _download_and_clean_file(eval_file_path, EVAL_URL)
132 | 
133 |     return training_file_path, eval_file_path
134 | 
135 | 
136 | def upload(train_df, eval_df, train_path, eval_path):
137 |     train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE),
138 |                     index=False, header=False)
139 |     eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE),
140 |                    index=False, header=False)
141 | 
142 | 
143 | def preprocess(dataframe):
144 |     """Converts categorical features to numeric. Removes unused columns.
145 | 
146 |     Args:
147 |       dataframe: Pandas dataframe with raw data
148 | 
149 |     Returns:
150 |       Dataframe with preprocessed data
151 |     """
152 |     dataframe = dataframe.drop(columns=UNUSED_COLUMNS)
153 | 
154 |     # Convert integer valued (numeric) columns to floating point
155 |     numeric_columns = dataframe.select_dtypes(["int64"]).columns
156 |     dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32")
157 | 
158 |     # Convert categorical columns to numeric
159 |     cat_columns = dataframe.select_dtypes(["object"]).columns
160 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
161 |         lambda x: x.astype(_CATEGORICAL_TYPES[x.name]))
162 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
163 |         lambda x: x.cat.codes)
164 |     return dataframe
165 | 
166 | 
167 | def standardize(dataframe):
168 |     """Scales numerical columns using their means and standard deviation.
169 | 
170 |     Args:
171 |       dataframe: Pandas dataframe
172 | 
173 |     Returns:
174 |       Input dataframe with the numerical columns scaled to z-scores
175 |     """
176 |     dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes)))
177 |     for column, dtype in dtypes:
178 |         if dtype == "float32":
179 |             dataframe[column] -= dataframe[column].mean()
180 |             dataframe[column] /= dataframe[column].std()
181 |     return dataframe
182 | 
183 | 
184 | def load_data(train_path="", eval_path=""):
185 |     """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes.
186 | 
187 |     Args:
188 |       train_path: Local or GCS path to uploaded train data to.
189 |       eval_path: Local or GCS path to uploaded eval data to.
190 | 
191 |     Returns:
192 |       A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are
193 |       Pandas dataframes with features for training and train_y and eval_y are
194 |       numpy arrays with the corresponding labels.
195 |     """
196 |     # Download Census dataset: Training and eval csv files.
197 |     training_file_path, eval_file_path = download(DATA_DIR)
198 | 
199 |     train_df = pd.read_csv(
200 |         training_file_path, names=_CSV_COLUMNS, na_values="?")
201 |     eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?")
202 | 
203 |     train_df = preprocess(train_df)
204 |     eval_df = preprocess(eval_df)
205 | 
206 |     # Split train and eval data with labels. The pop method copies and removes
207 |     # the label column from the dataframe.
208 |     train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN)
209 |     eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN)
210 | 
211 |     # Join train_x and eval_x to normalize on overall means and standard
212 |     # deviations. Then separate them again.
213 |     all_x = pd.concat([train_x, eval_x], keys=["train", "eval"])
214 |     all_x = standardize(all_x)
215 |     train_x, eval_x = all_x.xs("train"), all_x.xs("eval")
216 | 
217 |     # Rejoin features and labels and upload to GCS.
218 |     if train_path and eval_path:
219 |         train_df = train_x.copy()
220 |         train_df[_LABEL_COLUMN] = train_y
221 |         eval_df = eval_x.copy()
222 |         eval_df[_LABEL_COLUMN] = eval_y
223 |         upload(train_df, eval_df, train_path, eval_path)
224 | 
225 |     # Reshape label columns for use with tf.data.Dataset
226 |     train_y = np.asarray(train_y).astype("float32").reshape((-1, 1))
227 |     eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1))
228 | 
229 |     return train_x, train_y, eval_x, eval_y
230 | 
231 | 


--------------------------------------------------------------------------------
/examples/kfp/model/tf_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Train a simple TF classifier for census dataset."""
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import argparse
21 | 
22 | import tensorflow.compat.v1 as tf
23 | 
24 | from model.census_preprocess import load_data
25 | 
26 | 
27 | def get_model(inputs, params):
28 |     """Trains a classifier on iris data."""
29 |     dense = tf.keras.layers.Dense
30 |     nn = dense(params.first_layer_size, activation="relu",
31 |                kernel_initializer="uniform")(inputs)
32 |     for i in reversed(range(1, params.num_layers)):
33 |         layer_size = int(params.first_layer_size * (i / params.num_layers))
34 |         nn = dense(max(1, layer_size), activation="relu")(nn)
35 |     logits = dense(1, activation="sigmoid")(nn)
36 | 
37 |     return logits
38 | 
39 | 
40 | # TODO(humichael): create get_predicition and get_evaluation instead.
41 | def get_loss():
42 |     """The loss function to use."""
43 |     return tf.losses.sigmoid_cross_entropy
44 | 
45 | 
46 | def main():
47 |     """Trains a model locally to test get_model() and get_loss()."""
48 |     train_x, train_y, _, _ = load_data()
49 |     input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],))
50 |     params = argparse.Namespace(first_layer_size=50, num_layers=5)
51 |     predictions = get_model(input_layer, params)
52 |     model = tf.keras.models.Model(inputs=input_layer, outputs=predictions)
53 |     model.compile(optimizer="adam", loss=get_loss(),
54 |                   metrics=["accuracy"])
55 |     model.fit(train_x, train_y, epochs=1)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/examples/sklearn/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "mls1-c4-m2"
25 | 
26 | data:
27 |     schema:
28 |         - "age"
29 |         - "workclass"
30 |         - "education_num"
31 |         - "marital_status"
32 |         - "occupation"
33 |         - "relationship"
34 |         - "race"
35 |         - "capital_gain"
36 |         - "capital_loss"
37 |         - "hours_per_week"
38 |         - "native_country"
39 |         - "income_bracket"
40 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv"
41 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv"
42 |     prediction:
43 |         input_data_paths:
44 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
45 |         input_format: "JSON"
46 |         output_format: "JSON"
47 | 
48 | model:
49 |     # Name must start with a letter and only contain letters, numbers, and
50 |     # underscores.
51 |     name: [MODEL NAME]
52 |     path: "model.sklearn_model"
53 |     target: "income_bracket"
54 | 
55 | model_params:
56 |     input_args:
57 |         C:
58 |             type: "float"
59 |             help: "Regularization parameter, must be positive."
60 |             default: 1.0
61 |     # Relative path.
62 |     hyperparam_config: "hptuning_config.yaml"
63 | 


--------------------------------------------------------------------------------
/examples/sklearn/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for scikit-learn ML Pipeline Generator."""
16 | from ml_pipeline_gen.models import SklearnModel
17 | from model.census_preprocess import load_data
18 | 
19 | 
20 | def _upload_data_to_gcs(model):
21 |     load_data(model.data["train"], model.data["evaluation"])
22 | 
23 | 
24 | def main():
25 |     config = "config.yaml"
26 |     pred_input = [
27 |         [0.02599666, 6, 1.1365801, 4, 0, 1, 4, 0.14693314, -0.21713187,
28 |          -0.034039237, 38],
29 |     ]
30 |     model = SklearnModel(config)
31 |     model.generate_files()
32 |     _upload_data_to_gcs(model)
33 | 
34 |     job_id = model.train(tune=True)
35 |     version = model.deploy(job_id=job_id)
36 |     preds = model.online_predict(pred_input, version=version)
37 | 
38 |     print("Features: {}".format(pred_input))
39 |     print("Predictions: {}".format(preds))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/examples/sklearn/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   scaleTier: STANDARD_1
16 |   hyperparameters:
17 |     goal: MAXIMIZE
18 |     maxTrials: 2
19 |     maxParallelTrials: 2
20 |     hyperparameterMetricTag: score
21 |     enableTrialEarlyStopping: TRUE
22 |     params:
23 |     - parameterName: C
24 |       type: DOUBLE
25 |       minValue: .001
26 |       maxValue: 10
27 |       scaleType: UNIT_LOG_SCALE
28 | 


--------------------------------------------------------------------------------
/examples/sklearn/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/sklearn/model/__init__.py


--------------------------------------------------------------------------------
/examples/sklearn/model/census_preprocess.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2019 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Train a simple TF classifier for MNIST dataset.
 16 | 
 17 | This example comes from the cloudml-samples keras demo.
 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras
 19 | """
 20 | from __future__ import absolute_import
 21 | from __future__ import division
 22 | from __future__ import print_function
 23 | 
 24 | import os
 25 | from six.moves import urllib
 26 | import tempfile
 27 | 
 28 | import numpy as np
 29 | import pandas as pd
 30 | import tensorflow.compat.v1 as tf
 31 | 
 32 | 
 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data")
 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform"
 35 |             + "/census/data/")
 36 | TRAINING_FILE = "adult.data.csv"
 37 | EVAL_FILE = "adult.test.csv"
 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE)
 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE)
 40 | 
 41 | _CSV_COLUMNS = [
 42 |     "age", "workclass", "fnlwgt", "education", "education_num",
 43 |     "marital_status", "occupation", "relationship", "race", "gender",
 44 |     "capital_gain", "capital_loss", "hours_per_week", "native_country",
 45 |     "income_bracket",
 46 | ]
 47 | _LABEL_COLUMN = "income_bracket"
 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"]
 49 | 
 50 | _CATEGORICAL_TYPES = {
 51 |     "workclass": pd.api.types.CategoricalDtype(categories=[
 52 |         "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc",
 53 |         "Self-emp-not-inc", "State-gov", "Without-pay"
 54 |     ]),
 55 |     "marital_status": pd.api.types.CategoricalDtype(categories=[
 56 |         "Divorced", "Married-AF-spouse", "Married-civ-spouse",
 57 |         "Married-spouse-absent", "Never-married", "Separated", "Widowed"
 58 |     ]),
 59 |     "occupation": pd.api.types.CategoricalDtype([
 60 |         "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial",
 61 |         "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct",
 62 |         "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv",
 63 |         "Sales", "Tech-support", "Transport-moving"
 64 |     ]),
 65 |     "relationship": pd.api.types.CategoricalDtype(categories=[
 66 |         "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried",
 67 |         "Wife"
 68 |     ]),
 69 |     "race": pd.api.types.CategoricalDtype(categories=[
 70 |         "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"
 71 |     ]),
 72 |     "native_country": pd.api.types.CategoricalDtype(categories=[
 73 |         "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic",
 74 |         "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece",
 75 |         "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong",
 76 |         "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan",
 77 |         "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru",
 78 |         "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South",
 79 |         "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam",
 80 |         "Yugoslavia"
 81 |     ]),
 82 |     "income_bracket": pd.api.types.CategoricalDtype(categories=[
 83 |         "<=50K", ">50K"
 84 |     ])
 85 | }
 86 | 
 87 | 
 88 | def _download_and_clean_file(filename, url):
 89 |     """Downloads data from url, and makes changes to match the CSV format.
 90 | 
 91 |     The CSVs may use spaces after the comma delimters (non-standard) or include
 92 |     rows which do not represent well-formed examples. This function strips out
 93 |     some of these problems.
 94 | 
 95 |     Args:
 96 |       filename: filename to save url to
 97 |       url: URL of resource to download
 98 |     """
 99 |     temp_file, _ = urllib.request.urlretrieve(url)
100 |     with tf.io.gfile.GFile(temp_file, "r") as temp_file_object:
101 |         with tf.io.gfile.GFile(filename, "w") as file_object:
102 |             for line in temp_file_object:
103 |                 line = line.strip()
104 |                 line = line.replace(", ", ",")
105 |                 if not line or "," not in line:
106 |                     continue
107 |                 if line[-1] == ".":
108 |                     line = line[:-1]
109 |                 line += "\n"
110 |                 file_object.write(line)
111 |     tf.io.gfile.remove(temp_file)
112 | 
113 | 
114 | def download(data_dir):
115 |     """Downloads census data if it is not already present.
116 | 
117 |     Args:
118 |       data_dir: directory where we will access/save the census data
119 | 
120 |     Returns:
121 |       foo
122 |     """
123 |     tf.io.gfile.makedirs(data_dir)
124 | 
125 |     training_file_path = os.path.join(data_dir, TRAINING_FILE)
126 |     if not tf.io.gfile.exists(training_file_path):
127 |         _download_and_clean_file(training_file_path, TRAINING_URL)
128 | 
129 |     eval_file_path = os.path.join(data_dir, EVAL_FILE)
130 |     if not tf.io.gfile.exists(eval_file_path):
131 |         _download_and_clean_file(eval_file_path, EVAL_URL)
132 | 
133 |     return training_file_path, eval_file_path
134 | 
135 | 
136 | def upload(train_df, eval_df, train_path, eval_path):
137 |     train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE),
138 |                     index=False, header=False)
139 |     eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE),
140 |                    index=False, header=False)
141 | 
142 | 
143 | def preprocess(dataframe):
144 |     """Converts categorical features to numeric. Removes unused columns.
145 | 
146 |     Args:
147 |       dataframe: Pandas dataframe with raw data
148 | 
149 |     Returns:
150 |       Dataframe with preprocessed data
151 |     """
152 |     dataframe = dataframe.drop(columns=UNUSED_COLUMNS)
153 | 
154 |     # Convert integer valued (numeric) columns to floating point
155 |     numeric_columns = dataframe.select_dtypes(["int64"]).columns
156 |     dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32")
157 | 
158 |     # Convert categorical columns to numeric
159 |     cat_columns = dataframe.select_dtypes(["object"]).columns
160 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
161 |         lambda x: x.astype(_CATEGORICAL_TYPES[x.name]))
162 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
163 |         lambda x: x.cat.codes)
164 |     return dataframe
165 | 
166 | 
167 | def standardize(dataframe):
168 |     """Scales numerical columns using their means and standard deviation.
169 | 
170 |     Args:
171 |       dataframe: Pandas dataframe
172 | 
173 |     Returns:
174 |       Input dataframe with the numerical columns scaled to z-scores
175 |     """
176 |     dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes)))
177 |     for column, dtype in dtypes:
178 |         if dtype == "float32":
179 |             dataframe[column] -= dataframe[column].mean()
180 |             dataframe[column] /= dataframe[column].std()
181 |     return dataframe
182 | 
183 | 
184 | def load_data(train_path="", eval_path=""):
185 |     """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes.
186 | 
187 |     Args:
188 |       train_path: Local or GCS path to uploaded train data to.
189 |       eval_path: Local or GCS path to uploaded eval data to.
190 | 
191 |     Returns:
192 |       A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are
193 |       Pandas dataframes with features for training and train_y and eval_y are
194 |       numpy arrays with the corresponding labels.
195 |     """
196 |     # Download Census dataset: Training and eval csv files.
197 |     training_file_path, eval_file_path = download(DATA_DIR)
198 | 
199 |     train_df = pd.read_csv(
200 |         training_file_path, names=_CSV_COLUMNS, na_values="?")
201 |     eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?")
202 | 
203 |     train_df = preprocess(train_df)
204 |     eval_df = preprocess(eval_df)
205 | 
206 |     # Split train and eval data with labels. The pop method copies and removes
207 |     # the label column from the dataframe.
208 |     train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN)
209 |     eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN)
210 | 
211 |     # Join train_x and eval_x to normalize on overall means and standard
212 |     # deviations. Then separate them again.
213 |     all_x = pd.concat([train_x, eval_x], keys=["train", "eval"])
214 |     all_x = standardize(all_x)
215 |     train_x, eval_x = all_x.xs("train"), all_x.xs("eval")
216 | 
217 |     # Rejoin features and labels and upload to GCS.
218 |     if train_path and eval_path:
219 |         train_df = train_x.copy()
220 |         train_df[_LABEL_COLUMN] = train_y
221 |         eval_df = eval_x.copy()
222 |         eval_df[_LABEL_COLUMN] = eval_y
223 |         upload(train_df, eval_df, train_path, eval_path)
224 | 
225 |     # Reshape label columns for use with tf.data.Dataset
226 |     train_y = np.asarray(train_y).astype("float32").reshape((-1, 1))
227 |     eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1))
228 | 
229 |     return train_x, train_y, eval_x, eval_y
230 | 
231 | 


--------------------------------------------------------------------------------
/examples/sklearn/model/sklearn_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Train a simple SVM classifier."""
16 | 
17 | import argparse
18 | import numpy as np
19 | from sklearn import svm
20 | 
21 | from model.census_preprocess import load_data
22 | 
23 | 
24 | def get_model(params):
25 |     """Trains a classifier."""
26 |     classifier = svm.SVC(C=params.C)
27 |     return classifier
28 | 
29 | 
30 | def main():
31 |     """Trains a model locally to test get_model()."""
32 |     train_x, train_y, eval_x, eval_y = load_data()
33 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
34 |     params = argparse.Namespace(C=1.0)
35 |     model = get_model(params)
36 |     model.fit(train_x, train_y)
37 |     score = model.score(eval_x, eval_y)
38 |     print(score)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/examples/taxi/sklearn/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "mls1-c4-m2"
25 | 
26 | data:
27 |     schema:
28 |         - "trip_miles"
29 |         - "trip_seconds"
30 |         - "fare"
31 |         - "trip_start_month"
32 |         - "trip_start_hour"
33 |         - "trip_start_day"
34 |         - "pickup_community_area"
35 |         - "dropoff_community_area"
36 |         - "pickup_census_tract"
37 |         - "dropoff_census_tract"
38 |         - "pickup_latitude"
39 |         - "pickup_longitude"
40 |         - "dropoff_latitude"
41 |         - "dropoff_longitude"
42 |         - "payment_type"
43 |         - "company"
44 |         - "tip"
45 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv"
46 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv"
47 |     prediction:
48 |         input_data_paths:
49 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
50 |         input_format: "JSON"
51 |         output_format: "JSON"
52 | 
53 | model:
54 |     # Name must start with a letter and only contain letters, numbers, and
55 |     # underscores.
56 |     name: [MODEL NAME]
57 |     path: "model.sklearn_model"
58 |     target: "tip"
59 | 
60 | model_params:
61 |     input_args:
62 |         C:
63 |             type: "float"
64 |             help: "Regularization parameter, must be positive."
65 |             default: 1.0
66 |     hyperparam_config: "hptuning_config.yaml"
67 | 


--------------------------------------------------------------------------------
/examples/taxi/sklearn/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for scikit-learn ML Pipeline Generator."""
16 | from ml_pipeline_gen.models import SklearnModel
17 | from model.taxi_preprocess import load_data
18 | 
19 | 
20 | def _upload_data_to_gcs(model):
21 |     load_data(model.data["train"], model.data["evaluation"])
22 | 
23 | 
24 | def main():
25 |     config = "config.yaml"
26 |     pred_input = [
27 |         [1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147,
28 |          0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409,
29 |          41.881, -87.633, 41.885, -87.62100000000001, 1, 3],
30 |     ]
31 |     model = SklearnModel(config)
32 |     model.generate_files()
33 |     _upload_data_to_gcs(model)
34 | 
35 |     job_id = model.train(tune=True)
36 |     version = model.deploy(job_id=job_id)
37 |     preds = model.online_predict(pred_input, version=version)
38 | 
39 |     print("Features: {}".format(pred_input))
40 |     print("Predictions: {}".format(preds))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/examples/taxi/sklearn/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | trainingInput:
16 |   scaleTier: STANDARD_1
17 |   hyperparameters:
18 |     goal: MAXIMIZE
19 |     maxTrials: 2
20 |     maxParallelTrials: 2
21 |     hyperparameterMetricTag: score
22 |     enableTrialEarlyStopping: TRUE
23 |     params:
24 |     - parameterName: C
25 |       type: DOUBLE
26 |       minValue: .001
27 |       maxValue: 10
28 |       scaleType: UNIT_LOG_SCALE
29 | 


--------------------------------------------------------------------------------
/examples/taxi/sklearn/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/sklearn/model/__init__.py


--------------------------------------------------------------------------------
/examples/taxi/sklearn/model/sklearn_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # python3
14 | # Copyright 2020 Google Inc. All Rights Reserved.
15 | #
16 | # Licensed under the Apache License, Version 2.0 (the "License");
17 | # you may not use this file except in compliance with the License.
18 | # You may obtain a copy of the License at
19 | #
20 | #      http://www.apache.org/licenses/LICENSE-2.0
21 | #
22 | # Unless required by applicable law or agreed to in writing, software
23 | # distributed under the License is distributed on an "AS IS" BASIS,
24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 | # See the License for the specific language governing permissions and
26 | # limitations under the License.
27 | """Train a simple SVM classifier."""
28 | 
29 | import argparse
30 | import numpy as np
31 | from sklearn import svm
32 | 
33 | from model.taxi_preprocess import load_data
34 | 
35 | 
36 | def get_model(params):
37 |     """Trains a classifier."""
38 |     classifier = svm.SVC(C=params.C)
39 |     return classifier
40 | 
41 | 
42 | def main():
43 |     """Trains a model locally to test get_model()."""
44 |     train_x, train_y, eval_x, eval_y = load_data()
45 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
46 |     params = argparse.Namespace(C=1.0)
47 |     model = get_model(params)
48 |     model.fit(train_x, train_y)
49 |     score = model.score(eval_x, eval_y)
50 |     print(score)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/examples/taxi/tf/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "n1-standard-4"
25 | 
26 | data:
27 |     schema:
28 |         - "trip_miles"
29 |         - "trip_seconds"
30 |         - "fare"
31 |         - "trip_start_month"
32 |         - "trip_start_hour"
33 |         - "trip_start_day"
34 |         - "pickup_community_area"
35 |         - "dropoff_community_area"
36 |         - "pickup_census_tract"
37 |         - "dropoff_census_tract"
38 |         - "pickup_latitude"
39 |         - "pickup_longitude"
40 |         - "dropoff_latitude"
41 |         - "dropoff_longitude"
42 |         - "payment_type"
43 |         - "company"
44 |         - "tip"
45 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv"
46 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv"
47 |     prediction:
48 |         input_data_paths:
49 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
50 |         input_format: "JSON"
51 |         output_format: "JSON"
52 | 
53 | model:
54 |     # Name must start with a letter and only contain letters, numbers, and
55 |     # underscores.
56 |     name: [MODEL NAME]
57 |     path: "model.tf_model"
58 |     target: "tip"
59 |     metrics:
60 |         - "accuracy"
61 | 
62 | model_params:
63 |     input_args:
64 |         first_layer_size:
65 |             type: "int"
66 |             help: "Size of the NN first layer."
67 |             default: 50
68 |         num_layers:
69 |             type: "int"
70 |             help: "Number of layers in the NN."
71 |             default: 5
72 |         max_steps:
73 |             default: 1000
74 |     hyperparam_config: "hptuning_config.yaml"
75 |     explain_output:
76 |         explain_type: "sampledShapleyAttribution"
77 |         explain_param:
78 |             name: "numPaths"
79 |             value: 40
80 | 


--------------------------------------------------------------------------------
/examples/taxi/tf/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for TF ML Pipeline Generator."""
16 | import json
17 | import os
18 | 
19 | from ml_pipeline_gen.models import TFModel
20 | from model.taxi_preprocess import load_data
21 | 
22 | 
23 | def _upload_data_to_gcs(model):
24 |     load_data(model.data["train"], model.data["evaluation"])
25 | 
26 | 
27 | # TODO(humichael): See if there"s a way to support csv batch predicts.
28 | def _upload_input_data_to_gcs(model, data):
29 |     input_path = "./tf_input_data.json"
30 |     with open(input_path, "w+") as f:
31 |         for features in data:
32 |             f.write(json.dumps(features) + "\n")
33 |     model.upload_pred_input_data(input_path)
34 |     os.remove(input_path)
35 | 
36 | 
37 | def main():
38 |     explanations = True
39 |     config = "config.yaml"
40 |     pred_input = [{
41 |         "trip_miles": 1.0,
42 |         "trip_seconds": -0.56447923,
43 |         "fare": -0.5502175,
44 |         "trip_start_month": -1.00234,
45 |         "trip_start_hour": -0.60791147,
46 |         "trip_start_day": 0.38163432,
47 |         "pickup_community_area": 0.5846407,
48 |         "dropoff_community_area": 0.6274534,
49 |         "pickup_census_tract": 1.4543412,
50 |         "dropoff_census_tract": -0.09238409,
51 |         "pickup_latitude": 41.881,
52 |         "pickup_longitude": -87.633,
53 |         "dropoff_latitude": 41.885,
54 |         "dropoff_longitude": -87.62100000000001,
55 |         "payment_type": 1,
56 |         "company": 3
57 |     }]
58 |     model = TFModel(config)
59 |     model.generate_files()
60 |     _upload_data_to_gcs(model)
61 | 
62 |     job_id = model.train(tune=True)
63 |     version = model.deploy(job_id=job_id, explanations=explanations)
64 |     if explanations:
65 |         explanations = model.online_explanations(pred_input,
66 |                                                  version=version)
67 |         print("Online Explanations")
68 |         print("Explanations: {}".format(explanations))
69 |     preds = model.online_predict(pred_input, version=version)
70 | 
71 |     print("Online Predictions")
72 |     print("Features: {}".format(pred_input))
73 |     print("Predictions: {}".format(preds))
74 | 
75 |     if not explanations:
76 |         _upload_input_data_to_gcs(model, pred_input)
77 |         model.batch_predict(version=version)
78 |         print("Batch predictions written to",
79 |               model.get_pred_output_path())
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/examples/taxi/tf/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | trainingInput:
16 |   hyperparameters:
17 |     hyperparameterMetricTag: accuracy
18 |     goal: MAXIMIZE
19 |     maxTrials: 4
20 |     maxParallelTrials: 2
21 |     enableTrialEarlyStopping: True
22 |     params:
23 |       - parameterName: first_layer_size
24 |         type: INTEGER
25 |         minValue: 50
26 |         maxValue: 500
27 |         scaleType: UNIT_LINEAR_SCALE
28 |       - parameterName: num_layers
29 |         type: INTEGER
30 |         minValue: 1
31 |         maxValue: 15
32 |         scaleType: UNIT_LINEAR_SCALE
33 | 


--------------------------------------------------------------------------------
/examples/taxi/tf/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/tf/model/__init__.py


--------------------------------------------------------------------------------
/examples/taxi/tf/model/tf_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Train a simple TF classifier for census dataset."""
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import argparse
21 | 
22 | import tensorflow.compat.v1 as tf
23 | 
24 | from model.taxi_preprocess import load_data
25 | 
26 | 
27 | def get_model(inputs, params):
28 |     """Trains a classifier on taxi data."""
29 |     dense = tf.keras.layers.Dense
30 |     nn = dense(params.first_layer_size, activation="relu",
31 |                kernel_initializer="uniform")(inputs)
32 |     for i in reversed(range(1, params.num_layers)):
33 |         layer_size = int(params.first_layer_size * (i / params.num_layers))
34 |         nn = dense(max(1, layer_size), activation="relu")(nn)
35 |     logits = dense(1, activation="sigmoid")(nn)
36 | 
37 |     return logits
38 | 
39 | 
40 | # TODO(humichael): create get_predicition and get_evaluation instead.
41 | def get_loss():
42 |     """The loss function to use."""
43 |     return tf.losses.sigmoid_cross_entropy
44 | 
45 | 
46 | def main():
47 |     """Trains a model locally to test get_model() and get_loss()."""
48 |     train_x, train_y, _, _ = load_data()
49 |     input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],))
50 |     params = argparse.Namespace(first_layer_size=50, num_layers=5)
51 |     predictions = get_model(input_layer, params)
52 |     model = tf.keras.models.Model(inputs=input_layer, outputs=predictions)
53 |     model.compile(optimizer="adam", loss=get_loss(),
54 |                   metrics=["accuracy"])
55 |     model.fit(train_x, train_y, epochs=1)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/examples/taxi/xgb/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "mls1-c4-m2"
25 | 
26 | data:
27 |     schema:
28 |         - "trip_miles"
29 |         - "trip_seconds"
30 |         - "fare"
31 |         - "trip_start_month"
32 |         - "trip_start_hour"
33 |         - "trip_start_day"
34 |         - "pickup_community_area"
35 |         - "dropoff_community_area"
36 |         - "pickup_census_tract"
37 |         - "dropoff_census_tract"
38 |         - "pickup_latitude"
39 |         - "pickup_longitude"
40 |         - "dropoff_latitude"
41 |         - "dropoff_longitude"
42 |         - "payment_type"
43 |         - "company"
44 |         - "tip"
45 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_train.csv"
46 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/taxi_trips_eval.csv"
47 |     prediction:
48 |         input_data_paths:
49 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
50 |         input_format: "JSON"
51 |         output_format: "JSON"
52 | 
53 | model:
54 |     # Name must start with a letter and only contain letters, numbers, and
55 |     # underscores.
56 |     name: [MODEL NAME]
57 |     path: "model.xgb_model"
58 |     target: "tip"
59 | 
60 | model_params:
61 |     input_args:
62 |         n_estimators:
63 |             type: "int"
64 |             help: "Number of output categories."
65 |             default: 10
66 |     hyperparam_config: "hptuning_config.yaml"
67 | 


--------------------------------------------------------------------------------
/examples/taxi/xgb/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for XGBoost ML Pipeline Generator."""
16 | from ml_pipeline_gen.models import XGBoostModel
17 | from model.taxi_preprocess import load_data
18 | 
19 | 
20 | def _upload_data_to_gcs(model):
21 |     load_data(model.data["train"], model.data["evaluation"])
22 | 
23 | 
24 | def main():
25 |     config = "config.yaml"
26 |     pred_input = [[
27 |         1.0, -0.56447923, -0.5502175, -1.00234, -0.60791147,
28 |         0.38163432, 0.5846407, 0.6274534, 1.4543412, -0.09238409,
29 |         41.881, -87.633, 41.885, -87.62100000000001, 1, 3
30 |     ]]
31 | 
32 |     model = XGBoostModel(config)
33 |     model.generate_files()
34 |     _upload_data_to_gcs(model)
35 | 
36 |     job_id = model.train(tune=True)
37 |     version = model.deploy(job_id=job_id)
38 |     preds = model.online_predict(pred_input, version=version)
39 | 
40 |     print("Features: {}".format(pred_input))
41 |     print("Predictions: {}".format(preds))
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/examples/taxi/xgb/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   hyperparameters:
16 |     goal: MAXIMIZE
17 |     maxTrials: 4
18 |     maxParallelTrials: 2
19 |     hyperparameterMetricTag: roc_auc
20 |     enableTrialEarlyStopping: TRUE
21 |     params:
22 |       - parameterName: max_depth
23 |         type: INTEGER
24 |         minValue: 3
25 |         maxValue: 8
26 |         scaleType: UNIT_LINEAR_SCALE
27 |       - parameterName: n_estimators
28 |         type: INTEGER
29 |         minValue: 1
30 |         maxValue: 20
31 |         scaleType: UNIT_LINEAR_SCALE
32 |       - parameterName: booster
33 |         type: CATEGORICAL
34 |         categoricalValues: [
35 |           "gbtree",
36 |           "gblinear",
37 |           "dart"
38 |         ]
39 | 


--------------------------------------------------------------------------------
/examples/taxi/xgb/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/taxi/xgb/model/__init__.py


--------------------------------------------------------------------------------
/examples/taxi/xgb/model/xgb_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # python3
14 | # Copyright 2020 Google Inc. All Rights Reserved.
15 | #
16 | # Licensed under the Apache License, Version 2.0 (the "License");
17 | # you may not use this file except in compliance with the License.
18 | # You may obtain a copy of the License at
19 | #
20 | #      http://www.apache.org/licenses/LICENSE-2.0
21 | #
22 | # Unless required by applicable law or agreed to in writing, software
23 | # distributed under the License is distributed on an "AS IS" BASIS,
24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 | # See the License for the specific language governing permissions and
26 | # limitations under the License.
27 | """Train a simple SVM classifier."""
28 | 
29 | import argparse
30 | import numpy as np
31 | 
32 | from sklearn import metrics
33 | from xgboost import XGBClassifier
34 | 
35 | from model.taxi_preprocess import load_data
36 | 
37 | TARGET_COLUMN = "TARGET"
38 | 
39 | 
40 | def get_model(args):
41 |     """Returns a XGBoost model."""
42 |     params = {
43 |         "n_estimators": args.n_estimators,
44 |         "max_depth": args.max_depth,
45 |         "booster": args.booster,
46 |         "min_child_weight": args.min_child_weight,
47 |         "learning_rate": args.learning_rate,
48 |         "gamma": args.gamma,
49 |         "subsample": args.subsample,
50 |         "colsample_bytree": args.colsample_bytree,
51 |         "reg_alpha": args.reg_alpha,
52 |         "num_class": args.num_classes
53 |     }
54 |     xgb_model = XGBClassifier(**params)
55 |     return xgb_model
56 | 
57 | 
58 | def main():
59 |     """Trains a model locally to test get_model()."""
60 |     train_x, train_y, eval_x, eval_y = load_data()
61 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
62 |     params = argparse.Namespace(
63 |         n_estimators = 2,
64 |         max_depth = 3,
65 |         booster = "gbtree",
66 |         min_child_weight = 1,
67 |         learning_rate = 0.3,
68 |         gamma = 0,
69 |         subsample = 1,
70 |         colsample_bytree = 1,
71 |         reg_alpha = 0,
72 |         num_class = 1)
73 |     model = get_model(params)
74 |     model.fit(train_x, train_y)
75 |     y_pred = model.predict(eval_x)
76 |     score = metrics.roc_auc_score(eval_y, y_pred, average="macro")
77 |     print("ROC: {}".format(score))
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/examples/tf/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "n1-standard-4"
25 | 
26 | data:
27 |     schema:
28 |         - "age"
29 |         - "workclass"
30 |         - "education_num"
31 |         - "marital_status"
32 |         - "occupation"
33 |         - "relationship"
34 |         - "race"
35 |         - "capital_gain"
36 |         - "capital_loss"
37 |         - "hours_per_week"
38 |         - "native_country"
39 |         - "income_bracket"
40 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv"
41 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv"
42 |     prediction:
43 |         input_data_paths:
44 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
45 |         input_format: "JSON"
46 |         output_format: "JSON"
47 | 
48 | model:
49 |     # Name must start with a letter and only contain letters, numbers, and
50 |     # underscores.
51 |     name: [MODEL NAME]
52 |     path: "model.tf_model"
53 |     target: "income_bracket"
54 |     metrics:
55 |         - "accuracy"
56 | 
57 | model_params:
58 |     input_args:
59 |         first_layer_size:
60 |             type: "int"
61 |             help: "Size of the NN first layer."
62 |             default: 50
63 |         num_layers:
64 |             type: "int"
65 |             help: "Number of layers in the NN."
66 |             default: 5
67 |         max_steps:
68 |             default: 1000
69 |     # Relative path.
70 |     hyperparam_config: "hptuning_config.yaml"
71 |     explain_output:
72 |         explain_type: "sampledShapleyAttribution"
73 |         explain_param:
74 |             name: "numPaths"
75 |             value: 40
76 | 


--------------------------------------------------------------------------------
/examples/tf/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for TF ML Pipeline Generator."""
16 | import json
17 | import os
18 | 
19 | from ml_pipeline_gen.models import TFModel
20 | from model.census_preprocess import load_data
21 | 
22 | 
23 | def _upload_data_to_gcs(model):
24 |     """Calls the preprocessing fn which uploads train/eval data to GCS."""
25 |     load_data(model.data["train"], model.data["evaluation"])
26 | 
27 | 
28 | # TODO(humichael): See if there's a way to support csv batch predicts.
29 | def _upload_input_data_to_gcs(model, data):
30 |     input_path = "tf_input_data.json"
31 |     with open(input_path, "w+") as f:
32 |         for features in data:
33 |             f.write(json.dumps(features) + "\n")
34 |     model.upload_pred_input_data(input_path)
35 |     os.remove(input_path)
36 | 
37 | 
38 | def main():
39 |     explanations = True
40 |     config = "config.yaml"
41 |     pred_input = [{
42 |         "age": 0.02599666,
43 |         "workclass": 6,
44 |         "education_num": 1.1365801,
45 |         "marital_status": 4,
46 |         "occupation": 0,
47 |         "relationship": 1,
48 |         "race": 4,
49 |         "capital_gain": 0.14693314,
50 |         "capital_loss": -0.21713187,
51 |         "hours_per_week": -0.034039237,
52 |         "native_country": 38,
53 |         "income_bracket": 0,
54 |     }]
55 |     model = TFModel(config)
56 |     model.generate_files()
57 |     _upload_data_to_gcs(model)
58 | 
59 |     job_id = model.train(tune=True)
60 |     version = model.deploy(job_id=job_id, explanations=explanations)
61 |     if explanations:
62 |         explanations = model.online_explanations(pred_input,
63 |                                                  version=version)
64 |         print("Online Explanations")
65 |         print("Explanations: {}".format(explanations))
66 |     preds = model.online_predict(pred_input, version=version)
67 | 
68 |     print("Online Predictions")
69 |     print("Features: {}".format(pred_input))
70 |     print("Predictions: {}".format(preds))
71 | 
72 |     if not explanations:
73 |         _upload_input_data_to_gcs(model, pred_input)
74 |         model.batch_predict(version=version)
75 |         print("Batch predictions written to",
76 |               model.get_pred_output_path())
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/examples/tf/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   hyperparameters:
16 |     hyperparameterMetricTag: accuracy
17 |     goal: MAXIMIZE
18 |     maxTrials: 4
19 |     maxParallelTrials: 2
20 |     enableTrialEarlyStopping: True
21 |     params:
22 |       - parameterName: first_layer_size
23 |         type: INTEGER
24 |         minValue: 50
25 |         maxValue: 500
26 |         scaleType: UNIT_LINEAR_SCALE
27 |       - parameterName: num_layers
28 |         type: INTEGER
29 |         minValue: 1
30 |         maxValue: 15
31 |         scaleType: UNIT_LINEAR_SCALE
32 | 


--------------------------------------------------------------------------------
/examples/tf/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/tf/model/__init__.py


--------------------------------------------------------------------------------
/examples/tf/model/census_preprocess.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2019 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Train a simple TF classifier for MNIST dataset.
 16 | 
 17 | This example comes from the cloudml-samples keras demo.
 18 | github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/tf-keras
 19 | """
 20 | from __future__ import absolute_import
 21 | from __future__ import division
 22 | from __future__ import print_function
 23 | 
 24 | import os
 25 | from six.moves import urllib
 26 | import tempfile
 27 | 
 28 | import numpy as np
 29 | import pandas as pd
 30 | import tensorflow.compat.v1 as tf
 31 | 
 32 | 
 33 | DATA_DIR = os.path.join(tempfile.gettempdir(), "census_data")
 34 | DATA_URL = ("https://storage.googleapis.com/cloud-samples-data/ai-platform"
 35 |             + "/census/data/")
 36 | TRAINING_FILE = "adult.data.csv"
 37 | EVAL_FILE = "adult.test.csv"
 38 | TRAINING_URL = os.path.join(DATA_URL, TRAINING_FILE)
 39 | EVAL_URL = os.path.join(DATA_URL, EVAL_FILE)
 40 | 
 41 | _CSV_COLUMNS = [
 42 |     "age", "workclass", "fnlwgt", "education", "education_num",
 43 |     "marital_status", "occupation", "relationship", "race", "gender",
 44 |     "capital_gain", "capital_loss", "hours_per_week", "native_country",
 45 |     "income_bracket",
 46 | ]
 47 | _LABEL_COLUMN = "income_bracket"
 48 | UNUSED_COLUMNS = ["fnlwgt", "education", "gender"]
 49 | 
 50 | _CATEGORICAL_TYPES = {
 51 |     "workclass": pd.api.types.CategoricalDtype(categories=[
 52 |         "Federal-gov", "Local-gov", "Never-worked", "Private", "Self-emp-inc",
 53 |         "Self-emp-not-inc", "State-gov", "Without-pay"
 54 |     ]),
 55 |     "marital_status": pd.api.types.CategoricalDtype(categories=[
 56 |         "Divorced", "Married-AF-spouse", "Married-civ-spouse",
 57 |         "Married-spouse-absent", "Never-married", "Separated", "Widowed"
 58 |     ]),
 59 |     "occupation": pd.api.types.CategoricalDtype([
 60 |         "Adm-clerical", "Armed-Forces", "Craft-repair", "Exec-managerial",
 61 |         "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct",
 62 |         "Other-service", "Priv-house-serv", "Prof-specialty", "Protective-serv",
 63 |         "Sales", "Tech-support", "Transport-moving"
 64 |     ]),
 65 |     "relationship": pd.api.types.CategoricalDtype(categories=[
 66 |         "Husband", "Not-in-family", "Other-relative", "Own-child", "Unmarried",
 67 |         "Wife"
 68 |     ]),
 69 |     "race": pd.api.types.CategoricalDtype(categories=[
 70 |         "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"
 71 |     ]),
 72 |     "native_country": pd.api.types.CategoricalDtype(categories=[
 73 |         "Cambodia", "Canada", "China", "Columbia", "Cuba", "Dominican-Republic",
 74 |         "Ecuador", "El-Salvador", "England", "France", "Germany", "Greece",
 75 |         "Guatemala", "Haiti", "Holand-Netherlands", "Honduras", "Hong",
 76 |         "Hungary", "India", "Iran", "Ireland", "Italy", "Jamaica", "Japan",
 77 |         "Laos", "Mexico", "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Peru",
 78 |         "Philippines", "Poland", "Portugal", "Puerto-Rico", "Scotland", "South",
 79 |         "Taiwan", "Thailand", "Trinadad&Tobago", "United-States", "Vietnam",
 80 |         "Yugoslavia"
 81 |     ]),
 82 |     "income_bracket": pd.api.types.CategoricalDtype(categories=[
 83 |         "<=50K", ">50K"
 84 |     ])
 85 | }
 86 | 
 87 | 
 88 | def _download_and_clean_file(filename, url):
 89 |     """Downloads data from url, and makes changes to match the CSV format.
 90 | 
 91 |     The CSVs may use spaces after the comma delimters (non-standard) or include
 92 |     rows which do not represent well-formed examples. This function strips out
 93 |     some of these problems.
 94 | 
 95 |     Args:
 96 |       filename: filename to save url to
 97 |       url: URL of resource to download
 98 |     """
 99 |     temp_file, _ = urllib.request.urlretrieve(url)
100 |     with tf.io.gfile.GFile(temp_file, "r") as temp_file_object:
101 |         with tf.io.gfile.GFile(filename, "w") as file_object:
102 |             for line in temp_file_object:
103 |                 line = line.strip()
104 |                 line = line.replace(", ", ",")
105 |                 if not line or "," not in line:
106 |                     continue
107 |                 if line[-1] == ".":
108 |                     line = line[:-1]
109 |                 line += "\n"
110 |                 file_object.write(line)
111 |     tf.io.gfile.remove(temp_file)
112 | 
113 | 
114 | def download(data_dir):
115 |     """Downloads census data if it is not already present.
116 | 
117 |     Args:
118 |       data_dir: directory where we will access/save the census data
119 | 
120 |     Returns:
121 |       foo
122 |     """
123 |     tf.io.gfile.makedirs(data_dir)
124 | 
125 |     training_file_path = os.path.join(data_dir, TRAINING_FILE)
126 |     if not tf.io.gfile.exists(training_file_path):
127 |         _download_and_clean_file(training_file_path, TRAINING_URL)
128 | 
129 |     eval_file_path = os.path.join(data_dir, EVAL_FILE)
130 |     if not tf.io.gfile.exists(eval_file_path):
131 |         _download_and_clean_file(eval_file_path, EVAL_URL)
132 | 
133 |     return training_file_path, eval_file_path
134 | 
135 | 
136 | def upload(train_df, eval_df, train_path, eval_path):
137 |     train_df.to_csv(os.path.join(os.path.dirname(train_path), TRAINING_FILE),
138 |                     index=False, header=False)
139 |     eval_df.to_csv(os.path.join(os.path.dirname(eval_path), EVAL_FILE),
140 |                    index=False, header=False)
141 | 
142 | 
143 | def preprocess(dataframe):
144 |     """Converts categorical features to numeric. Removes unused columns.
145 | 
146 |     Args:
147 |       dataframe: Pandas dataframe with raw data
148 | 
149 |     Returns:
150 |       Dataframe with preprocessed data
151 |     """
152 |     dataframe = dataframe.drop(columns=UNUSED_COLUMNS)
153 | 
154 |     # Convert integer valued (numeric) columns to floating point
155 |     numeric_columns = dataframe.select_dtypes(["int64"]).columns
156 |     dataframe[numeric_columns] = dataframe[numeric_columns].astype("float32")
157 | 
158 |     # Convert categorical columns to numeric
159 |     cat_columns = dataframe.select_dtypes(["object"]).columns
160 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
161 |         lambda x: x.astype(_CATEGORICAL_TYPES[x.name]))
162 |     dataframe[cat_columns] = dataframe[cat_columns].apply(
163 |         lambda x: x.cat.codes)
164 |     return dataframe
165 | 
166 | 
167 | def standardize(dataframe):
168 |     """Scales numerical columns using their means and standard deviation.
169 | 
170 |     Args:
171 |       dataframe: Pandas dataframe
172 | 
173 |     Returns:
174 |       Input dataframe with the numerical columns scaled to z-scores
175 |     """
176 |     dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes)))
177 |     for column, dtype in dtypes:
178 |         if dtype == "float32":
179 |             dataframe[column] -= dataframe[column].mean()
180 |             dataframe[column] /= dataframe[column].std()
181 |     return dataframe
182 | 
183 | 
184 | def load_data(train_path="", eval_path=""):
185 |     """Loads data into preprocessed (train_x, train_y, eval_y, eval_y) dataframes.
186 | 
187 |     Args:
188 |       train_path: Local or GCS path to uploaded train data to.
189 |       eval_path: Local or GCS path to uploaded eval data to.
190 | 
191 |     Returns:
192 |       A tuple (train_x, train_y, eval_x, eval_y), where train_x and eval_x are
193 |       Pandas dataframes with features for training and train_y and eval_y are
194 |       numpy arrays with the corresponding labels.
195 |     """
196 |     # Download Census dataset: Training and eval csv files.
197 |     training_file_path, eval_file_path = download(DATA_DIR)
198 | 
199 |     train_df = pd.read_csv(
200 |         training_file_path, names=_CSV_COLUMNS, na_values="?")
201 |     eval_df = pd.read_csv(eval_file_path, names=_CSV_COLUMNS, na_values="?")
202 | 
203 |     train_df = preprocess(train_df)
204 |     eval_df = preprocess(eval_df)
205 | 
206 |     # Split train and eval data with labels. The pop method copies and removes
207 |     # the label column from the dataframe.
208 |     train_x, train_y = train_df, train_df.pop(_LABEL_COLUMN)
209 |     eval_x, eval_y = eval_df, eval_df.pop(_LABEL_COLUMN)
210 | 
211 |     # Join train_x and eval_x to normalize on overall means and standard
212 |     # deviations. Then separate them again.
213 |     all_x = pd.concat([train_x, eval_x], keys=["train", "eval"])
214 |     all_x = standardize(all_x)
215 |     train_x, eval_x = all_x.xs("train"), all_x.xs("eval")
216 | 
217 |     # Rejoin features and labels and upload to GCS.
218 |     if train_path and eval_path:
219 |         train_df = train_x.copy()
220 |         train_df[_LABEL_COLUMN] = train_y
221 |         eval_df = eval_x.copy()
222 |         eval_df[_LABEL_COLUMN] = eval_y
223 |         upload(train_df, eval_df, train_path, eval_path)
224 | 
225 |     # Reshape label columns for use with tf.data.Dataset
226 |     train_y = np.asarray(train_y).astype("float32").reshape((-1, 1))
227 |     eval_y = np.asarray(eval_y).astype("float32").reshape((-1, 1))
228 | 
229 |     return train_x, train_y, eval_x, eval_y
230 | 
231 | 


--------------------------------------------------------------------------------
/examples/tf/model/tf_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Train a simple TF classifier for census dataset."""
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import argparse
21 | 
22 | import tensorflow.compat.v1 as tf
23 | 
24 | from model.census_preprocess import load_data
25 | 
26 | 
27 | def get_model(inputs, params):
28 |     """Trains a classifier on iris data."""
29 |     dense = tf.keras.layers.Dense
30 |     nn = dense(params.first_layer_size, activation="relu",
31 |                kernel_initializer="uniform")(inputs)
32 |     for i in reversed(range(1, params.num_layers)):
33 |         layer_size = int(params.first_layer_size * (i / params.num_layers))
34 |         nn = dense(max(1, layer_size), activation="relu")(nn)
35 |     logits = dense(1, activation="sigmoid")(nn)
36 | 
37 |     return logits
38 | 
39 | 
40 | # TODO(humichael): create get_predicition and get_evaluation instead.
41 | def get_loss():
42 |     """The loss function to use."""
43 |     return tf.losses.sigmoid_cross_entropy
44 | 
45 | 
46 | def main():
47 |     """Trains a model locally to test get_model() and get_loss()."""
48 |     train_x, train_y, _, _ = load_data()
49 |     input_layer = tf.keras.layers.Input(shape=(train_x.shape[1],))
50 |     params = argparse.Namespace(first_layer_size=50, num_layers=5)
51 |     predictions = get_model(input_layer, params)
52 |     model = tf.keras.models.Model(inputs=input_layer, outputs=predictions)
53 |     model.compile(optimizer="adam", loss=get_loss(),
54 |                   metrics=["accuracy"])
55 |     model.fit(train_x, train_y, epochs=1)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/examples/xgboost/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: [PROJECT ID]
18 | bucket_id: [BUCKET ID]
19 | region: "us-central1"
20 | scale_tier: "STANDARD_1"
21 | runtime_version: "1.15"
22 | python_version: "3.7"
23 | package_name: "ml_pipeline_gen"
24 | machine_type_pred: "mls1-c4-m2"
25 | 
26 | data:
27 |     schema:
28 |         - "age"
29 |         - "workclass"
30 |         - "education_num"
31 |         - "marital_status"
32 |         - "occupation"
33 |         - "relationship"
34 |         - "race"
35 |         - "capital_gain"
36 |         - "capital_loss"
37 |         - "hours_per_week"
38 |         - "native_country"
39 |         - "income_bracket"
40 |     train: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.data.csv"
41 |     evaluation: "gs://[BUCKET ID]/[MODEL NAME]/data/adult.test.csv"
42 |     prediction:
43 |         input_data_paths:
44 |             - "gs://[BUCKET ID]/[MODEL NAME]/inputs/*"
45 |         input_format: "JSON"
46 |         output_format: "JSON"
47 | 
48 | model:
49 |     # Name must start with a letter and only contain letters, numbers, and
50 |     # underscores.
51 |     name: [MODEL NAME]
52 |     path: "model.xgboost_model"
53 |     target: "income_bracket"
54 | 
55 | model_params:
56 |     input_args:
57 |         n_estimators:
58 |             type: "int"
59 |             help: "Number of output categories."
60 |             default: 10
61 |     # Relative path.
62 |     hyperparam_config: "hptuning_config.yaml"
63 | 


--------------------------------------------------------------------------------
/examples/xgboost/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for XGBoost ML Pipeline Generator."""
16 | from ml_pipeline_gen.models import XGBoostModel
17 | from model.census_preprocess import load_data
18 | 
19 | 
20 | def _upload_data_to_gcs(model):
21 |     load_data(model.data["train"], model.data["evaluation"])
22 | 
23 | 
24 | def main():
25 |     config = "config.yaml"
26 |     pred_input = [[
27 |          7.65000000e+02, 2.81400000e+04, 0.00000000e+00, 1.00000000e+00,
28 |          8.30000000e+01, 3.26000000e+05, 8.30000000e+01, 4.87500000e+00,
29 |          3.60000000e+02, 1.00000000e+00, 3.09730330e+05, 3.25000000e+05,
30 |          1.52696700e+04, 4.67629611e+03, 0.00000000e+00, 3.17866362e+05,
31 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
32 |          0.00000000e+00, 0.00000000e+00, 4.87500000e+00, 4.87500000e+00,
33 |          0.00000000e+00, 4.87500000e+00, 0.00000000e+00, 4.87500000e+00,
34 |          0.00000000e+00, 5.95836265e-06, 0.00000000e+00, 0.00000000e+00,
35 |          0.00000000e+00, 2.63157895e-02, 9.99000000e+02, 9.99000000e+02,
36 |          9.99000000e+02, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
37 |          1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
38 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
39 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
40 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
41 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
42 |          0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
43 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
44 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
45 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
46 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
47 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
48 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
49 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
50 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
51 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
52 |          0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
53 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
54 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
55 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
56 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
57 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
58 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
59 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
60 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
61 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
62 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
63 |          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00
64 |     ]]
65 | 
66 |     model = XGBoostModel(config)
67 |     model.generate_files()
68 |     _upload_data_to_gcs(model)
69 | 
70 |     job_id = model.train()
71 |     version = model.deploy(job_id=job_id)
72 |     preds = model.online_predict(pred_input, version=version)
73 | 
74 |     print("Features: {}".format(pred_input))
75 |     print("Predictions: {}".format(preds))
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/examples/xgboost/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   hyperparameters:
16 |     goal: MAXIMIZE
17 |     maxTrials: 4
18 |     maxParallelTrials: 2
19 |     hyperparameterMetricTag: roc_auc
20 |     enableTrialEarlyStopping: TRUE
21 |     params:
22 |       - parameterName: max_depth
23 |         type: INTEGER
24 |         minValue: 3
25 |         maxValue: 8
26 |         scaleType: UNIT_LINEAR_SCALE
27 |       - parameterName: n_estimators
28 |         type: INTEGER
29 |         minValue: 1
30 |         maxValue: 20
31 |         scaleType: UNIT_LINEAR_SCALE
32 |       - parameterName: booster
33 |         type: CATEGORICAL
34 |         categoricalValues: [
35 |           "gbtree",
36 |           "gblinear",
37 |           "dart"
38 |         ]
39 | 


--------------------------------------------------------------------------------
/examples/xgboost/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/examples/xgboost/model/__init__.py


--------------------------------------------------------------------------------
/examples/xgboost/model/xgboost_model.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2019 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # python3
14 | # Copyright 2020 Google Inc. All Rights Reserved.
15 | #
16 | # Licensed under the Apache License, Version 2.0 (the "License");
17 | # you may not use this file except in compliance with the License.
18 | # You may obtain a copy of the License at
19 | #
20 | #      http://www.apache.org/licenses/LICENSE-2.0
21 | #
22 | # Unless required by applicable law or agreed to in writing, software
23 | # distributed under the License is distributed on an "AS IS" BASIS,
24 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 | # See the License for the specific language governing permissions and
26 | # limitations under the License.
27 | """Train a simple SVM classifier."""
28 | 
29 | import argparse
30 | import numpy as np
31 | from xgboost import XGBClassifier
32 | 
33 | from model.census_preprocess import load_data
34 | 
35 | TARGET_COLUMN = 'TARGET'
36 | 
37 | 
38 | def get_model(args):
39 |     """Returns a XGBoost model."""
40 |     params = {
41 |         'n_estimators': args.n_estimators,
42 |         'max_depth': args.max_depth,
43 |         'booster': args.booster,
44 |         'min_child_weight': args.min_child_weight,
45 |         'learning_rate': args.learning_rate,
46 |         'gamma': args.gamma,
47 |         'subsample': args.subsample,
48 |         'colsample_bytree': args.colsample_bytree,
49 |         'reg_alpha': args.reg_alpha,
50 |         'num_class': args.num_classes
51 |     }
52 |     xgb_model = XGBClassifier(**params)
53 |     return xgb_model
54 | 
55 | 
56 | def main():
57 |     """Trains a model locally to test get_model()."""
58 |     train_x, train_y, eval_x, eval_y = load_data()
59 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
60 |     params = argparse.Namespace(C=1.0)
61 |     model = get_model(params)
62 |     model.fit(train_x, train_y)
63 |     score = model.score(eval_x, eval_y)
64 |     print(score)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/__init__.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Michael Hu, Stefan Hosein"
 2 | __license__ = "Apache 2.0"
 3 | __copyright__ = """
 4 |     Copyright 2020 Google Inc. All Rights Reserved.
 5 | 
 6 |     Licensed under the Apache License, Version 2.0 (the "License");
 7 |     you may not use this file except in compliance with the License.
 8 |     You may obtain a copy of the License at
 9 | 
10 |          http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |     Unless required by applicable law or agreed to in writing, software
13 |     distributed under the License is distributed on an "AS IS" BASIS,
14 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |     See the License for the specific language governing permissions and
16 |     limitations under the License.
17 | """
18 | __version__ = "0.0.5"
19 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/experimental/component_lib.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Method for generating component files from respective templates."""
16 | from os import path
17 | import pathlib
18 | 
19 | from ml_pipeline_gen.parsers import parse_yaml
20 | import jinja2 as jinja
21 | 
22 | 
23 | def generate_component(config, name, template_spec='./component_spec.yaml'):
24 |     """Generate the component files from the templates."""
25 |     template_spec_path = path.join(path.dirname(__file__), template_spec)
26 |     output_spec = parse_yaml(template_spec_path)
27 |     current_spec = output_spec[name]
28 | 
29 |     loader = jinja.PackageLoader('ml_pipeline_gen', current_spec['template_dir'])
30 |     env = jinja.Environment(loader=loader, trim_blocks=True,
31 |                             lstrip_blocks='True')
32 |     template_file_list = current_spec['files']
33 |     for template in template_file_list:
34 |         template_in = env.get_template(template['input'])
35 |         template_out = template_in.render(
36 | 
37 |             config=config
38 |         )
39 |         output_file = path.join(config.output_package, template['output'])
40 |         pathlib.Path(output_file).parent.mkdir(parents=True, exist_ok=True)
41 |         with open(output_file, 'w') as f:
42 |             f.write(template_out)
43 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/experimental/component_spec.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Reference for template generated pipeline steps
16 | hptune:
17 |   template_dir: "./templates/hptune"
18 |   files:
19 |     - input: 'component.yaml'
20 |       output: 'hptune/component.yaml'
21 |     - input: 'Dockerfile'
22 |       output: 'hptune/Dockerfile'
23 |     - input: 'build.sh'
24 |       output: 'hptune/build.sh'
25 |     - input: 'hptune.sh'
26 |       output: 'hptune/hptune.sh'
27 | 
28 | 
29 | get_tuned_params:
30 |   template_dir: "./templates/get_tuned_params"
31 |   files:
32 |     - input: 'component.yaml'
33 |       output: 'get_tuned_params/component.yaml'
34 |     - input: 'Dockerfile'
35 |       output: 'get_tuned_params/Dockerfile'
36 |     - input: 'build.sh'
37 |       output: 'get_tuned_params/build.sh'
38 |     - input: 'get_tuned_params.py'
39 |       output: 'get_tuned_params/get_tuned_params.py'
40 | 
41 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/parsers.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions for parsing data sources."""
16 | import types
17 | import yaml
18 | 
19 | 
20 | # TODO(humichael): Replace with gfile to support GCS.
21 | def parse_yaml(path):
22 |     """Parses the given config file."""
23 |     with open(path, "r") as f:
24 |         doc = f.read()
25 |     return yaml.load(doc, Loader=yaml.FullLoader)
26 | 
27 | 
28 | class NestedNamespace(types.SimpleNamespace):
29 |     """Parse nested disctionary to create nested namespace object."""
30 | 
31 |     def __init__(self, dictionary, **kwargs):
32 |         super(NestedNamespace, self).__init__(**kwargs)
33 |         for key, value in dictionary.items():
34 |             if isinstance(value, dict):
35 |                 self.__setattr__(key, NestedNamespace(value))
36 |             elif isinstance(value, list):
37 |                 self.__setattr__(key,
38 |                                  [NestedNamespace(i)
39 |                                   if isinstance(i, dict)
40 |                                   else i for i in value])
41 |             else:
42 |                 self.__setattr__(key, value)
43 | 
44 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/bin/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | rm trainer/task.py 2> /dev/null
17 | rm trainer/model.py 2> /dev/null
18 | rm trainer/inputs.py 2> /dev/null
19 | rm orchestration/pipeline.py 2> /dev/null
20 | rm *.tar.gz 2> /dev/null
21 | rm -rf dist/ 2> /dev/null
22 | rm -rf *.egg-info/ 2> /dev/null
23 | rm -rf models/ 2> /dev/null
24 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/bin/run.local_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2019 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Convenience script for training model locally.
18 | PACKAGE_PATH=trainer
19 | MODULE_NAME=trainer.task
20 | 
21 | gcloud ai-platform local train \
22 |   --package-path "${PACKAGE_PATH}" \
23 |   --module-name "${MODULE_NAME}" \
24 |   --
25 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/orchestration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/orchestration/__init__.py


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/orchestration/components/list_blobs.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Modified version of https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/storage/list/component.yaml
16 | 
17 | name: List blobs
18 | inputs:
19 | - {name: GCS path, type: String, description: 'GCS path for listing. For recursive listing use the "gs://bucket/path/**" syntax".'}
20 | outputs:
21 | - {name: Paths}
22 | implementation:
23 |     container:
24 |         image: google/cloud-sdk
25 |         command:
26 |         - sh
27 |         - -ex
28 |         - -c
29 |         - |
30 |             if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
31 |                 gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
32 |             fi
33 |             mkdir -p "$(dirname "$1")"
34 |             gsutil ls "$0" | tail -n1 > "$1"
35 |         - inputValue: GCS path
36 |         - outputPath: Paths
37 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/ml_pipeline_gen/static/trainer/__init__.py


--------------------------------------------------------------------------------
/ml_pipeline_gen/static/trainer/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utility functions."""
15 | import os
16 | 
17 | import tensorflow as tf
18 | 
19 | from sklearn.externals import joblib
20 | from google.cloud import storage
21 | 
22 | 
23 | 
24 | def dump_object(obj, output_path, model_type=""):
25 |     """Pickle the given object and write to output_path.
26 | 
27 |     Args:
28 |       obj: object to pickle.
29 |       output_path: a local or GCS path.
30 |       model_type: whether we are saving a TF model or sklearn/xgboost
31 |     """
32 |     if not tf.io.gfile.exists(output_path):
33 |         tf.io.gfile.makedirs(os.path.dirname(output_path))
34 |     if model_type == "tf":
35 |         tf.saved_model.save(obj, output_path)
36 |     else:
37 |         with tf.io.gfile.GFile(output_path, "w+") as f:
38 |             joblib.dump(obj, f)
39 | 
40 | 
41 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
42 |     """Uploads a file to the bucket."""
43 |     # bucket_name = "your-bucket-name"
44 |     # source_file_name = "local/path/to/file"
45 |     # destination_blob_name = "storage-object-name"
46 | 
47 |     storage_client = storage.Client()
48 |     bucket = storage_client.bucket(bucket_name)
49 |     blob = bucket.blob(destination_blob_name)
50 | 
51 |     blob.upload_from_filename(source_file_name)
52 | 
53 |     print(
54 |         "File {} uploaded to {}.".format(
55 |             source_file_name, destination_blob_name
56 |         )
57 |     )


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/example_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import kfp\n",
 10 |     "import kfp.components as comp\n",
 11 |     "import kfp.dsl as dsl\n",
 12 |     "from kfp.gcp import use_gcp_secret\n",
 13 |     "from kfp.components import ComponentStore\n",
 14 |     "from os import path\n",
 15 |     "import json"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "cs = ComponentStore(local_search_paths=['.', '{{config.output_package}}'],\n",
 25 |     "                    url_search_prefixes=['{{config.github_component_url}}'])"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "pre_process_op = cs.load_component('{{config.preprocess.component}}')\n",
 35 |     "hpt_op = cs.load_component('hptune')\n",
 36 |     "param_comp = cs.load_component('get_tuned_params')\n",
 37 |     "train_op = cs.load_component('{{config.train.component}}')\n",
 38 |     "deploy_op = cs.load_component('{{config.deploy.component}}')\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "@dsl.pipeline(\n",
 48 |     "    name='KFP-Pipelines Example',\n",
 49 |     "    description='Kubeflow pipeline generated from ai-pipeline asset'\n",
 50 |     ")\n",
 51 |     "def pipeline_sample(\n",
 52 |     "   project_id='{{config.project_id}}',\n",
 53 |     "   region = '{{config.region}}',\n",
 54 |     "    python_module = '{{config.train.python_module}}',\n",
 55 |     "    package_uri = '{{config.train.python_package}}',\n",
 56 |     "    dataset_bucket = '{{config.bucket_id}}',\n",
 57 |     "    staging_bucket = 'gs://{{config.bucket_id}}',\n",
 58 |     "    job_dir_hptune = 'gs://{{config.bucket_id}}/hptune',\n",
 59 |     "    job_dir_train = 'gs://{{config.bucket_id}}/train',\n",
 60 |     "    runtime_version_train = '{{config.runtime_version}}',\n",
 61 |     "    runtime_version_deploy = '{{config.runtime_version}}',\n",
 62 |     "    hptune_config='{{config.hptune.config}}',\n",
 63 |     "    model_id='{{config.deploy.model_id}}',\n",
 64 |     "    version_id='{{config.deploy.version_id}}',\n",
 65 |     "    common_args_hpt=json.dumps([\n",
 66 |     "        {% for arg in config.hptune.args %}",
 67 |     "          {% set name = arg.name %}",
 68 |     "          {% set value = arg.default %}",
 69 |     "                    '--{{name}}', '{{value}}' ,\n",
 70 |     "        {% endfor %}",
 71 |     "        ]),\n",
 72 |     "    common_args_train=json.dumps([\n",
 73 |     "        {% for arg in config.train.args %}",
 74 |     "          {% set name =  arg.name %}",
 75 |     "          {% set value = arg.default%}",
 76 |     "                    '--{{name}}', '{{value}}' ,\n",
 77 |     "        {% endfor %}",
 78 |     "        ]),\n",
 79 |     "    replace_existing_version=True\n",
 80 |     "):\n",
 81 |     "\n",
 82 |     "    #Preprocess Task\n",
 83 |     "    pre_process_task = pre_process_op(\n",
 84 |     "        {% for arg in config.preprocess.component_args %}\n",
 85 |     "          {% set name = arg.name %}\n",
 86 |     "        {{name}}={{name}},\n",
 87 |     "        {% endfor %}\n",
 88 |     "     )\n",
 89 |     "\n",
 90 |     "    # HP tune Task\n",
 91 |     "    hpt_task = hpt_op (\n",
 92 |     "         region = region,\n",
 93 |     "         python_module = python_module,\n",
 94 |     "         package_uri = package_uri,\n",
 95 |     "         staging_bucket = staging_bucket,\n",
 96 |     "         job_dir = job_dir_hptune,\n",
 97 |     "         config=hptune_config,\n",
 98 |     "         runtime_version = runtime_version_train,\n",
 99 |     "         args = common_args_hpt ,\n",
100 |     "    )\n",
101 |     "    hpt_task.after(pre_process_task)\n",
102 |     "\n",
103 |     "    # Get the best hyperparameters\n",
104 |     "    param_task = param_comp (\n",
105 |     "        project_id=project_id,\n",
106 |     "        hptune_job_id=hpt_task.outputs['job_id'].to_struct(),\n",
107 |     "        common_args=common_args_train,\n",
108 |     "    )\n",
109 |     "\n",
110 |     "    # Train Task\n",
111 |     "    train_task = train_op (\n",
112 |     "        project_id = project_id,\n",
113 |     "        python_module = python_module,\n",
114 |     "        package_uris = json.dumps([package_uri.to_struct()]),\n",
115 |     "        region = region,\n",
116 |     "        args = str(param_task.outputs['tuned_parameters_out']) ,\n",
117 |     "        job_dir = job_dir_train,\n",
118 |     "        python_version = '',\n",
119 |     "        runtime_version = runtime_version_train,\n",
120 |     "        master_image_uri = '',\n",
121 |     "        worker_image_uri = '',\n",
122 |     "        training_input = '',\n",
123 |     "        job_id_prefix = '',\n",
124 |     "        wait_interval = '30'\n",
125 |     "    )\n",
126 |     "\n",
127 |     "         #model_uri=train_task.outputs['job_dir'],\n",
128 |     "         #model_uri='gs://poc-bucket-0120/train/out/export/exporter',\n",
129 |     "    deploy_model = deploy_op(\n",
130 |     "         model_uri=train_task.outputs['job_dir'].to_struct()+'{{config.train.model_out_prefix}}',\n",
131 |     "         project_id=project_id,\n",
132 |     "         model_id=model_id,\n",
133 |     "         version_id=version_id,\n",
134 |     "         runtime_version=runtime_version_deploy,\n",
135 |     "         replace_existing_version=replace_existing_version\n",
136 |     "    )\n",
137 |     "    kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))\n",
138 |     "\n",
139 |     "\n"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "client = kfp.Client(host='{{config.kfp_deployment_url}}')\n",
149 |     "\n",
150 |     "client.create_run_from_pipeline_func(pipeline_sample, arguments={})"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.6.10"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 4
175 | }
176 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/get_tuned_params/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 The Kubeflow Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:3.6
16 | RUN pip install -U google-api-python-client==1.7.11
17 | RUN pip install -U oauth2client==4.1.3
18 | COPY . /
19 | ENTRYPOINT ["python", "get_tuned_params.py" ]
20 | 
21 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/get_tuned_params/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2018 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | if [ -z "$1" ]; then
18 |   PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
19 | else
20 |   PROJECT_ID=$1
21 | fi
22 | 
23 | if [ -z "$2" ]; then
24 |   TAG_NAME="latest"
25 | else
26 |   TAG_NAME="$2"
27 | fi
28 | 
29 | CONTAINER_NAME=ml-pipeline-get-tuned-params
30 | 
31 | docker build -t ${CONTAINER_NAME} .
32 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
33 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
34 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/get_tuned_params/component.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Get Best Hparam
16 | description: |
17 |   A Kubeflow Pipeline component to extract best hyperparameters from a given 
18 |   hyperparameter job ID for a given project.
19 | inputs:
20 |   - name: project_id
21 |     description: 'Required. The ID of the parent project of the job.'
22 |     type: String
23 |   - name: hptune_job_id
24 |     description: 'hyperparameter tuning job ID'
25 |     type: String
26 |   - name: common_args
27 |     description: 'Common (non-tunable) args'
28 |     type: String
29 | outputs:
30 |   - name: tuned_parameters_out
31 |     description: 'tuned paramters from the given job'
32 |     type: String
33 | implementation:
34 |   container:
35 |     image: gcr.io/gcp-demo-2-262319/ml-pipeline-get-tuned-params:latest
36 |     args: [
37 |       --project_id, {inputValue: project_id},
38 |       --hptune_job_id, {inputValue: hptune_job_id},
39 |       --common_args, {inputValue: common_args},
40 |       --tuned_parameters_out, {outputPath: tuned_parameters_out}
41 |     ]
42 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/get_tuned_params/get_tuned_params.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Script to extract hyperparamters from the job-ID."""
16 | import argparse
17 | 
18 | from pathlib import Path
19 | 
20 | from googleapiclient import discovery
21 | from googleapiclient import errors
22 | from types import SimpleNamespace
23 | import ast
24 | 
25 | 
26 | # Modified from: https://stackoverflow.com/a/54332748
27 | class NestedNamespace(SimpleNamespace):
28 |     """Parse nested disctionary to create nested namespace object."""
29 | 
30 |     def __init__(self, dictionary, **kwargs):
31 |         super(NestedNamespace, self).__init__(**kwargs)
32 |         for key, value in dictionary.items():
33 |             if isinstance(value, dict):
34 |                 self.__setattr__(key, NestedNamespace(value))
35 |             elif isinstance(value, list):
36 |                 self.__setattr__(key,
37 |                                  [NestedNamespace(i)
38 |                                   if isinstance(i, dict)
39 |                                   else i for i in value])
40 |             else:
41 |                 self.__setattr__(key, value)
42 | 
43 | 
44 | def print_best_parameters(project_id,
45 |                           hp_tune_job,
46 |                           filename='tuned_params',
47 |                           common_args='[]'):
48 |     """Select best hyperparameter set from the job_id."""
49 |     job_id = 'projects/{}/jobs/{}'.format(project_id, hp_tune_job)
50 | 
51 |     # Build a representation of the Cloud ML API.
52 |     ml = discovery.build('ml', 'v1')
53 | 
54 |     # Create a request to call projects.models.create.
55 |     request = ml.projects().jobs().get(name=job_id)
56 |     # Make the call.
57 |     try:
58 |         response = request.execute()
59 |     except errors.HttpError as err:
60 |         # Something went wrong, print out some information.
61 |         print('There was an error getting the job info, Check the details:')
62 |         print(err._get_reason())
63 | 
64 |     job_info = NestedNamespace(response)
65 |     param_list = ast.literal_eval(common_args)
66 |     for key, value in job_info.trainingOutput.trials[0].hyperparameters.__dict__.items():
67 |         param_list.append('--'+key)
68 |         param_list.append(value)
69 |     # Creating the directory where the output file will be created (the directory may or may not exist).
70 |     Path(filename).parent.mkdir(parents=True, exist_ok=True)
71 |     with open(filename, 'w') as f:
72 |         f.write(str(param_list))
73 | 
74 | if __name__ == '__main__':
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument('--hptune_job_id',
77 |                         type=str,
78 |                         required=True,
79 |                         help='ID of hparam search job')
80 |     parser.add_argument('--project_id',
81 |                         type=str,
82 |                         required=True,
83 |                         help='GCP project ID')
84 |     parser.add_argument('--common_args',
85 |                         type=str,
86 |                         required=True,
87 |                         help='common (not tunable) arguments for training application')
88 |     parser.add_argument('--tuned_parameters_out',
89 |                         type=str,
90 |                         required=True,
91 |                         help='Path to the file containing Tuned Parameters array')
92 |     args = parser.parse_args()
93 |     print_best_parameters(args.project_id, args.hptune_job_id, args.tuned_parameters_out, args.common_args)
94 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/hptune/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM google/cloud-sdk:latest
16 | 
17 | COPY  . /
18 | 
19 | ENTRYPOINT ["bash", "/hptune.sh" ]
20 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/hptune/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | if [ -z "$1" ]; then
18 |   PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
19 | else
20 |   PROJECT_ID=$1
21 | fi
22 | 
23 | if [ -z "$2" ]; then
24 |   TAG_NAME="latest"
25 | else
26 |   TAG_NAME="$2"
27 | fi
28 | 
29 | 
30 | CONTAINER_NAME=ml-pipeline-hptune
31 | 
32 | docker build -t ${CONTAINER_NAME} .
33 | docker tag ${CONTAINER_NAME} gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
34 | docker push gcr.io/${PROJECT_ID}/${CONTAINER_NAME}:${TAG_NAME}
35 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/hptune/component.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Submitting a Cloud ML Hyper Parameter Search job as a pipeline step
16 | description: |
17 |   A Kubeflow Pipeline component to submit a Cloud Machine Learning (Cloud ML) 
18 |   Engine Hyperparameter search job as a step in a pipeline.
19 | inputs:
20 |   - name: python_module
21 |     description: 'The Python module name to run after installing the packages.'
22 |     default: ''
23 |     type: String
24 |   - name: staging_bucket
25 |     description: 'The GCS bucket for staging'
26 |     default: ''
27 |     type: GCSPath
28 |   - name: job_dir
29 |     description: 'The GCS bucket dir for where the hparam search run files are created.'
30 |     default: ''
31 |     type: GCSPath
32 |   - name: package_uri
33 |     description: 'The Cloud Storage location of the training package.'
34 |     default: ''
35 |     type: GCSPath
36 |   - name: region
37 |     description: 'The Compute Engine region in which the training job is run.'
38 |     default: ''
39 |     type: GCPRegion
40 |   - name: args
41 |     description: 'The command line arguments to pass to the program.'
42 |     default: ''
43 |     type: List
44 |   - name: runtime_version
45 |     description: 'The Cloud ML Engine runtime version to use for training'
46 |     default: ''
47 |     type: String
48 |   - name: config
49 |     description: 'hptun yaml'
50 |     default: ''
51 |     type: String
52 | 
53 | outputs:
54 |   - name: job_id
55 |     description: 'The ID of the created job.'
56 |     type: String
57 | implementation:
58 |   container:
59 |     image: gcr.io/gcp-demo-2-262319/ml-pipeline-hptune:latest
60 |     args: [
61 |       --python_module, {inputValue: python_module},
62 |       --package_uri, {inputValue: package_uri},
63 |       --region, {inputValue: region},
64 |       --args, {inputValue: args},
65 |       --staging_bucket, {inputValue: staging_bucket},
66 |       --runtime_version, {inputValue: runtime_version},
67 |       --config, {inputValue: config},
68 |       --job_dir, {inputValue: job_dir},
69 |       --job_id, {outputPath: job_id}
70 |       
71 |     ]
72 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/hptune/hptune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Google LLC
 4 | # 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | # 
 9 | #     https://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | while [ $# -ne 0 ]; do
18 |     case "$1" in
19 |        -h|--help)      echo "Usage: ./hptune.sh \\"
20 |                        echo "--region=<REGION> \\"
21 |                        echo "--module-name=<MODULE_NAME> \\"
22 |                        echo "--package-path=<PACKAGE_PATH> \\"
23 |                        echo "--job-dir=<JOB_DIR> \\"
24 |                        echo "--staging-bucket=<STAGING_BUCKET> \\"
25 |                        echo "--config=<CONFIG> \\"
26 |                        echo "--runtime-version=<RUNTIME_VERSION> \\"
27 |                        echo "--stream-logs \\"
28 |                        echo "-- \\"
29 |                        echo "--common_args=<COMMON/NON_TUNABLE_ARGS>"
30 |                        exit
31 | 		       shift
32 |                        ;;
33 |        --region)       REGION=$2
34 |                        shift
35 |                        ;;
36 |        --python_module)  MODULE_NAME=$2
37 |                        shift
38 |                        ;;
39 |        --package_uri) PACKAGE_URI=$2
40 |                        shift
41 |                        ;;
42 |        --job_dir)      JOB_DIR=$2
43 |                        shift
44 |                        ;;
45 |       --staging_bucket)STAGING_BUCKET=$2
46 |                        shift
47 |                        ;;
48 |        --config)       CONFIG=$2
49 |                        shift
50 |                        ;;
51 |      --runtime_version)RUNTIME_VERSION=$2
52 |                        shift
53 |                        ;;
54 |        --args)   ARGS=$2
55 |                        shift
56 |                        ;;
57 |        ###	
58 |        --job_id) JOB_ID=$2      
59 |                        shift
60 |                        ;;
61 |        *)              shift
62 |                         ;;
63 |     esac
64 | done   
65 | echo "Executing $0 $@ . ...."
66 | COMMON_ARGS=`python -c "import ast; print(' '.join(ast.literal_eval('$ARGS')))"`
67 | COMMON_ARGS=`echo $COMMON_ARGS |  sed 's/--\([^ ]*\) *\([^-]*\)/--\1=\2/g'`
68 | 
69 | JOBNAME=wd_hcr_hptuning_$(date -u +%y%m%d_%H%M)
70 | 
71 | gsutil -m rm -rf $OUTPUT_DIR  || echo "No object was deleted" 
72 | gsutil -m cp $CONFIG .
73 | config_file=`basename $CONFIG`
74 | 
75 | eval `echo "gcloud ai-platform jobs submit training $JOBNAME \
76 |        --region=$REGION \
77 |        --module-name=$MODULE_NAME \
78 |        --packages=$PACKAGE_URI \
79 |        --job-dir=$JOB_DIR \
80 |        --staging-bucket=$STAGING_BUCKET \
81 |        --config=$config_file \
82 |        --runtime-version=$RUNTIME_VERSION \
83 |        --stream-logs \
84 |        -- \
85 |        $COMMON_ARGS
86 | 
87 | "`
88 | 
89 | mkdir -p `dirname $JOB_ID`
90 | 
91 | echo "$JOBNAME" > $JOB_ID
92 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | trainingInput:
15 |   hyperparameters:
16 |     hyperparameterMetricTag: accuracy
17 |     goal: MAXIMIZE
18 |     maxTrials: 4
19 |     maxParallelTrials: 2
20 |     enableTrialEarlyStopping: True
21 |     params:
22 |     - parameterName: batch_size
23 |       type: INTEGER
24 |       minValue: 8
25 |       maxValue: 512
26 |       scaleType: UNIT_LOG_SCALE
27 |     - parameterName: dnn_lr
28 |       type: DOUBLE
29 |       minValue: 0.00001
30 |       maxValue: 0.5
31 |       scaleType: UNIT_LOG_SCALE
32 |     - parameterName: lin_lr
33 |       type: DOUBLE
34 |       minValue: 0.00001
35 |       maxValue: 0.5
36 |       scaleType: UNIT_LOG_SCALE
37 |     - parameterName: lin_lr_power
38 |       type: DOUBLE
39 |       minValue: -5
40 |       maxValue: 0.0
41 |       scaleType: UNIT_LINEAR_SCALE
42 |     - parameterName: lin_l1
43 |       type: DOUBLE
44 |       minValue: 0.01
45 |       maxValue: 100
46 |       scaleType: UNIT_LOG_SCALE
47 |     - parameterName: lin_l2
48 |       type: DOUBLE
49 |       minValue: 0.01
50 |       maxValue: 100
51 |       scaleType: UNIT_LOG_SCALE
52 |     - parameterName: lin_shrinkage
53 |       type: DOUBLE
54 |       minValue: 0.00001
55 |       maxValue: 100
56 |       s
57 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/experimental/kfp_pipeline_from_config.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Kubeflow Pipeline Example."""
 16 | import json
 17 | import kfp
 18 | import kfp.dsl as dsl
 19 | from kfp.components import ComponentStore
 20 | from kfp.gcp import use_gcp_secret
 21 | 
 22 | cs = ComponentStore(local_search_paths=['.', '{{config.output_package}}'],
 23 |                     url_search_prefixes=['{{config.github_component_url}}'])
 24 | preprocess_op = cs.load_component('{{config.preprocess.component}}')
 25 | hpt_op = cs.load_component('hptune')
 26 | param_comp = cs.load_component('get_tuned_params')
 27 | train_op = cs.load_component('{{config.train.component}}')
 28 | deploy_op = cs.load_component('{{config.deploy.component}}')
 29 | 
 30 | 
 31 | @dsl.pipeline(
 32 |     name='KFP-Pipelines Example',
 33 |     description='Kubeflow pipeline generated from ai-pipeline asset'
 34 | )
 35 | def pipeline_sample(
 36 |         project_id='{{config.project_id}}',
 37 |         region='{{config.region}}',
 38 |         python_module='{{config.train.python_module}}',
 39 |         package_uri='{{config.train.python_package}}',
 40 |         dataset_bucket='{{config.bucket_id}}',
 41 |         staging_bucket='gs://{{config.bucket_id}}',
 42 |         job_dir_hptune='gs://{{config.bucket_id}}/hptune',
 43 |         job_dir_train='gs://{{config.bucket_id}}/train',
 44 |         runtime_version_train='{{config.runtime_version}}',
 45 |         runtime_version_deploy='{{config.runtime_version}}',
 46 |         hptune_config='{{config.hptune.config}}',
 47 |         model_id='{{config.deploy.model_id}}',
 48 |         version_id='{{config.deploy.version_id}}',
 49 |         common_args_hpt=json.dumps([
 50 |            {% for arg in config.hptune.args %}
 51 |              {% set name = arg.name %}
 52 |              {% set value = arg.default %}
 53 |             '--{{name}}', '{{value}}',
 54 |            {% endfor %}
 55 |             ]),
 56 |         common_args_train=json.dumps([
 57 |            {% for arg in config.train.args %}
 58 |              {% set name =  arg.name %}
 59 |              {% set value = arg.default%}
 60 |             '--{{name}}', '{{value}}',
 61 |            {% endfor %}
 62 |             ]),
 63 |         replace_existing_version=True):
 64 |     """."""
 65 |     preprocess_task = preprocess_op(
 66 |         {% for arg in config.preprocess.component_args %}
 67 |           {% set name = arg.name %}
 68 |         {{name}}={{name}},
 69 |         {% endfor %}
 70 |         )
 71 | 
 72 |     hpt_task = hpt_op(
 73 |         region=region,
 74 |         python_module=python_module,
 75 |         package_uri=package_uri,
 76 |         staging_bucket=staging_bucket,
 77 |         job_dir=job_dir_hptune,
 78 |         config=hptune_config,
 79 |         runtime_version=runtime_version_train,
 80 |         args=common_args_hpt
 81 |         )
 82 |     hpt_task.after(preprocess_task)
 83 | 
 84 |     param_task = param_comp(
 85 |         project_id=project_id,
 86 |         hptune_job_id=hpt_task.outputs['job_id'].to_struct(),
 87 |         common_args=common_args_train
 88 |         )
 89 | 
 90 |     train_task = train_op(
 91 |         project_id=project_id,
 92 |         python_module=python_module,
 93 |         package_uris=json.dumps([package_uri.to_struct()]),
 94 |         region=region,
 95 |         args=str(param_task.outputs['tuned_parameters_out']),
 96 |         job_dir=job_dir_train,
 97 |         python_version='',
 98 |         runtime_version=runtime_version_train,
 99 |         master_image_uri='',
100 |         worker_image_uri='',
101 |         training_input='',
102 |         job_id_prefix='',
103 |         wait_interval='30'
104 |         )
105 | 
106 |     deploy_model = deploy_op(  # pylint: disable=unused-variable
107 |         model_uri=train_task.outputs['job_dir'].to_struct()+'{{config.train.model_out_prefix}}',
108 |         project_id=project_id,
109 |         model_id=model_id,
110 |         version_id=version_id,
111 |         runtime_version=runtime_version_deploy,
112 |         replace_existing_version=replace_existing_version
113 |         )
114 | 
115 |     kfp.dsl.get_pipeline_conf().add_op_transformer(
116 |         use_gcp_secret('user-gcp-sa'))
117 | 
118 | client = kfp.Client(host='{{config.kfp_deployment_url}}')
119 | 
120 | client.create_run_from_pipeline_func(pipeline_sample, arguments={})
121 | 
122 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/kfp_pipeline.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Defines a KubeFlow pipeline."""
 16 | 
 17 | import kfp
 18 | import kfp.gcp as gcp
 19 | from kfp_server_api.rest import ApiException
 20 | from typing import NamedTuple
 21 | 
 22 | 
 23 | # pylint: disable=redefined-outer-name
 24 | # pylint: disable=g-import-not-at-top
 25 | # pylint: disable=reimported
 26 | def make_op_func(func):
 27 |     """Converts a self-contained python function into an op.
 28 | 
 29 |     Args:
 30 |       func: a python function with no outside dependencies.
 31 | 
 32 |     Returns:
 33 |       A function that ingests PipelineParams, parses them, and passes the
 34 |       results to the given function, all within a container.
 35 |     """
 36 |     return kfp.components.func_to_container_op(func)
 37 | 
 38 | 
 39 | def get_train_op(github_url, prev_op_id=""):
 40 |     """Returns an op for running AI Platform training jobs.
 41 | 
 42 |     Args:
 43 |       github_url: url to the github commit the component definition will be
 44 |         read from.
 45 |       prev_op_id: an output from a previous component to use to chain
 46 |         components together.
 47 | 
 48 |     Returns:
 49 |       A Kubeflow Pipelines component for running training.
 50 |     """
 51 |     {% filter indent(width=4, indentfirst=False) %}
 52 |     params = {{train_params}}
 53 |     {% endfilter %}
 54 | 
 55 |     params["job_id_prefix"] += prev_op_id
 56 |     mlengine_train_op = kfp.components.load_component_from_url(
 57 |         "{}/ml_engine/train/component.yaml".format(github_url))
 58 |     train_op = mlengine_train_op(**params)
 59 |     return train_op
 60 | 
 61 | 
 62 | def get_model_path(prev_op_id="") -> NamedTuple("params", [
 63 |         ("model_path", str),
 64 |         ("stub", str),
 65 | ]):
 66 |     """Builds a model path prefix to use to search for the export dir."""
 67 |     model_path = "{{ model_dir }}"
 68 |     return (model_path, prev_op_id)
 69 | 
 70 | 
 71 | def get_model_path_op(prev_op_id):
 72 |     """Returns a component for getting the model path."""
 73 |     model_path_op = make_op_func(get_model_path)(prev_op_id)
 74 |     list_blobs = kfp.components.load_component(
 75 |         "orchestration/components/list_blobs.yaml")
 76 |     gsutil_op = list_blobs(model_path_op.outputs["model_path"])
 77 |     return gsutil_op
 78 | 
 79 | 
 80 | def get_deploy_op(github_url, prev_op_id=""):
 81 |     """Returns an op for deploying models on CAIP.
 82 | 
 83 |     Args:
 84 |       github_url: url to the github commit the component definition will be
 85 |         read from.
 86 |       prev_op_id: an output from a previous component to use to chain
 87 |         components together.
 88 | 
 89 |     Returns:
 90 |       A Kubeflow Pipelines component for deploying models.
 91 |     """
 92 | 
 93 |     {% filter indent(width=4, indentfirst=False) %}
 94 |     params = {{deploy_params}}
 95 |     {% endfilter %}
 96 | 
 97 |     params["version_id"] = prev_op_id
 98 |     if "model_uri" not in params:
 99 |         gsutil_op = get_model_path_op(prev_op_id)
100 |         params["model_uri"] = gsutil_op.output
101 | 
102 |     mlengine_deploy_op = kfp.components.load_component_from_url(
103 |         "{}/ml_engine/deploy/component.yaml".format(github_url))
104 |     deploy_op = mlengine_deploy_op(**params)
105 |     return deploy_op
106 | 
107 | 
108 | def get_predict_op(github_url, prev_op_id="", version_name=""):
109 |     """Returns an op for running AI Platform batch prediction jobs.
110 | 
111 |     Args:
112 |       github_url: url to the github commit the component definition will be
113 |         read from.
114 |       prev_op_id: an output from a previous component to use to chain
115 |         components together.
116 |       version_name: a version name of a deployed model to predict with.
117 | 
118 |     Returns:
119 |       A Kubeflow Pipelines component for running batch predictions.
120 |     """
121 | 
122 |     {% filter indent(width=4, indentfirst=False) %}
123 |     params = {{prediction_params}}
124 |     {% endfilter %}
125 | 
126 |     if prev_op_id:
127 |         gsutil_op = get_model_path_op(prev_op_id)
128 |         params["model_path"] = gsutil_op.output
129 |     elif version_name:
130 |         params["model_path"] = version_name
131 |     mlengine_batch_predict_op = kfp.components.load_component_from_url(
132 |         "{}/ml_engine/batch_predict/component.yaml".format(github_url))
133 |     predict_op = mlengine_batch_predict_op(**params)
134 |     return predict_op
135 | 
136 | 
137 | @kfp.dsl.pipeline(
138 |     name="train_pipeline",
139 |     description="Pipeline for training a model on CAIP.")
140 | def train_pipeline():
141 |     """Defines a Kubeflow Pipeline."""
142 |     github_url = ("https://raw.githubusercontent.com/kubeflow/pipelines/"
143 |                   + "02c991dd265054b040265b3dfa1903d5b49df859/components/gcp")
144 | 
145 |     # TODO(humichael): Add params.
146 |     {% for p, c in relations %}
147 |         {% set parent = components[p] %}
148 |         {% set parent_name = "{}_{}_op".format(parent.role, parent.id) %}
149 |         {% set parent_func = "get_{}_op".format(parent.role) %}
150 |         {% set parent_out = "version_name" if parent.role == "deploy" else "job_id" %}
151 |         {% set connection = "version_name" if parent.role == "deploy" and child.role == "predict" else "prev_op_id" %}
152 |         {% set child = components[c] %}
153 |         {% set child_name = "{}_{}_op".format(child.role, child.id) %}
154 |         {% set child_func = "get_{}_op".format(child.role) %}
155 | 
156 |         {% if p == -1 %}
157 |     {{ child_name }} = {{ child_func }}(github_url)
158 |         {% else %}
159 |     {{ child_name }} = {{ child_func }}(
160 |         github_url,
161 |         {{ connection }}={{ parent_name }}.outputs["{{ parent_out }}"],
162 |     )
163 |         {% endif %}
164 |     {% endfor %}
165 | 
166 | 
167 | def main(compile=False):
168 |     """Compile the pipeline and also create a run."""
169 |     if compile:
170 |         kfp.compiler.Compiler().compile(train_pipeline, "train_pipeline.tar.gz")
171 | 
172 |     try:
173 |       client = kfp.Client(host="{{ host }}")
174 |       client.create_run_from_pipeline_func(train_pipeline, arguments={})
175 |     except ApiException as e:
176 |       print("{0}: KFP Dashboard unreachable. Please update the config.yaml with latest hostname.".format(e.reason))
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     main()
181 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Config for installing a Python module/package."""
15 | 
16 | from setuptools import find_packages
17 | from setuptools import setup
18 | 
19 | NAME = "{{ package_name }}"
20 | VERSION = "1.0"
21 | REQUIRED_PACKAGES = ["gcsfs"]
22 | 
23 | setup(
24 |     name=NAME,
25 |     version=VERSION,
26 |     author="Author",
27 |     author_email="author@example.com",
28 |     packages=find_packages(),
29 |     install_requires=REQUIRED_PACKAGES,
30 |     url="www.example.com",
31 | )
32 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/sklearn_inputs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google Inc. All Rights Reserved.
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Input functions."""
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import numpy as np
21 | import pandas as pd
22 | 
23 | SCHEMA = {{ schema }}
24 | TARGET = "{{ target }}"
25 | 
26 | 
27 | def download_data(train_path, eval_path):
28 |     """Downloads train and eval datasets from GCP.
29 | 
30 |     Args:
31 |         train_path: GCS path to training data.
32 |         eval_path: GCS path to evaluation data.
33 | 
34 |     Returns:
35 |         train_x: dataframe of training features.
36 |         train_y: dataframe of training labels.
37 |         eval_x: dataframe of eval features.
38 |         eval_y: dataframe of eval labels.
39 |     """
40 |     train_df = pd.read_csv(train_path, names=SCHEMA)
41 |     eval_df = pd.read_csv(eval_path, names=SCHEMA)
42 | 
43 |     train_x, train_y = train_df, train_df.pop(TARGET)
44 |     eval_x, eval_y = eval_df, eval_df.pop(TARGET)
45 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
46 |     return train_x, train_y, eval_x, eval_y
47 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/sklearn_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ML model definitions."""
15 | from {{model_path}} import get_model
16 | 
17 | def get_estimator(params):
18 |     """Returns a SKLearn model."""
19 |     estimator = get_model(params)
20 |     return estimator
21 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/sklearn_task.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Executes model training and evaluation."""
 16 | 
 17 | import argparse
 18 | import json
 19 | import logging
 20 | import os
 21 | import sys
 22 | 
 23 | import hypertune
 24 | import numpy as np
 25 | from sklearn import model_selection
 26 | 
 27 | from trainer import inputs
 28 | from trainer import model
 29 | from trainer import utils
 30 | 
 31 | 
 32 | def _parse_arguments(argv):
 33 |     """Parses execution arguments and replaces default values.
 34 | 
 35 |     Args:
 36 |       argv: Input arguments from sys.
 37 | 
 38 |     Returns:
 39 |       Dictionary of parsed arguments.
 40 |     """
 41 |     parser = argparse.ArgumentParser()
 42 | 
 43 |     # TODO(humichael): Make this into modular template.
 44 |     {% for name, arg in input_args.items() %}
 45 |     parser.add_argument(
 46 |         "--{{name}}",
 47 |         help="{{arg.help}}",
 48 |         type={{arg.type}},
 49 |         {% if arg.type == "str" and "default" in arg %}
 50 |         default="{{arg.default}}",
 51 |         {% elif "default" in arg %}
 52 |         default={{arg.default}},
 53 |         {% endif %}
 54 |     )
 55 |     {% endfor %}
 56 | 
 57 |     args, _ = parser.parse_known_args(args=argv[1:])
 58 |     return args
 59 | 
 60 | 
 61 | # TODO(humichael): Evaluate the results.
 62 | def _train_and_evaluate(estimator, dataset, model_dir, params):
 63 |     """Runs model training and evaluation."""
 64 |     x_train, y_train, x_eval, y_eval = dataset
 65 |     estimator.fit(x_train, y_train)
 66 | 
 67 |     model_path = os.path.join(model_dir, "model.joblib")
 68 |     utils.dump_object(estimator, model_path)
 69 | 
 70 |     scores = model_selection.cross_val_score(
 71 |         estimator, x_eval, y_eval, cv=params.cross_validations)
 72 |     metric_path = os.path.join(model_dir, "eval_metrics.joblib")
 73 |     utils.dump_object(scores, metric_path)
 74 | 
 75 |     hpt = hypertune.HyperTune()
 76 |     hpt.report_hyperparameter_tuning_metric(
 77 |         hyperparameter_metric_tag="score",
 78 |         metric_value=np.mean(scores))
 79 | 
 80 | 
 81 | def _get_trial_id():
 82 |     """Returns the trial id if it exists, else "0"."""
 83 |     trial_id = json.loads(
 84 |         os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "")
 85 |     return trial_id if trial_id else "1"
 86 | 
 87 | 
 88 | def run_experiment(params):
 89 |     """Testbed for running model training and evaluation."""
 90 |     dataset = inputs.download_data(params.train_path, params.eval_path)
 91 |     estimator = model.get_estimator(params)
 92 |     trial_id = _get_trial_id()
 93 |     model_dir = os.path.join(params.model_dir, trial_id)
 94 |     _train_and_evaluate(estimator, dataset, model_dir, params)
 95 | 
 96 | 
 97 | def main():
 98 |     """Entry point."""
 99 |     args = _parse_arguments(sys.argv)
100 |     logging.basicConfig(level="INFO")
101 |     run_experiment(args)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/tf_inputs.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Google Inc. All Rights Reserved.
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Input functions."""
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import six
 21 | import tensorflow as tf
 22 | 
 23 | SCHEMA = {{ schema }}
 24 | TARGET = "{{ target }}"
 25 | 
 26 | 
 27 | def _decode_csv(line):
 28 |     """Takes the string input tensor and returns a dict of rank-2 tensors."""
 29 |     columns = tf.decode_csv(line, record_defaults=[0.0] * len(SCHEMA))
 30 |     features = dict(zip(SCHEMA, columns))
 31 |     for key, _ in six.iteritems(features):
 32 |         features[key] = tf.expand_dims(features[key], -1)
 33 |     return features
 34 | 
 35 | 
 36 | def get_input_fn(file_pattern, shuffle, batch_size, num_epochs=None,
 37 |                  data_format="csv"):
 38 |     """Returns an input function.
 39 | 
 40 |     Two input methods are supported:
 41 |       TFRecord on GCS: provide a file_pattern.
 42 |       Local CSV: provide features and labels.
 43 | 
 44 |     Args:
 45 |       file_pattern: pattern of the input files.
 46 |       shuffle: boolean for whether to shuffle the data or not (set True for
 47 |         training, False for evaluation)
 48 |       batch_size: batch size used to read data.
 49 |       num_epochs: number of times to iterate over the dataset.
 50 |       data_format: format of input data.
 51 | 
 52 |     Returns:
 53 |       An input_fn.
 54 | 
 55 |     Raises:
 56 |       RuntimeError: either file_pattern or features and labels were not
 57 |         provided.
 58 |     """
 59 |     def _csv_input_fn():
 60 |         """Parses csv input using tf.data."""
 61 |         filenames = tf.io.gfile.glob(file_pattern)
 62 |         dataset = tf.data.TextLineDataset(filenames).map(
 63 |             _decode_csv,
 64 |             num_parallel_calls=tf.data.experimental.AUTOTUNE)
 65 |         if shuffle:
 66 |             dataset = dataset.shuffle(buffer_size=batch_size * 10)
 67 |         dataset = dataset.repeat(num_epochs)
 68 |         dataset = dataset.batch(batch_size)
 69 |         dataset = dataset.prefetch(buffer_size=10)
 70 |         features = dataset.make_one_shot_iterator().get_next()
 71 |         return features, features.pop(TARGET)
 72 | 
 73 |     data_formats = {
 74 |         "csv": _csv_input_fn,
 75 |     }
 76 |     if data_format in data_formats:
 77 |         return data_formats[data_format]
 78 |     raise RuntimeError("Invalid arguments")
 79 | 
 80 | 
 81 | def get_serving_input_fn(data_format):
 82 |     """Returns a serving input function based on the given format.
 83 | 
 84 |     Args:
 85 |       data_format: format of input data.
 86 | 
 87 |     Returns:
 88 |       An input fn for serving.
 89 | 
 90 |     Raises:
 91 |       KeyError: the given data_format is invalid.
 92 |     """
 93 | 
 94 |     def _csv_serving_input_fn():
 95 |         """Build the serving inputs."""
 96 |         csv_row = tf.placeholder(shape=[None], dtype=tf.string)
 97 |         features = _decode_csv(csv_row)
 98 |         return tf.estimator.export.ServingInputReceiver(
 99 |             features, {"csv_row": csv_row})
100 | 
101 |     def _json_serving_input_fn():
102 |         """Build the serving inputs."""
103 |         inputs = {}
104 |         for col in SCHEMA:
105 |             if col != TARGET:
106 |                 inputs[col] = tf.placeholder(shape=[None], dtype=float)
107 |         return tf.estimator.export.ServingInputReceiver(inputs, inputs)
108 | 
109 |     data_formats = {
110 |         "csv": _csv_serving_input_fn,
111 |         "json": _json_serving_input_fn,
112 |     }
113 |     if data_format in data_formats:
114 |         return data_formats[data_format]
115 |     raise KeyError("Invalid arguments")
116 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/tf_model.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ML model definitions."""
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import json
 21 | import os
 22 | 
 23 | import tensorflow.compat.v1 as tf
 24 | 
 25 | from trainer import inputs
 26 | from {{ model_path }} import get_model
 27 | from {{ model_path }} import get_loss
 28 | 
 29 | 
 30 | # pylint: disable=unused-argument
 31 | def _model_fn(features, labels, mode, params):
 32 |     """Builds an EstimatorSpec.
 33 | 
 34 |     Args:
 35 |       features: a dict mapping feature names to tensors.
 36 |       labels: a tensor of labels.
 37 |       mode: a tf.estimator.ModeKey signifying the Estimator mode.
 38 |       params: hyperparameters for the model.
 39 | 
 40 |     Returns:
 41 |       an EstimatorSpec that defines the model to be run by an Estimator.
 42 |     """
 43 |     schema = [x for x in inputs.SCHEMA if x != inputs.TARGET]
 44 |     feature_columns = [tf.feature_column.numeric_column(
 45 |         col, shape=(1,), dtype=tf.dtypes.float32) for col in schema]
 46 |     input_layer = tf.feature_column.input_layer(features, feature_columns)
 47 |     # TODO(humichael): support multiple outputs.
 48 |     predictions = get_model(input_layer, params)
 49 | 
 50 |     if mode == tf.estimator.ModeKeys.PREDICT:
 51 |         prediction_out = {
 52 |             "predictions": predictions,
 53 |         }
 54 |         return tf.estimator.EstimatorSpec(mode, predictions=prediction_out)
 55 | 
 56 |     loss = get_loss()(labels, predictions)
 57 |     metrics = {}
 58 | 
 59 |     {% for metric in metrics%}
 60 |     key = "{{ metric }}"
 61 |     # TODO(humichael): how to generate this from user?
 62 |     # may tie in with multiple outputs. Use logits for loss, preds for eval.
 63 |     predictions = tf.round(predictions)
 64 |     metric = tf.metrics.{{ metric }}(labels, predictions)
 65 |     metrics[key] = metric
 66 |     tf.summary.scalar(key, metric[1])
 67 |     {% endfor %}
 68 | 
 69 |     tf.summary.merge_all()
 70 | 
 71 |     if mode == tf.estimator.ModeKeys.EVAL:
 72 |         return tf.estimator.EstimatorSpec(
 73 |             mode, loss=loss, eval_metric_ops=metrics)
 74 | 
 75 |     optimizer = tf.train.AdagradOptimizer(learning_rate=params.learning_rate)
 76 |     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
 77 | 
 78 |     hook = tf.estimator.LoggingTensorHook(
 79 |         [input_layer[:5], labels[:5], predictions[:5]], at_end=True)
 80 |     return tf.estimator.EstimatorSpec(
 81 |         mode, loss=loss, train_op=train_op, training_hooks=[hook])
 82 | 
 83 | 
 84 | def _get_trial_id():
 85 |     """Returns the trial id if it exists, else "0"."""
 86 |     trial_id = json.loads(
 87 |         os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "")
 88 |     return trial_id if trial_id else "1"
 89 | 
 90 | 
 91 | def get_estimator(params):
 92 |     """Returns a tf.Estimator for reconstruction.
 93 | 
 94 |     Args:
 95 |       params: a dict of hyperparameters for the model.
 96 | 
 97 |     Returns:
 98 |       A tf.Estimator.
 99 |     """
100 |     config = tf.estimator.RunConfig(
101 |         save_checkpoints_steps=params.save_checkpoints_steps,
102 |         keep_checkpoint_max=params.keep_checkpoint_max,
103 |         log_step_count_steps=params.log_step_count_steps)
104 |     trial_id = _get_trial_id()
105 |     model_dir = os.path.join(params.model_dir, trial_id)
106 | 
107 |     estimator = tf.estimator.Estimator(
108 |         model_fn=_model_fn,
109 |         model_dir=model_dir,
110 |         config=config,
111 |         params=params)
112 |     return estimator
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/tf_task.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Main script to train the model."""
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import argparse
21 | import sys
22 | 
23 | import tensorflow.compat.v1 as tf
24 | 
25 | from trainer import inputs
26 | from trainer import model
27 | 
28 | 
29 | def _parse_arguments(argv):
30 |     """Parses execution arguments and replaces default values.
31 | 
32 |     Args:
33 |       argv: Input arguments from sys.
34 | 
35 |     Returns:
36 |       Dictionary of parsed arguments.
37 |     """
38 |     parser = argparse.ArgumentParser()
39 | 
40 |     {% for name, arg in input_args.items() %}
41 |     parser.add_argument(
42 |         "--{{name}}",
43 |         help="{{arg.help}}",
44 |         type={{arg.type}},
45 |         {% if arg.type == "str" and "default" in arg %}
46 |         default="{{arg.default}}",
47 |         {% elif "default" in arg %}
48 |         default={{arg.default}},
49 |         {% endif %}
50 |     )
51 |     {% endfor %}
52 | 
53 |     args, _ = parser.parse_known_args(args=argv[1:])
54 |     return args
55 | 
56 | 
57 | def run_training(params):
58 |     """Initializes the estimator and runs train_and_evaluate."""
59 |     estimator = model.get_estimator(params)
60 |     train_input_fn = inputs.get_input_fn(
61 |         params.train_path,
62 |         shuffle=True,
63 |         batch_size=params.batch_size,
64 |         num_epochs=params.num_epochs,
65 |     )
66 |     train_spec = tf.estimator.TrainSpec(
67 |         input_fn=train_input_fn,
68 |         max_steps=params.max_steps,
69 |     )
70 |     eval_input_fn = inputs.get_input_fn(
71 |         params.eval_path,
72 |         shuffle=False,
73 |         batch_size=params.batch_size,
74 |     )
75 |     exporter = tf.estimator.BestExporter(
76 |         "export", inputs.get_serving_input_fn(params.export_format),
77 |         exports_to_keep=1)
78 |     eval_spec = tf.estimator.EvalSpec(
79 |         input_fn=eval_input_fn,
80 |         throttle_secs=1,
81 |         steps=params.eval_steps,
82 |         start_delay_secs=1,
83 |         exporters=[exporter],
84 |     )
85 |     tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
86 | 
87 | 
88 | def main():
89 |     """Trains a model."""
90 |     params = _parse_arguments(sys.argv)
91 |     tf.logging.set_verbosity(tf.logging.INFO)
92 |     run_training(params)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/xgboost_inputs.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Input functions."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import numpy as np
22 | import pandas as pd
23 | 
24 | SCHEMA = {{ schema }}
25 | TARGET = "{{ target }}"
26 | 
27 | 
28 | def download_data(train_path, eval_path):
29 |     """Downloads train and eval datasets from GCP.
30 | 
31 |     Args:
32 |         train_path: GCS path to training data.
33 |         eval_path: GCS path to evaluation data.
34 | 
35 |     Returns:
36 |         train_x: dataframe of training features.
37 |         train_y: dataframe of training labels.
38 |         eval_x: dataframe of eval features.
39 |         eval_y: dataframe of eval labels.
40 |     """
41 | 
42 |     train_df = pd.read_csv(train_path, names=SCHEMA)
43 |     eval_df = pd.read_csv(eval_path, names=SCHEMA)
44 |     train_x, train_y = train_df.drop(TARGET, axis=1), train_df[TARGET]
45 |     eval_x, eval_y = eval_df.drop(TARGET, axis=1), eval_df[TARGET]
46 |     train_y, eval_y = [np.ravel(x) for x in [train_y, eval_y]]
47 | 
48 |     return train_x, train_y, eval_x, eval_y
49 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/xgboost_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ML model definition."""
15 | from {{model_path}} import get_model
16 | 
17 | def get_estimator(params):
18 |     """Returns a SKLearn model."""
19 |     estimator = get_model(params)
20 |     return estimator
21 | 


--------------------------------------------------------------------------------
/ml_pipeline_gen/templates/xgboost_task.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Executes model training and evaluation."""
 15 | 
 16 | import argparse
 17 | import os
 18 | import sys
 19 | import json
 20 | import logging
 21 | import hypertune
 22 | 
 23 | from sklearn import metrics
 24 | from sklearn import preprocessing
 25 | 
 26 | from trainer import inputs
 27 | from trainer import model
 28 | from trainer import utils
 29 | 
 30 | 
 31 | def _parse_arguments(argv):
 32 |     """Parses execution arguments and replaces default values.
 33 | 
 34 |     Args:
 35 |       argv: Input arguments from sys.
 36 | 
 37 |     Returns:
 38 |       Dictionary of parsed arguments.
 39 |     """
 40 |     parser = argparse.ArgumentParser()
 41 | 
 42 |     # TODO(humichael): Make this into modular template.
 43 |     {% for name, arg in input_args.items() %}
 44 |     parser.add_argument(
 45 |         "--{{name}}",
 46 |         help="{{arg.help}}",
 47 |         type={{arg.type}},
 48 |         {% if arg.type == "str" and "default" in arg %}
 49 |         default="{{arg.default}}",
 50 |         {% elif "default" in arg %}
 51 |         default={{arg.default}},
 52 |         {% endif %}
 53 |     )
 54 |     {% endfor %}
 55 | 
 56 |     args, _ = parser.parse_known_args(args=argv[1:])
 57 |     return args
 58 | 
 59 | 
 60 | def _get_trial_id():
 61 |     """Returns the trial id if it exists, else "0"."""
 62 |     trial_id = json.loads(
 63 |         os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial",
 64 |                                                                "")
 65 |     return trial_id if trial_id else "1"
 66 | 
 67 | 
 68 | def _train_and_evaluate(estimator, dataset, model_dir):
 69 |     """Runs model training and evaluation."""
 70 |     x_train, y_train, x_eval, y_eval = dataset
 71 |     estimator.fit(x_train, y_train)
 72 |     logging.info("Completed training XGBOOST model")
 73 | 
 74 |     bst = estimator.get_booster()
 75 |     bst_filename = 'model.bst'
 76 |     bst.save_model(bst_filename)
 77 |     model_output_path = os.path.join(model_dir, bst_filename)
 78 |     utils.upload_blob(model_output_path.split("/")[2], bst_filename,
 79 |                       "/".join(model_output_path.split("/")[3:]))
 80 |     logging.info("Successfully uploaded file to GCS at location %s",
 81 |                  model_dir)
 82 |     y_pred = estimator.predict(x_eval)
 83 | 
 84 |     # Binarize multiclass labels
 85 |     lb = preprocessing.LabelBinarizer()
 86 |     lb.fit(y_eval)
 87 |     y_test = lb.transform(y_eval)
 88 |     y_pred = lb.transform(y_pred)
 89 | 
 90 |     score = metrics.roc_auc_score(y_test, y_pred, average='macro')
 91 |     logging.info("AUC Score: %s", str(score))
 92 | 
 93 |     hpt = hypertune.HyperTune()
 94 |     hpt.report_hyperparameter_tuning_metric(
 95 |         hyperparameter_metric_tag='roc_auc',
 96 |         metric_value=score,
 97 |         global_step=1000
 98 |     )
 99 | 
100 | 
101 | def run_experiment(params):
102 |     """Testbed for running model training and evaluation."""
103 |     dataset = inputs.download_data(params.train_path, params.eval_path)
104 |     estimator = model.get_estimator(params)
105 |     trial_id = _get_trial_id()
106 |     model_dir = os.path.join(params.model_dir, trial_id)
107 |     _train_and_evaluate(estimator, dataset, model_dir)
108 | 
109 | 
110 | def main():
111 |     """Entry point."""
112 |     args = _parse_arguments(sys.argv)
113 |     logging.basicConfig(level="INFO")
114 |     run_experiment(args)
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the 'License');
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Config for installing a Python module/package."""
15 | 
16 | import setuptools
17 | import ml_pipeline_gen
18 | 
19 | with open('README.md', 'r') as f:
20 |     long_description = f.read()
21 | 
22 | setuptools.setup(
23 |     name='ml-pipeline-gen',
24 |     version=ml_pipeline_gen.__version__,
25 |     author='Michael Hu',
26 |     author_email='author@example.com',
27 |     description='A tool for generating end-to-end pipelines on GCP.',
28 |     long_description=long_description,
29 |     long_description_content_type='text/markdown',
30 |     url='https://github.com/GoogleCloudPlatform/ml-pipeline-generator-python',
31 |     packages=['ml_pipeline_gen'],
32 |     install_requires=[
33 |         'cloudml-hypertune>=0.1.0.dev6',
34 |         'gcsfs>=0.6.2',
35 |         'google-api-python-client>=1.9.3',
36 |         'google-cloud-container>=0.5.0',
37 |         'jinja2>=2.11.2',
38 |         'joblib>=0.15.1',
39 |         'kfp>=0.5.1',
40 |         'pandas>=1.0.4',
41 |         'pyyaml>=5.3.1',
42 |         'scikit-learn>=0.23.1',
43 |         'tensorflow>=1.14.0,<2.0.0',
44 |         'xgboost>=1.1.1',
45 |     ],
46 |     extras_require={
47 |         'dev': [
48 |             'mock',
49 |         ]
50 |     },
51 |     classifiers=[
52 |         'Programming Language :: Python :: 3.6',
53 |         'Programming Language :: Python :: 3.7',
54 |         'License :: OSI Approved :: Apache Software License',
55 |         'Operating System :: OS Independent',
56 |     ],
57 |     python_requires='>=3.6',
58 |     include_package_data=True,
59 | )
60 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration/fixtures/test_config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Config file for ML Pipeline Generator.
16 | 
17 | project_id: "ml-pipeline-gen"
18 | bucket_id: "ml-pipeline-gen-test"
19 | cluster_name: "cluster-test"
20 | region: "us-central1"
21 | zone: "us-central1-a"
22 | scale_tier: "STANDARD_1"
23 | runtime_version: "1.15"
24 | python_version: "3.7"
25 | package_name: "ml_pipeline_gen"
26 | machine_type_pred: "n1-standard-4"
27 | 
28 | data:
29 |     schema:
30 |         - "age"
31 |         - "workclass"
32 |         - "education_num"
33 |         - "marital_status"
34 |         - "occupation"
35 |         - "relationship"
36 |         - "race"
37 |         - "capital_gain"
38 |         - "capital_loss"
39 |         - "hours_per_week"
40 |         - "native_country"
41 |         - "income_bracket"
42 |     train: "gs://ml-pipeline-gen-test/test_model/data/adult.data.csv"
43 |     evaluation: "gs://ml-pipeline-gen-test/test_model/data/adult.test.csv"
44 |     prediction:
45 |         input_data_paths:
46 |             - "gs://ml-pipeline-gen-test/test_model/inputs/*"
47 |         input_format: "JSON"
48 |         output_format: "JSON"
49 | 
50 | model:
51 |     # Name must start with a letter and only contain letters, numbers, and
52 |     # underscores.
53 |     name: "test_model"
54 |     path: "model.test_model"
55 |     target: "income_bracket"
56 |     metrics:
57 |         - "accuracy"
58 | 
59 | model_params:
60 |     # Relative path.
61 |     hyperparam_config: "hptuning_config.yaml"
62 |     explain_output:
63 |         explain_type: "sampledShapleyAttribution"
64 |         explain_param:
65 |             name: "numPaths"
66 |             value: 40
67 | 
68 | orchestration:
69 |     host: "https://5e892ccf4c09b627-dot-us-central2.pipelines.googleusercontent.com"
70 | 


--------------------------------------------------------------------------------
/tests/integration/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/integration/src/__init__.py


--------------------------------------------------------------------------------
/tests/integration/src/test_models.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # Copyright 2020 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Integration tests for models classes."""
16 | import mock
17 | import os
18 | import shutil
19 | import tempfile
20 | import time
21 | import unittest
22 | 
23 | from googleapiclient import discovery
24 | from tensorflow.io import gfile
25 | 
26 | from ml_pipeline_gen.models import BaseModel
27 | from ml_pipeline_gen.models import SklearnModel
28 | 
29 | 
30 | class TestSklearnModel(unittest.TestCase):
31 |     """Tests SklearnModel class."""
32 | 
33 |     @classmethod
34 |     def setUpClass(cls):
35 |         """Copies a demo and instantiates a model."""
36 |         super(TestSklearnModel, cls).setUpClass()
37 |         cls.cwd = os.getcwd()
38 |         cls.test_dir = tempfile.mkdtemp()
39 |         cls.demo_dir = os.path.join(cls.test_dir, 'demo')
40 |         shutil.copytree('examples/sklearn', cls.demo_dir)
41 |         shutil.copyfile('tests/integration/fixtures/test_config.yaml',
42 |                         os.path.join(cls.demo_dir, 'test_config.yaml'))
43 |         os.chdir(cls.demo_dir)
44 | 
45 |     @classmethod
46 |     def tearDownClass(cls):
47 |         """Switch back to the original working dir and removes the demo."""
48 |         super(TestSklearnModel, cls).tearDownClass()
49 |         os.chdir(cls.cwd)
50 |         shutil.rmtree(cls.test_dir)
51 | 
52 |     def modify_config(self):
53 |         self.model.model['name'] = 'test_model_{}'.format(self.now)
54 |         self.model.model['path'] = 'model.sklearn_model'
55 |         self.model.model_params['input_args']['C'] = {
56 |             'type': 'float',
57 |             'default': 1.0,
58 |         }
59 | 
60 |     def setUp(self):
61 |         super(TestSklearnModel, self).setUp()
62 |         # Delete models if exists
63 |         self.now = int(time.time())
64 |         self.model = SklearnModel('test_config.yaml')
65 |         self.modify_config()
66 | 
67 |         self.gcs_path = 'gs://ml-pipeline-gen-test/test_model_{}'.format(
68 |             self.now)
69 |         self.model_dir = os.path.join(self.gcs_path, 'models')
70 | 
71 |     def tearDown(self):
72 |         super(TestSklearnModel, self).tearDown()
73 |         self.model.clean_up()
74 |         if gfile.exists(self.gcs_path):
75 |             gfile.rmtree(self.gcs_path)
76 | 
77 |     def test_cloud_train(self):
78 |         """Tests training on CAIP."""
79 |         self.model.generate_files()
80 |         self.model.train(tune=False)
81 | 
82 |         self.assertTrue(gfile.exists(self.model_dir))
83 |         export_path = os.path.join(self.model_dir, '1', 'model.joblib')
84 |         self.assertTrue(gfile.exists(export_path))
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Lint as: python3
 2 | """Utils for testing."""
 3 | import importlib.util
 4 | import sys
 5 | 
 6 | 
 7 | def load_module(name, path):
 8 |     spec = importlib.util.spec_from_file_location(name, path)
 9 |     module = importlib.util.module_from_spec(spec)
10 |     sys.modules[name] = module
11 |     spec.loader.exec_module(module)
12 |     return module
13 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/__init__.py


--------------------------------------------------------------------------------
/tests/unit/examples/sklearn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/sklearn/__init__.py


--------------------------------------------------------------------------------
/tests/unit/examples/sklearn/test_sklearn_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Unit tests demo scikit-learn model."""
15 | import argparse
16 | import os
17 | import shutil
18 | import sys
19 | import tempfile
20 | import unittest
21 | 
22 | from tests import test_utils
23 | 
24 | 
25 | class TestModel(unittest.TestCase):
26 |     """Tests demo model."""
27 | 
28 |     @classmethod
29 |     def setUpClass(cls):
30 |         super(TestModel, cls).setUpClass()
31 |         cls.test_dir = tempfile.mkdtemp()
32 |         cls.demo_dir = os.path.join(cls.test_dir, 'demo')
33 |         shutil.copytree('examples/sklearn', cls.demo_dir)
34 | 
35 |         # TODO(humichael) We can't import the model using __import__ because
36 |         # several other examples are also adding their demo dirs to sys.path.
37 |         # It's very likely the model module that is imported is not the one from
38 |         # this test. All examples currently use the same census_preprocess.
39 |         # These tests will break if any example uses a different preprocessing
40 |         # script.
41 |         # We should just mock this.
42 |         sys.path.append(cls.demo_dir)
43 |         sklearn_model = test_utils.load_module(
44 |             'sklearn_model', os.path.join(
45 |                 cls.demo_dir, 'model', 'sklearn_model.py'))
46 |         sklearn_preprocess = test_utils.load_module(
47 |             'sklearn_preprocess', os.path.join(
48 |                 cls.demo_dir, 'model', 'census_preprocess.py'))
49 |         sys.path.remove(cls.demo_dir)
50 |         params = argparse.Namespace(C=1.0)
51 |         cls.model = sklearn_model.get_model(params)
52 |         cls.features, cls.labels, _, _ = sklearn_preprocess.load_data()
53 | 
54 |     @classmethod
55 |     def tearDownClass(cls):
56 |         super(TestModel, cls).tearDownClass()
57 |         shutil.rmtree(cls.test_dir)
58 | 
59 |     def setUp(self):
60 |         super(TestModel, self).setUp()
61 |         self.model = self.__class__.model
62 |         self.features = self.__class__.features
63 |         self.labels = self.__class__.labels
64 | 
65 |     def test_get_data(self):
66 |         """Checks that there is a label for each feature."""
67 |         self.assertEqual(self.features.shape[0], self.labels.shape[0])
68 | 
69 |     def test_get_model(self):
70 |         """Checks that the model can be trained and used for predictions."""
71 |         self.model.fit(self.features, self.labels)
72 |         preds = self.model.predict(self.features)
73 |         self.assertEqual(preds.shape[0], self.labels.shape[0])
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/tests/unit/examples/tensorflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/examples/tensorflow/__init__.py


--------------------------------------------------------------------------------
/tests/unit/examples/tensorflow/test_tf_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Unit tests demo TF model."""
15 | import argparse
16 | import os
17 | import shutil
18 | import sys
19 | import tempfile
20 | import unittest
21 | 
22 | import tensorflow.compat.v1 as tf
23 | 
24 | from tests import test_utils
25 | 
26 | 
27 | class TestModel(tf.test.TestCase):
28 |     """Tests TF demo model."""
29 | 
30 |     @classmethod
31 |     def setUpClass(cls):
32 |         super(TestModel, cls).setUpClass()
33 |         cls.test_dir = tempfile.mkdtemp()
34 |         cls.demo_dir = os.path.join(cls.test_dir, 'demo')
35 |         shutil.copytree('examples/tf', cls.demo_dir)
36 | 
37 |         # TODO(humichael) We can't import the model using __import__ because
38 |         # several other examples are also adding their demo dirs to sys.path.
39 |         # It's very likely the model module that is imported is not the one from
40 |         # this test. All examples currently use the same census_preprocess.
41 |         # These tests will break if any example uses a different preprocessing
42 |         # script.
43 |         # We should just mock this.
44 |         sys.path.append(cls.demo_dir)
45 |         tf_model = test_utils.load_module(
46 |             'tf_model', os.path.join(cls.demo_dir, 'model', 'tf_model.py'))
47 |         tf_preprocess = test_utils.load_module(
48 |             'tf_preprocess', os.path.join(
49 |                 cls.demo_dir, 'model', 'census_preprocess.py'))
50 |         sys.path.remove(cls.demo_dir)
51 | 
52 |         cls.features, cls.labels, _, _ = tf_preprocess.load_data()
53 |         cls.model = tf_model
54 | 
55 |     @classmethod
56 |     def tearDownClass(cls):
57 |         super(TestModel, cls).tearDownClass()
58 |         shutil.rmtree(cls.test_dir)
59 | 
60 |     # pylint: disable=g-import-not-at-top
61 |     def setUp(self):
62 |         super(TestModel, self).setUp()
63 |         self.model = self.__class__.model
64 |         self.features = self.__class__.features
65 |         self.labels = self.__class__.labels
66 | 
67 |     def test_get_data(self):
68 |         """Checks that there is a label for each feature."""
69 |         self.assertEqual(self.features.shape[0], self.labels.shape[0])
70 | 
71 |     def test_get_model(self):
72 |         """Checks that the model can be trained and used for predictions."""
73 |         input_layer = tf.keras.layers.Input(shape=(self.features.shape[1],))
74 |         params = argparse.Namespace(first_layer_size=50, num_layers=5)
75 |         predictions = self.model.get_model(input_layer, params)
76 | 
77 |         model = tf.keras.models.Model(inputs=input_layer, outputs=predictions)
78 |         model.compile(optimizer='adam', loss=tf.losses.sigmoid_cross_entropy,
79 |                       metrics=['accuracy'])
80 |         model.fit(self.features, self.labels)
81 |         preds = model.predict(self.features)
82 |         self.assertEqual(preds.shape[0], self.labels.shape[0])
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/unit/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ml-pipeline-generator-python/b0508160c1752a149bdc5c862560387fd9c65d77/tests/unit/src/__init__.py


--------------------------------------------------------------------------------
/tests/unit/src/test_models.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # Copyright 2020 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Unit tests for models classes."""
 16 | import mock
 17 | import os
 18 | import shutil
 19 | import tempfile
 20 | import unittest
 21 | 
 22 | from googleapiclient import discovery
 23 | 
 24 | from ml_pipeline_gen.models import BaseModel
 25 | from ml_pipeline_gen.models import SklearnModel
 26 | 
 27 | 
 28 | class TestBaseModel(unittest.TestCase):
 29 |     """Tests BaseModel class."""
 30 | 
 31 |     def test_init(self):
 32 |         """Ensure BaseModel is abstract."""
 33 |         with self.assertRaises(TypeError):
 34 |             BaseModel()
 35 | 
 36 | 
 37 | class TestSklearnModel(unittest.TestCase):
 38 |     """Tests SklearnModel class."""
 39 | 
 40 |     @classmethod
 41 |     @mock.patch.object(discovery, 'build')
 42 |     def setUpClass(cls, build_mock):
 43 |         """Copies a demo and instantiates a model."""
 44 |         super(TestSklearnModel, cls).setUpClass()
 45 |         build_mock.return_value = None
 46 |         cls.cwd = os.getcwd()
 47 |         cls.test_dir = tempfile.mkdtemp()
 48 |         cls.demo_dir = os.path.join(cls.test_dir, 'demo')
 49 |         shutil.copytree('examples/sklearn', cls.demo_dir)
 50 | 
 51 |         os.chdir(cls.demo_dir)
 52 |         cls.config = 'config.yaml.example'
 53 |         cls.model = SklearnModel(cls.config)
 54 | 
 55 |     @classmethod
 56 |     def tearDownClass(cls):
 57 |         """Switch back to the original working dir and removes the demo."""
 58 |         super(TestSklearnModel, cls).tearDownClass()
 59 |         os.chdir(cls.cwd)
 60 |         shutil.rmtree(cls.test_dir)
 61 | 
 62 |     def setUp(self):
 63 |         super(TestSklearnModel, self).setUp()
 64 |         self.model = self.__class__.model
 65 | 
 66 |     def tearDown(self):
 67 |         super(TestSklearnModel, self).tearDown()
 68 |         try:
 69 |             self.__class__.model.clean_up()
 70 |         except FileNotFoundError:
 71 |             pass
 72 | 
 73 |     def test_generate_files(self):
 74 |         """Ensures task.py and model.py are created."""
 75 |         self.assertFalse(os.path.exists('trainer'))
 76 |         self.model.generate_files()
 77 |         self.assertTrue(os.path.exists('trainer'))
 78 |         trainer_files = os.listdir('trainer')
 79 |         self.assertIn('task.py', trainer_files)
 80 |         self.assertIn('model.py', trainer_files)
 81 | 
 82 |     @unittest.skip('How to test without running training?')
 83 |     def test_local_train(self):
 84 |         """Tests local training."""
 85 |         self.model.generate_files()
 86 |         self.model.train()
 87 |         model_files = os.listdir('models')
 88 |         self.assertIn('{}.joblib'.format(self.model.model['name']), model_files)
 89 | 
 90 |     # TODO(humichael): Need to spoof CAIP calls to test this.
 91 |     def test_cloud_train(self):
 92 |         """Tests training on CAIP."""
 93 |         pass
 94 | 
 95 |     # TODO(humichael): Need to spoof CAIP calls to test this.
 96 |     def test_serve(self):
 97 |         """Tests serving."""
 98 |         pass
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     unittest.main()
103 | 


--------------------------------------------------------------------------------