├── lesson1_setup
    ├── step6_kubeflow_check.sh
    ├── delete_kubeflow.sh
    ├── step5_kubeflow_pipeline_sdk.sh
    ├── step1_install_docker.sh
    ├── step4_port_forward_gateway.sh
    ├── step2_install_kfctl.sh
    └── step3_apply_kubeflow.sh
├── .gitignore
├── first_project
    ├── Dockerfile
    ├── build_and_push.sh
    ├── config.json
    ├── train_pipeline.py
    └── first.yml
├── lessonx_mnist_pipeline
    ├── sc_local_storage.yml
    ├── pvc_local_storage.yml
    ├── pv_local_storage.yml
    └── mnist_pipeline.py
├── lesson8_download_s3
    ├── secret.yml
    ├── s3_ls.py
    └── s3_sync.py
├── lesson10_catboost
    ├── convert_CatBoostModel_to_ONNX
    │   ├── component.py
    │   └── component.yaml
    ├── convert_CatBoostModel_to_AppleCoreMLModel
    │   ├── component.py
    │   └── component.yaml
    ├── Predict_values
    │   └── from_CSV
    │   │   ├── component.py
    │   │   └── component.yaml
    ├── Predict_classes
    │   └── from_CSV
    │   │   ├── component.py
    │   │   └── component.yaml
    ├── Predict_class_probabilities
    │   └── from_CSV
    │   │   ├── component.py
    │   │   └── component.yaml
    ├── Train_regression
    │   └── from_CSV
    │   │   ├── component.py
    │   │   └── component.yaml
    ├── Train_classifier
    │   └── from_CSV
    │   │   ├── component.py
    │   │   └── component.yaml
    └── catboost_pipeline.py
├── lesson2_hello_world
    ├── helloworld_python.py
    └── helloworld_bash.py
├── lesson9_tf_mnist
    └── tf_mnist.py
├── lesson7_storing_data
    └── storing_data.py
├── lesson4_parallel
    └── parallel_execution.py
├── lesson3_add
    └── add_python.py
├── lesson7_output_a_directory
    └── output_a_directory.py
├── lesson5_control_structure
    ├── control_structure.py
    └── control.yaml
├── train_until_good
    ├── train_until_good.py
    └── train_until_good.py.yaml
└── lesson6_data_passing
    └── data_passing.py


/lesson1_setup/step6_kubeflow_check.sh:
--------------------------------------------------------------------------------
1 | kubectl -n kubeflow get all


--------------------------------------------------------------------------------
/lesson1_setup/delete_kubeflow.sh:
--------------------------------------------------------------------------------
1 | kfctl delete -f kfctl_k8s_istio.v1.0.0.yaml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | __pycache__
4 | hello-kf
5 | *.zip
6 | *.tar.gz


--------------------------------------------------------------------------------
/first_project/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0


--------------------------------------------------------------------------------
/first_project/build_and_push.sh:
--------------------------------------------------------------------------------
1 | docker build  -t chrisai/kubeflow-first-project:v1 -f Dockerfile .
2 | docker push chrisai/kubeflow-first-project:v1
3 | 


--------------------------------------------------------------------------------
/lesson1_setup/step5_kubeflow_pipeline_sdk.sh:
--------------------------------------------------------------------------------
1 | URL=https://storage.googleapis.com/ml-pipeline/release/latest/kfp.tar.gz
2 | pip install "${URL}" --upgrade
3 | 


--------------------------------------------------------------------------------
/lesson1_setup/step1_install_docker.sh:
--------------------------------------------------------------------------------
1 | # Mac
2 | wget https://desktop.docker.com/mac/stable/Docker.dmg
3 | 
4 | # Windows
5 | # https://desktop.docker.com/win/stable/Docker%20Desktop%20Installer.exe
6 | 
7 | 


--------------------------------------------------------------------------------
/lessonx_mnist_pipeline/sc_local_storage.yml:
--------------------------------------------------------------------------------
1 | apiVersion: storage.k8s.io/v1
2 | kind: StorageClass
3 | metadata:
4 |   name: my-storage-class
5 | provisioner: kubernetes.io/no-provisioner
6 | volumeBindingMode: WaitForFirstConsumer


--------------------------------------------------------------------------------
/first_project/config.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"auths": {
3 | 		"https://index.docker.io/v1/": {
4 | 			"auth": "Y2hyaXNhaTpzb25naG9yYW5ibGFjayBEb2NrZXJmaWxlIF9fcHljYWNoZV9fIGJ1aWxkX2FuZF9wdXNoLnNoIGZpcnN0LnltbCB0cmFpbl9waXBlbGluZS5weQ=="
5 | 		}
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/lesson1_setup/step4_port_forward_gateway.sh:
--------------------------------------------------------------------------------
1 | kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80
2 | # kubectl port-forward -n kubeflow svc/centraldashboard 8080:80
3 | 
4 | #virtualenv kfvenv --python python3
5 | #source kfvenv/bin/activate


--------------------------------------------------------------------------------
/lesson8_download_s3/secret.yml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: aws-secret
5 |   namespace: kubeflow
6 | type: Opaque
7 | data:
8 |   AWS_ACCESS_KEY_ID: BASE64_AWS_ACCESS_KEY_ID
9 |   AWS_SECRET_ACCESS_KEY: BASE64_AWS_SECRET_ACCESS_KEY


--------------------------------------------------------------------------------
/lessonx_mnist_pipeline/pvc_local_storage.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: task-pv-claim
 5 | spec:
 6 |   storageClassName: manual
 7 |   accessModes:
 8 |     - ReadWriteOnce
 9 |   resources:
10 |     requests:
11 |       storage: 1Gi


--------------------------------------------------------------------------------
/lessonx_mnist_pipeline/pv_local_storage.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: task-pv-volume
 5 |   labels:
 6 |     type: local
 7 | spec:
 8 |   storageClassName: manual
 9 |   capacity:
10 |     storage: 1Gi
11 |   accessModes:
12 |     - ReadWriteOnce
13 |   hostPath:
14 |     path: "/mnt/data"


--------------------------------------------------------------------------------
/lesson1_setup/step2_install_kfctl.sh:
--------------------------------------------------------------------------------
 1 | export PLATFORM=$(uname) # Either Linux or Darwin
 2 | export KUBEFLOW_TAG=1.0.0
 3 | KUBEFLOW_BASE="https://api.github.com/repos/kubeflow/kfctl/releases"
 4 | # Or just go to https://github.com/kubeflow/kfctl/releases
 5 | wget https://github.com/kubeflow/kfctl/releases/download/v1.0/kfctl_v1.0-0-g94c35cf_darwin.tar.gz
 6 | KFCTL_FILE=${KFCTL_URL##*/}
 7 | tar -xvf "${KFCTL_FILE}"
 8 | sudo mv ./kfctl /usr/local/bin/
 9 | rm "${KFCTL_FILE}"
10 | 


--------------------------------------------------------------------------------
/lesson1_setup/step3_apply_kubeflow.sh:
--------------------------------------------------------------------------------
 1 | # Pick the correct config file for your platform from
 2 | # https://github.com/kubeflow/manifests/tree/[version]/kfdef
 3 | # You can download and edit the configuration at this point if you need to.
 4 | # For generic Kubernetes with Istio:
 5 | MANIFEST_BRANCH=${MANIFEST_BRANCH:-v1.0-branch}
 6 | export MANIFEST_BRANCH
 7 | MANIFEST_VERSION=${MANIFEST_VERSION:-v1.0.0}
 8 | export MANIFEST_VERSION
 9 | 
10 | KF_PROJECT_NAME=${KF_PROJECT_NAME:-hello-kf}
11 | export KF_PROJECT_NAME
12 | mkdir "${KF_PROJECT_NAME}"
13 | pushd "${KF_PROJECT_NAME}"
14 | 
15 | manifest_root=https://raw.githubusercontent.com/kubeflow/manifests/
16 | # On most environments this will create a "vanilla" Kubeflow install using Istio.
17 | FILE_NAME=kfctl_k8s_istio.${MANIFEST_VERSION}.yaml
18 | KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/${FILE_NAME}
19 | kfctl apply -f $KFDEF -V
20 | echo $?
21 | 
22 | popd
23 | 


--------------------------------------------------------------------------------
/lesson8_download_s3/s3_ls.py:
--------------------------------------------------------------------------------
 1 | import kfp
 2 | from kfp import dsl
 3 | from kfp.aws import use_aws_secret
 4 | 
 5 | 
 6 | EXPERIMENT_NAME = 'AWS S3 ls'        # Name of the experiment in the UI
 7 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 8 | 
 9 | 
10 | def s3_ls():
11 |     return kfp.dsl.ContainerOp(
12 |         name="s3_ls",
13 |         image="amazon/aws-cli:latest",
14 |         command=["aws", "s3", "ls"],
15 |     )
16 | 
17 | 
18 | @dsl.pipeline(name="s3_ls_pipeline", description="s3 ls pipeline.")
19 | def s3_ls_pipeline():
20 |     echo_task = s3_ls().apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     kfp.compiler.Compiler().compile(s3_ls_pipeline, __file__ + ".zip")
25 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
26 |         s3_ls_pipeline,
27 |         arguments={},
28 |         experiment_name=EXPERIMENT_NAME,
29 |     )
30 | 


--------------------------------------------------------------------------------
/lesson8_download_s3/s3_sync.py:
--------------------------------------------------------------------------------
 1 | import kfp
 2 | from kfp import dsl
 3 | from kfp.aws import use_aws_secret
 4 | 
 5 | 
 6 | EXPERIMENT_NAME = 'AWS S3 sync'        # Name of the experiment in the UI
 7 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 8 | 
 9 | 
10 | def s3_sync():
11 |     return kfp.dsl.ContainerOp(
12 |         name="s3_sync",
13 |         image="amazon/aws-cli:latest",
14 |         command=["aws", "s3", "sync", "s3://inside-private/dataset/casa_grande/2021-04-01/", "/tmp"],
15 |         file_outputs={
16 |             "data": "/tmp"
17 |         }
18 |     )
19 | 
20 | 
21 | @dsl.pipeline(name="s3_sync_pipeline", description="s3 sync pipeline.")
22 | def s3_sync_pipeline():
23 |     echo_task = s3_sync().apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     kfp.compiler.Compiler().compile(s3_sync_pipeline, __file__ + ".zip")
28 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
29 |         s3_sync_pipeline,
30 |         arguments={},
31 |         experiment_name=EXPERIMENT_NAME,
32 |     )
33 | 


--------------------------------------------------------------------------------
/lesson10_catboost/convert_CatBoostModel_to_ONNX/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def convert_CatBoostModel_to_ONNX(
 4 |     model_path: InputPath('CatBoostModel'),
 5 |     converted_model_path: OutputPath('ONNX'),
 6 | ):
 7 |     '''Convert CatBoost model to ONNX format.
 8 | 
 9 |     Args:
10 |         model_path: Path of a trained model in binary CatBoost model format.
11 |         converted_model_path: Output path for the converted model.
12 | 
13 |     Outputs:
14 |         converted_model: Model in ONNX format.
15 | 
16 |     Annotations:
17 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
18 |     '''
19 |     from catboost import CatBoost
20 | 
21 |     model = CatBoost()
22 |     model.load_model(model_path)
23 |     model.save_model(converted_model_path, format="onnx")
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     create_component_from_func(
28 |         convert_CatBoostModel_to_ONNX,
29 |         output_component_file='component.yaml',
30 |         base_image='python:3.7',
31 |         packages_to_install=['catboost==0.22']
32 |     )
33 | 


--------------------------------------------------------------------------------
/lesson10_catboost/convert_CatBoostModel_to_AppleCoreMLModel/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def convert_CatBoostModel_to_AppleCoreMLModel(
 4 |     model_path: InputPath('CatBoostModel'),
 5 |     converted_model_path: OutputPath('AppleCoreMLModel'),
 6 | ):
 7 |     '''Convert CatBoost model to Apple CoreML format.
 8 | 
 9 |     Args:
10 |         model_path: Path of a trained model in binary CatBoost model format.
11 |         converted_model_path: Output path for the converted model.
12 | 
13 |     Outputs:
14 |         converted_model: Model in Apple CoreML format.
15 | 
16 |     Annotations:
17 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
18 |     '''
19 |     from catboost import CatBoost
20 | 
21 |     model = CatBoost()
22 |     model.load_model(model_path)
23 |     model.save_model(
24 |         converted_model_path,
25 |         format="coreml",
26 |         # export_parameters={'prediction_type': 'probability'},
27 |         # export_parameters={'prediction_type': 'raw'},
28 |     )
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     create_component_from_func(
33 |         convert_CatBoostModel_to_AppleCoreMLModel,
34 |         output_component_file='component.yaml',
35 |         base_image='python:3.7',
36 |         packages_to_install=['catboost==0.22']
37 |     )
38 | 


--------------------------------------------------------------------------------
/lesson2_hello_world/helloworld_python.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2021 Chris Hoyean Song (sjhshy@gmail.com)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import kfp
17 | 
18 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
19 | 
20 | 
21 | def hello_world_component():
22 |     ret = "Hello World!"
23 |     print(ret)
24 |     return ret
25 | 
26 | 
27 | @kfp.dsl.pipeline(name="hello_pipeline", description="Hello World Pipeline!")
28 | def hello_world_pipeline():
29 |     hello_world_op = kfp.components.func_to_container_op(hello_world_component)
30 |     _ = hello_world_op()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     kfp.compiler.Compiler().compile(hello_world_pipeline, "hello-world-pipeline.zip")
35 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
36 |         hello_world_pipeline, arguments={}, experiment_name="hello-world-experiment"
37 |     )
38 | 


--------------------------------------------------------------------------------
/lesson2_hello_world/helloworld_bash.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import kfp
16 | from kfp import dsl
17 | 
18 | BASE_IMAGE = "library/bash:4.4.23"
19 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
20 | 
21 | 
22 | def echo_op():
23 |     return dsl.ContainerOp(
24 |         name="echo",
25 |         image=BASE_IMAGE,
26 |         command=["sh", "-c"],
27 |         arguments=['echo "hello world"'],
28 |     )
29 | 
30 | 
31 | @dsl.pipeline(name="hello_world_bash_pipeline", description="A hello world pipeline.")
32 | def hello_world_bash_pipeline():
33 |     echo_task = echo_op()
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     kfp.compiler.Compiler().compile(hello_world_bash_pipeline, __file__ + ".zip")
38 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
39 |         hello_world_bash_pipeline,
40 |         arguments={},
41 |         experiment_name="hello-world-bash-experiment",
42 |     )
43 | 


--------------------------------------------------------------------------------
/lesson9_tf_mnist/tf_mnist.py:
--------------------------------------------------------------------------------
 1 | import kfp
 2 | from kfp.components import func_to_container_op, OutputPath, InputPath
 3 | 
 4 | EXPERIMENT_NAME = 'Train TF MNIST'        # Name of the experiment in the UI
 5 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 6 | 
 7 | 
 8 | def download_mnist(output_dir_path: OutputPath()):
 9 |     import tensorflow as tf
10 | 
11 |     tf.keras.datasets.mnist.load_data(output_dir_path)
12 | 
13 | 
14 | def train_mnist(data_path: InputPath(), model_output: OutputPath()):
15 |     import tensorflow as tf
16 |     import numpy as np
17 |     with np.load(data_path, allow_pickle=True) as f:
18 |         x_train, y_train = f['x_train'], f['y_train']
19 |         x_test, y_test = f['x_test'], f['y_test']
20 |     print(x_train.shape)
21 |     print(y_train.shape)
22 | 
23 |     model = tf.keras.models.Sequential([
24 |         tf.keras.layers.Flatten(input_shape=(28, 28)),
25 |         tf.keras.layers.Dense(128, activation='relu'),
26 |         tf.keras.layers.Dense(10)
27 |     ])
28 |     model.compile(
29 |         optimizer=tf.keras.optimizers.Adam(0.001),
30 |         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
31 |         metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
32 |     )
33 | 
34 |     model.fit(
35 |         x_train, y_train,
36 |     )
37 |     model.evaluate(x_test, y_test)
38 | 
39 |     model.save(model_output)
40 | 
41 | 
42 | def tf_mnist_pipeline():
43 |     download_op = func_to_container_op(download_mnist, base_image="tensorflow/tensorflow")
44 |     train_mnist_op = func_to_container_op(train_mnist, base_image="tensorflow/tensorflow")
45 |     train_mnist_op(download_op().output)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     import kfp.compiler as compiler
50 |     compiler.Compiler().compile(tf_mnist_pipeline, __file__ + '.zip')
51 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
52 |         tf_mnist_pipeline,
53 |         arguments={},
54 |         experiment_name=EXPERIMENT_NAME)
55 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_values/from_CSV/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def catboost_predict_values(
 4 |     data_path: InputPath('CSV'),
 5 |     model_path: InputPath('CatBoostModel'),
 6 |     predictions_path: OutputPath(),
 7 | 
 8 |     label_column: int = None,
 9 | ):
10 |     '''Predict values with a CatBoost model.
11 | 
12 |     Args:
13 |         data_path: Path for the data in CSV format.
14 |         model_path: Path for the trained model in binary CatBoostModel format.
15 |         label_column: Column containing the label data.
16 |         predictions_path: Output path for the predictions.
17 | 
18 |     Outputs:
19 |         predictions: Predictions in text format.
20 | 
21 |     Annotations:
22 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
23 |     '''
24 |     import tempfile
25 | 
26 |     from catboost import CatBoost, Pool
27 |     import numpy
28 | 
29 |     if label_column:
30 |         column_descriptions = {label_column: 'Label'}
31 |         column_description_path = tempfile.NamedTemporaryFile(delete=False).name
32 |         with open(column_description_path, 'w') as column_description_file:
33 |             for idx, kind in column_descriptions.items():
34 |                 column_description_file.write('{}\t{}\n'.format(idx, kind))
35 |     else:
36 |         column_description_path = None
37 | 
38 |     eval_data = Pool(
39 |         data_path,
40 |         column_description=column_description_path,
41 |         has_header=True,
42 |         delimiter=',',
43 |     )
44 | 
45 |     model = CatBoost()
46 |     model.load_model(model_path)
47 | 
48 |     predictions = model.predict(eval_data, prediction_type='RawFormulaVal')
49 |     numpy.savetxt(predictions_path, predictions)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     catboost_predict_values_op = create_component_from_func(
54 |         catboost_predict_values,
55 |         output_component_file='component.yaml',
56 |         base_image='python:3.7',
57 |         packages_to_install=['catboost==0.23']
58 |     )
59 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_classes/from_CSV/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def catboost_predict_classes(
 4 |     data_path: InputPath('CSV'),
 5 |     model_path: InputPath('CatBoostModel'),
 6 |     predictions_path: OutputPath(),
 7 | 
 8 |     label_column: int = None,
 9 | ):
10 |     '''Predict classes using the CatBoost classifier model.
11 | 
12 |     Args:
13 |         data_path: Path for the data in CSV format.
14 |         model_path: Path for the trained model in binary CatBoostModel format.
15 |         label_column: Column containing the label data.
16 |         predictions_path: Output path for the predictions.
17 | 
18 |     Outputs:
19 |         predictions: Class predictions in text format.
20 | 
21 |     Annotations:
22 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
23 |     '''
24 |     import tempfile
25 | 
26 |     from catboost import CatBoostClassifier, Pool
27 |     import numpy
28 | 
29 |     if label_column:
30 |         column_descriptions = {label_column: 'Label'}
31 |         column_description_path = tempfile.NamedTemporaryFile(delete=False).name
32 |         with open(column_description_path, 'w') as column_description_file:
33 |             for idx, kind in column_descriptions.items():
34 |                 column_description_file.write('{}\t{}\n'.format(idx, kind))
35 |     else:
36 |         column_description_path = None
37 | 
38 |     eval_data = Pool(
39 |         data_path,
40 |         column_description=column_description_path,
41 |         has_header=True,
42 |         delimiter=',',
43 |     )
44 | 
45 |     model = CatBoostClassifier()
46 |     model.load_model(model_path)
47 | 
48 |     predictions = model.predict(eval_data)
49 |     numpy.savetxt(predictions_path, predictions, fmt='%s')
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     catboost_predict_classes_op = create_component_from_func(
54 |         catboost_predict_classes,
55 |         output_component_file='component.yaml',
56 |         base_image='python:3.7',
57 |         packages_to_install=['catboost==0.22']
58 |     )
59 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_class_probabilities/from_CSV/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def catboost_predict_class_probabilities(
 4 |     data_path: InputPath('CSV'),
 5 |     model_path: InputPath('CatBoostModel'),
 6 |     predictions_path: OutputPath(),
 7 | 
 8 |     label_column: int = None,
 9 | ):
10 |     '''Predict class probabilities with a CatBoost model.
11 | 
12 |     Args:
13 |         data_path: Path for the data in CSV format.
14 |         model_path: Path for the trained model in binary CatBoostModel format.
15 |         label_column: Column containing the label data.
16 |         predictions_path: Output path for the predictions.
17 | 
18 |     Outputs:
19 |         predictions: Predictions in text format.
20 | 
21 |     Annotations:
22 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
23 |     '''
24 |     import tempfile
25 | 
26 |     from catboost import CatBoost, Pool
27 |     import numpy
28 | 
29 |     if label_column:
30 |         column_descriptions = {label_column: 'Label'}
31 |         column_description_path = tempfile.NamedTemporaryFile(delete=False).name
32 |         with open(column_description_path, 'w') as column_description_file:
33 |             for idx, kind in column_descriptions.items():
34 |                 column_description_file.write('{}\t{}\n'.format(idx, kind))
35 |     else:
36 |         column_description_path = None
37 | 
38 |     eval_data = Pool(
39 |         data_path,
40 |         column_description=column_description_path,
41 |         has_header=True,
42 |         delimiter=',',
43 |     )
44 | 
45 |     model = CatBoost()
46 |     model.load_model(model_path)
47 | 
48 |     predictions = model.predict(eval_data, prediction_type='Probability')
49 |     numpy.savetxt(predictions_path, predictions)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     catboost_predict_class_probabilities_op = create_component_from_func(
54 |         catboost_predict_class_probabilities,
55 |         output_component_file='component.yaml',
56 |         base_image='python:3.7',
57 |         packages_to_install=['catboost==0.23']
58 |     )
59 | 


--------------------------------------------------------------------------------
/lesson7_storing_data/storing_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import kfp
17 | import kfp.dsl as dsl
18 | 
19 | EXPERIMENT_NAME = 'Storing data'        # Name of the experiment in the UI
20 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
21 | 
22 | 
23 | @dsl.pipeline(
24 |     name="Volume Op DAG",
25 |     description="The second example of the design doc."
26 | )
27 | def volume_op_dag():
28 |     vop = dsl.VolumeOp(
29 |         name="create_pvc",
30 |         resource_name="my-pvc",
31 |         size="10Gi",
32 |         modes=dsl.VOLUME_MODE_RWM
33 |     )
34 | 
35 |     step1 = dsl.ContainerOp(
36 |         name="step1",
37 |         image="library/bash:4.4.23",
38 |         command=["sh", "-c"],
39 |         arguments=["echo 1 | tee /mnt/file1"],
40 |         pvolumes={"/mnt": vop.volume}
41 |     )
42 | 
43 |     step2 = dsl.ContainerOp(
44 |         name="step2",
45 |         image="library/bash:4.4.23",
46 |         command=["sh", "-c"],
47 |         arguments=["echo 2 | tee /mnt2/file2"],
48 |         pvolumes={"/mnt2": vop.volume}
49 |     )
50 | 
51 |     step3 = dsl.ContainerOp(
52 |         name="step3",
53 |         image="library/bash:4.4.23",
54 |         command=["sh", "-c"],
55 |         arguments=["cat /mnt/file1 /mnt/file2"],
56 |         pvolumes={"/mnt": vop.volume.after(step1, step2)}
57 |     )
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     import kfp.compiler as compiler
62 |     compiler.Compiler().compile(volume_op_dag, __file__ + ".tar.gz")
63 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
64 |         volume_op_dag,
65 |         arguments={},
66 |         experiment_name=EXPERIMENT_NAME)
67 | 


--------------------------------------------------------------------------------
/lesson4_parallel/parallel_execution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2019 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import kfp
18 | from kfp import dsl
19 | 
20 | 
21 | EXPERIMENT_NAME = 'Parallel execution'        # Name of the experiment in the UI
22 | BASE_IMAGE = "python:3.7"
23 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
24 | 
25 | 
26 | def gcs_download_op(url):
27 |     return dsl.ContainerOp(
28 |         name='GCS - Download',
29 |         image='google/cloud-sdk:272.0.0',
30 |         command=['sh', '-c'],
31 |         arguments=['gsutil cat $0 | tee $1', url, '/tmp/results.txt'],
32 |         file_outputs={
33 |             'data': '/tmp/results.txt',
34 |         }
35 |     )
36 | 
37 | 
38 | def echo2_op(text1, text2):
39 |     return dsl.ContainerOp(
40 |         name='echo',
41 |         image='library/bash:4.4.23',
42 |         command=['sh', '-c'],
43 |         arguments=['echo "Text 1: $0"; echo "Text 2: $1"', text1, text2]
44 |     )
45 | 
46 | 
47 | @dsl.pipeline(
48 |     name='Parallel pipeline',
49 |     description='Download two messages in parallel and prints the concatenated result.'
50 | )
51 | def download_and_join(
52 |         url1='gs://ml-pipeline-playground/shakespeare1.txt',
53 |         url2='gs://ml-pipeline-playground/shakespeare2.txt'
54 | ):
55 |     """A three-step pipeline with first two running in parallel."""
56 | 
57 |     download1_task = gcs_download_op(url1)
58 |     download2_task = gcs_download_op(url2)
59 | 
60 |     echo_task = echo2_op(download1_task.output, download2_task.output)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     # kfp.compiler.Compiler().compile(download_and_join, __file__ + '.zip')
65 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
66 |         download_and_join,
67 |         arguments={},
68 |         experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/lesson3_add/add_python.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2021 Chris Hoyean Song (sjhshy@gmail.com)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import kfp
17 | from kfp import components
18 | from kfp import dsl
19 | 
20 | 
21 | EXPERIMENT_NAME = 'Add number pipeline'        # Name of the experiment in the UI
22 | BASE_IMAGE = "python:3.7"
23 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
24 | 
25 | 
26 | @dsl.python_component(
27 |     name='add_op',
28 |     description='adds two numbers',
29 |     base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step.
30 | )
31 | def add(a: float, b: float) -> float:
32 |     '''Calculates sum of two arguments'''
33 |     print(a, '+', b, '=', a + b)
34 |     return a + b
35 | 
36 | 
37 | # Convert the function to a pipeline operation.
38 | add_op = components.func_to_container_op(
39 |     add,
40 |     base_image=BASE_IMAGE,
41 | )
42 | 
43 | @dsl.pipeline(
44 |     name='Calculation pipeline',
45 |     description='A toy pipeline that performs arithmetic calculations.'
46 | )
47 | def calc_pipeline(
48 |         a: float = 0,
49 |         b: float = 7
50 | ):
51 |     #Passing pipeline parameter and a constant value as operation arguments
52 |     add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance.
53 | 
54 |     #You can create explicit dependency between the tasks using xyz_task.after(abc_task)
55 |     add_2_task = add_op(a, b)
56 | 
57 |     add_3_task = add_op(add_task.output, add_2_task.output)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     # Specify pipeline argument values
62 |     arguments = {'a': '7', 'b': '8'}
63 |     # Launch a pipeline run given the pipeline function definition
64 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
65 |         calc_pipeline,
66 |         arguments=arguments,
67 |         experiment_name=EXPERIMENT_NAME)
68 |     # The generated links below lead to the Experiment page and the pipeline run details page, respectively
69 | 


--------------------------------------------------------------------------------
/lesson10_catboost/convert_CatBoostModel_to_ONNX/component.yaml:
--------------------------------------------------------------------------------
 1 | name: Convert CatBoostModel to ONNX
 2 | description: |-
 3 |   Convert CatBoost model to ONNX format.
 4 | 
 5 |       Args:
 6 |           model_path: Path of a trained model in binary CatBoost model format.
 7 |           converted_model_path: Output path for the converted model.
 8 | 
 9 |       Outputs:
10 |           converted_model: Model in ONNX format.
11 | 
12 |       Annotations:
13 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
14 | inputs:
15 | - {name: model, type: CatBoostModel}
16 | outputs:
17 | - {name: converted_model, type: ONNX}
18 | implementation:
19 |   container:
20 |     image: python:3.7
21 |     command:
22 |     - sh
23 |     - -c
24 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
25 |       'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
26 |       --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@"
27 |     - python3
28 |     - -u
29 |     - -c
30 |     - |
31 |       def _make_parent_dirs_and_return_path(file_path: str):
32 |           import os
33 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
34 |           return file_path
35 | 
36 |       def convert_CatBoostModel_to_ONNX(
37 |           model_path,
38 |           converted_model_path,
39 |       ):
40 |           '''Convert CatBoost model to ONNX format.
41 | 
42 |           Args:
43 |               model_path: Path of a trained model in binary CatBoost model format.
44 |               converted_model_path: Output path for the converted model.
45 | 
46 |           Outputs:
47 |               converted_model: Model in ONNX format.
48 | 
49 |           Annotations:
50 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
51 |           '''
52 |           from catboost import CatBoost
53 | 
54 |           model = CatBoost()
55 |           model.load_model(model_path)
56 |           model.save_model(converted_model_path, format="onnx")
57 | 
58 |       import argparse
59 |       _parser = argparse.ArgumentParser(prog='Convert CatBoostModel to ONNX', description='Convert CatBoost model to ONNX format.\n\n    Args:\n        model_path: Path of a trained model in binary CatBoost model format.\n        converted_model_path: Output path for the converted model.\n\n    Outputs:\n        converted_model: Model in ONNX format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
60 |       _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
61 |       _parser.add_argument("--converted-model", dest="converted_model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
62 |       _parsed_args = vars(_parser.parse_args())
63 | 
64 |       _outputs = convert_CatBoostModel_to_ONNX(**_parsed_args)
65 |     args:
66 |     - --model
67 |     - {inputPath: model}
68 |     - --converted-model
69 |     - {outputPath: converted_model}
70 | 


--------------------------------------------------------------------------------
/lesson10_catboost/convert_CatBoostModel_to_AppleCoreMLModel/component.yaml:
--------------------------------------------------------------------------------
 1 | name: Convert CatBoostModel to AppleCoreMLModel
 2 | description: |-
 3 |   Convert CatBoost model to Apple CoreML format.
 4 | 
 5 |       Args:
 6 |           model_path: Path of a trained model in binary CatBoost model format.
 7 |           converted_model_path: Output path for the converted model.
 8 | 
 9 |       Outputs:
10 |           converted_model: Model in Apple CoreML format.
11 | 
12 |       Annotations:
13 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
14 | inputs:
15 | - {name: model, type: CatBoostModel}
16 | outputs:
17 | - {name: converted_model, type: AppleCoreMLModel}
18 | implementation:
19 |   container:
20 |     image: python:3.7
21 |     command:
22 |     - sh
23 |     - -c
24 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
25 |       'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
26 |       --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@"
27 |     - python3
28 |     - -u
29 |     - -c
30 |     - |
31 |       def _make_parent_dirs_and_return_path(file_path: str):
32 |           import os
33 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
34 |           return file_path
35 | 
36 |       def convert_CatBoostModel_to_AppleCoreMLModel(
37 |           model_path,
38 |           converted_model_path,
39 |       ):
40 |           '''Convert CatBoost model to Apple CoreML format.
41 | 
42 |           Args:
43 |               model_path: Path of a trained model in binary CatBoost model format.
44 |               converted_model_path: Output path for the converted model.
45 | 
46 |           Outputs:
47 |               converted_model: Model in Apple CoreML format.
48 | 
49 |           Annotations:
50 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
51 |           '''
52 |           from catboost import CatBoost
53 | 
54 |           model = CatBoost()
55 |           model.load_model(model_path)
56 |           model.save_model(
57 |               converted_model_path,
58 |               format="coreml",
59 |               # export_parameters={'prediction_type': 'probability'},
60 |               # export_parameters={'prediction_type': 'raw'},
61 |           )
62 | 
63 |       import argparse
64 |       _parser = argparse.ArgumentParser(prog='Convert CatBoostModel to AppleCoreMLModel', description='Convert CatBoost model to Apple CoreML format.\n\n    Args:\n        model_path: Path of a trained model in binary CatBoost model format.\n        converted_model_path: Output path for the converted model.\n\n    Outputs:\n        converted_model: Model in Apple CoreML format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
65 |       _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
66 |       _parser.add_argument("--converted-model", dest="converted_model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
67 |       _parsed_args = vars(_parser.parse_args())
68 | 
69 |       _outputs = convert_CatBoostModel_to_AppleCoreMLModel(**_parsed_args)
70 |     args:
71 |     - --model
72 |     - {inputPath: model}
73 |     - --converted-model
74 |     - {outputPath: converted_model}
75 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Train_regression/from_CSV/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def catboost_train_regression(
 4 |     training_data_path: InputPath('CSV'),
 5 |     model_path: OutputPath('CatBoostModel'),
 6 |     starting_model_path: InputPath('CatBoostModel') = None,
 7 |     label_column: int = 0,
 8 | 
 9 |     loss_function: str = 'RMSE',
10 |     num_iterations: int = 500,
11 |     learning_rate: float = None,
12 |     depth: int = 6,
13 |     random_seed: int = 0,
14 | 
15 |     cat_features: list = None,
16 | 
17 |     additional_training_options: dict = {},
18 | ):
19 |     '''Train a CatBoost classifier model.
20 | 
21 |     Args:
22 |         training_data_path: Path for the training data in CSV format.
23 |         model_path: Output path for the trained model in binary CatBoostModel format.
24 |         starting_model_path: Path for the existing trained model to start from.
25 |         label_column: Column containing the label data.
26 | 
27 |         loss_function: The metric to use in training and also selector of the machine learning
28 |             problem to solve. Default = 'RMSE'. Possible values:
29 |             'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'
30 |         num_iterations: Number of trees to add to the ensemble.
31 |         learning_rate: Step size shrinkage used in update to prevents overfitting.
32 |             Default value is selected automatically for binary classification with other parameters set to default.
33 |             In all other cases default is 0.03.
34 |         depth: Depth of a tree. All trees are the same depth. Default = 6
35 |         random_seed: Random number seed. Default = 0
36 | 
37 |         cat_features: A list of Categorical features (indices or names).
38 |         additional_training_options: A dictionary with additional options to pass to CatBoostRegressor
39 | 
40 |     Outputs:
41 |         model: Trained model in binary CatBoostModel format.
42 | 
43 |     Annotations:
44 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
45 |     '''
46 |     import tempfile
47 |     from pathlib import Path
48 | 
49 |     from catboost import CatBoostRegressor, Pool
50 | 
51 |     column_descriptions = {label_column: 'Label'}
52 |     column_description_path = tempfile.NamedTemporaryFile(delete=False).name
53 |     with open(column_description_path, 'w') as column_description_file:
54 |         for idx, kind in column_descriptions.items():
55 |             column_description_file.write('{}\t{}\n'.format(idx, kind))
56 | 
57 |     train_data = Pool(
58 |         training_data_path,
59 |         column_description=column_description_path,
60 |         has_header=True,
61 |         delimiter=',',
62 |     )
63 | 
64 |     model = CatBoostRegressor(
65 |         iterations=num_iterations,
66 |         depth=depth,
67 |         learning_rate=learning_rate,
68 |         loss_function=loss_function,
69 |         random_seed=random_seed,
70 |         verbose=True,
71 |         **additional_training_options,
72 |     )
73 | 
74 |     model.fit(
75 |         train_data,
76 |         cat_features=cat_features,
77 |         init_model=starting_model_path,
78 |         #verbose=False,
79 |         #plot=True,
80 |     )
81 |     Path(model_path).parent.mkdir(parents=True, exist_ok=True)
82 |     model.save_model(model_path)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     catboost_train_regression_op = create_component_from_func(
87 |         catboost_train_regression,
88 |         output_component_file='component.yaml',
89 |         base_image='python:3.7',
90 |         packages_to_install=['catboost==0.23']
91 |     )
92 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Train_classifier/from_CSV/component.py:
--------------------------------------------------------------------------------
 1 | from kfp.components import InputPath, OutputPath, create_component_from_func
 2 | 
 3 | def catboost_train_classifier(
 4 |     training_data_path: InputPath('CSV'),
 5 |     model_path: OutputPath('CatBoostModel'),
 6 |     starting_model_path: InputPath('CatBoostModel') = None,
 7 |     label_column: int = 0,
 8 | 
 9 |     loss_function: str = 'Logloss',
10 |     num_iterations: int = 500,
11 |     learning_rate: float = None,
12 |     depth: int = 6,
13 |     random_seed: int = 0,
14 | 
15 |     cat_features: list = None,
16 |     text_features: list = None,
17 | 
18 |     additional_training_options: dict = {},
19 | ):
20 |     '''Train a CatBoost classifier model.
21 | 
22 |     Args:
23 |         training_data_path: Path for the training data in CSV format.
24 |         model_path: Output path for the trained model in binary CatBoostModel format.
25 |         starting_model_path: Path for the existing trained model to start from.
26 |         label_column: Column containing the label data.
27 | 
28 |         loss_function: The metric to use in training and also selector of the machine learning
29 |             problem to solve. Default = 'Logloss'
30 |         num_iterations: Number of trees to add to the ensemble.
31 |         learning_rate: Step size shrinkage used in update to prevents overfitting.
32 |             Default value is selected automatically for binary classification with other parameters set to default.
33 |             In all other cases default is 0.03.
34 |         depth: Depth of a tree. All trees are the same depth. Default = 6
35 |         random_seed: Random number seed. Default = 0
36 | 
37 |         cat_features: A list of Categorical features (indices or names).
38 |         text_features: A list of Text features (indices or names).
39 |         additional_training_options: A dictionary with additional options to pass to CatBoostClassifier
40 | 
41 |     Outputs:
42 |         model: Trained model in binary CatBoostModel format.
43 | 
44 |     Annotations:
45 |         author: Alexey Volkov <alexey.volkov@ark-kun.com>
46 |     '''
47 |     import tempfile
48 |     from pathlib import Path
49 | 
50 |     from catboost import CatBoostClassifier, Pool
51 | 
52 |     column_descriptions = {label_column: 'Label'}
53 |     column_description_path = tempfile.NamedTemporaryFile(delete=False).name
54 |     with open(column_description_path, 'w') as column_description_file:
55 |         for idx, kind in column_descriptions.items():
56 |             column_description_file.write('{}\t{}\n'.format(idx, kind))
57 | 
58 |     train_data = Pool(
59 |         training_data_path,
60 |         column_description=column_description_path,
61 |         has_header=True,
62 |         delimiter=',',
63 |     )
64 | 
65 |     model = CatBoostClassifier(
66 |         iterations=num_iterations,
67 |         depth=depth,
68 |         learning_rate=learning_rate,
69 |         loss_function=loss_function,
70 |         random_seed=random_seed,
71 |         verbose=True,
72 |         **additional_training_options,
73 |     )
74 | 
75 |     model.fit(
76 |         train_data,
77 |         cat_features=cat_features,
78 |         text_features=text_features,
79 |         init_model=starting_model_path,
80 |         #verbose=False,
81 |         #plot=True,
82 |     )
83 |     Path(model_path).parent.mkdir(parents=True, exist_ok=True)
84 |     model.save_model(model_path)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     catboost_train_classifier_op = create_component_from_func(
89 |         catboost_train_classifier,
90 |         output_component_file='component.yaml',
91 |         base_image='python:3.7',
92 |         packages_to_install=['catboost==0.23']
93 |     )
94 | 


--------------------------------------------------------------------------------
/lesson7_output_a_directory/output_a_directory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # This sample shows how components can output directories
 16 | # Outputting a directory is performed the same way as outputting a file:
 17 | # component receives an output path, writes data at that path and the system takes that data and makes it available for the downstream components.
 18 | # To output a file, create a new file at the output path location.
 19 | # To output a directory, create a new directory at the output path location.
 20 | 
 21 | 
 22 | import kfp
 23 | from kfp.components import create_component_from_func, load_component_from_text, InputPath, OutputPath
 24 | 
 25 | 
 26 | EXPERIMENT_NAME = 'Output a directory'        # Name of the experiment in the UI
 27 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 28 | 
 29 | 
 30 | # Outputting directories from Python-based components:
 31 | 
 32 | @create_component_from_func
 33 | def produce_dir_with_files_python_op(output_dir_path: OutputPath(), num_files: int = 10):
 34 |     import os
 35 |     os.makedirs(output_dir_path, exist_ok=True)
 36 |     for i in range(num_files):
 37 |         file_path = os.path.join(output_dir_path, str(i) + '.txt')
 38 |         with open(file_path, 'w') as f:
 39 |             f.write(str(i))
 40 | 
 41 | 
 42 | @create_component_from_func
 43 | def list_dir_files_python_op(input_dir_path: InputPath()):
 44 |     import os
 45 |     dir_items = os.listdir(input_dir_path)
 46 |     for dir_item in dir_items:
 47 |         print(dir_item)
 48 | 
 49 | 
 50 | # Outputting directories from general command-line based components:
 51 | 
 52 | produce_dir_with_files_general_op = load_component_from_text('''
 53 | name: Produce directory
 54 | inputs:
 55 | - {name: num_files, type: Integer}
 56 | outputs:
 57 | - {name: output_dir}
 58 | implementation:
 59 |   container:
 60 |     image: alpine
 61 |     command:
 62 |     - sh
 63 |     - -ecx
 64 |     - |
 65 |       num_files="$0"
 66 |       output_path="$1"
 67 |       mkdir -p "$output_path"
 68 |       for i in $(seq "$num_files"); do
 69 |         echo "$i" > "$output_path/${i}.txt"
 70 |       done
 71 |     - {inputValue: num_files}
 72 |     - {outputPath: output_dir}
 73 | ''')
 74 | 
 75 | 
 76 | list_dir_files_general_op = load_component_from_text('''
 77 | name: List dir files
 78 | inputs:
 79 | - {name: input_dir}
 80 | implementation:
 81 |   container:
 82 |     image: alpine
 83 |     command:
 84 |     - ls
 85 |     - {inputPath: input_dir}
 86 | ''')
 87 | 
 88 | 
 89 | # Test pipeline
 90 | 
 91 | def dir_pipeline():
 92 |     produce_dir_python_task = produce_dir_with_files_python_op(num_files=15)
 93 |     list_dir_files_python_op(input_dir=produce_dir_python_task.output)
 94 | 
 95 |     produce_dir_general_task = produce_dir_with_files_general_op(num_files=15)
 96 |     list_dir_files_general_op(input_dir=produce_dir_general_task.output)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
101 |         dir_pipeline,
102 |         arguments={},
103 |         experiment_name=EXPERIMENT_NAME)


--------------------------------------------------------------------------------
/first_project/train_pipeline.py:
--------------------------------------------------------------------------------
  1 | import kfp.dsl as dsl
  2 | import kfp.gcp as gcp
  3 | import kfp.onprem as onprem
  4 | 
  5 | from string import Template
  6 | import json
  7 | 
  8 | 
  9 | @dsl.pipeline(name='Simple sci-kit KF Pipeline',
 10 |               description='A simple end to end sci-kit seldon kf pipeline')
 11 | def mnist_train_pipeline(docker_org="index.docker.io/seldonio",
 12 |                          train_container_version="0.2",
 13 |                          serve_container_version="0.1"):
 14 | 
 15 |     vop = dsl.VolumeOp(name="create_pvc",
 16 |                        resource_name="nfs-1",
 17 |                        modes=dsl.VOLUME_MODE_RWO,
 18 |                        size="10G")
 19 |     volume = vop.volume
 20 |     train = dsl.ContainerOp(
 21 |         name='sk-train',
 22 |         image=
 23 |         f"{docker_org}/skmnistclassifier_trainer:{train_container_version}",
 24 |         pvolumes={"/data": volume})
 25 | 
 26 |     seldon_serving_json_template = Template("""
 27 | {
 28 | 	"apiVersion": "machinelearning.seldon.io/v1alpha2",
 29 | 	"kind": "SeldonDeployment",
 30 | 	"metadata": {
 31 | 		"labels": {
 32 | 			"app": "seldon"
 33 | 		},
 34 | 		"name": "mnist-classifier"
 35 | 	},
 36 | 	"spec": {
 37 | 		"annotations": {
 38 | 			"deployment_version": "v1",
 39 | 			"project_name": "MNIST Example"
 40 | 		},
 41 | 		"name": "mnist-classifier",
 42 | 		"predictors": [
 43 | 			{
 44 | 				"annotations": {
 45 | 					"predictor_version": "v1"
 46 | 				},
 47 | 				"componentSpecs": [
 48 | 					{
 49 | 						"spec": {
 50 | 							"containers": [
 51 | 								{
 52 | 									"image": "$dockerreposerving:$dockertagserving",
 53 | 									"imagePullPolicy": "Always",
 54 | 									"name": "mnist-classifier",
 55 | 									"volumeMounts": [
 56 | 										{
 57 | 											"mountPath": "/data",
 58 | 											"name": "persistent-storage"
 59 | 										}
 60 | 									]
 61 | 								}
 62 | 							],
 63 | 							"terminationGracePeriodSeconds": 1,
 64 | 							"volumes": [
 65 | 								{
 66 | 									"name": "persistent-storage",
 67 | 									"persistentVolumeClaim": {
 68 | 											"claimName": "$modelpvc"
 69 | 									}
 70 | 								}
 71 | 							]
 72 | 						}
 73 | 					}
 74 | 				],
 75 | 				"graph": {
 76 | 					"children": [],
 77 | 					"endpoint": {
 78 | 						"type": "REST"
 79 | 					},
 80 | 					"name": "mnist-classifier",
 81 | 					"type": "MODEL"
 82 | 				},
 83 | 				"name": "mnist-classifier",
 84 | 				"replicas": 1
 85 | 			}
 86 | 		]
 87 | 	}
 88 | }    
 89 | """)
 90 |     seldon_serving_json = seldon_serving_json_template.substitute({
 91 |         'dockerreposerving':
 92 |             f"{docker_org}/skmnistclassifier_runtime",
 93 |         'dockertagserving':
 94 |             str(serve_container_version),
 95 |         'modelpvc':
 96 |             vop.outputs["name"]
 97 |     })
 98 | 
 99 |     seldon_deployment = json.loads(seldon_serving_json)
100 | 
101 |     serve = dsl.ResourceOp(
102 |         name='serve',
103 |         k8s_resource=seldon_deployment,
104 |         success_condition='status.state == Available').after(train)
105 | 
106 | 
107 | # If we're called directly create an expirement and run
108 | if __name__ == '__main__':
109 |     pipeline_func = mnist_train_pipeline
110 |     pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
111 |     import kfp.compiler as compiler
112 |     compiler.Compiler().compile(pipeline_func, pipeline_filename)
113 |     expirement_name = "cheese"
114 |     # experiment = client.create_experiment(expirement_name)
115 |     # run_name = pipeline_func.__name__ + ' run'
116 |     # run_result = client.run_pipeline(experiment.id, run_name,
117 |     #                                  pipeline_filename, arguments)
118 |     # print(run_result)


--------------------------------------------------------------------------------
/lessonx_mnist_pipeline/mnist_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Kubeflow Pipelines MNIST example
 16 | Run this script to compile pipeline
 17 | """
 18 | 
 19 | 
 20 | import kfp
 21 | import kfp.dsl as dsl
 22 | import kfp.gcp as gcp
 23 | import kfp.onprem as onprem
 24 | 
 25 | platform = 'onprem'
 26 | EXPERIMENT_NAME = 'MNIST pipeline'        # Name of the experiment in the UI
 27 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 28 | 
 29 | 
 30 | @dsl.pipeline(
 31 |     name='MNIST',
 32 |     description='A pipeline to train and serve the MNIST example.'
 33 | )
 34 | def mnist_pipeline(model_export_dir='gs://your-bucket/export',
 35 |                    train_steps='200',
 36 |                    learning_rate='0.01',
 37 |                    batch_size='100',
 38 |                    pvc_name=''):
 39 |     """
 40 |     Pipeline with three stages:
 41 |       1. train an MNIST classifier
 42 |       2. deploy a tf-serving instance to the cluster
 43 |       3. deploy a web-ui to interact with it
 44 |     """
 45 |     train = dsl.ContainerOp(
 46 |         name='train',
 47 |         image='gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b',
 48 |         arguments=[
 49 |             "/opt/model.py",
 50 |             "--tf-export-dir", model_export_dir,
 51 |             "--tf-train-steps", train_steps,
 52 |             "--tf-batch-size", batch_size,
 53 |             "--tf-learning-rate", learning_rate
 54 |         ]
 55 |     )
 56 | 
 57 |     serve_args = [
 58 |         '--model-export-path', model_export_dir,
 59 |         '--server-name', "mnist-service"
 60 |     ]
 61 |     if platform != 'GCP':
 62 |         serve_args.extend([
 63 |             '--cluster-name', "mnist-pipeline",
 64 |             '--pvc-name', pvc_name
 65 |         ])
 66 | 
 67 |     serve = dsl.ContainerOp(
 68 |         name='serve',
 69 |         image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:'
 70 |               '7775692adf28d6f79098e76e839986c9ee55dd61',
 71 |         arguments=serve_args
 72 |     )
 73 |     serve.after(train)
 74 | 
 75 |     webui_args = [
 76 |         '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:'
 77 |                    'v20190304-v0.2-176-g15d997b-pipelines',
 78 |         '--name', 'web-ui',
 79 |         '--container-port', '5000',
 80 |         '--service-port', '80',
 81 |         '--service-type', "LoadBalancer"
 82 |     ]
 83 |     if platform != 'GCP':
 84 |         webui_args.extend([
 85 |             '--cluster-name', "mnist-pipeline"
 86 |         ])
 87 | 
 88 |     web_ui = dsl.ContainerOp(
 89 |         name='web-ui',
 90 |         image='gcr.io/kubeflow-examples/mnist/deploy-service:latest',
 91 |         arguments=webui_args
 92 |     )
 93 |     web_ui.after(serve)
 94 | 
 95 |     steps = [train, serve, web_ui]
 96 |     for step in steps:
 97 |         if platform == 'GCP':
 98 |             step.apply(gcp.use_gcp_secret('user-gcp-sa'))
 99 |         else:
100 |             step.apply(onprem.mount_pvc(pvc_name, 'task-pv-volume', '/mnt'))
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     import kfp.compiler as compiler
105 |     compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz')
106 | 
107 |     # Launch a pipeline run given the pipeline function definition
108 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
109 |         mnist_pipeline,
110 |         arguments={
111 |             "model_export_dir": "/mnt/export",
112 |             "pvc_name": "task-pv-claim"},
113 |         experiment_name=EXPERIMENT_NAME)
114 |     # The generated links below lead to the Experiment page and the pipeline run details page, respectively
115 | 


--------------------------------------------------------------------------------
/lesson10_catboost/catboost_pipeline.py:
--------------------------------------------------------------------------------
 1 | import kfp
 2 | from kfp import components
 3 | 
 4 | EXPERIMENT_NAME = 'CatBoost pipeline'        # Name of the experiment in the UI
 5 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline"
 6 | 
 7 | chicago_taxi_dataset_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml')
 8 | pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e69a6694/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml')
 9 | 
10 | catboost_train_classifier_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Train_classifier/from_CSV/component.yaml')
11 | catboost_train_regression_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Train_regression/from_CSV/component.yaml')
12 | catboost_predict_classes_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_classes/from_CSV/component.yaml')
13 | catboost_predict_values_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_values/from_CSV/component.yaml')
14 | catboost_predict_class_probabilities_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_class_probabilities/from_CSV/component.yaml')
15 | catboost_to_apple_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/convert_CatBoostModel_to_AppleCoreMLModel/component.yaml')
16 | catboost_to_onnx_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/convert_CatBoostModel_to_ONNX/component.yaml')
17 | 
18 | 
19 | def catboost_pipeline():
20 |     training_data_in_csv = chicago_taxi_dataset_op(
21 |         where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"',
22 |         select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total',
23 |         limit=10000,
24 |     ).output
25 | 
26 |     training_data_for_classification_in_csv = pandas_transform_csv_op(
27 |         table=training_data_in_csv,
28 |         transform_code='''df.insert(0, "was_tipped", df["tips"] > 0); del df["tips"]''',
29 |     ).output
30 | 
31 |     catboost_train_regression_task = catboost_train_regression_op(
32 |         training_data=training_data_in_csv,
33 |         loss_function='RMSE',
34 |         label_column=0,
35 |         num_iterations=200,
36 |     )
37 | 
38 |     regression_model = catboost_train_regression_task.outputs['model']
39 | 
40 |     catboost_train_classifier_task = catboost_train_classifier_op(
41 |         training_data=training_data_for_classification_in_csv,
42 |         label_column=0,
43 |         num_iterations=200,
44 |     )
45 | 
46 |     classification_model = catboost_train_classifier_task.outputs['model']
47 | 
48 |     evaluation_data_for_regression_in_csv = training_data_in_csv
49 |     evaluation_data_for_classification_in_csv = training_data_for_classification_in_csv
50 | 
51 |     catboost_predict_values_op(
52 |         data=evaluation_data_for_regression_in_csv,
53 |         model=regression_model,
54 |         label_column=0,
55 |     )
56 | 
57 |     catboost_predict_classes_op(
58 |         data=evaluation_data_for_classification_in_csv,
59 |         model=classification_model,
60 |         label_column=0,
61 |     )
62 | 
63 |     catboost_predict_class_probabilities_op(
64 |         data=evaluation_data_for_classification_in_csv,
65 |         model=classification_model,
66 |         label_column=0,
67 |     )
68 | 
69 |     catboost_to_apple_op(regression_model)
70 |     catboost_to_apple_op(classification_model)
71 | 
72 |     catboost_to_onnx_op(regression_model)
73 |     catboost_to_onnx_op(classification_model)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     kfp.compiler.Compiler().compile(catboost_pipeline, __file__ + '.zip')
78 |     kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func(
79 |         catboost_pipeline,
80 |         arguments={},
81 |         experiment_name=EXPERIMENT_NAME)
82 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_values/from_CSV/component.yaml:
--------------------------------------------------------------------------------
  1 | name: Catboost predict values
  2 | description: |-
  3 |   Predict values with a CatBoost model.
  4 | 
  5 |       Args:
  6 |           data_path: Path for the data in CSV format.
  7 |           model_path: Path for the trained model in binary CatBoostModel format.
  8 |           label_column: Column containing the label data.
  9 |           predictions_path: Output path for the predictions.
 10 | 
 11 |       Outputs:
 12 |           predictions: Predictions in text format.
 13 | 
 14 |       Annotations:
 15 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
 16 | inputs:
 17 | - {name: data, type: CSV}
 18 | - {name: model, type: CatBoostModel}
 19 | - {name: label_column, type: Integer, optional: true}
 20 | outputs:
 21 | - {name: predictions}
 22 | implementation:
 23 |   container:
 24 |     image: python:3.7
 25 |     command:
 26 |     - sh
 27 |     - -c
 28 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 29 |       'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
 30 |       --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
 31 |     - python3
 32 |     - -u
 33 |     - -c
 34 |     - |
 35 |       def _make_parent_dirs_and_return_path(file_path: str):
 36 |           import os
 37 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
 38 |           return file_path
 39 | 
 40 |       def catboost_predict_values(
 41 |           data_path,
 42 |           model_path,
 43 |           predictions_path,
 44 | 
 45 |           label_column = None,
 46 |       ):
 47 |           '''Predict values with a CatBoost model.
 48 | 
 49 |           Args:
 50 |               data_path: Path for the data in CSV format.
 51 |               model_path: Path for the trained model in binary CatBoostModel format.
 52 |               label_column: Column containing the label data.
 53 |               predictions_path: Output path for the predictions.
 54 | 
 55 |           Outputs:
 56 |               predictions: Predictions in text format.
 57 | 
 58 |           Annotations:
 59 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
 60 |           '''
 61 |           import tempfile
 62 | 
 63 |           from catboost import CatBoost, Pool
 64 |           import numpy
 65 | 
 66 |           if label_column:
 67 |               column_descriptions = {label_column: 'Label'}
 68 |               column_description_path = tempfile.NamedTemporaryFile(delete=False).name
 69 |               with open(column_description_path, 'w') as column_description_file:
 70 |                   for idx, kind in column_descriptions.items():
 71 |                       column_description_file.write('{}\t{}\n'.format(idx, kind))
 72 |           else:
 73 |               column_description_path = None
 74 | 
 75 |           eval_data = Pool(
 76 |               data_path,
 77 |               column_description=column_description_path,
 78 |               has_header=True,
 79 |               delimiter=',',
 80 |           )
 81 | 
 82 |           model = CatBoost()
 83 |           model.load_model(model_path)
 84 | 
 85 |           predictions = model.predict(eval_data, prediction_type='RawFormulaVal')
 86 |           numpy.savetxt(predictions_path, predictions)
 87 | 
 88 |       import argparse
 89 |       _parser = argparse.ArgumentParser(prog='Catboost predict values', description='Predict values with a CatBoost model.\n\n    Args:\n        data_path: Path for the data in CSV format.\n        model_path: Path for the trained model in binary CatBoostModel format.\n        label_column: Column containing the label data.\n        predictions_path: Output path for the predictions.\n\n    Outputs:\n        predictions: Predictions in text format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 90 |       _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
 91 |       _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
 92 |       _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 93 |       _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 94 |       _parsed_args = vars(_parser.parse_args())
 95 | 
 96 |       _outputs = catboost_predict_values(**_parsed_args)
 97 |     args:
 98 |     - --data
 99 |     - {inputPath: data}
100 |     - --model
101 |     - {inputPath: model}
102 |     - if:
103 |         cond: {isPresent: label_column}
104 |         then:
105 |         - --label-column
106 |         - {inputValue: label_column}
107 |     - --predictions
108 |     - {outputPath: predictions}
109 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_classes/from_CSV/component.yaml:
--------------------------------------------------------------------------------
  1 | name: Catboost predict classes
  2 | description: |-
  3 |   Predict classes using the CatBoost classifier model.
  4 | 
  5 |       Args:
  6 |           data_path: Path for the data in CSV format.
  7 |           model_path: Path for the trained model in binary CatBoostModel format.
  8 |           label_column: Column containing the label data.
  9 |           predictions_path: Output path for the predictions.
 10 | 
 11 |       Outputs:
 12 |           predictions: Class predictions in text format.
 13 | 
 14 |       Annotations:
 15 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
 16 | inputs:
 17 | - {name: data, type: CSV}
 18 | - {name: model, type: CatBoostModel}
 19 | - {name: label_column, type: Integer, optional: true}
 20 | outputs:
 21 | - {name: predictions}
 22 | implementation:
 23 |   container:
 24 |     image: python:3.7
 25 |     command:
 26 |     - sh
 27 |     - -c
 28 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 29 |       'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
 30 |       --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@"
 31 |     - python3
 32 |     - -u
 33 |     - -c
 34 |     - |
 35 |       def _make_parent_dirs_and_return_path(file_path: str):
 36 |           import os
 37 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
 38 |           return file_path
 39 | 
 40 |       def catboost_predict_classes(
 41 |           data_path,
 42 |           model_path,
 43 |           predictions_path,
 44 | 
 45 |           label_column = None,
 46 |       ):
 47 |           '''Predict classes using the CatBoost classifier model.
 48 | 
 49 |           Args:
 50 |               data_path: Path for the data in CSV format.
 51 |               model_path: Path for the trained model in binary CatBoostModel format.
 52 |               label_column: Column containing the label data.
 53 |               predictions_path: Output path for the predictions.
 54 | 
 55 |           Outputs:
 56 |               predictions: Class predictions in text format.
 57 | 
 58 |           Annotations:
 59 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
 60 |           '''
 61 |           import tempfile
 62 | 
 63 |           from catboost import CatBoostClassifier, Pool
 64 |           import numpy
 65 | 
 66 |           if label_column:
 67 |               column_descriptions = {label_column: 'Label'}
 68 |               column_description_path = tempfile.NamedTemporaryFile(delete=False).name
 69 |               with open(column_description_path, 'w') as column_description_file:
 70 |                   for idx, kind in column_descriptions.items():
 71 |                       column_description_file.write('{}\t{}\n'.format(idx, kind))
 72 |           else:
 73 |               column_description_path = None
 74 | 
 75 |           eval_data = Pool(
 76 |               data_path,
 77 |               column_description=column_description_path,
 78 |               has_header=True,
 79 |               delimiter=',',
 80 |           )
 81 | 
 82 |           model = CatBoostClassifier()
 83 |           model.load_model(model_path)
 84 | 
 85 |           predictions = model.predict(eval_data)
 86 |           numpy.savetxt(predictions_path, predictions, fmt='%s')
 87 | 
 88 |       import argparse
 89 |       _parser = argparse.ArgumentParser(prog='Catboost predict classes', description='Predict classes using the CatBoost classifier model.\n\n    Args:\n        data_path: Path for the data in CSV format.\n        model_path: Path for the trained model in binary CatBoostModel format.\n        label_column: Column containing the label data.\n        predictions_path: Output path for the predictions.\n\n    Outputs:\n        predictions: Class predictions in text format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 90 |       _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
 91 |       _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
 92 |       _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 93 |       _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 94 |       _parsed_args = vars(_parser.parse_args())
 95 | 
 96 |       _outputs = catboost_predict_classes(**_parsed_args)
 97 |     args:
 98 |     - --data
 99 |     - {inputPath: data}
100 |     - --model
101 |     - {inputPath: model}
102 |     - if:
103 |         cond: {isPresent: label_column}
104 |         then:
105 |         - --label-column
106 |         - {inputValue: label_column}
107 |     - --predictions
108 |     - {outputPath: predictions}
109 | 


--------------------------------------------------------------------------------
/lesson5_control_structure/control_structure.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2020 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | # %% [markdown]
 18 | # # DSL control structures tutorial
 19 | # Shows how to use conditional execution and exit handlers.
 20 | 
 21 | # %%
 22 | from typing import NamedTuple
 23 | 
 24 | import kfp
 25 | from kfp import dsl
 26 | from kfp.components import func_to_container_op, InputPath, OutputPath
 27 | 
 28 | # %% [markdown]
 29 | # ## Conditional execution
 30 | # You can use the `with dsl.Condition(task1.outputs["output_name"] = "value"):` context to execute parts of the pipeline conditionally
 31 | 
 32 | # %%
 33 | 
 34 | @func_to_container_op
 35 | def get_random_int_op(minimum: int, maximum: int) -> int:
 36 |     """Generate a random number between minimum and maximum (inclusive)."""
 37 |     import random
 38 |     result = random.randint(minimum, maximum)
 39 |     print(result)
 40 |     return result
 41 | 
 42 | 
 43 | @func_to_container_op
 44 | def flip_coin_op() -> str:
 45 |     """Flip a coin and output heads or tails randomly."""
 46 |     import random
 47 |     result = random.choice(['heads', 'tails'])
 48 |     print(result)
 49 |     return result
 50 | 
 51 | 
 52 | @func_to_container_op
 53 | def print_op(message: str):
 54 |     """Print a message."""
 55 |     print(message)
 56 | 
 57 | 
 58 | @dsl.pipeline(
 59 |     name='Conditional execution pipeline',
 60 |     description='Shows how to use dsl.Condition().'
 61 | )
 62 | def flipcoin_pipeline():
 63 |     flip = flip_coin_op()
 64 |     with dsl.Condition(flip.output == 'heads'):
 65 |         random_num_head = get_random_int_op(0, 9)
 66 |         with dsl.Condition(random_num_head.output > 5):
 67 |             print_op('heads and %s > 5!' % random_num_head.output)
 68 |         with dsl.Condition(random_num_head.output <= 5):
 69 |             print_op('heads and %s <= 5!' % random_num_head.output)
 70 | 
 71 |     with dsl.Condition(flip.output == 'tails'):
 72 |         random_num_tail = get_random_int_op(10, 19)
 73 |         with dsl.Condition(random_num_tail.output > 15):
 74 |             print_op('tails and %s > 15!' % random_num_tail.output)
 75 |         with dsl.Condition(random_num_tail.output <= 15):
 76 |             print_op('tails and %s <= 15!' % random_num_tail.output)
 77 | 
 78 | 
 79 | # Submit the pipeline for execution:
 80 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(flipcoin_pipeline, arguments={})
 81 | 
 82 | # %% [markdown]
 83 | # ## Exit handlers
 84 | # You can use `with dsl.ExitHandler(exit_task):` context to execute a task when the rest of the pipeline finishes (succeeds or fails)
 85 | 
 86 | # %%
 87 | @func_to_container_op
 88 | def fail_op(message):
 89 |     """Fails."""
 90 |     import sys
 91 |     print(message)
 92 |     sys.exit(1)
 93 | 
 94 | 
 95 | @dsl.pipeline(
 96 |     name='Conditional execution pipeline with exit handler',
 97 |     description='Shows how to use dsl.Condition() and dsl.ExitHandler().'
 98 | )
 99 | def flipcoin_exit_pipeline():
100 |     exit_task = print_op('Exit handler has worked!')
101 |     with dsl.ExitHandler(exit_task):
102 |         flip = flip_coin_op()
103 |         with dsl.Condition(flip.output == 'heads'):
104 |             random_num_head = get_random_int_op(0, 9)
105 |             with dsl.Condition(random_num_head.output > 5):
106 |                 print_op('heads and %s > 5!' % random_num_head.output)
107 |             with dsl.Condition(random_num_head.output <= 5):
108 |                 print_op('heads and %s <= 5!' % random_num_head.output)
109 | 
110 |         with dsl.Condition(flip.output == 'tails'):
111 |             random_num_tail = get_random_int_op(10, 19)
112 |             with dsl.Condition(random_num_tail.output > 15):
113 |                 print_op('tails and %s > 15!' % random_num_tail.output)
114 |             with dsl.Condition(random_num_tail.output <= 15):
115 |                 print_op('tails and %s <= 15!' % random_num_tail.output)
116 | 
117 |         with dsl.Condition(flip.output == 'tails'):
118 |             fail_op(message="Failing the run to demonstrate that exit handler still gets executed.")
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     # Compiling the pipeline
123 |     kfp.compiler.Compiler().compile(flipcoin_exit_pipeline, __file__ + '.yaml')
124 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Predict_class_probabilities/from_CSV/component.yaml:
--------------------------------------------------------------------------------
  1 | name: Catboost predict class probabilities
  2 | description: |-
  3 |   Predict class probabilities with a CatBoost model.
  4 | 
  5 |       Args:
  6 |           data_path: Path for the data in CSV format.
  7 |           model_path: Path for the trained model in binary CatBoostModel format.
  8 |           label_column: Column containing the label data.
  9 |           predictions_path: Output path for the predictions.
 10 | 
 11 |       Outputs:
 12 |           predictions: Predictions in text format.
 13 | 
 14 |       Annotations:
 15 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
 16 | inputs:
 17 | - {name: data, type: CSV}
 18 | - {name: model, type: CatBoostModel}
 19 | - {name: label_column, type: Integer, optional: true}
 20 | outputs:
 21 | - {name: predictions}
 22 | implementation:
 23 |   container:
 24 |     image: python:3.7
 25 |     command:
 26 |     - sh
 27 |     - -c
 28 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 29 |       'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
 30 |       --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
 31 |     - python3
 32 |     - -u
 33 |     - -c
 34 |     - |
 35 |       def _make_parent_dirs_and_return_path(file_path: str):
 36 |           import os
 37 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
 38 |           return file_path
 39 | 
 40 |       def catboost_predict_class_probabilities(
 41 |           data_path,
 42 |           model_path,
 43 |           predictions_path,
 44 | 
 45 |           label_column = None,
 46 |       ):
 47 |           '''Predict class probabilities with a CatBoost model.
 48 | 
 49 |           Args:
 50 |               data_path: Path for the data in CSV format.
 51 |               model_path: Path for the trained model in binary CatBoostModel format.
 52 |               label_column: Column containing the label data.
 53 |               predictions_path: Output path for the predictions.
 54 | 
 55 |           Outputs:
 56 |               predictions: Predictions in text format.
 57 | 
 58 |           Annotations:
 59 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
 60 |           '''
 61 |           import tempfile
 62 | 
 63 |           from catboost import CatBoost, Pool
 64 |           import numpy
 65 | 
 66 |           if label_column:
 67 |               column_descriptions = {label_column: 'Label'}
 68 |               column_description_path = tempfile.NamedTemporaryFile(delete=False).name
 69 |               with open(column_description_path, 'w') as column_description_file:
 70 |                   for idx, kind in column_descriptions.items():
 71 |                       column_description_file.write('{}\t{}\n'.format(idx, kind))
 72 |           else:
 73 |               column_description_path = None
 74 | 
 75 |           eval_data = Pool(
 76 |               data_path,
 77 |               column_description=column_description_path,
 78 |               has_header=True,
 79 |               delimiter=',',
 80 |           )
 81 | 
 82 |           model = CatBoost()
 83 |           model.load_model(model_path)
 84 | 
 85 |           predictions = model.predict(eval_data, prediction_type='Probability')
 86 |           numpy.savetxt(predictions_path, predictions)
 87 | 
 88 |       import argparse
 89 |       _parser = argparse.ArgumentParser(prog='Catboost predict class probabilities', description='Predict class probabilities with a CatBoost model.\n\n    Args:\n        data_path: Path for the data in CSV format.\n        model_path: Path for the trained model in binary CatBoostModel format.\n        label_column: Column containing the label data.\n        predictions_path: Output path for the predictions.\n\n    Outputs:\n        predictions: Predictions in text format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 90 |       _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
 91 |       _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
 92 |       _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 93 |       _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 94 |       _parsed_args = vars(_parser.parse_args())
 95 | 
 96 |       _outputs = catboost_predict_class_probabilities(**_parsed_args)
 97 |     args:
 98 |     - --data
 99 |     - {inputPath: data}
100 |     - --model
101 |     - {inputPath: model}
102 |     - if:
103 |         cond: {isPresent: label_column}
104 |         then:
105 |         - --label-column
106 |         - {inputValue: label_column}
107 |     - --predictions
108 |     - {outputPath: predictions}
109 | 


--------------------------------------------------------------------------------
/train_until_good/train_until_good.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2020 The Kubeflow Pipleines authors
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # This sample demonstrates continuous training using a train-eval-check recursive loop.
 17 | # The main pipeline trains the initial model and then gradually trains the model
 18 | # some more until the model evaluation metrics are good enough.
 19 | 
 20 | import kfp
 21 | from kfp import components
 22 | 
 23 | 
 24 | chicago_taxi_dataset_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml')
 25 | xgboost_train_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml')
 26 | xgboost_predict_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml')
 27 | 
 28 | pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml')
 29 | drop_header_op = kfp.components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml')
 30 | calculate_regression_metrics_from_csv_op = kfp.components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/616542ac0f789914f4eb53438da713dd3004fba4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml')
 31 | 
 32 | 
 33 | # This recursive sub-pipeline trains a model, evaluates it, calculates the metrics and checks them.
 34 | # If the model error is too high, then more training is performed until the model is good.
 35 | @kfp.dsl.graph_component
 36 | def train_until_low_error(starting_model, training_data, true_values):
 37 |     # Training
 38 |     model = xgboost_train_on_csv_op(
 39 |         training_data=training_data,
 40 |         starting_model=starting_model,
 41 |         label_column=0,
 42 |         objective='reg:squarederror',
 43 |         num_iterations=50,
 44 |     ).outputs['model']
 45 | 
 46 |     # Predicting
 47 |     predictions = xgboost_predict_on_csv_op(
 48 |         data=training_data,
 49 |         model=model,
 50 |         label_column=0,
 51 |     ).output
 52 | 
 53 |     # Calculating the regression metrics
 54 |     metrics_task = calculate_regression_metrics_from_csv_op(
 55 |         true_values=true_values,
 56 |         predicted_values=predictions,
 57 |     )
 58 | 
 59 |     # Checking the metrics
 60 |     with kfp.dsl.Condition(metrics_task.outputs['mean_squared_error'] > 0.01):
 61 |         # Training some more
 62 |         train_until_low_error(
 63 |             starting_model=model,
 64 |             training_data=training_data,
 65 |             true_values=true_values,
 66 |         )
 67 | 
 68 | 
 69 | # The main pipleine trains the initial model and then gradually trains the model some more until the model evaluation metrics are good enough.
 70 | @kfp.dsl.pipeline()
 71 | def train_until_good_pipeline():
 72 |     # Preparing the training data
 73 |     training_data = chicago_taxi_dataset_op(
 74 |         where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"',
 75 |         select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total',
 76 |         limit=10000,
 77 |     ).output
 78 | 
 79 |     # Preparing the true values
 80 |     true_values_table = pandas_transform_csv_op(
 81 |         table=training_data,
 82 |         transform_code='df = df[["tips"]]',
 83 |     ).output
 84 | 
 85 |     true_values = drop_header_op(true_values_table).output
 86 | 
 87 |     # Initial model training
 88 |     first_model = xgboost_train_on_csv_op(
 89 |         training_data=training_data,
 90 |         label_column=0,
 91 |         objective='reg:squarederror',
 92 |         num_iterations=100,
 93 |     ).outputs['model']
 94 | 
 95 |     # Recursively training until the error becomes low
 96 |     train_until_low_error(
 97 |         starting_model=first_model,
 98 |         training_data=training_data,
 99 |         true_values=true_values,
100 |     )
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     kfp.compiler.Compiler().compile(train_until_good_pipeline, __file__ + '.yaml')


--------------------------------------------------------------------------------
/first_project/first.yml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: Workflow
  3 | metadata:
  4 |   generateName: simple-sci-kit-kf-pipeline-
  5 |   annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2, pipelines.kubeflow.org/pipeline_compilation_time: '2021-04-04T16:14:25.709465',
  6 |     pipelines.kubeflow.org/pipeline_spec: '{"description": "A simple end to end sci-kit
  7 |       seldon kf pipeline", "inputs": [{"default": "index.docker.io/seldonio", "name":
  8 |       "docker_org", "optional": true}, {"default": "0.2", "name": "train_container_version",
  9 |       "optional": true}, {"default": "0.1", "name": "serve_container_version", "optional":
 10 |       true}], "name": "Simple sci-kit KF Pipeline"}'}
 11 |   labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2}
 12 | spec:
 13 |   entrypoint: simple-sci-kit-kf-pipeline
 14 |   templates:
 15 |   - name: create-pvc
 16 |     resource:
 17 |       action: create
 18 |       manifest: |
 19 |         apiVersion: v1
 20 |         kind: PersistentVolumeClaim
 21 |         metadata:
 22 |           name: '{{workflow.name}}-nfs-1'
 23 |         spec:
 24 |           accessModes:
 25 |           - ReadWriteOnce
 26 |           resources:
 27 |             requests:
 28 |               storage: 10G
 29 |     outputs:
 30 |       parameters:
 31 |       - name: create-pvc-manifest
 32 |         valueFrom: {jsonPath: '{}'}
 33 |       - name: create-pvc-name
 34 |         valueFrom: {jsonPath: '{.metadata.name}'}
 35 |       - name: create-pvc-size
 36 |         valueFrom: {jsonPath: '{.status.capacity.storage}'}
 37 |   - name: serve
 38 |     resource:
 39 |       action: create
 40 |       successCondition: status.state == Available
 41 |       manifest: |
 42 |         apiVersion: machinelearning.seldon.io/v1alpha2
 43 |         kind: SeldonDeployment
 44 |         metadata:
 45 |           labels:
 46 |             app: seldon
 47 |           name: mnist-classifier
 48 |         spec:
 49 |           annotations:
 50 |             deployment_version: v1
 51 |             project_name: MNIST Example
 52 |           name: mnist-classifier
 53 |           predictors:
 54 |           - annotations:
 55 |               predictor_version: v1
 56 |             componentSpecs:
 57 |             - spec:
 58 |                 containers:
 59 |                 - image: '{{inputs.parameters.docker_org}}/skmnistclassifier_runtime:{{inputs.parameters.serve_container_version}}'
 60 |                   imagePullPolicy: Always
 61 |                   name: mnist-classifier
 62 |                   volumeMounts:
 63 |                   - mountPath: /data
 64 |                     name: persistent-storage
 65 |                 terminationGracePeriodSeconds: 1
 66 |                 volumes:
 67 |                 - name: persistent-storage
 68 |                   persistentVolumeClaim:
 69 |                     claimName: '{{inputs.parameters.create-pvc-name}}'
 70 |             graph:
 71 |               children: []
 72 |               endpoint:
 73 |                 type: REST
 74 |               name: mnist-classifier
 75 |               type: MODEL
 76 |             name: mnist-classifier
 77 |             replicas: 1
 78 |     inputs:
 79 |       parameters:
 80 |       - {name: create-pvc-name}
 81 |       - {name: docker_org}
 82 |       - {name: serve_container_version}
 83 |     outputs:
 84 |       parameters:
 85 |       - name: serve-manifest
 86 |         valueFrom: {jsonPath: '{}'}
 87 |       - name: serve-name
 88 |         valueFrom: {jsonPath: '{.metadata.name}'}
 89 |   - name: simple-sci-kit-kf-pipeline
 90 |     inputs:
 91 |       parameters:
 92 |       - {name: docker_org}
 93 |       - {name: serve_container_version}
 94 |       - {name: train_container_version}
 95 |     dag:
 96 |       tasks:
 97 |       - {name: create-pvc, template: create-pvc}
 98 |       - name: serve
 99 |         template: serve
100 |         dependencies: [create-pvc, sk-train]
101 |         arguments:
102 |           parameters:
103 |           - {name: create-pvc-name, value: '{{tasks.create-pvc.outputs.parameters.create-pvc-name}}'}
104 |           - {name: docker_org, value: '{{inputs.parameters.docker_org}}'}
105 |           - {name: serve_container_version, value: '{{inputs.parameters.serve_container_version}}'}
106 |       - name: sk-train
107 |         template: sk-train
108 |         dependencies: [create-pvc]
109 |         arguments:
110 |           parameters:
111 |           - {name: create-pvc-name, value: '{{tasks.create-pvc.outputs.parameters.create-pvc-name}}'}
112 |           - {name: docker_org, value: '{{inputs.parameters.docker_org}}'}
113 |           - {name: train_container_version, value: '{{inputs.parameters.train_container_version}}'}
114 |   - name: sk-train
115 |     container:
116 |       image: '{{inputs.parameters.docker_org}}/skmnistclassifier_trainer:{{inputs.parameters.train_container_version}}'
117 |       volumeMounts:
118 |       - {mountPath: /data, name: create-pvc}
119 |     inputs:
120 |       parameters:
121 |       - {name: create-pvc-name}
122 |       - {name: docker_org}
123 |       - {name: train_container_version}
124 |     volumes:
125 |     - name: create-pvc
126 |       persistentVolumeClaim: {claimName: '{{inputs.parameters.create-pvc-name}}'}
127 |   arguments:
128 |     parameters:
129 |     - {name: docker_org, value: index.docker.io/seldonio}
130 |     - {name: train_container_version, value: '0.2'}
131 |     - {name: serve_container_version, value: '0.1'}
132 |   serviceAccountName: pipeline-runner
133 | 


--------------------------------------------------------------------------------
/lesson6_data_passing/data_passing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2020 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # %% [markdown]
 17 | # # Data passing tutorial
 18 | # Data passing is the most important aspect of Pipelines.
 19 | #
 20 | # In Kubeflow Pipelines, the pipeline authors compose pipelines by creating component instances (tasks) and connecting them together.
 21 | #
 22 | # Component have inputs and outputs. They can consume and produce arbitrary data.
 23 | #
 24 | # Pipeline authors establish connections between component tasks by connecting their data inputs and outputs - by passing the output of one task as an argument to another task's input.
 25 | #
 26 | # The system takes care of storing the data produced by components and later passing that data to other components for consumption as instructed by the pipeline.
 27 | #
 28 | # This tutorial shows how to create python components that produce, consume and transform data.
 29 | # It shows how to create data passing pipelines by instantiating components and connecting them together.
 30 | 
 31 | # %%
 32 | from typing import NamedTuple
 33 | 
 34 | import kfp
 35 | from kfp.components import func_to_container_op, InputPath, OutputPath
 36 | 
 37 | # %% [markdown]
 38 | # ## Small data
 39 | #
 40 | # Small data is the data that you'll be comfortable passing as program's command-line argument. Small data size should not exceed few kilobytes.
 41 | #
 42 | # Some examples of typical types of small data are: number, URL, small string (e.g. column name).
 43 | #
 44 | # Small lists, dictionaries and JSON structures are fine, but keep an eye on the size and consider switching to file-based data passing methods taht are more suitable for bigger data (more than several kilobytes) or binary data.
 45 | #
 46 | # All small data outputs will be at some point serialized to strings and all small data input values will be at some point deserialized from strings (passed as command-line argumants). There are built-in serializers and deserializers for several common types (e.g. `str`, `int`, `float`, `bool`, `list`, `dict`). All other types of data need to be serialized manually before returning the data. Make sure to properly specify type annotations, otherwize there would be no automatic deserialization and the component function will receive strings instead of deserialized objects.
 47 | 
 48 | # %% [markdown]
 49 | # ## Bigger data (files)
 50 | #
 51 | # Bigger data should be read from files and written to files.
 52 | #
 53 | # The paths for the input and output files are chosen by the system and are passed into the function (as strings).
 54 | #
 55 | # Use the `InputPath` parameter annotation to tell the system that the function wants to consume the corresponding input data as a file. The system will download the data, write it to a local file and then pass the **path** of that file to the function.
 56 | #
 57 | # Use the `OutputPath` parameter annotation to tell the system that the function wants to produce the corresponding output data as a file. The system will prepare and pass the **path** of a file where the function should write the output data. After the function exits, the system will upload the data to the storage system so that it can be passed to downstream components.
 58 | #
 59 | # You can specify the type of the consumed/produced data by specifying the type argument to `InputPath` and `OutputPath`. The type can be a python type or an arbitrary type name string. `OutputPath('TFModel')` means that the function states that the data it has written to a file has type 'TFModel'. `InputPath('TFModel')` means that the function states that it expect the data it reads from a file to have type 'TFModel'. When the pipeline author connects inputs to outputs the system checks whether the types match.
 60 | #
 61 | # Note on input/output names: When the function is converted to component, the input and output names generally follow the parameter names, but the "\_path" and "\_file" suffixes are stripped from file/path inputs and outputs. E.g. the `number_file_path: InputPath(int)` parameter becomes the `number: int` input. This makes the argument passing look more natural: `number=42` instead of `number_file_path=42`.
 62 | # %% [markdown]
 63 | #
 64 | # ### Writing and reading bigger data
 65 | 
 66 | # %%
 67 | # Writing bigger data
 68 | @func_to_container_op
 69 | def repeat_line(line: str, output_text_path: OutputPath(str), count: int = 10):
 70 |     '''Repeat the line specified number of times'''
 71 |     with open(output_text_path, 'w') as writer:
 72 |         for i in range(count):
 73 |             writer.write(line + '\n')
 74 | 
 75 | 
 76 | # Reading bigger data
 77 | @func_to_container_op
 78 | def print_text(text_path: InputPath()): # The "text" input is untyped so that any data can be printed
 79 |     '''Print text'''
 80 |     with open(text_path, 'r') as reader:
 81 |         for line in reader:
 82 |             print(line, end = '')
 83 | 
 84 | def print_repeating_lines_pipeline():
 85 |     repeat_lines_task = repeat_line(line='Hello', count=5000)
 86 |     print_text(repeat_lines_task.output) # Don't forget .output !
 87 | 
 88 | # Submit the pipeline for execution:
 89 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(print_repeating_lines_pipeline, arguments={})
 90 | 
 91 | # %% [markdown]
 92 | # ### Processing bigger data
 93 | 
 94 | # %%
 95 | @func_to_container_op
 96 | def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)):
 97 |     with open(source_path, 'r') as reader:
 98 |         with open(odd_lines_path, 'w') as odd_writer:
 99 |             with open(even_lines_path, 'w') as even_writer:
100 |                 while True:
101 |                     line = reader.readline()
102 |                     if line == "":
103 |                         break
104 |                     odd_writer.write(line)
105 |                     line = reader.readline()
106 |                     if line == "":
107 |                         break
108 |                     even_writer.write(line)
109 | 
110 | def text_splitting_pipeline():
111 |     text = '\n'.join(['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])
112 |     split_text_task = split_text_lines(text)
113 |     print_text(split_text_task.outputs['odd_lines'])
114 |     print_text(split_text_task.outputs['even_lines'])
115 | 
116 | # Submit the pipeline for execution:
117 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(text_splitting_pipeline, arguments={})
118 | 
119 | 
120 | # %% [markdown]
121 | # ### Example: Pipeline that generates then sums many numbers
122 | 
123 | # %%
124 | # Writing many numbers
125 | @func_to_container_op
126 | def write_numbers(numbers_path: OutputPath(str), start: int = 0, count: int = 10):
127 |     with open(numbers_path, 'w') as writer:
128 |         for i in range(start, count):
129 |             writer.write(str(i) + '\n')
130 | 
131 | 
132 | # Reading and summing many numbers
133 | @func_to_container_op
134 | def sum_numbers(numbers_path: InputPath(str)) -> int:
135 |     sum = 0
136 |     with open(numbers_path, 'r') as reader:
137 |         for line in reader:
138 |             sum = sum + int(line)
139 |     return sum
140 | 
141 | 
142 | 
143 | # Pipeline to sum 100000 numbers
144 | def sum_pipeline(count: int = 100000):
145 |     numbers_task = write_numbers(count=count)
146 |     print_text(numbers_task.output)
147 | 
148 |     sum_task = sum_numbers(numbers_task.outputs['numbers'])
149 |     print_text(sum_task.output)
150 | 
151 | 
152 | # Submit the pipeline for execution:
153 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(sum_pipeline, arguments={})
154 | 
155 | # Combining all pipelines together in a single pipeline
156 | def file_passing_pipelines():
157 |     print_repeating_lines_pipeline()
158 |     text_splitting_pipeline()
159 |     sum_pipeline()
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     # Compiling the pipeline
164 |     kfp.compiler.Compiler().compile(file_passing_pipelines, __file__ + '.yaml')
165 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Train_regression/from_CSV/component.yaml:
--------------------------------------------------------------------------------
  1 | name: Catboost train regression
  2 | description: |-
  3 |   Train a CatBoost classifier model.
  4 | 
  5 |       Args:
  6 |           training_data_path: Path for the training data in CSV format.
  7 |           model_path: Output path for the trained model in binary CatBoostModel format.
  8 |           starting_model_path: Path for the existing trained model to start from.
  9 |           label_column: Column containing the label data.
 10 | 
 11 |           loss_function: The metric to use in training and also selector of the machine learning
 12 |               problem to solve. Default = 'RMSE'. Possible values:
 13 |               'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'
 14 |           num_iterations: Number of trees to add to the ensemble.
 15 |           learning_rate: Step size shrinkage used in update to prevents overfitting.
 16 |               Default value is selected automatically for binary classification with other parameters set to default.
 17 |               In all other cases default is 0.03.
 18 |           depth: Depth of a tree. All trees are the same depth. Default = 6
 19 |           random_seed: Random number seed. Default = 0
 20 | 
 21 |           cat_features: A list of Categorical features (indices or names).
 22 |           additional_training_options: A dictionary with additional options to pass to CatBoostRegressor
 23 | 
 24 |       Outputs:
 25 |           model: Trained model in binary CatBoostModel format.
 26 | 
 27 |       Annotations:
 28 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
 29 | inputs:
 30 | - {name: training_data, type: CSV}
 31 | - {name: starting_model, type: CatBoostModel, optional: true}
 32 | - {name: label_column, type: Integer, default: '0', optional: true}
 33 | - {name: loss_function, type: String, default: RMSE, optional: true}
 34 | - {name: num_iterations, type: Integer, default: '500', optional: true}
 35 | - {name: learning_rate, type: Float, optional: true}
 36 | - {name: depth, type: Integer, default: '6', optional: true}
 37 | - {name: random_seed, type: Integer, default: '0', optional: true}
 38 | - {name: cat_features, type: JsonArray, optional: true}
 39 | - {name: additional_training_options, type: JsonObject, default: '{}', optional: true}
 40 | outputs:
 41 | - {name: model, type: CatBoostModel}
 42 | implementation:
 43 |   container:
 44 |     image: python:3.7
 45 |     command:
 46 |     - sh
 47 |     - -c
 48 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 49 |       'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
 50 |       --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
 51 |     - python3
 52 |     - -u
 53 |     - -c
 54 |     - |
 55 |       def _make_parent_dirs_and_return_path(file_path: str):
 56 |           import os
 57 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
 58 |           return file_path
 59 | 
 60 |       def catboost_train_regression(
 61 |           training_data_path,
 62 |           model_path,
 63 |           starting_model_path = None,
 64 |           label_column = 0,
 65 | 
 66 |           loss_function = 'RMSE',
 67 |           num_iterations = 500,
 68 |           learning_rate = None,
 69 |           depth = 6,
 70 |           random_seed = 0,
 71 | 
 72 |           cat_features = None,
 73 | 
 74 |           additional_training_options = {},
 75 |       ):
 76 |           '''Train a CatBoost classifier model.
 77 | 
 78 |           Args:
 79 |               training_data_path: Path for the training data in CSV format.
 80 |               model_path: Output path for the trained model in binary CatBoostModel format.
 81 |               starting_model_path: Path for the existing trained model to start from.
 82 |               label_column: Column containing the label data.
 83 | 
 84 |               loss_function: The metric to use in training and also selector of the machine learning
 85 |                   problem to solve. Default = 'RMSE'. Possible values:
 86 |                   'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'
 87 |               num_iterations: Number of trees to add to the ensemble.
 88 |               learning_rate: Step size shrinkage used in update to prevents overfitting.
 89 |                   Default value is selected automatically for binary classification with other parameters set to default.
 90 |                   In all other cases default is 0.03.
 91 |               depth: Depth of a tree. All trees are the same depth. Default = 6
 92 |               random_seed: Random number seed. Default = 0
 93 | 
 94 |               cat_features: A list of Categorical features (indices or names).
 95 |               additional_training_options: A dictionary with additional options to pass to CatBoostRegressor
 96 | 
 97 |           Outputs:
 98 |               model: Trained model in binary CatBoostModel format.
 99 | 
100 |           Annotations:
101 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
102 |           '''
103 |           import tempfile
104 |           from pathlib import Path
105 | 
106 |           from catboost import CatBoostRegressor, Pool
107 | 
108 |           column_descriptions = {label_column: 'Label'}
109 |           column_description_path = tempfile.NamedTemporaryFile(delete=False).name
110 |           with open(column_description_path, 'w') as column_description_file:
111 |               for idx, kind in column_descriptions.items():
112 |                   column_description_file.write('{}\t{}\n'.format(idx, kind))
113 | 
114 |           train_data = Pool(
115 |               training_data_path,
116 |               column_description=column_description_path,
117 |               has_header=True,
118 |               delimiter=',',
119 |           )
120 | 
121 |           model = CatBoostRegressor(
122 |               iterations=num_iterations,
123 |               depth=depth,
124 |               learning_rate=learning_rate,
125 |               loss_function=loss_function,
126 |               random_seed=random_seed,
127 |               verbose=True,
128 |               **additional_training_options,
129 |           )
130 | 
131 |           model.fit(
132 |               train_data,
133 |               cat_features=cat_features,
134 |               init_model=starting_model_path,
135 |               #verbose=False,
136 |               #plot=True,
137 |           )
138 |           Path(model_path).parent.mkdir(parents=True, exist_ok=True)
139 |           model.save_model(model_path)
140 | 
141 |       import json
142 |       import argparse
143 |       _parser = argparse.ArgumentParser(prog='Catboost train regression', description="Train a CatBoost classifier model.\n\n    Args:\n        training_data_path: Path for the training data in CSV format.\n        model_path: Output path for the trained model in binary CatBoostModel format.\n        starting_model_path: Path for the existing trained model to start from.\n        label_column: Column containing the label data.\n\n        loss_function: The metric to use in training and also selector of the machine learning\n            problem to solve. Default = 'RMSE'. Possible values:\n            'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'\n        num_iterations: Number of trees to add to the ensemble.\n        learning_rate: Step size shrinkage used in update to prevents overfitting.\n            Default value is selected automatically for binary classification with other parameters set to default.\n            In all other cases default is 0.03.\n        depth: Depth of a tree. All trees are the same depth. Default = 6\n        random_seed: Random number seed. Default = 0\n\n        cat_features: A list of Categorical features (indices or names).\n        additional_training_options: A dictionary with additional options to pass to CatBoostRegressor\n\n    Outputs:\n        model: Trained model in binary CatBoostModel format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>")
144 |       _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS)
145 |       _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS)
146 |       _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
147 |       _parser.add_argument("--loss-function", dest="loss_function", type=str, required=False, default=argparse.SUPPRESS)
148 |       _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS)
149 |       _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS)
150 |       _parser.add_argument("--depth", dest="depth", type=int, required=False, default=argparse.SUPPRESS)
151 |       _parser.add_argument("--random-seed", dest="random_seed", type=int, required=False, default=argparse.SUPPRESS)
152 |       _parser.add_argument("--cat-features", dest="cat_features", type=json.loads, required=False, default=argparse.SUPPRESS)
153 |       _parser.add_argument("--additional-training-options", dest="additional_training_options", type=json.loads, required=False, default=argparse.SUPPRESS)
154 |       _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
155 |       _parsed_args = vars(_parser.parse_args())
156 | 
157 |       _outputs = catboost_train_regression(**_parsed_args)
158 |     args:
159 |     - --training-data
160 |     - {inputPath: training_data}
161 |     - if:
162 |         cond: {isPresent: starting_model}
163 |         then:
164 |         - --starting-model
165 |         - {inputPath: starting_model}
166 |     - if:
167 |         cond: {isPresent: label_column}
168 |         then:
169 |         - --label-column
170 |         - {inputValue: label_column}
171 |     - if:
172 |         cond: {isPresent: loss_function}
173 |         then:
174 |         - --loss-function
175 |         - {inputValue: loss_function}
176 |     - if:
177 |         cond: {isPresent: num_iterations}
178 |         then:
179 |         - --num-iterations
180 |         - {inputValue: num_iterations}
181 |     - if:
182 |         cond: {isPresent: learning_rate}
183 |         then:
184 |         - --learning-rate
185 |         - {inputValue: learning_rate}
186 |     - if:
187 |         cond: {isPresent: depth}
188 |         then:
189 |         - --depth
190 |         - {inputValue: depth}
191 |     - if:
192 |         cond: {isPresent: random_seed}
193 |         then:
194 |         - --random-seed
195 |         - {inputValue: random_seed}
196 |     - if:
197 |         cond: {isPresent: cat_features}
198 |         then:
199 |         - --cat-features
200 |         - {inputValue: cat_features}
201 |     - if:
202 |         cond: {isPresent: additional_training_options}
203 |         then:
204 |         - --additional-training-options
205 |         - {inputValue: additional_training_options}
206 |     - --model
207 |     - {outputPath: model}
208 | 


--------------------------------------------------------------------------------
/lesson10_catboost/Train_classifier/from_CSV/component.yaml:
--------------------------------------------------------------------------------
  1 | name: Catboost train classifier
  2 | description: |-
  3 |   Train a CatBoost classifier model.
  4 | 
  5 |       Args:
  6 |           training_data_path: Path for the training data in CSV format.
  7 |           model_path: Output path for the trained model in binary CatBoostModel format.
  8 |           starting_model_path: Path for the existing trained model to start from.
  9 |           label_column: Column containing the label data.
 10 | 
 11 |           loss_function: The metric to use in training and also selector of the machine learning
 12 |               problem to solve. Default = 'Logloss'
 13 |           num_iterations: Number of trees to add to the ensemble.
 14 |           learning_rate: Step size shrinkage used in update to prevents overfitting.
 15 |               Default value is selected automatically for binary classification with other parameters set to default.
 16 |               In all other cases default is 0.03.
 17 |           depth: Depth of a tree. All trees are the same depth. Default = 6
 18 |           random_seed: Random number seed. Default = 0
 19 | 
 20 |           cat_features: A list of Categorical features (indices or names).
 21 |           text_features: A list of Text features (indices or names).
 22 |           additional_training_options: A dictionary with additional options to pass to CatBoostClassifier
 23 | 
 24 |       Outputs:
 25 |           model: Trained model in binary CatBoostModel format.
 26 | 
 27 |       Annotations:
 28 |           author: Alexey Volkov <alexey.volkov@ark-kun.com>
 29 | inputs:
 30 | - {name: training_data, type: CSV}
 31 | - {name: starting_model, type: CatBoostModel, optional: true}
 32 | - {name: label_column, type: Integer, default: '0', optional: true}
 33 | - {name: loss_function, type: String, default: Logloss, optional: true}
 34 | - {name: num_iterations, type: Integer, default: '500', optional: true}
 35 | - {name: learning_rate, type: Float, optional: true}
 36 | - {name: depth, type: Integer, default: '6', optional: true}
 37 | - {name: random_seed, type: Integer, default: '0', optional: true}
 38 | - {name: cat_features, type: JsonArray, optional: true}
 39 | - {name: text_features, type: JsonArray, optional: true}
 40 | - {name: additional_training_options, type: JsonObject, default: '{}', optional: true}
 41 | outputs:
 42 | - {name: model, type: CatBoostModel}
 43 | implementation:
 44 |   container:
 45 |     image: python:3.7
 46 |     command:
 47 |     - sh
 48 |     - -c
 49 |     - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 50 |       'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
 51 |       --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
 52 |     - python3
 53 |     - -u
 54 |     - -c
 55 |     - |
 56 |       def _make_parent_dirs_and_return_path(file_path: str):
 57 |           import os
 58 |           os.makedirs(os.path.dirname(file_path), exist_ok=True)
 59 |           return file_path
 60 | 
 61 |       def catboost_train_classifier(
 62 |           training_data_path,
 63 |           model_path,
 64 |           starting_model_path = None,
 65 |           label_column = 0,
 66 | 
 67 |           loss_function = 'Logloss',
 68 |           num_iterations = 500,
 69 |           learning_rate = None,
 70 |           depth = 6,
 71 |           random_seed = 0,
 72 | 
 73 |           cat_features = None,
 74 |           text_features = None,
 75 | 
 76 |           additional_training_options = {},
 77 |       ):
 78 |           '''Train a CatBoost classifier model.
 79 | 
 80 |           Args:
 81 |               training_data_path: Path for the training data in CSV format.
 82 |               model_path: Output path for the trained model in binary CatBoostModel format.
 83 |               starting_model_path: Path for the existing trained model to start from.
 84 |               label_column: Column containing the label data.
 85 | 
 86 |               loss_function: The metric to use in training and also selector of the machine learning
 87 |                   problem to solve. Default = 'Logloss'
 88 |               num_iterations: Number of trees to add to the ensemble.
 89 |               learning_rate: Step size shrinkage used in update to prevents overfitting.
 90 |                   Default value is selected automatically for binary classification with other parameters set to default.
 91 |                   In all other cases default is 0.03.
 92 |               depth: Depth of a tree. All trees are the same depth. Default = 6
 93 |               random_seed: Random number seed. Default = 0
 94 | 
 95 |               cat_features: A list of Categorical features (indices or names).
 96 |               text_features: A list of Text features (indices or names).
 97 |               additional_training_options: A dictionary with additional options to pass to CatBoostClassifier
 98 | 
 99 |           Outputs:
100 |               model: Trained model in binary CatBoostModel format.
101 | 
102 |           Annotations:
103 |               author: Alexey Volkov <alexey.volkov@ark-kun.com>
104 |           '''
105 |           import tempfile
106 |           from pathlib import Path
107 | 
108 |           from catboost import CatBoostClassifier, Pool
109 | 
110 |           column_descriptions = {label_column: 'Label'}
111 |           column_description_path = tempfile.NamedTemporaryFile(delete=False).name
112 |           with open(column_description_path, 'w') as column_description_file:
113 |               for idx, kind in column_descriptions.items():
114 |                   column_description_file.write('{}\t{}\n'.format(idx, kind))
115 | 
116 |           train_data = Pool(
117 |               training_data_path,
118 |               column_description=column_description_path,
119 |               has_header=True,
120 |               delimiter=',',
121 |           )
122 | 
123 |           model = CatBoostClassifier(
124 |               iterations=num_iterations,
125 |               depth=depth,
126 |               learning_rate=learning_rate,
127 |               loss_function=loss_function,
128 |               random_seed=random_seed,
129 |               verbose=True,
130 |               **additional_training_options,
131 |           )
132 | 
133 |           model.fit(
134 |               train_data,
135 |               cat_features=cat_features,
136 |               text_features=text_features,
137 |               init_model=starting_model_path,
138 |               #verbose=False,
139 |               #plot=True,
140 |           )
141 |           Path(model_path).parent.mkdir(parents=True, exist_ok=True)
142 |           model.save_model(model_path)
143 | 
144 |       import json
145 |       import argparse
146 |       _parser = argparse.ArgumentParser(prog='Catboost train classifier', description="Train a CatBoost classifier model.\n\n    Args:\n        training_data_path: Path for the training data in CSV format.\n        model_path: Output path for the trained model in binary CatBoostModel format.\n        starting_model_path: Path for the existing trained model to start from.\n        label_column: Column containing the label data.\n\n        loss_function: The metric to use in training and also selector of the machine learning\n            problem to solve. Default = 'Logloss'\n        num_iterations: Number of trees to add to the ensemble.\n        learning_rate: Step size shrinkage used in update to prevents overfitting.\n            Default value is selected automatically for binary classification with other parameters set to default.\n            In all other cases default is 0.03.\n        depth: Depth of a tree. All trees are the same depth. Default = 6\n        random_seed: Random number seed. Default = 0\n\n        cat_features: A list of Categorical features (indices or names).\n        text_features: A list of Text features (indices or names).\n        additional_training_options: A dictionary with additional options to pass to CatBoostClassifier\n\n    Outputs:\n        model: Trained model in binary CatBoostModel format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>")
147 |       _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS)
148 |       _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS)
149 |       _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
150 |       _parser.add_argument("--loss-function", dest="loss_function", type=str, required=False, default=argparse.SUPPRESS)
151 |       _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS)
152 |       _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS)
153 |       _parser.add_argument("--depth", dest="depth", type=int, required=False, default=argparse.SUPPRESS)
154 |       _parser.add_argument("--random-seed", dest="random_seed", type=int, required=False, default=argparse.SUPPRESS)
155 |       _parser.add_argument("--cat-features", dest="cat_features", type=json.loads, required=False, default=argparse.SUPPRESS)
156 |       _parser.add_argument("--text-features", dest="text_features", type=json.loads, required=False, default=argparse.SUPPRESS)
157 |       _parser.add_argument("--additional-training-options", dest="additional_training_options", type=json.loads, required=False, default=argparse.SUPPRESS)
158 |       _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
159 |       _parsed_args = vars(_parser.parse_args())
160 | 
161 |       _outputs = catboost_train_classifier(**_parsed_args)
162 |     args:
163 |     - --training-data
164 |     - {inputPath: training_data}
165 |     - if:
166 |         cond: {isPresent: starting_model}
167 |         then:
168 |         - --starting-model
169 |         - {inputPath: starting_model}
170 |     - if:
171 |         cond: {isPresent: label_column}
172 |         then:
173 |         - --label-column
174 |         - {inputValue: label_column}
175 |     - if:
176 |         cond: {isPresent: loss_function}
177 |         then:
178 |         - --loss-function
179 |         - {inputValue: loss_function}
180 |     - if:
181 |         cond: {isPresent: num_iterations}
182 |         then:
183 |         - --num-iterations
184 |         - {inputValue: num_iterations}
185 |     - if:
186 |         cond: {isPresent: learning_rate}
187 |         then:
188 |         - --learning-rate
189 |         - {inputValue: learning_rate}
190 |     - if:
191 |         cond: {isPresent: depth}
192 |         then:
193 |         - --depth
194 |         - {inputValue: depth}
195 |     - if:
196 |         cond: {isPresent: random_seed}
197 |         then:
198 |         - --random-seed
199 |         - {inputValue: random_seed}
200 |     - if:
201 |         cond: {isPresent: cat_features}
202 |         then:
203 |         - --cat-features
204 |         - {inputValue: cat_features}
205 |     - if:
206 |         cond: {isPresent: text_features}
207 |         then:
208 |         - --text-features
209 |         - {inputValue: text_features}
210 |     - if:
211 |         cond: {isPresent: additional_training_options}
212 |         then:
213 |         - --additional-training-options
214 |         - {inputValue: additional_training_options}
215 |     - --model
216 |     - {outputPath: model}
217 | 


--------------------------------------------------------------------------------
/lesson5_control_structure/control.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: Workflow
  3 | metadata:
  4 |   generateName: conditional-execution-pipeline-
  5 |   annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2, pipelines.kubeflow.org/pipeline_compilation_time: '2021-04-04T17:39:09.256839',
  6 |     pipelines.kubeflow.org/pipeline_spec: '{"description": "Shows how to use dsl.Condition().",
  7 |       "name": "Conditional execution pipeline"}'}
  8 |   labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2}
  9 | spec:
 10 |   entrypoint: conditional-execution-pipeline
 11 |   templates:
 12 |   - name: condition-1
 13 |     dag:
 14 |       tasks:
 15 |       - name: condition-2
 16 |         template: condition-2
 17 |         when: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}
 18 |           > 5'
 19 |         dependencies: [get-random-int-op]
 20 |         arguments:
 21 |           parameters:
 22 |           - {name: get-random-int-op-Output, value: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}'}
 23 |       - name: condition-3
 24 |         template: condition-3
 25 |         when: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}
 26 |           <= 5'
 27 |         dependencies: [get-random-int-op]
 28 |         arguments:
 29 |           parameters:
 30 |           - {name: get-random-int-op-Output, value: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}'}
 31 |       - {name: get-random-int-op, template: get-random-int-op}
 32 |   - name: condition-2
 33 |     inputs:
 34 |       parameters:
 35 |       - {name: get-random-int-op-Output}
 36 |     dag:
 37 |       tasks:
 38 |       - name: print-op
 39 |         template: print-op
 40 |         arguments:
 41 |           parameters:
 42 |           - {name: get-random-int-op-Output, value: '{{inputs.parameters.get-random-int-op-Output}}'}
 43 |   - name: condition-3
 44 |     inputs:
 45 |       parameters:
 46 |       - {name: get-random-int-op-Output}
 47 |     dag:
 48 |       tasks:
 49 |       - name: print-op-2
 50 |         template: print-op-2
 51 |         arguments:
 52 |           parameters:
 53 |           - {name: get-random-int-op-Output, value: '{{inputs.parameters.get-random-int-op-Output}}'}
 54 |   - name: condition-4
 55 |     dag:
 56 |       tasks:
 57 |       - name: condition-5
 58 |         template: condition-5
 59 |         when: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}
 60 |           > 15'
 61 |         dependencies: [get-random-int-op-2]
 62 |         arguments:
 63 |           parameters:
 64 |           - {name: get-random-int-op-2-Output, value: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}'}
 65 |       - name: condition-6
 66 |         template: condition-6
 67 |         when: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}
 68 |           <= 15'
 69 |         dependencies: [get-random-int-op-2]
 70 |         arguments:
 71 |           parameters:
 72 |           - {name: get-random-int-op-2-Output, value: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}'}
 73 |       - {name: get-random-int-op-2, template: get-random-int-op-2}
 74 |   - name: condition-5
 75 |     inputs:
 76 |       parameters:
 77 |       - {name: get-random-int-op-2-Output}
 78 |     dag:
 79 |       tasks:
 80 |       - name: print-op-3
 81 |         template: print-op-3
 82 |         arguments:
 83 |           parameters:
 84 |           - {name: get-random-int-op-2-Output, value: '{{inputs.parameters.get-random-int-op-2-Output}}'}
 85 |   - name: condition-6
 86 |     inputs:
 87 |       parameters:
 88 |       - {name: get-random-int-op-2-Output}
 89 |     dag:
 90 |       tasks:
 91 |       - name: print-op-4
 92 |         template: print-op-4
 93 |         arguments:
 94 |           parameters:
 95 |           - {name: get-random-int-op-2-Output, value: '{{inputs.parameters.get-random-int-op-2-Output}}'}
 96 |   - name: conditional-execution-pipeline
 97 |     dag:
 98 |       tasks:
 99 |       - name: condition-1
100 |         template: condition-1
101 |         when: '"{{tasks.flip-coin-op.outputs.parameters.flip-coin-op-Output}}" ==
102 |           "heads"'
103 |         dependencies: [flip-coin-op]
104 |       - name: condition-4
105 |         template: condition-4
106 |         when: '"{{tasks.flip-coin-op.outputs.parameters.flip-coin-op-Output}}" ==
107 |           "tails"'
108 |         dependencies: [flip-coin-op]
109 |       - {name: flip-coin-op, template: flip-coin-op}
110 |   - name: flip-coin-op
111 |     container:
112 |       args: ['----output-paths', /tmp/outputs/Output/data]
113 |       command:
114 |       - sh
115 |       - -ec
116 |       - |
117 |         program_path=$(mktemp)
118 |         printf "%s" "$0" > "$program_path"
119 |         python3 -u "$program_path" "$@"
120 |       - |
121 |         def flip_coin_op():
122 |             """Flip a coin and output heads or tails randomly."""
123 |             import random
124 |             result = random.choice(['heads', 'tails'])
125 |             print(result)
126 |             return result
127 | 
128 |         def _serialize_str(str_value: str) -> str:
129 |             if not isinstance(str_value, str):
130 |                 raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
131 |             return str_value
132 | 
133 |         import argparse
134 |         _parser = argparse.ArgumentParser(prog='Flip coin op', description='Flip a coin and output heads or tails randomly.')
135 |         _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
136 |         _parsed_args = vars(_parser.parse_args())
137 |         _output_files = _parsed_args.pop("_output_paths", [])
138 | 
139 |         _outputs = flip_coin_op(**_parsed_args)
140 | 
141 |         _outputs = [_outputs]
142 | 
143 |         _output_serializers = [
144 |             _serialize_str,
145 | 
146 |         ]
147 | 
148 |         import os
149 |         for idx, output_file in enumerate(_output_files):
150 |             try:
151 |                 os.makedirs(os.path.dirname(output_file))
152 |             except OSError:
153 |                 pass
154 |             with open(output_file, 'w') as f:
155 |                 f.write(_output_serializers[idx](_outputs[idx]))
156 |       image: python:3.7
157 |     outputs:
158 |       parameters:
159 |       - name: flip-coin-op-Output
160 |         valueFrom: {path: /tmp/outputs/Output/data}
161 |       artifacts:
162 |       - {name: flip-coin-op-Output, path: /tmp/outputs/Output/data}
163 |     metadata:
164 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Flip
165 |           a coin and output heads or tails randomly.", "implementation": {"container":
166 |           {"args": ["----output-paths", {"outputPath": "Output"}], "command": ["sh",
167 |           "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3
168 |           -u \"$program_path\" \"$@\"\n", "def flip_coin_op():\n    \"\"\"Flip a coin
169 |           and output heads or tails randomly.\"\"\"\n    import random\n    result
170 |           = random.choice([''heads'', ''tails''])\n    print(result)\n    return result\n\ndef
171 |           _serialize_str(str_value: str) -> str:\n    if not isinstance(str_value,
172 |           str):\n        raise TypeError(''Value \"{}\" has type \"{}\" instead of
173 |           str.''.format(str(str_value), str(type(str_value))))\n    return str_value\n\nimport
174 |           argparse\n_parser = argparse.ArgumentParser(prog=''Flip coin op'', description=''Flip
175 |           a coin and output heads or tails randomly.'')\n_parser.add_argument(\"----output-paths\",
176 |           dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files
177 |           = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = flip_coin_op(**_parsed_args)\n\n_outputs
178 |           = [_outputs]\n\n_output_serializers = [\n    _serialize_str,\n\n]\n\nimport
179 |           os\nfor idx, output_file in enumerate(_output_files):\n    try:\n        os.makedirs(os.path.dirname(output_file))\n    except
180 |           OSError:\n        pass\n    with open(output_file, ''w'') as f:\n        f.write(_output_serializers[idx](_outputs[idx]))\n"],
181 |           "image": "python:3.7"}}, "name": "Flip coin op", "outputs": [{"name": "Output",
182 |           "type": "String"}]}', pipelines.kubeflow.org/component_ref: '{}'}
183 |   - name: get-random-int-op
184 |     container:
185 |       args: [--minimum, '0', --maximum, '9', '----output-paths', /tmp/outputs/Output/data]
186 |       command:
187 |       - sh
188 |       - -ec
189 |       - |
190 |         program_path=$(mktemp)
191 |         printf "%s" "$0" > "$program_path"
192 |         python3 -u "$program_path" "$@"
193 |       - |
194 |         def get_random_int_op(minimum, maximum):
195 |             """Generate a random number between minimum and maximum (inclusive)."""
196 |             import random
197 |             result = random.randint(minimum, maximum)
198 |             print(result)
199 |             return result
200 | 
201 |         def _serialize_int(int_value: int) -> str:
202 |             if isinstance(int_value, str):
203 |                 return int_value
204 |             if not isinstance(int_value, int):
205 |                 raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value))))
206 |             return str(int_value)
207 | 
208 |         import argparse
209 |         _parser = argparse.ArgumentParser(prog='Get random int op', description='Generate a random number between minimum and maximum (inclusive).')
210 |         _parser.add_argument("--minimum", dest="minimum", type=int, required=True, default=argparse.SUPPRESS)
211 |         _parser.add_argument("--maximum", dest="maximum", type=int, required=True, default=argparse.SUPPRESS)
212 |         _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
213 |         _parsed_args = vars(_parser.parse_args())
214 |         _output_files = _parsed_args.pop("_output_paths", [])
215 | 
216 |         _outputs = get_random_int_op(**_parsed_args)
217 | 
218 |         _outputs = [_outputs]
219 | 
220 |         _output_serializers = [
221 |             _serialize_int,
222 | 
223 |         ]
224 | 
225 |         import os
226 |         for idx, output_file in enumerate(_output_files):
227 |             try:
228 |                 os.makedirs(os.path.dirname(output_file))
229 |             except OSError:
230 |                 pass
231 |             with open(output_file, 'w') as f:
232 |                 f.write(_output_serializers[idx](_outputs[idx]))
233 |       image: python:3.7
234 |     outputs:
235 |       parameters:
236 |       - name: get-random-int-op-Output
237 |         valueFrom: {path: /tmp/outputs/Output/data}
238 |       artifacts:
239 |       - {name: get-random-int-op-Output, path: /tmp/outputs/Output/data}
240 |     metadata:
241 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Generate
242 |           a random number between minimum and maximum (inclusive).", "implementation":
243 |           {"container": {"args": ["--minimum", {"inputValue": "minimum"}, "--maximum",
244 |           {"inputValue": "maximum"}, "----output-paths", {"outputPath": "Output"}],
245 |           "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" >
246 |           \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def get_random_int_op(minimum,
247 |           maximum):\n    \"\"\"Generate a random number between minimum and maximum
248 |           (inclusive).\"\"\"\n    import random\n    result = random.randint(minimum,
249 |           maximum)\n    print(result)\n    return result\n\ndef _serialize_int(int_value:
250 |           int) -> str:\n    if isinstance(int_value, str):\n        return int_value\n    if
251 |           not isinstance(int_value, int):\n        raise TypeError(''Value \"{}\"
252 |           has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n    return
253 |           str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Get
254 |           random int op'', description=''Generate a random number between minimum
255 |           and maximum (inclusive).'')\n_parser.add_argument(\"--minimum\", dest=\"minimum\",
256 |           type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--maximum\",
257 |           dest=\"maximum\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\",
258 |           dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files
259 |           = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = get_random_int_op(**_parsed_args)\n\n_outputs
260 |           = [_outputs]\n\n_output_serializers = [\n    _serialize_int,\n\n]\n\nimport
261 |           os\nfor idx, output_file in enumerate(_output_files):\n    try:\n        os.makedirs(os.path.dirname(output_file))\n    except
262 |           OSError:\n        pass\n    with open(output_file, ''w'') as f:\n        f.write(_output_serializers[idx](_outputs[idx]))\n"],
263 |           "image": "python:3.7"}}, "inputs": [{"name": "minimum", "type": "Integer"},
264 |           {"name": "maximum", "type": "Integer"}], "name": "Get random int op", "outputs":
265 |           [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}',
266 |         pipelines.kubeflow.org/arguments.parameters: '{"maximum": "9", "minimum":
267 |           "0"}'}
268 |   - name: get-random-int-op-2
269 |     container:
270 |       args: [--minimum, '10', --maximum, '19', '----output-paths', /tmp/outputs/Output/data]
271 |       command:
272 |       - sh
273 |       - -ec
274 |       - |
275 |         program_path=$(mktemp)
276 |         printf "%s" "$0" > "$program_path"
277 |         python3 -u "$program_path" "$@"
278 |       - |
279 |         def get_random_int_op(minimum, maximum):
280 |             """Generate a random number between minimum and maximum (inclusive)."""
281 |             import random
282 |             result = random.randint(minimum, maximum)
283 |             print(result)
284 |             return result
285 | 
286 |         def _serialize_int(int_value: int) -> str:
287 |             if isinstance(int_value, str):
288 |                 return int_value
289 |             if not isinstance(int_value, int):
290 |                 raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value))))
291 |             return str(int_value)
292 | 
293 |         import argparse
294 |         _parser = argparse.ArgumentParser(prog='Get random int op', description='Generate a random number between minimum and maximum (inclusive).')
295 |         _parser.add_argument("--minimum", dest="minimum", type=int, required=True, default=argparse.SUPPRESS)
296 |         _parser.add_argument("--maximum", dest="maximum", type=int, required=True, default=argparse.SUPPRESS)
297 |         _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
298 |         _parsed_args = vars(_parser.parse_args())
299 |         _output_files = _parsed_args.pop("_output_paths", [])
300 | 
301 |         _outputs = get_random_int_op(**_parsed_args)
302 | 
303 |         _outputs = [_outputs]
304 | 
305 |         _output_serializers = [
306 |             _serialize_int,
307 | 
308 |         ]
309 | 
310 |         import os
311 |         for idx, output_file in enumerate(_output_files):
312 |             try:
313 |                 os.makedirs(os.path.dirname(output_file))
314 |             except OSError:
315 |                 pass
316 |             with open(output_file, 'w') as f:
317 |                 f.write(_output_serializers[idx](_outputs[idx]))
318 |       image: python:3.7
319 |     outputs:
320 |       parameters:
321 |       - name: get-random-int-op-2-Output
322 |         valueFrom: {path: /tmp/outputs/Output/data}
323 |       artifacts:
324 |       - {name: get-random-int-op-2-Output, path: /tmp/outputs/Output/data}
325 |     metadata:
326 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Generate
327 |           a random number between minimum and maximum (inclusive).", "implementation":
328 |           {"container": {"args": ["--minimum", {"inputValue": "minimum"}, "--maximum",
329 |           {"inputValue": "maximum"}, "----output-paths", {"outputPath": "Output"}],
330 |           "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" >
331 |           \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def get_random_int_op(minimum,
332 |           maximum):\n    \"\"\"Generate a random number between minimum and maximum
333 |           (inclusive).\"\"\"\n    import random\n    result = random.randint(minimum,
334 |           maximum)\n    print(result)\n    return result\n\ndef _serialize_int(int_value:
335 |           int) -> str:\n    if isinstance(int_value, str):\n        return int_value\n    if
336 |           not isinstance(int_value, int):\n        raise TypeError(''Value \"{}\"
337 |           has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n    return
338 |           str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Get
339 |           random int op'', description=''Generate a random number between minimum
340 |           and maximum (inclusive).'')\n_parser.add_argument(\"--minimum\", dest=\"minimum\",
341 |           type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--maximum\",
342 |           dest=\"maximum\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\",
343 |           dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files
344 |           = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = get_random_int_op(**_parsed_args)\n\n_outputs
345 |           = [_outputs]\n\n_output_serializers = [\n    _serialize_int,\n\n]\n\nimport
346 |           os\nfor idx, output_file in enumerate(_output_files):\n    try:\n        os.makedirs(os.path.dirname(output_file))\n    except
347 |           OSError:\n        pass\n    with open(output_file, ''w'') as f:\n        f.write(_output_serializers[idx](_outputs[idx]))\n"],
348 |           "image": "python:3.7"}}, "inputs": [{"name": "minimum", "type": "Integer"},
349 |           {"name": "maximum", "type": "Integer"}], "name": "Get random int op", "outputs":
350 |           [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}',
351 |         pipelines.kubeflow.org/arguments.parameters: '{"maximum": "19", "minimum":
352 |           "10"}'}
353 |   - name: print-op
354 |     container:
355 |       args: [--message, 'heads and {{inputs.parameters.get-random-int-op-Output}}
356 |           > 5!']
357 |       command:
358 |       - sh
359 |       - -ec
360 |       - |
361 |         program_path=$(mktemp)
362 |         printf "%s" "$0" > "$program_path"
363 |         python3 -u "$program_path" "$@"
364 |       - |
365 |         def print_op(message):
366 |             """Print a message."""
367 |             print(message)
368 | 
369 |         import argparse
370 |         _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.')
371 |         _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS)
372 |         _parsed_args = vars(_parser.parse_args())
373 | 
374 |         _outputs = print_op(**_parsed_args)
375 |       image: python:3.7
376 |     inputs:
377 |       parameters:
378 |       - {name: get-random-int-op-Output}
379 |     metadata:
380 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print
381 |           a message.", "implementation": {"container": {"args": ["--message", {"inputValue":
382 |           "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\"
383 |           \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def
384 |           print_op(message):\n    \"\"\"Print a message.\"\"\"\n    print(message)\n\nimport
385 |           argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print
386 |           a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str,
387 |           required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
388 |           = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name":
389 |           "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}',
390 |         pipelines.kubeflow.org/arguments.parameters: '{"message": "heads and {{inputs.parameters.get-random-int-op-Output}}
391 |           > 5!"}'}
392 |   - name: print-op-2
393 |     container:
394 |       args: [--message, 'heads and {{inputs.parameters.get-random-int-op-Output}}
395 |           <= 5!']
396 |       command:
397 |       - sh
398 |       - -ec
399 |       - |
400 |         program_path=$(mktemp)
401 |         printf "%s" "$0" > "$program_path"
402 |         python3 -u "$program_path" "$@"
403 |       - |
404 |         def print_op(message):
405 |             """Print a message."""
406 |             print(message)
407 | 
408 |         import argparse
409 |         _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.')
410 |         _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS)
411 |         _parsed_args = vars(_parser.parse_args())
412 | 
413 |         _outputs = print_op(**_parsed_args)
414 |       image: python:3.7
415 |     inputs:
416 |       parameters:
417 |       - {name: get-random-int-op-Output}
418 |     metadata:
419 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print
420 |           a message.", "implementation": {"container": {"args": ["--message", {"inputValue":
421 |           "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\"
422 |           \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def
423 |           print_op(message):\n    \"\"\"Print a message.\"\"\"\n    print(message)\n\nimport
424 |           argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print
425 |           a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str,
426 |           required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
427 |           = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name":
428 |           "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}',
429 |         pipelines.kubeflow.org/arguments.parameters: '{"message": "heads and {{inputs.parameters.get-random-int-op-Output}}
430 |           <= 5!"}'}
431 |   - name: print-op-3
432 |     container:
433 |       args: [--message, 'tails and {{inputs.parameters.get-random-int-op-2-Output}}
434 |           > 15!']
435 |       command:
436 |       - sh
437 |       - -ec
438 |       - |
439 |         program_path=$(mktemp)
440 |         printf "%s" "$0" > "$program_path"
441 |         python3 -u "$program_path" "$@"
442 |       - |
443 |         def print_op(message):
444 |             """Print a message."""
445 |             print(message)
446 | 
447 |         import argparse
448 |         _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.')
449 |         _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS)
450 |         _parsed_args = vars(_parser.parse_args())
451 | 
452 |         _outputs = print_op(**_parsed_args)
453 |       image: python:3.7
454 |     inputs:
455 |       parameters:
456 |       - {name: get-random-int-op-2-Output}
457 |     metadata:
458 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print
459 |           a message.", "implementation": {"container": {"args": ["--message", {"inputValue":
460 |           "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\"
461 |           \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def
462 |           print_op(message):\n    \"\"\"Print a message.\"\"\"\n    print(message)\n\nimport
463 |           argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print
464 |           a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str,
465 |           required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
466 |           = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name":
467 |           "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}',
468 |         pipelines.kubeflow.org/arguments.parameters: '{"message": "tails and {{inputs.parameters.get-random-int-op-2-Output}}
469 |           > 15!"}'}
470 |   - name: print-op-4
471 |     container:
472 |       args: [--message, 'tails and {{inputs.parameters.get-random-int-op-2-Output}}
473 |           <= 15!']
474 |       command:
475 |       - sh
476 |       - -ec
477 |       - |
478 |         program_path=$(mktemp)
479 |         printf "%s" "$0" > "$program_path"
480 |         python3 -u "$program_path" "$@"
481 |       - |
482 |         def print_op(message):
483 |             """Print a message."""
484 |             print(message)
485 | 
486 |         import argparse
487 |         _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.')
488 |         _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS)
489 |         _parsed_args = vars(_parser.parse_args())
490 | 
491 |         _outputs = print_op(**_parsed_args)
492 |       image: python:3.7
493 |     inputs:
494 |       parameters:
495 |       - {name: get-random-int-op-2-Output}
496 |     metadata:
497 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print
498 |           a message.", "implementation": {"container": {"args": ["--message", {"inputValue":
499 |           "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\"
500 |           \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def
501 |           print_op(message):\n    \"\"\"Print a message.\"\"\"\n    print(message)\n\nimport
502 |           argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print
503 |           a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str,
504 |           required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
505 |           = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name":
506 |           "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}',
507 |         pipelines.kubeflow.org/arguments.parameters: '{"message": "tails and {{inputs.parameters.get-random-int-op-2-Output}}
508 |           <= 15!"}'}
509 |   arguments:
510 |     parameters: []
511 |   serviceAccountName: pipeline-runner
512 | 


--------------------------------------------------------------------------------
/train_until_good/train_until_good.py.yaml:
--------------------------------------------------------------------------------
   1 | apiVersion: argoproj.io/v1alpha1
   2 | kind: Workflow
   3 | metadata:
   4 |   generateName: train-until-good-pipeline-
   5 |   annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.4.0, pipelines.kubeflow.org/pipeline_compilation_time: '2021-03-25T08:14:34.624430',
   6 |     pipelines.kubeflow.org/pipeline_spec: '{"name": "Train until good pipeline"}'}
   7 |   labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.4.0}
   8 | spec:
   9 |   entrypoint: train-until-good-pipeline
  10 |   templates:
  11 |   - name: calculate-regression-metrics-from-csv
  12 |     container:
  13 |       args: [--true-values, /tmp/inputs/true_values/data, --predicted-values, /tmp/inputs/predicted_values/data,
  14 |         '----output-paths', /tmp/outputs/max_absolute_error/data, /tmp/outputs/mean_absolute_error/data,
  15 |         /tmp/outputs/mean_squared_error/data, /tmp/outputs/root_mean_squared_error/data]
  16 |       command:
  17 |       - sh
  18 |       - -c
  19 |       - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
  20 |         'numpy==1.19.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
  21 |         --quiet --no-warn-script-location 'numpy==1.19.0' --user) && "$0" "$@"
  22 |       - python3
  23 |       - -u
  24 |       - -c
  25 |       - |
  26 |         def calculate_regression_metrics_from_csv(
  27 |             true_values_path,
  28 |             predicted_values_path,
  29 |         ):
  30 |             '''Calculates regression metrics.
  31 | 
  32 |             Annotations:
  33 |                 author: Alexey Volkov <alexey.volkov@ark-kun.com>
  34 |             '''
  35 |             import math
  36 |             import numpy
  37 | 
  38 |             true_values = numpy.loadtxt(true_values_path, dtype=numpy.float64)
  39 |             predicted_values = numpy.loadtxt(predicted_values_path, dtype=numpy.float64)
  40 | 
  41 |             if len(predicted_values.shape) != 1:
  42 |                 raise NotImplemented('Only single prediction values are supported.')
  43 |             if len(true_values.shape) != 1:
  44 |                 raise NotImplemented('Only single true values are supported.')
  45 | 
  46 |             if predicted_values.shape != true_values.shape:
  47 |                 raise ValueError('Input shapes are different: {} != {}'.format(predicted_values.shape, true_values.shape))
  48 | 
  49 |             num_true_values = true_values
  50 |             errors = (true_values - predicted_values)
  51 |             abs_errors = numpy.abs(errors)
  52 |             squared_errors = errors ** 2
  53 |             max_absolute_error = numpy.max(abs_errors)
  54 |             mean_absolute_error = numpy.average(abs_errors)
  55 |             mean_squared_error = numpy.average(squared_errors)
  56 |             root_mean_squared_error = math.sqrt(mean_squared_error)
  57 | 
  58 |             return (
  59 |                 max_absolute_error,
  60 |                 mean_absolute_error,
  61 |                 mean_squared_error,
  62 |                 root_mean_squared_error,
  63 |             )
  64 | 
  65 |         def _serialize_float(float_value: float) -> str:
  66 |             if isinstance(float_value, str):
  67 |                 return float_value
  68 |             if not isinstance(float_value, (float, int)):
  69 |                 raise TypeError('Value "{}" has type "{}" instead of float.'.format(str(float_value), str(type(float_value))))
  70 |             return str(float_value)
  71 | 
  72 |         import argparse
  73 |         _parser = argparse.ArgumentParser(prog='Calculate regression metrics from csv', description='Calculates regression metrics.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
  74 |         _parser.add_argument("--true-values", dest="true_values_path", type=str, required=True, default=argparse.SUPPRESS)
  75 |         _parser.add_argument("--predicted-values", dest="predicted_values_path", type=str, required=True, default=argparse.SUPPRESS)
  76 |         _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4)
  77 |         _parsed_args = vars(_parser.parse_args())
  78 |         _output_files = _parsed_args.pop("_output_paths", [])
  79 | 
  80 |         _outputs = calculate_regression_metrics_from_csv(**_parsed_args)
  81 | 
  82 |         _output_serializers = [
  83 |             _serialize_float,
  84 |             _serialize_float,
  85 |             _serialize_float,
  86 |             _serialize_float,
  87 | 
  88 |         ]
  89 | 
  90 |         import os
  91 |         for idx, output_file in enumerate(_output_files):
  92 |             try:
  93 |                 os.makedirs(os.path.dirname(output_file))
  94 |             except OSError:
  95 |                 pass
  96 |             with open(output_file, 'w') as f:
  97 |                 f.write(_output_serializers[idx](_outputs[idx]))
  98 |       image: python:3.7
  99 |     inputs:
 100 |       artifacts:
 101 |       - {name: xgboost-predict-predictions, path: /tmp/inputs/predicted_values/data}
 102 |       - {name: remove-header-table, path: /tmp/inputs/true_values/data}
 103 |     outputs:
 104 |       parameters:
 105 |       - name: calculate-regression-metrics-from-csv-mean_squared_error
 106 |         valueFrom: {path: /tmp/outputs/mean_squared_error/data}
 107 |       artifacts:
 108 |       - {name: calculate-regression-metrics-from-csv-max_absolute_error, path: /tmp/outputs/max_absolute_error/data}
 109 |       - {name: calculate-regression-metrics-from-csv-mean_absolute_error, path: /tmp/outputs/mean_absolute_error/data}
 110 |       - {name: calculate-regression-metrics-from-csv-mean_squared_error, path: /tmp/outputs/mean_squared_error/data}
 111 |       - {name: calculate-regression-metrics-from-csv-root_mean_squared_error, path: /tmp/outputs/root_mean_squared_error/data}
 112 |     metadata:
 113 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Calculates
 114 |           regression metrics.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>",
 115 |           "implementation": {"container": {"args": ["--true-values", {"inputPath":
 116 |           "true_values"}, "--predicted-values", {"inputPath": "predicted_values"},
 117 |           "----output-paths", {"outputPath": "max_absolute_error"}, {"outputPath":
 118 |           "mean_absolute_error"}, {"outputPath": "mean_squared_error"}, {"outputPath":
 119 |           "root_mean_squared_error"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1
 120 |           python3 -m pip install --quiet --no-warn-script-location ''numpy==1.19.0''
 121 |           || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 122 |           ''numpy==1.19.0'' --user) && \"$0\" \"$@\"", "python3", "-u", "-c", "def
 123 |           calculate_regression_metrics_from_csv(\n    true_values_path,\n    predicted_values_path,\n):\n    ''''''Calculates
 124 |           regression metrics.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>\n    ''''''\n    import
 125 |           math\n    import numpy\n\n    true_values = numpy.loadtxt(true_values_path,
 126 |           dtype=numpy.float64)\n    predicted_values = numpy.loadtxt(predicted_values_path,
 127 |           dtype=numpy.float64)\n\n    if len(predicted_values.shape) != 1:\n        raise
 128 |           NotImplemented(''Only single prediction values are supported.'')\n    if
 129 |           len(true_values.shape) != 1:\n        raise NotImplemented(''Only single
 130 |           true values are supported.'')\n\n    if predicted_values.shape != true_values.shape:\n        raise
 131 |           ValueError(''Input shapes are different: {} != {}''.format(predicted_values.shape,
 132 |           true_values.shape))\n\n    num_true_values = true_values\n    errors = (true_values
 133 |           - predicted_values)\n    abs_errors = numpy.abs(errors)\n    squared_errors
 134 |           = errors ** 2\n    max_absolute_error = numpy.max(abs_errors)\n    mean_absolute_error
 135 |           = numpy.average(abs_errors)\n    mean_squared_error = numpy.average(squared_errors)\n    root_mean_squared_error
 136 |           = math.sqrt(mean_squared_error)\n\n    return (\n        max_absolute_error,\n        mean_absolute_error,\n        mean_squared_error,\n        root_mean_squared_error,\n    )\n\ndef
 137 |           _serialize_float(float_value: float) -> str:\n    if isinstance(float_value,
 138 |           str):\n        return float_value\n    if not isinstance(float_value, (float,
 139 |           int)):\n        raise TypeError(''Value \"{}\" has type \"{}\" instead of
 140 |           float.''.format(str(float_value), str(type(float_value))))\n    return str(float_value)\n\nimport
 141 |           argparse\n_parser = argparse.ArgumentParser(prog=''Calculate regression
 142 |           metrics from csv'', description=''Calculates regression metrics.\\n\\n    Annotations:\\n        author:
 143 |           Alexey Volkov <alexey.volkov@ark-kun.com>'')\n_parser.add_argument(\"--true-values\",
 144 |           dest=\"true_values_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--predicted-values\",
 145 |           dest=\"predicted_values_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\",
 146 |           dest=\"_output_paths\", type=str, nargs=4)\n_parsed_args = vars(_parser.parse_args())\n_output_files
 147 |           = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = calculate_regression_metrics_from_csv(**_parsed_args)\n\n_output_serializers
 148 |           = [\n    _serialize_float,\n    _serialize_float,\n    _serialize_float,\n    _serialize_float,\n\n]\n\nimport
 149 |           os\nfor idx, output_file in enumerate(_output_files):\n    try:\n        os.makedirs(os.path.dirname(output_file))\n    except
 150 |           OSError:\n        pass\n    with open(output_file, ''w'') as f:\n        f.write(_output_serializers[idx](_outputs[idx]))\n"],
 151 |           "image": "python:3.7"}}, "inputs": [{"name": "true_values"}, {"name": "predicted_values"}],
 152 |           "name": "Calculate regression metrics from csv", "outputs": [{"name": "max_absolute_error",
 153 |           "type": "Float"}, {"name": "mean_absolute_error", "type": "Float"}, {"name":
 154 |           "mean_squared_error", "type": "Float"}, {"name": "root_mean_squared_error",
 155 |           "type": "Float"}]}', pipelines.kubeflow.org/component_ref: '{"digest": "f326bddad865f292b6e67b0edc485649b13f5fa74b1546584974274c2bced3e1",
 156 |           "url": "https://raw.githubusercontent.com/kubeflow/pipelines/616542ac0f789914f4eb53438da713dd3004fba4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml"}'}
 157 |   - name: chicago-taxi-trips-dataset
 158 |     container:
 159 |       args: []
 160 |       command:
 161 |       - sh
 162 |       - -c
 163 |       - |
 164 |         set -e -x -o pipefail
 165 |         output_path="$0"
 166 |         select="$1"
 167 |         where="$2"
 168 |         limit="$3"
 169 |         format="$4"
 170 |         mkdir -p "$(dirname "$output_path")"
 171 |         curl --get 'https://data.cityofchicago.org/resource/wrvz-psew.'"${format}" \
 172 |             --data-urlencode '$limit='"${limit}" \
 173 |             --data-urlencode '$where='"${where}" \
 174 |             --data-urlencode '$select='"${select}" \
 175 |             | tr -d '"' > "$output_path"  # Removing unneeded quotes around all numbers
 176 |       - /tmp/outputs/Table/data
 177 |       - tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total
 178 |       - trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"
 179 |       - '10000'
 180 |       - csv
 181 |       image: curlimages/curl
 182 |     outputs:
 183 |       artifacts:
 184 |       - {name: chicago-taxi-trips-dataset-Table, path: /tmp/outputs/Table/data}
 185 |     metadata:
 186 |       annotations: {author: Alexey Volkov <alexey.volkov@ark-kun.com>, pipelines.kubeflow.org/component_spec: '{"description":
 187 |           "City of Chicago Taxi Trips dataset: https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew\n\nThe
 188 |           input parameters configure the SQL query to the database.\nThe dataset is
 189 |           pretty big, so limit the number of results using the `Limit` or `Where`
 190 |           parameters.\nRead [Socrata dev](https://dev.socrata.com/docs/queries/) for
 191 |           the advanced query syntax\n", "implementation": {"container": {"command":
 192 |           ["sh", "-c", "set -e -x -o pipefail\noutput_path=\"$0\"\nselect=\"$1\"\nwhere=\"$2\"\nlimit=\"$3\"\nformat=\"$4\"\nmkdir
 193 |           -p \"$(dirname \"$output_path\")\"\ncurl --get ''https://data.cityofchicago.org/resource/wrvz-psew.''\"${format}\"
 194 |           \\\n    --data-urlencode ''$limit=''\"${limit}\" \\\n    --data-urlencode
 195 |           ''$where=''\"${where}\" \\\n    --data-urlencode ''$select=''\"${select}\"
 196 |           \\\n    | tr -d ''\"'' > \"$output_path\"  # Removing unneeded quotes around
 197 |           all numbers\n", {"outputPath": "Table"}, {"inputValue": "Select"}, {"inputValue":
 198 |           "Where"}, {"inputValue": "Limit"}, {"inputValue": "Format"}], "image": "curlimages/curl"}},
 199 |           "inputs": [{"default": "trip_start_timestamp>=\"1900-01-01\" AND trip_start_timestamp<\"2100-01-01\"",
 200 |           "name": "Where", "type": "String"}, {"default": "1000", "description": "Number
 201 |           of rows to return. The rows are randomly sampled.", "name": "Limit", "type":
 202 |           "Integer"}, {"default": "trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location",
 203 |           "name": "Select", "type": "String"}, {"default": "csv", "description": "Output
 204 |           data format. Suports csv,tsv,cml,rdf,json", "name": "Format", "type": "String"}],
 205 |           "metadata": {"annotations": {"author": "Alexey Volkov <alexey.volkov@ark-kun.com>"}},
 206 |           "name": "Chicago Taxi Trips dataset", "outputs": [{"description": "Result
 207 |           type depends on format. CSV and TSV have header.", "name": "Table"}]}',
 208 |         pipelines.kubeflow.org/component_ref: '{"digest": "ecf2f2840c57bd9cb2778c8f529da9b938b81f59294b3f7271cb23b363640343",
 209 |           "url": "https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml"}',
 210 |         pipelines.kubeflow.org/arguments.parameters: '{"Format": "csv", "Limit": "10000",
 211 |           "Select": "tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total",
 212 |           "Where": "trip_start_timestamp >= \"2019-01-01\" AND trip_start_timestamp
 213 |           < \"2019-02-01\""}'}
 214 |   - name: condition-2
 215 |     inputs:
 216 |       artifacts:
 217 |       - {name: chicago-taxi-trips-dataset-Table}
 218 |       - {name: remove-header-table}
 219 |       - {name: xgboost-train-2-model}
 220 |     dag:
 221 |       tasks:
 222 |       - name: graph-train-until-low-error-1
 223 |         template: graph-train-until-low-error-1
 224 |         arguments:
 225 |           artifacts:
 226 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 227 |           - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'}
 228 |           - {name: xgboost-train-model, from: '{{inputs.artifacts.xgboost-train-2-model}}'}
 229 |   - name: graph-train-until-low-error-1
 230 |     inputs:
 231 |       artifacts:
 232 |       - {name: chicago-taxi-trips-dataset-Table}
 233 |       - {name: remove-header-table}
 234 |       - {name: xgboost-train-model}
 235 |     dag:
 236 |       tasks:
 237 |       - name: calculate-regression-metrics-from-csv
 238 |         template: calculate-regression-metrics-from-csv
 239 |         dependencies: [xgboost-predict]
 240 |         arguments:
 241 |           artifacts:
 242 |           - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'}
 243 |           - {name: xgboost-predict-predictions, from: '{{tasks.xgboost-predict.outputs.artifacts.xgboost-predict-predictions}}'}
 244 |       - name: condition-2
 245 |         template: condition-2
 246 |         when: '{{tasks.calculate-regression-metrics-from-csv.outputs.parameters.calculate-regression-metrics-from-csv-mean_squared_error}}
 247 |           > 0.01'
 248 |         dependencies: [calculate-regression-metrics-from-csv, xgboost-train-2]
 249 |         arguments:
 250 |           artifacts:
 251 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 252 |           - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'}
 253 |           - {name: xgboost-train-2-model, from: '{{tasks.xgboost-train-2.outputs.artifacts.xgboost-train-2-model}}'}
 254 |       - name: xgboost-predict
 255 |         template: xgboost-predict
 256 |         dependencies: [xgboost-train-2]
 257 |         arguments:
 258 |           artifacts:
 259 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 260 |           - {name: xgboost-train-2-model, from: '{{tasks.xgboost-train-2.outputs.artifacts.xgboost-train-2-model}}'}
 261 |       - name: xgboost-train-2
 262 |         template: xgboost-train-2
 263 |         arguments:
 264 |           artifacts:
 265 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 266 |           - {name: xgboost-train-model, from: '{{inputs.artifacts.xgboost-train-model}}'}
 267 |   - name: pandas-transform-dataframe-in-csv-format
 268 |     container:
 269 |       args: [--table, /tmp/inputs/table/data, --transform-code, 'df = df[["tips"]]',
 270 |         --transformed-table, /tmp/outputs/transformed_table/data]
 271 |       command:
 272 |       - sh
 273 |       - -c
 274 |       - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 275 |         'pandas==1.0.4' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
 276 |         --quiet --no-warn-script-location 'pandas==1.0.4' --user) && "$0" "$@"
 277 |       - python3
 278 |       - -u
 279 |       - -c
 280 |       - |
 281 |         def _make_parent_dirs_and_return_path(file_path: str):
 282 |             import os
 283 |             os.makedirs(os.path.dirname(file_path), exist_ok=True)
 284 |             return file_path
 285 | 
 286 |         def Pandas_Transform_DataFrame_in_CSV_format(
 287 |             table_path,
 288 |             transformed_table_path,
 289 |             transform_code,
 290 |         ):
 291 |             '''Transform DataFrame loaded from a CSV file.
 292 | 
 293 |             Inputs:
 294 |                 table: Table to transform.
 295 |                 transform_code: Transformation code. Code is written in Python and can consist of multiple lines.
 296 |                     The DataFrame variable is called "df".
 297 |                     Examples:
 298 |                     - `df['prod'] = df['X'] * df['Y']`
 299 |                     - `df = df[['X', 'prod']]`
 300 |                     - `df.insert(0, "is_positive", df["X"] > 0)`
 301 | 
 302 |             Outputs:
 303 |                 transformed_table: Transformed table.
 304 | 
 305 |             Annotations:
 306 |                 author: Alexey Volkov <alexey.volkov@ark-kun.com>
 307 |             '''
 308 |             import pandas
 309 | 
 310 |             df = pandas.read_csv(
 311 |                 table_path,
 312 |             )
 313 |             # The namespace is needed so that the code can replace `df`. For example df = df[['X']]
 314 |             namespace = locals()
 315 |             exec(transform_code, namespace)
 316 |             namespace['df'].to_csv(
 317 |                 transformed_table_path,
 318 |                 index=False,
 319 |             )
 320 | 
 321 |         import argparse
 322 |         _parser = argparse.ArgumentParser(prog='Pandas Transform DataFrame in CSV format', description='Transform DataFrame loaded from a CSV file.\n\n    Inputs:\n        table: Table to transform.\n        transform_code: Transformation code. Code is written in Python and can consist of multiple lines.\n            The DataFrame variable is called "df".\n            Examples:\n            - `df[\'prod\'] = df[\'X\'] * df[\'Y\']`\n            - `df = df[[\'X\', \'prod\']]`\n            - `df.insert(0, "is_positive", df["X"] > 0)`\n\n    Outputs:\n        transformed_table: Transformed table.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 323 |         _parser.add_argument("--table", dest="table_path", type=str, required=True, default=argparse.SUPPRESS)
 324 |         _parser.add_argument("--transform-code", dest="transform_code", type=str, required=True, default=argparse.SUPPRESS)
 325 |         _parser.add_argument("--transformed-table", dest="transformed_table_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 326 |         _parsed_args = vars(_parser.parse_args())
 327 | 
 328 |         _outputs = Pandas_Transform_DataFrame_in_CSV_format(**_parsed_args)
 329 |       image: python:3.7
 330 |     inputs:
 331 |       artifacts:
 332 |       - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/table/data}
 333 |     outputs:
 334 |       artifacts:
 335 |       - {name: pandas-transform-dataframe-in-csv-format-transformed_table, path: /tmp/outputs/transformed_table/data}
 336 |     metadata:
 337 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Transform
 338 |           DataFrame loaded from a CSV file.\n\n    Inputs:\n        table: Table to
 339 |           transform.\n        transform_code: Transformation code. Code is written
 340 |           in Python and can consist of multiple lines.\n            The DataFrame
 341 |           variable is called \"df\".\n            Examples:\n            - `df[''prod'']
 342 |           = df[''X''] * df[''Y'']`\n            - `df = df[[''X'', ''prod'']]`\n            -
 343 |           `df.insert(0, \"is_positive\", df[\"X\"] > 0)`\n\n    Outputs:\n        transformed_table:
 344 |           Transformed table.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>",
 345 |           "implementation": {"container": {"args": ["--table", {"inputPath": "table"},
 346 |           "--transform-code", {"inputValue": "transform_code"}, "--transformed-table",
 347 |           {"outputPath": "transformed_table"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1
 348 |           python3 -m pip install --quiet --no-warn-script-location ''pandas==1.0.4''
 349 |           || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 350 |           ''pandas==1.0.4'' --user) && \"$0\" \"$@\"", "python3", "-u", "-c", "def
 351 |           _make_parent_dirs_and_return_path(file_path: str):\n    import os\n    os.makedirs(os.path.dirname(file_path),
 352 |           exist_ok=True)\n    return file_path\n\ndef Pandas_Transform_DataFrame_in_CSV_format(\n    table_path,\n    transformed_table_path,\n    transform_code,\n):\n    ''''''Transform
 353 |           DataFrame loaded from a CSV file.\n\n    Inputs:\n        table: Table to
 354 |           transform.\n        transform_code: Transformation code. Code is written
 355 |           in Python and can consist of multiple lines.\n            The DataFrame
 356 |           variable is called \"df\".\n            Examples:\n            - `df[''prod'']
 357 |           = df[''X''] * df[''Y'']`\n            - `df = df[[''X'', ''prod'']]`\n            -
 358 |           `df.insert(0, \"is_positive\", df[\"X\"] > 0)`\n\n    Outputs:\n        transformed_table:
 359 |           Transformed table.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>\n    ''''''\n    import
 360 |           pandas\n\n    df = pandas.read_csv(\n        table_path,\n    )\n    # The
 361 |           namespace is needed so that the code can replace `df`. For example df =
 362 |           df[[''X'']]\n    namespace = locals()\n    exec(transform_code, namespace)\n    namespace[''df''].to_csv(\n        transformed_table_path,\n        index=False,\n    )\n\nimport
 363 |           argparse\n_parser = argparse.ArgumentParser(prog=''Pandas Transform DataFrame
 364 |           in CSV format'', description=''Transform DataFrame loaded from a CSV file.\\n\\n    Inputs:\\n        table:
 365 |           Table to transform.\\n        transform_code: Transformation code. Code
 366 |           is written in Python and can consist of multiple lines.\\n            The
 367 |           DataFrame variable is called \"df\".\\n            Examples:\\n            -
 368 |           `df[\\''prod\\''] = df[\\''X\\''] * df[\\''Y\\'']`\\n            - `df =
 369 |           df[[\\''X\\'', \\''prod\\'']]`\\n            - `df.insert(0, \"is_positive\",
 370 |           df[\"X\"] > 0)`\\n\\n    Outputs:\\n        transformed_table: Transformed
 371 |           table.\\n\\n    Annotations:\\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>'')\n_parser.add_argument(\"--table\",
 372 |           dest=\"table_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--transform-code\",
 373 |           dest=\"transform_code\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--transformed-table\",
 374 |           dest=\"transformed_table_path\", type=_make_parent_dirs_and_return_path,
 375 |           required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
 376 |           = Pandas_Transform_DataFrame_in_CSV_format(**_parsed_args)\n"], "image":
 377 |           "python:3.7"}}, "inputs": [{"name": "table", "type": "CSV"}, {"name": "transform_code",
 378 |           "type": "PythonCode"}], "name": "Pandas Transform DataFrame in CSV format",
 379 |           "outputs": [{"name": "transformed_table", "type": "CSV"}]}', pipelines.kubeflow.org/component_ref: '{"digest":
 380 |           "58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510", "url":
 381 |           "https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml"}',
 382 |         pipelines.kubeflow.org/arguments.parameters: '{"transform_code": "df = df[[\"tips\"]]"}'}
 383 |   - name: remove-header
 384 |     container:
 385 |       args: []
 386 |       command:
 387 |       - sh
 388 |       - -exc
 389 |       - |
 390 |         mkdir -p "$(dirname "$1")"
 391 |         tail -n +2 <"$0" >"$1"
 392 |       - /tmp/inputs/table/data
 393 |       - /tmp/outputs/table/data
 394 |       image: alpine
 395 |     inputs:
 396 |       artifacts:
 397 |       - {name: pandas-transform-dataframe-in-csv-format-transformed_table, path: /tmp/inputs/table/data}
 398 |     outputs:
 399 |       artifacts:
 400 |       - {name: remove-header-table, path: /tmp/outputs/table/data}
 401 |     metadata:
 402 |       annotations: {author: Alexey Volkov <alexey.volkov@ark-kun.com>, pipelines.kubeflow.org/component_spec: '{"description":
 403 |           "Remove the header line from CSV and TSV data (unconditionally)", "implementation":
 404 |           {"container": {"command": ["sh", "-exc", "mkdir -p \"$(dirname \"$1\")\"\ntail
 405 |           -n +2 <\"$0\" >\"$1\"\n", {"inputPath": "table"}, {"outputPath": "table"}],
 406 |           "image": "alpine"}}, "inputs": [{"name": "table"}], "metadata": {"annotations":
 407 |           {"author": "Alexey Volkov <alexey.volkov@ark-kun.com>"}}, "name": "Remove
 408 |           header", "outputs": [{"name": "table"}]}', pipelines.kubeflow.org/component_ref: '{"digest":
 409 |           "ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3", "url":
 410 |           "https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml"}'}
 411 |   - name: train-until-good-pipeline
 412 |     dag:
 413 |       tasks:
 414 |       - {name: chicago-taxi-trips-dataset, template: chicago-taxi-trips-dataset}
 415 |       - name: graph-train-until-low-error-1
 416 |         template: graph-train-until-low-error-1
 417 |         dependencies: [chicago-taxi-trips-dataset, remove-header, xgboost-train]
 418 |         arguments:
 419 |           artifacts:
 420 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 421 |           - {name: remove-header-table, from: '{{tasks.remove-header.outputs.artifacts.remove-header-table}}'}
 422 |           - {name: xgboost-train-model, from: '{{tasks.xgboost-train.outputs.artifacts.xgboost-train-model}}'}
 423 |       - name: pandas-transform-dataframe-in-csv-format
 424 |         template: pandas-transform-dataframe-in-csv-format
 425 |         dependencies: [chicago-taxi-trips-dataset]
 426 |         arguments:
 427 |           artifacts:
 428 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 429 |       - name: remove-header
 430 |         template: remove-header
 431 |         dependencies: [pandas-transform-dataframe-in-csv-format]
 432 |         arguments:
 433 |           artifacts:
 434 |           - {name: pandas-transform-dataframe-in-csv-format-transformed_table, from: '{{tasks.pandas-transform-dataframe-in-csv-format.outputs.artifacts.pandas-transform-dataframe-in-csv-format-transformed_table}}'}
 435 |       - name: xgboost-train
 436 |         template: xgboost-train
 437 |         dependencies: [chicago-taxi-trips-dataset]
 438 |         arguments:
 439 |           artifacts:
 440 |           - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'}
 441 |   - name: xgboost-predict
 442 |     container:
 443 |       args: [--data, /tmp/inputs/data/data, --model, /tmp/inputs/model/data, --label-column,
 444 |         '0', --predictions, /tmp/outputs/predictions/data]
 445 |       command:
 446 |       - sh
 447 |       - -c
 448 |       - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 449 |         'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
 450 |         -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
 451 |         --user) && "$0" "$@"
 452 |       - python3
 453 |       - -u
 454 |       - -c
 455 |       - |
 456 |         def _make_parent_dirs_and_return_path(file_path: str):
 457 |             import os
 458 |             os.makedirs(os.path.dirname(file_path), exist_ok=True)
 459 |             return file_path
 460 | 
 461 |         def xgboost_predict(
 462 |             data_path,  # Also supports LibSVM
 463 |             model_path,
 464 |             predictions_path,
 465 |             label_column = None,
 466 |         ):
 467 |             '''Make predictions using a trained XGBoost model.
 468 | 
 469 |             Args:
 470 |                 data_path: Path for the feature data in CSV format.
 471 |                 model_path: Path for the trained model in binary XGBoost format.
 472 |                 predictions_path: Output path for the predictions.
 473 |                 label_column: Column containing the label data.
 474 | 
 475 |             Annotations:
 476 |                 author: Alexey Volkov <alexey.volkov@ark-kun.com>
 477 |             '''
 478 |             from pathlib import Path
 479 | 
 480 |             import numpy
 481 |             import pandas
 482 |             import xgboost
 483 | 
 484 |             df = pandas.read_csv(
 485 |                 data_path,
 486 |             )
 487 | 
 488 |             if label_column is not None:
 489 |                 df = df.drop(columns=[df.columns[label_column]])
 490 | 
 491 |             testing_data = xgboost.DMatrix(
 492 |                 data=df,
 493 |             )
 494 | 
 495 |             model = xgboost.Booster(model_file=model_path)
 496 | 
 497 |             predictions = model.predict(testing_data)
 498 | 
 499 |             Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
 500 |             numpy.savetxt(predictions_path, predictions)
 501 | 
 502 |         import argparse
 503 |         _parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n    Args:\n        data_path: Path for the feature data in CSV format.\n        model_path: Path for the trained model in binary XGBoost format.\n        predictions_path: Output path for the predictions.\n        label_column: Column containing the label data.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 504 |         _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
 505 |         _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
 506 |         _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 507 |         _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 508 |         _parsed_args = vars(_parser.parse_args())
 509 | 
 510 |         _outputs = xgboost_predict(**_parsed_args)
 511 |       image: python:3.7
 512 |     inputs:
 513 |       artifacts:
 514 |       - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/data/data}
 515 |       - {name: xgboost-train-2-model, path: /tmp/inputs/model/data}
 516 |     outputs:
 517 |       artifacts:
 518 |       - {name: xgboost-predict-predictions, path: /tmp/outputs/predictions/data}
 519 |     metadata:
 520 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Make
 521 |           predictions using a trained XGBoost model.\n\n    Args:\n        data_path:
 522 |           Path for the feature data in CSV format.\n        model_path: Path for the
 523 |           trained model in binary XGBoost format.\n        predictions_path: Output
 524 |           path for the predictions.\n        label_column: Column containing the label
 525 |           data.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>",
 526 |           "implementation": {"container": {"args": ["--data", {"inputPath": "data"},
 527 |           "--model", {"inputPath": "model"}, {"if": {"cond": {"isPresent": "label_column"},
 528 |           "then": ["--label-column", {"inputValue": "label_column"}]}}, "--predictions",
 529 |           {"outputPath": "predictions"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1
 530 |           python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1''
 531 |           ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
 532 |           --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user)
 533 |           && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path:
 534 |           str):\n    import os\n    os.makedirs(os.path.dirname(file_path), exist_ok=True)\n    return
 535 |           file_path\n\ndef xgboost_predict(\n    data_path,  # Also supports LibSVM\n    model_path,\n    predictions_path,\n    label_column
 536 |           = None,\n):\n    ''''''Make predictions using a trained XGBoost model.\n\n    Args:\n        data_path:
 537 |           Path for the feature data in CSV format.\n        model_path: Path for the
 538 |           trained model in binary XGBoost format.\n        predictions_path: Output
 539 |           path for the predictions.\n        label_column: Column containing the label
 540 |           data.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>\n    ''''''\n    from
 541 |           pathlib import Path\n\n    import numpy\n    import pandas\n    import xgboost\n\n    df
 542 |           = pandas.read_csv(\n        data_path,\n    )\n\n    if label_column is
 543 |           not None:\n        df = df.drop(columns=[df.columns[label_column]])\n\n    testing_data
 544 |           = xgboost.DMatrix(\n        data=df,\n    )\n\n    model = xgboost.Booster(model_file=model_path)\n\n    predictions
 545 |           = model.predict(testing_data)\n\n    Path(predictions_path).parent.mkdir(parents=True,
 546 |           exist_ok=True)\n    numpy.savetxt(predictions_path, predictions)\n\nimport
 547 |           argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost predict'', description=''Make
 548 |           predictions using a trained XGBoost model.\\n\\n    Args:\\n        data_path:
 549 |           Path for the feature data in CSV format.\\n        model_path: Path for
 550 |           the trained model in binary XGBoost format.\\n        predictions_path:
 551 |           Output path for the predictions.\\n        label_column: Column containing
 552 |           the label data.\\n\\n    Annotations:\\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>'')\n_parser.add_argument(\"--data\",
 553 |           dest=\"data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\",
 554 |           dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\",
 555 |           dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\",
 556 |           dest=\"predictions_path\", type=_make_parent_dirs_and_return_path, required=True,
 557 |           default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs
 558 |           = xgboost_predict(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs":
 559 |           [{"name": "data", "type": "CSV"}, {"name": "model", "type": "XGBoostModel"},
 560 |           {"name": "label_column", "optional": true, "type": "Integer"}], "name":
 561 |           "Xgboost predict", "outputs": [{"name": "predictions", "type": "Text"}]}',
 562 |         pipelines.kubeflow.org/component_ref: '{"digest": "ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357",
 563 |           "url": "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml"}',
 564 |         pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0"}'}
 565 |   - name: xgboost-train
 566 |     container:
 567 |       args: [--training-data, /tmp/inputs/training_data/data, --label-column, '0',
 568 |         --num-iterations, '100', --objective, 'reg:squarederror', --model, /tmp/outputs/model/data,
 569 |         --model-config, /tmp/outputs/model_config/data]
 570 |       command:
 571 |       - sh
 572 |       - -c
 573 |       - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 574 |         'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
 575 |         -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
 576 |         --user) && "$0" "$@"
 577 |       - python3
 578 |       - -u
 579 |       - -c
 580 |       - |
 581 |         def _make_parent_dirs_and_return_path(file_path: str):
 582 |             import os
 583 |             os.makedirs(os.path.dirname(file_path), exist_ok=True)
 584 |             return file_path
 585 | 
 586 |         def xgboost_train(
 587 |             training_data_path,  # Also supports LibSVM
 588 |             model_path,
 589 |             model_config_path,
 590 |             starting_model_path = None,
 591 | 
 592 |             label_column = 0,
 593 |             num_iterations = 10,
 594 |             booster_params = None,
 595 | 
 596 |             # Booster parameters
 597 |             objective = 'reg:squarederror',
 598 |             booster = 'gbtree',
 599 |             learning_rate = 0.3,
 600 |             min_split_loss = 0,
 601 |             max_depth = 6,
 602 |         ):
 603 |             '''Train an XGBoost model.
 604 | 
 605 |             Args:
 606 |                 training_data_path: Path for the training data in CSV format.
 607 |                 model_path: Output path for the trained model in binary XGBoost format.
 608 |                 model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
 609 |                 starting_model_path: Path for the existing trained model to start from.
 610 |                 label_column: Column containing the label data.
 611 |                 num_boost_rounds: Number of boosting iterations.
 612 |                 booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
 613 |                 objective: The learning task and the corresponding learning objective.
 614 |                     See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 615 |                     The most common values are:
 616 |                     "reg:squarederror" - Regression with squared loss (default).
 617 |                     "reg:logistic" - Logistic regression.
 618 |                     "binary:logistic" - Logistic regression for binary classification, output probability.
 619 |                     "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
 620 |                     "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
 621 |                     "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized
 622 | 
 623 |             Annotations:
 624 |                 author: Alexey Volkov <alexey.volkov@ark-kun.com>
 625 |             '''
 626 |             import pandas
 627 |             import xgboost
 628 | 
 629 |             df = pandas.read_csv(
 630 |                 training_data_path,
 631 |             )
 632 | 
 633 |             training_data = xgboost.DMatrix(
 634 |                 data=df.drop(columns=[df.columns[label_column]]),
 635 |                 label=df[df.columns[label_column]],
 636 |             )
 637 | 
 638 |             booster_params = booster_params or {}
 639 |             booster_params.setdefault('objective', objective)
 640 |             booster_params.setdefault('booster', booster)
 641 |             booster_params.setdefault('learning_rate', learning_rate)
 642 |             booster_params.setdefault('min_split_loss', min_split_loss)
 643 |             booster_params.setdefault('max_depth', max_depth)
 644 | 
 645 |             starting_model = None
 646 |             if starting_model_path:
 647 |                 starting_model = xgboost.Booster(model_file=starting_model_path)
 648 | 
 649 |             model = xgboost.train(
 650 |                 params=booster_params,
 651 |                 dtrain=training_data,
 652 |                 num_boost_round=num_iterations,
 653 |                 xgb_model=starting_model
 654 |             )
 655 | 
 656 |             # Saving the model in binary format
 657 |             model.save_model(model_path)
 658 | 
 659 |             model_config_str = model.save_config()
 660 |             with open(model_config_path, 'w') as model_config_file:
 661 |                 model_config_file.write(model_config_str)
 662 | 
 663 |         import json
 664 |         import argparse
 665 |         _parser = argparse.ArgumentParser(prog='Xgboost train', description='Train an XGBoost model.\n\n    Args:\n        training_data_path: Path for the training data in CSV format.\n        model_path: Output path for the trained model in binary XGBoost format.\n        model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.\n        starting_model_path: Path for the existing trained model to start from.\n        label_column: Column containing the label data.\n        num_boost_rounds: Number of boosting iterations.\n        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective: The learning task and the corresponding learning objective.\n            See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The most common values are:\n            "reg:squarederror" - Regression with squared loss (default).\n            "reg:logistic" - Logistic regression.\n            "binary:logistic" - Logistic regression for binary classification, output probability.\n            "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation\n            "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized\n            "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 666 |         _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS)
 667 |         _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS)
 668 |         _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 669 |         _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS)
 670 |         _parser.add_argument("--booster-params", dest="booster_params", type=json.loads, required=False, default=argparse.SUPPRESS)
 671 |         _parser.add_argument("--objective", dest="objective", type=str, required=False, default=argparse.SUPPRESS)
 672 |         _parser.add_argument("--booster", dest="booster", type=str, required=False, default=argparse.SUPPRESS)
 673 |         _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS)
 674 |         _parser.add_argument("--min-split-loss", dest="min_split_loss", type=float, required=False, default=argparse.SUPPRESS)
 675 |         _parser.add_argument("--max-depth", dest="max_depth", type=int, required=False, default=argparse.SUPPRESS)
 676 |         _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 677 |         _parser.add_argument("--model-config", dest="model_config_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 678 |         _parsed_args = vars(_parser.parse_args())
 679 | 
 680 |         _outputs = xgboost_train(**_parsed_args)
 681 |       image: python:3.7
 682 |     inputs:
 683 |       artifacts:
 684 |       - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/training_data/data}
 685 |     outputs:
 686 |       artifacts:
 687 |       - {name: xgboost-train-model, path: /tmp/outputs/model/data}
 688 |       - {name: xgboost-train-model_config, path: /tmp/outputs/model_config/data}
 689 |     metadata:
 690 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Train
 691 |           an XGBoost model.\n\n    Args:\n        training_data_path: Path for the
 692 |           training data in CSV format.\n        model_path: Output path for the trained
 693 |           model in binary XGBoost format.\n        model_config_path: Output path
 694 |           for the internal parameter configuration of Booster as a JSON string.\n        starting_model_path:
 695 |           Path for the existing trained model to start from.\n        label_column:
 696 |           Column containing the label data.\n        num_boost_rounds: Number of boosting
 697 |           iterations.\n        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective:
 698 |           The learning task and the corresponding learning objective.\n            See
 699 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The
 700 |           most common values are:\n            \"reg:squarederror\" - Regression with
 701 |           squared loss (default).\n            \"reg:logistic\" - Logistic regression.\n            \"binary:logistic\"
 702 |           - Logistic regression for binary classification, output probability.\n            \"binary:logitraw\"
 703 |           - Logistic regression for binary classification, output score before logistic
 704 |           transformation\n            \"rank:pairwise\" - Use LambdaMART to perform
 705 |           pairwise ranking where the pairwise loss is minimized\n            \"rank:ndcg\"
 706 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
 707 |           Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author:
 708 |           Alexey Volkov <alexey.volkov@ark-kun.com>", "implementation": {"container":
 709 |           {"args": ["--training-data", {"inputPath": "training_data"}, {"if": {"cond":
 710 |           {"isPresent": "starting_model"}, "then": ["--starting-model", {"inputPath":
 711 |           "starting_model"}]}}, {"if": {"cond": {"isPresent": "label_column"}, "then":
 712 |           ["--label-column", {"inputValue": "label_column"}]}}, {"if": {"cond": {"isPresent":
 713 |           "num_iterations"}, "then": ["--num-iterations", {"inputValue": "num_iterations"}]}},
 714 |           {"if": {"cond": {"isPresent": "booster_params"}, "then": ["--booster-params",
 715 |           {"inputValue": "booster_params"}]}}, {"if": {"cond": {"isPresent": "objective"},
 716 |           "then": ["--objective", {"inputValue": "objective"}]}}, {"if": {"cond":
 717 |           {"isPresent": "booster"}, "then": ["--booster", {"inputValue": "booster"}]}},
 718 |           {"if": {"cond": {"isPresent": "learning_rate"}, "then": ["--learning-rate",
 719 |           {"inputValue": "learning_rate"}]}}, {"if": {"cond": {"isPresent": "min_split_loss"},
 720 |           "then": ["--min-split-loss", {"inputValue": "min_split_loss"}]}}, {"if":
 721 |           {"cond": {"isPresent": "max_depth"}, "then": ["--max-depth", {"inputValue":
 722 |           "max_depth"}]}}, "--model", {"outputPath": "model"}, "--model-config", {"outputPath":
 723 |           "model_config"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1
 724 |           python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1''
 725 |           ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
 726 |           --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user)
 727 |           && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path:
 728 |           str):\n    import os\n    os.makedirs(os.path.dirname(file_path), exist_ok=True)\n    return
 729 |           file_path\n\ndef xgboost_train(\n    training_data_path,  # Also supports
 730 |           LibSVM\n    model_path,\n    model_config_path,\n    starting_model_path
 731 |           = None,\n\n    label_column = 0,\n    num_iterations = 10,\n    booster_params
 732 |           = None,\n\n    # Booster parameters\n    objective = ''reg:squarederror'',\n    booster
 733 |           = ''gbtree'',\n    learning_rate = 0.3,\n    min_split_loss = 0,\n    max_depth
 734 |           = 6,\n):\n    ''''''Train an XGBoost model.\n\n    Args:\n        training_data_path:
 735 |           Path for the training data in CSV format.\n        model_path: Output path
 736 |           for the trained model in binary XGBoost format.\n        model_config_path:
 737 |           Output path for the internal parameter configuration of Booster as a JSON
 738 |           string.\n        starting_model_path: Path for the existing trained model
 739 |           to start from.\n        label_column: Column containing the label data.\n        num_boost_rounds:
 740 |           Number of boosting iterations.\n        booster_params: Parameters for the
 741 |           booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective:
 742 |           The learning task and the corresponding learning objective.\n            See
 743 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The
 744 |           most common values are:\n            \"reg:squarederror\" - Regression with
 745 |           squared loss (default).\n            \"reg:logistic\" - Logistic regression.\n            \"binary:logistic\"
 746 |           - Logistic regression for binary classification, output probability.\n            \"binary:logitraw\"
 747 |           - Logistic regression for binary classification, output score before logistic
 748 |           transformation\n            \"rank:pairwise\" - Use LambdaMART to perform
 749 |           pairwise ranking where the pairwise loss is minimized\n            \"rank:ndcg\"
 750 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
 751 |           Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author:
 752 |           Alexey Volkov <alexey.volkov@ark-kun.com>\n    ''''''\n    import pandas\n    import
 753 |           xgboost\n\n    df = pandas.read_csv(\n        training_data_path,\n    )\n\n    training_data
 754 |           = xgboost.DMatrix(\n        data=df.drop(columns=[df.columns[label_column]]),\n        label=df[df.columns[label_column]],\n    )\n\n    booster_params
 755 |           = booster_params or {}\n    booster_params.setdefault(''objective'', objective)\n    booster_params.setdefault(''booster'',
 756 |           booster)\n    booster_params.setdefault(''learning_rate'', learning_rate)\n    booster_params.setdefault(''min_split_loss'',
 757 |           min_split_loss)\n    booster_params.setdefault(''max_depth'', max_depth)\n\n    starting_model
 758 |           = None\n    if starting_model_path:\n        starting_model = xgboost.Booster(model_file=starting_model_path)\n\n    model
 759 |           = xgboost.train(\n        params=booster_params,\n        dtrain=training_data,\n        num_boost_round=num_iterations,\n        xgb_model=starting_model\n    )\n\n    #
 760 |           Saving the model in binary format\n    model.save_model(model_path)\n\n    model_config_str
 761 |           = model.save_config()\n    with open(model_config_path, ''w'') as model_config_file:\n        model_config_file.write(model_config_str)\n\nimport
 762 |           json\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost
 763 |           train'', description=''Train an XGBoost model.\\n\\n    Args:\\n        training_data_path:
 764 |           Path for the training data in CSV format.\\n        model_path: Output path
 765 |           for the trained model in binary XGBoost format.\\n        model_config_path:
 766 |           Output path for the internal parameter configuration of Booster as a JSON
 767 |           string.\\n        starting_model_path: Path for the existing trained model
 768 |           to start from.\\n        label_column: Column containing the label data.\\n        num_boost_rounds:
 769 |           Number of boosting iterations.\\n        booster_params: Parameters for
 770 |           the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\n        objective:
 771 |           The learning task and the corresponding learning objective.\\n            See
 772 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\n            The
 773 |           most common values are:\\n            \"reg:squarederror\" - Regression
 774 |           with squared loss (default).\\n            \"reg:logistic\" - Logistic regression.\\n            \"binary:logistic\"
 775 |           - Logistic regression for binary classification, output probability.\\n            \"binary:logitraw\"
 776 |           - Logistic regression for binary classification, output score before logistic
 777 |           transformation\\n            \"rank:pairwise\" - Use LambdaMART to perform
 778 |           pairwise ranking where the pairwise loss is minimized\\n            \"rank:ndcg\"
 779 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
 780 |           Cumulative Gain (NDCG) is maximized\\n\\n    Annotations:\\n        author:
 781 |           Alexey Volkov <alexey.volkov@ark-kun.com>'')\n_parser.add_argument(\"--training-data\",
 782 |           dest=\"training_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--starting-model\",
 783 |           dest=\"starting_model_path\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\",
 784 |           dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--num-iterations\",
 785 |           dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster-params\",
 786 |           dest=\"booster_params\", type=json.loads, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\",
 787 |           dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster\",
 788 |           dest=\"booster\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\",
 789 |           dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--min-split-loss\",
 790 |           dest=\"min_split_loss\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\",
 791 |           dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\",
 792 |           dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True,
 793 |           default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\", dest=\"model_config_path\",
 794 |           type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args
 795 |           = vars(_parser.parse_args())\n\n_outputs = xgboost_train(**_parsed_args)\n"],
 796 |           "image": "python:3.7"}}, "inputs": [{"name": "training_data", "type": "CSV"},
 797 |           {"name": "starting_model", "optional": true, "type": "XGBoostModel"}, {"default":
 798 |           "0", "name": "label_column", "optional": true, "type": "Integer"}, {"default":
 799 |           "10", "name": "num_iterations", "optional": true, "type": "Integer"}, {"name":
 800 |           "booster_params", "optional": true, "type": "JsonObject"}, {"default": "reg:squarederror",
 801 |           "name": "objective", "optional": true, "type": "String"}, {"default": "gbtree",
 802 |           "name": "booster", "optional": true, "type": "String"}, {"default": "0.3",
 803 |           "name": "learning_rate", "optional": true, "type": "Float"}, {"default":
 804 |           "0", "name": "min_split_loss", "optional": true, "type": "Float"}, {"default":
 805 |           "6", "name": "max_depth", "optional": true, "type": "Integer"}], "name":
 806 |           "Xgboost train", "outputs": [{"name": "model", "type": "XGBoostModel"},
 807 |           {"name": "model_config", "type": "XGBoostModelConfig"}]}', pipelines.kubeflow.org/component_ref: '{"digest":
 808 |           "09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38", "url":
 809 |           "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml"}',
 810 |         pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0", "num_iterations":
 811 |           "100", "objective": "reg:squarederror"}'}
 812 |   - name: xgboost-train-2
 813 |     container:
 814 |       args: [--training-data, /tmp/inputs/training_data/data, --starting-model, /tmp/inputs/starting_model/data,
 815 |         --label-column, '0', --num-iterations, '50', --objective, 'reg:squarederror',
 816 |         --model, /tmp/outputs/model/data, --model-config, /tmp/outputs/model_config/data]
 817 |       command:
 818 |       - sh
 819 |       - -c
 820 |       - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
 821 |         'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
 822 |         -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
 823 |         --user) && "$0" "$@"
 824 |       - python3
 825 |       - -u
 826 |       - -c
 827 |       - |
 828 |         def _make_parent_dirs_and_return_path(file_path: str):
 829 |             import os
 830 |             os.makedirs(os.path.dirname(file_path), exist_ok=True)
 831 |             return file_path
 832 | 
 833 |         def xgboost_train(
 834 |             training_data_path,  # Also supports LibSVM
 835 |             model_path,
 836 |             model_config_path,
 837 |             starting_model_path = None,
 838 | 
 839 |             label_column = 0,
 840 |             num_iterations = 10,
 841 |             booster_params = None,
 842 | 
 843 |             # Booster parameters
 844 |             objective = 'reg:squarederror',
 845 |             booster = 'gbtree',
 846 |             learning_rate = 0.3,
 847 |             min_split_loss = 0,
 848 |             max_depth = 6,
 849 |         ):
 850 |             '''Train an XGBoost model.
 851 | 
 852 |             Args:
 853 |                 training_data_path: Path for the training data in CSV format.
 854 |                 model_path: Output path for the trained model in binary XGBoost format.
 855 |                 model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
 856 |                 starting_model_path: Path for the existing trained model to start from.
 857 |                 label_column: Column containing the label data.
 858 |                 num_boost_rounds: Number of boosting iterations.
 859 |                 booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
 860 |                 objective: The learning task and the corresponding learning objective.
 861 |                     See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 862 |                     The most common values are:
 863 |                     "reg:squarederror" - Regression with squared loss (default).
 864 |                     "reg:logistic" - Logistic regression.
 865 |                     "binary:logistic" - Logistic regression for binary classification, output probability.
 866 |                     "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
 867 |                     "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
 868 |                     "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized
 869 | 
 870 |             Annotations:
 871 |                 author: Alexey Volkov <alexey.volkov@ark-kun.com>
 872 |             '''
 873 |             import pandas
 874 |             import xgboost
 875 | 
 876 |             df = pandas.read_csv(
 877 |                 training_data_path,
 878 |             )
 879 | 
 880 |             training_data = xgboost.DMatrix(
 881 |                 data=df.drop(columns=[df.columns[label_column]]),
 882 |                 label=df[df.columns[label_column]],
 883 |             )
 884 | 
 885 |             booster_params = booster_params or {}
 886 |             booster_params.setdefault('objective', objective)
 887 |             booster_params.setdefault('booster', booster)
 888 |             booster_params.setdefault('learning_rate', learning_rate)
 889 |             booster_params.setdefault('min_split_loss', min_split_loss)
 890 |             booster_params.setdefault('max_depth', max_depth)
 891 | 
 892 |             starting_model = None
 893 |             if starting_model_path:
 894 |                 starting_model = xgboost.Booster(model_file=starting_model_path)
 895 | 
 896 |             model = xgboost.train(
 897 |                 params=booster_params,
 898 |                 dtrain=training_data,
 899 |                 num_boost_round=num_iterations,
 900 |                 xgb_model=starting_model
 901 |             )
 902 | 
 903 |             # Saving the model in binary format
 904 |             model.save_model(model_path)
 905 | 
 906 |             model_config_str = model.save_config()
 907 |             with open(model_config_path, 'w') as model_config_file:
 908 |                 model_config_file.write(model_config_str)
 909 | 
 910 |         import json
 911 |         import argparse
 912 |         _parser = argparse.ArgumentParser(prog='Xgboost train', description='Train an XGBoost model.\n\n    Args:\n        training_data_path: Path for the training data in CSV format.\n        model_path: Output path for the trained model in binary XGBoost format.\n        model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.\n        starting_model_path: Path for the existing trained model to start from.\n        label_column: Column containing the label data.\n        num_boost_rounds: Number of boosting iterations.\n        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective: The learning task and the corresponding learning objective.\n            See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The most common values are:\n            "reg:squarederror" - Regression with squared loss (default).\n            "reg:logistic" - Logistic regression.\n            "binary:logistic" - Logistic regression for binary classification, output probability.\n            "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation\n            "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized\n            "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
 913 |         _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS)
 914 |         _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS)
 915 |         _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
 916 |         _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS)
 917 |         _parser.add_argument("--booster-params", dest="booster_params", type=json.loads, required=False, default=argparse.SUPPRESS)
 918 |         _parser.add_argument("--objective", dest="objective", type=str, required=False, default=argparse.SUPPRESS)
 919 |         _parser.add_argument("--booster", dest="booster", type=str, required=False, default=argparse.SUPPRESS)
 920 |         _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS)
 921 |         _parser.add_argument("--min-split-loss", dest="min_split_loss", type=float, required=False, default=argparse.SUPPRESS)
 922 |         _parser.add_argument("--max-depth", dest="max_depth", type=int, required=False, default=argparse.SUPPRESS)
 923 |         _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 924 |         _parser.add_argument("--model-config", dest="model_config_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
 925 |         _parsed_args = vars(_parser.parse_args())
 926 | 
 927 |         _outputs = xgboost_train(**_parsed_args)
 928 |       image: python:3.7
 929 |     inputs:
 930 |       artifacts:
 931 |       - {name: xgboost-train-model, path: /tmp/inputs/starting_model/data}
 932 |       - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/training_data/data}
 933 |     outputs:
 934 |       artifacts:
 935 |       - {name: xgboost-train-2-model, path: /tmp/outputs/model/data}
 936 |       - {name: xgboost-train-2-model_config, path: /tmp/outputs/model_config/data}
 937 |     metadata:
 938 |       annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Train
 939 |           an XGBoost model.\n\n    Args:\n        training_data_path: Path for the
 940 |           training data in CSV format.\n        model_path: Output path for the trained
 941 |           model in binary XGBoost format.\n        model_config_path: Output path
 942 |           for the internal parameter configuration of Booster as a JSON string.\n        starting_model_path:
 943 |           Path for the existing trained model to start from.\n        label_column:
 944 |           Column containing the label data.\n        num_boost_rounds: Number of boosting
 945 |           iterations.\n        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective:
 946 |           The learning task and the corresponding learning objective.\n            See
 947 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The
 948 |           most common values are:\n            \"reg:squarederror\" - Regression with
 949 |           squared loss (default).\n            \"reg:logistic\" - Logistic regression.\n            \"binary:logistic\"
 950 |           - Logistic regression for binary classification, output probability.\n            \"binary:logitraw\"
 951 |           - Logistic regression for binary classification, output score before logistic
 952 |           transformation\n            \"rank:pairwise\" - Use LambdaMART to perform
 953 |           pairwise ranking where the pairwise loss is minimized\n            \"rank:ndcg\"
 954 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
 955 |           Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author:
 956 |           Alexey Volkov <alexey.volkov@ark-kun.com>", "implementation": {"container":
 957 |           {"args": ["--training-data", {"inputPath": "training_data"}, {"if": {"cond":
 958 |           {"isPresent": "starting_model"}, "then": ["--starting-model", {"inputPath":
 959 |           "starting_model"}]}}, {"if": {"cond": {"isPresent": "label_column"}, "then":
 960 |           ["--label-column", {"inputValue": "label_column"}]}}, {"if": {"cond": {"isPresent":
 961 |           "num_iterations"}, "then": ["--num-iterations", {"inputValue": "num_iterations"}]}},
 962 |           {"if": {"cond": {"isPresent": "booster_params"}, "then": ["--booster-params",
 963 |           {"inputValue": "booster_params"}]}}, {"if": {"cond": {"isPresent": "objective"},
 964 |           "then": ["--objective", {"inputValue": "objective"}]}}, {"if": {"cond":
 965 |           {"isPresent": "booster"}, "then": ["--booster", {"inputValue": "booster"}]}},
 966 |           {"if": {"cond": {"isPresent": "learning_rate"}, "then": ["--learning-rate",
 967 |           {"inputValue": "learning_rate"}]}}, {"if": {"cond": {"isPresent": "min_split_loss"},
 968 |           "then": ["--min-split-loss", {"inputValue": "min_split_loss"}]}}, {"if":
 969 |           {"cond": {"isPresent": "max_depth"}, "then": ["--max-depth", {"inputValue":
 970 |           "max_depth"}]}}, "--model", {"outputPath": "model"}, "--model-config", {"outputPath":
 971 |           "model_config"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1
 972 |           python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1''
 973 |           ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
 974 |           --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user)
 975 |           && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path:
 976 |           str):\n    import os\n    os.makedirs(os.path.dirname(file_path), exist_ok=True)\n    return
 977 |           file_path\n\ndef xgboost_train(\n    training_data_path,  # Also supports
 978 |           LibSVM\n    model_path,\n    model_config_path,\n    starting_model_path
 979 |           = None,\n\n    label_column = 0,\n    num_iterations = 10,\n    booster_params
 980 |           = None,\n\n    # Booster parameters\n    objective = ''reg:squarederror'',\n    booster
 981 |           = ''gbtree'',\n    learning_rate = 0.3,\n    min_split_loss = 0,\n    max_depth
 982 |           = 6,\n):\n    ''''''Train an XGBoost model.\n\n    Args:\n        training_data_path:
 983 |           Path for the training data in CSV format.\n        model_path: Output path
 984 |           for the trained model in binary XGBoost format.\n        model_config_path:
 985 |           Output path for the internal parameter configuration of Booster as a JSON
 986 |           string.\n        starting_model_path: Path for the existing trained model
 987 |           to start from.\n        label_column: Column containing the label data.\n        num_boost_rounds:
 988 |           Number of boosting iterations.\n        booster_params: Parameters for the
 989 |           booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective:
 990 |           The learning task and the corresponding learning objective.\n            See
 991 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The
 992 |           most common values are:\n            \"reg:squarederror\" - Regression with
 993 |           squared loss (default).\n            \"reg:logistic\" - Logistic regression.\n            \"binary:logistic\"
 994 |           - Logistic regression for binary classification, output probability.\n            \"binary:logitraw\"
 995 |           - Logistic regression for binary classification, output score before logistic
 996 |           transformation\n            \"rank:pairwise\" - Use LambdaMART to perform
 997 |           pairwise ranking where the pairwise loss is minimized\n            \"rank:ndcg\"
 998 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
 999 |           Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author:
1000 |           Alexey Volkov <alexey.volkov@ark-kun.com>\n    ''''''\n    import pandas\n    import
1001 |           xgboost\n\n    df = pandas.read_csv(\n        training_data_path,\n    )\n\n    training_data
1002 |           = xgboost.DMatrix(\n        data=df.drop(columns=[df.columns[label_column]]),\n        label=df[df.columns[label_column]],\n    )\n\n    booster_params
1003 |           = booster_params or {}\n    booster_params.setdefault(''objective'', objective)\n    booster_params.setdefault(''booster'',
1004 |           booster)\n    booster_params.setdefault(''learning_rate'', learning_rate)\n    booster_params.setdefault(''min_split_loss'',
1005 |           min_split_loss)\n    booster_params.setdefault(''max_depth'', max_depth)\n\n    starting_model
1006 |           = None\n    if starting_model_path:\n        starting_model = xgboost.Booster(model_file=starting_model_path)\n\n    model
1007 |           = xgboost.train(\n        params=booster_params,\n        dtrain=training_data,\n        num_boost_round=num_iterations,\n        xgb_model=starting_model\n    )\n\n    #
1008 |           Saving the model in binary format\n    model.save_model(model_path)\n\n    model_config_str
1009 |           = model.save_config()\n    with open(model_config_path, ''w'') as model_config_file:\n        model_config_file.write(model_config_str)\n\nimport
1010 |           json\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost
1011 |           train'', description=''Train an XGBoost model.\\n\\n    Args:\\n        training_data_path:
1012 |           Path for the training data in CSV format.\\n        model_path: Output path
1013 |           for the trained model in binary XGBoost format.\\n        model_config_path:
1014 |           Output path for the internal parameter configuration of Booster as a JSON
1015 |           string.\\n        starting_model_path: Path for the existing trained model
1016 |           to start from.\\n        label_column: Column containing the label data.\\n        num_boost_rounds:
1017 |           Number of boosting iterations.\\n        booster_params: Parameters for
1018 |           the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\n        objective:
1019 |           The learning task and the corresponding learning objective.\\n            See
1020 |           https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\n            The
1021 |           most common values are:\\n            \"reg:squarederror\" - Regression
1022 |           with squared loss (default).\\n            \"reg:logistic\" - Logistic regression.\\n            \"binary:logistic\"
1023 |           - Logistic regression for binary classification, output probability.\\n            \"binary:logitraw\"
1024 |           - Logistic regression for binary classification, output score before logistic
1025 |           transformation\\n            \"rank:pairwise\" - Use LambdaMART to perform
1026 |           pairwise ranking where the pairwise loss is minimized\\n            \"rank:ndcg\"
1027 |           - Use LambdaMART to perform list-wise ranking where Normalized Discounted
1028 |           Cumulative Gain (NDCG) is maximized\\n\\n    Annotations:\\n        author:
1029 |           Alexey Volkov <alexey.volkov@ark-kun.com>'')\n_parser.add_argument(\"--training-data\",
1030 |           dest=\"training_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--starting-model\",
1031 |           dest=\"starting_model_path\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\",
1032 |           dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--num-iterations\",
1033 |           dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster-params\",
1034 |           dest=\"booster_params\", type=json.loads, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\",
1035 |           dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster\",
1036 |           dest=\"booster\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\",
1037 |           dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--min-split-loss\",
1038 |           dest=\"min_split_loss\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\",
1039 |           dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\",
1040 |           dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True,
1041 |           default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\", dest=\"model_config_path\",
1042 |           type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args
1043 |           = vars(_parser.parse_args())\n\n_outputs = xgboost_train(**_parsed_args)\n"],
1044 |           "image": "python:3.7"}}, "inputs": [{"name": "training_data", "type": "CSV"},
1045 |           {"name": "starting_model", "optional": true, "type": "XGBoostModel"}, {"default":
1046 |           "0", "name": "label_column", "optional": true, "type": "Integer"}, {"default":
1047 |           "10", "name": "num_iterations", "optional": true, "type": "Integer"}, {"name":
1048 |           "booster_params", "optional": true, "type": "JsonObject"}, {"default": "reg:squarederror",
1049 |           "name": "objective", "optional": true, "type": "String"}, {"default": "gbtree",
1050 |           "name": "booster", "optional": true, "type": "String"}, {"default": "0.3",
1051 |           "name": "learning_rate", "optional": true, "type": "Float"}, {"default":
1052 |           "0", "name": "min_split_loss", "optional": true, "type": "Float"}, {"default":
1053 |           "6", "name": "max_depth", "optional": true, "type": "Integer"}], "name":
1054 |           "Xgboost train", "outputs": [{"name": "model", "type": "XGBoostModel"},
1055 |           {"name": "model_config", "type": "XGBoostModelConfig"}]}', pipelines.kubeflow.org/component_ref: '{"digest":
1056 |           "09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38", "url":
1057 |           "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml"}',
1058 |         pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0", "num_iterations":
1059 |           "50", "objective": "reg:squarederror"}'}
1060 |   arguments:
1061 |     parameters: []
1062 |   serviceAccountName: pipeline-runner
1063 | 


--------------------------------------------------------------------------------