├── lesson1_setup ├── step6_kubeflow_check.sh ├── delete_kubeflow.sh ├── step5_kubeflow_pipeline_sdk.sh ├── step1_install_docker.sh ├── step4_port_forward_gateway.sh ├── step2_install_kfctl.sh └── step3_apply_kubeflow.sh ├── .gitignore ├── first_project ├── Dockerfile ├── build_and_push.sh ├── config.json ├── train_pipeline.py └── first.yml ├── lessonx_mnist_pipeline ├── sc_local_storage.yml ├── pvc_local_storage.yml ├── pv_local_storage.yml └── mnist_pipeline.py ├── lesson8_download_s3 ├── secret.yml ├── s3_ls.py └── s3_sync.py ├── lesson10_catboost ├── convert_CatBoostModel_to_ONNX │ ├── component.py │ └── component.yaml ├── convert_CatBoostModel_to_AppleCoreMLModel │ ├── component.py │ └── component.yaml ├── Predict_values │ └── from_CSV │ │ ├── component.py │ │ └── component.yaml ├── Predict_classes │ └── from_CSV │ │ ├── component.py │ │ └── component.yaml ├── Predict_class_probabilities │ └── from_CSV │ │ ├── component.py │ │ └── component.yaml ├── Train_regression │ └── from_CSV │ │ ├── component.py │ │ └── component.yaml ├── Train_classifier │ └── from_CSV │ │ ├── component.py │ │ └── component.yaml └── catboost_pipeline.py ├── lesson2_hello_world ├── helloworld_python.py └── helloworld_bash.py ├── lesson9_tf_mnist └── tf_mnist.py ├── lesson7_storing_data └── storing_data.py ├── lesson4_parallel └── parallel_execution.py ├── lesson3_add └── add_python.py ├── lesson7_output_a_directory └── output_a_directory.py ├── lesson5_control_structure ├── control_structure.py └── control.yaml ├── train_until_good ├── train_until_good.py └── train_until_good.py.yaml └── lesson6_data_passing └── data_passing.py /lesson1_setup/step6_kubeflow_check.sh: -------------------------------------------------------------------------------- 1 | kubectl -n kubeflow get all -------------------------------------------------------------------------------- /lesson1_setup/delete_kubeflow.sh: -------------------------------------------------------------------------------- 1 | kfctl delete -f kfctl_k8s_istio.v1.0.0.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | __pycache__ 4 | hello-kf 5 | *.zip 6 | *.tar.gz -------------------------------------------------------------------------------- /first_project/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0 -------------------------------------------------------------------------------- /first_project/build_and_push.sh: -------------------------------------------------------------------------------- 1 | docker build -t chrisai/kubeflow-first-project:v1 -f Dockerfile . 2 | docker push chrisai/kubeflow-first-project:v1 3 | -------------------------------------------------------------------------------- /lesson1_setup/step5_kubeflow_pipeline_sdk.sh: -------------------------------------------------------------------------------- 1 | URL=https://storage.googleapis.com/ml-pipeline/release/latest/kfp.tar.gz 2 | pip install "${URL}" --upgrade 3 | -------------------------------------------------------------------------------- /lesson1_setup/step1_install_docker.sh: -------------------------------------------------------------------------------- 1 | # Mac 2 | wget https://desktop.docker.com/mac/stable/Docker.dmg 3 | 4 | # Windows 5 | # https://desktop.docker.com/win/stable/Docker%20Desktop%20Installer.exe 6 | 7 | -------------------------------------------------------------------------------- /lessonx_mnist_pipeline/sc_local_storage.yml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | name: my-storage-class 5 | provisioner: kubernetes.io/no-provisioner 6 | volumeBindingMode: WaitForFirstConsumer -------------------------------------------------------------------------------- /first_project/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "auths": { 3 | "https://index.docker.io/v1/": { 4 | "auth": "Y2hyaXNhaTpzb25naG9yYW5ibGFjayBEb2NrZXJmaWxlIF9fcHljYWNoZV9fIGJ1aWxkX2FuZF9wdXNoLnNoIGZpcnN0LnltbCB0cmFpbl9waXBlbGluZS5weQ==" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lesson1_setup/step4_port_forward_gateway.sh: -------------------------------------------------------------------------------- 1 | kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 2 | # kubectl port-forward -n kubeflow svc/centraldashboard 8080:80 3 | 4 | #virtualenv kfvenv --python python3 5 | #source kfvenv/bin/activate -------------------------------------------------------------------------------- /lesson8_download_s3/secret.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: aws-secret 5 | namespace: kubeflow 6 | type: Opaque 7 | data: 8 | AWS_ACCESS_KEY_ID: BASE64_AWS_ACCESS_KEY_ID 9 | AWS_SECRET_ACCESS_KEY: BASE64_AWS_SECRET_ACCESS_KEY -------------------------------------------------------------------------------- /lessonx_mnist_pipeline/pvc_local_storage.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: task-pv-claim 5 | spec: 6 | storageClassName: manual 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 1Gi -------------------------------------------------------------------------------- /lessonx_mnist_pipeline/pv_local_storage.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: task-pv-volume 5 | labels: 6 | type: local 7 | spec: 8 | storageClassName: manual 9 | capacity: 10 | storage: 1Gi 11 | accessModes: 12 | - ReadWriteOnce 13 | hostPath: 14 | path: "/mnt/data" -------------------------------------------------------------------------------- /lesson1_setup/step2_install_kfctl.sh: -------------------------------------------------------------------------------- 1 | export PLATFORM=$(uname) # Either Linux or Darwin 2 | export KUBEFLOW_TAG=1.0.0 3 | KUBEFLOW_BASE="https://api.github.com/repos/kubeflow/kfctl/releases" 4 | # Or just go to https://github.com/kubeflow/kfctl/releases 5 | wget https://github.com/kubeflow/kfctl/releases/download/v1.0/kfctl_v1.0-0-g94c35cf_darwin.tar.gz 6 | KFCTL_FILE=${KFCTL_URL##*/} 7 | tar -xvf "${KFCTL_FILE}" 8 | sudo mv ./kfctl /usr/local/bin/ 9 | rm "${KFCTL_FILE}" 10 | -------------------------------------------------------------------------------- /lesson1_setup/step3_apply_kubeflow.sh: -------------------------------------------------------------------------------- 1 | # Pick the correct config file for your platform from 2 | # https://github.com/kubeflow/manifests/tree/[version]/kfdef 3 | # You can download and edit the configuration at this point if you need to. 4 | # For generic Kubernetes with Istio: 5 | MANIFEST_BRANCH=${MANIFEST_BRANCH:-v1.0-branch} 6 | export MANIFEST_BRANCH 7 | MANIFEST_VERSION=${MANIFEST_VERSION:-v1.0.0} 8 | export MANIFEST_VERSION 9 | 10 | KF_PROJECT_NAME=${KF_PROJECT_NAME:-hello-kf} 11 | export KF_PROJECT_NAME 12 | mkdir "${KF_PROJECT_NAME}" 13 | pushd "${KF_PROJECT_NAME}" 14 | 15 | manifest_root=https://raw.githubusercontent.com/kubeflow/manifests/ 16 | # On most environments this will create a "vanilla" Kubeflow install using Istio. 17 | FILE_NAME=kfctl_k8s_istio.${MANIFEST_VERSION}.yaml 18 | KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/${FILE_NAME} 19 | kfctl apply -f $KFDEF -V 20 | echo $? 21 | 22 | popd 23 | -------------------------------------------------------------------------------- /lesson8_download_s3/s3_ls.py: -------------------------------------------------------------------------------- 1 | import kfp 2 | from kfp import dsl 3 | from kfp.aws import use_aws_secret 4 | 5 | 6 | EXPERIMENT_NAME = 'AWS S3 ls' # Name of the experiment in the UI 7 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 8 | 9 | 10 | def s3_ls(): 11 | return kfp.dsl.ContainerOp( 12 | name="s3_ls", 13 | image="amazon/aws-cli:latest", 14 | command=["aws", "s3", "ls"], 15 | ) 16 | 17 | 18 | @dsl.pipeline(name="s3_ls_pipeline", description="s3 ls pipeline.") 19 | def s3_ls_pipeline(): 20 | echo_task = s3_ls().apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) 21 | 22 | 23 | if __name__ == "__main__": 24 | kfp.compiler.Compiler().compile(s3_ls_pipeline, __file__ + ".zip") 25 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 26 | s3_ls_pipeline, 27 | arguments={}, 28 | experiment_name=EXPERIMENT_NAME, 29 | ) 30 | -------------------------------------------------------------------------------- /lesson8_download_s3/s3_sync.py: -------------------------------------------------------------------------------- 1 | import kfp 2 | from kfp import dsl 3 | from kfp.aws import use_aws_secret 4 | 5 | 6 | EXPERIMENT_NAME = 'AWS S3 sync' # Name of the experiment in the UI 7 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 8 | 9 | 10 | def s3_sync(): 11 | return kfp.dsl.ContainerOp( 12 | name="s3_sync", 13 | image="amazon/aws-cli:latest", 14 | command=["aws", "s3", "sync", "s3://inside-private/dataset/casa_grande/2021-04-01/", "/tmp"], 15 | file_outputs={ 16 | "data": "/tmp" 17 | } 18 | ) 19 | 20 | 21 | @dsl.pipeline(name="s3_sync_pipeline", description="s3 sync pipeline.") 22 | def s3_sync_pipeline(): 23 | echo_task = s3_sync().apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) 24 | 25 | 26 | if __name__ == "__main__": 27 | kfp.compiler.Compiler().compile(s3_sync_pipeline, __file__ + ".zip") 28 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 29 | s3_sync_pipeline, 30 | arguments={}, 31 | experiment_name=EXPERIMENT_NAME, 32 | ) 33 | -------------------------------------------------------------------------------- /lesson10_catboost/convert_CatBoostModel_to_ONNX/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def convert_CatBoostModel_to_ONNX( 4 | model_path: InputPath('CatBoostModel'), 5 | converted_model_path: OutputPath('ONNX'), 6 | ): 7 | '''Convert CatBoost model to ONNX format. 8 | 9 | Args: 10 | model_path: Path of a trained model in binary CatBoost model format. 11 | converted_model_path: Output path for the converted model. 12 | 13 | Outputs: 14 | converted_model: Model in ONNX format. 15 | 16 | Annotations: 17 | author: Alexey Volkov 18 | ''' 19 | from catboost import CatBoost 20 | 21 | model = CatBoost() 22 | model.load_model(model_path) 23 | model.save_model(converted_model_path, format="onnx") 24 | 25 | 26 | if __name__ == '__main__': 27 | create_component_from_func( 28 | convert_CatBoostModel_to_ONNX, 29 | output_component_file='component.yaml', 30 | base_image='python:3.7', 31 | packages_to_install=['catboost==0.22'] 32 | ) 33 | -------------------------------------------------------------------------------- /lesson10_catboost/convert_CatBoostModel_to_AppleCoreMLModel/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def convert_CatBoostModel_to_AppleCoreMLModel( 4 | model_path: InputPath('CatBoostModel'), 5 | converted_model_path: OutputPath('AppleCoreMLModel'), 6 | ): 7 | '''Convert CatBoost model to Apple CoreML format. 8 | 9 | Args: 10 | model_path: Path of a trained model in binary CatBoost model format. 11 | converted_model_path: Output path for the converted model. 12 | 13 | Outputs: 14 | converted_model: Model in Apple CoreML format. 15 | 16 | Annotations: 17 | author: Alexey Volkov 18 | ''' 19 | from catboost import CatBoost 20 | 21 | model = CatBoost() 22 | model.load_model(model_path) 23 | model.save_model( 24 | converted_model_path, 25 | format="coreml", 26 | # export_parameters={'prediction_type': 'probability'}, 27 | # export_parameters={'prediction_type': 'raw'}, 28 | ) 29 | 30 | 31 | if __name__ == '__main__': 32 | create_component_from_func( 33 | convert_CatBoostModel_to_AppleCoreMLModel, 34 | output_component_file='component.yaml', 35 | base_image='python:3.7', 36 | packages_to_install=['catboost==0.22'] 37 | ) 38 | -------------------------------------------------------------------------------- /lesson2_hello_world/helloworld_python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Chris Hoyean Song (sjhshy@gmail.com) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import kfp 17 | 18 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 19 | 20 | 21 | def hello_world_component(): 22 | ret = "Hello World!" 23 | print(ret) 24 | return ret 25 | 26 | 27 | @kfp.dsl.pipeline(name="hello_pipeline", description="Hello World Pipeline!") 28 | def hello_world_pipeline(): 29 | hello_world_op = kfp.components.func_to_container_op(hello_world_component) 30 | _ = hello_world_op() 31 | 32 | 33 | if __name__ == "__main__": 34 | kfp.compiler.Compiler().compile(hello_world_pipeline, "hello-world-pipeline.zip") 35 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 36 | hello_world_pipeline, arguments={}, experiment_name="hello-world-experiment" 37 | ) 38 | -------------------------------------------------------------------------------- /lesson2_hello_world/helloworld_bash.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import kfp 16 | from kfp import dsl 17 | 18 | BASE_IMAGE = "library/bash:4.4.23" 19 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 20 | 21 | 22 | def echo_op(): 23 | return dsl.ContainerOp( 24 | name="echo", 25 | image=BASE_IMAGE, 26 | command=["sh", "-c"], 27 | arguments=['echo "hello world"'], 28 | ) 29 | 30 | 31 | @dsl.pipeline(name="hello_world_bash_pipeline", description="A hello world pipeline.") 32 | def hello_world_bash_pipeline(): 33 | echo_task = echo_op() 34 | 35 | 36 | if __name__ == "__main__": 37 | kfp.compiler.Compiler().compile(hello_world_bash_pipeline, __file__ + ".zip") 38 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 39 | hello_world_bash_pipeline, 40 | arguments={}, 41 | experiment_name="hello-world-bash-experiment", 42 | ) 43 | -------------------------------------------------------------------------------- /lesson9_tf_mnist/tf_mnist.py: -------------------------------------------------------------------------------- 1 | import kfp 2 | from kfp.components import func_to_container_op, OutputPath, InputPath 3 | 4 | EXPERIMENT_NAME = 'Train TF MNIST' # Name of the experiment in the UI 5 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 6 | 7 | 8 | def download_mnist(output_dir_path: OutputPath()): 9 | import tensorflow as tf 10 | 11 | tf.keras.datasets.mnist.load_data(output_dir_path) 12 | 13 | 14 | def train_mnist(data_path: InputPath(), model_output: OutputPath()): 15 | import tensorflow as tf 16 | import numpy as np 17 | with np.load(data_path, allow_pickle=True) as f: 18 | x_train, y_train = f['x_train'], f['y_train'] 19 | x_test, y_test = f['x_test'], f['y_test'] 20 | print(x_train.shape) 21 | print(y_train.shape) 22 | 23 | model = tf.keras.models.Sequential([ 24 | tf.keras.layers.Flatten(input_shape=(28, 28)), 25 | tf.keras.layers.Dense(128, activation='relu'), 26 | tf.keras.layers.Dense(10) 27 | ]) 28 | model.compile( 29 | optimizer=tf.keras.optimizers.Adam(0.001), 30 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 31 | metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], 32 | ) 33 | 34 | model.fit( 35 | x_train, y_train, 36 | ) 37 | model.evaluate(x_test, y_test) 38 | 39 | model.save(model_output) 40 | 41 | 42 | def tf_mnist_pipeline(): 43 | download_op = func_to_container_op(download_mnist, base_image="tensorflow/tensorflow") 44 | train_mnist_op = func_to_container_op(train_mnist, base_image="tensorflow/tensorflow") 45 | train_mnist_op(download_op().output) 46 | 47 | 48 | if __name__ == '__main__': 49 | import kfp.compiler as compiler 50 | compiler.Compiler().compile(tf_mnist_pipeline, __file__ + '.zip') 51 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 52 | tf_mnist_pipeline, 53 | arguments={}, 54 | experiment_name=EXPERIMENT_NAME) 55 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_values/from_CSV/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def catboost_predict_values( 4 | data_path: InputPath('CSV'), 5 | model_path: InputPath('CatBoostModel'), 6 | predictions_path: OutputPath(), 7 | 8 | label_column: int = None, 9 | ): 10 | '''Predict values with a CatBoost model. 11 | 12 | Args: 13 | data_path: Path for the data in CSV format. 14 | model_path: Path for the trained model in binary CatBoostModel format. 15 | label_column: Column containing the label data. 16 | predictions_path: Output path for the predictions. 17 | 18 | Outputs: 19 | predictions: Predictions in text format. 20 | 21 | Annotations: 22 | author: Alexey Volkov 23 | ''' 24 | import tempfile 25 | 26 | from catboost import CatBoost, Pool 27 | import numpy 28 | 29 | if label_column: 30 | column_descriptions = {label_column: 'Label'} 31 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 32 | with open(column_description_path, 'w') as column_description_file: 33 | for idx, kind in column_descriptions.items(): 34 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 35 | else: 36 | column_description_path = None 37 | 38 | eval_data = Pool( 39 | data_path, 40 | column_description=column_description_path, 41 | has_header=True, 42 | delimiter=',', 43 | ) 44 | 45 | model = CatBoost() 46 | model.load_model(model_path) 47 | 48 | predictions = model.predict(eval_data, prediction_type='RawFormulaVal') 49 | numpy.savetxt(predictions_path, predictions) 50 | 51 | 52 | if __name__ == '__main__': 53 | catboost_predict_values_op = create_component_from_func( 54 | catboost_predict_values, 55 | output_component_file='component.yaml', 56 | base_image='python:3.7', 57 | packages_to_install=['catboost==0.23'] 58 | ) 59 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_classes/from_CSV/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def catboost_predict_classes( 4 | data_path: InputPath('CSV'), 5 | model_path: InputPath('CatBoostModel'), 6 | predictions_path: OutputPath(), 7 | 8 | label_column: int = None, 9 | ): 10 | '''Predict classes using the CatBoost classifier model. 11 | 12 | Args: 13 | data_path: Path for the data in CSV format. 14 | model_path: Path for the trained model in binary CatBoostModel format. 15 | label_column: Column containing the label data. 16 | predictions_path: Output path for the predictions. 17 | 18 | Outputs: 19 | predictions: Class predictions in text format. 20 | 21 | Annotations: 22 | author: Alexey Volkov 23 | ''' 24 | import tempfile 25 | 26 | from catboost import CatBoostClassifier, Pool 27 | import numpy 28 | 29 | if label_column: 30 | column_descriptions = {label_column: 'Label'} 31 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 32 | with open(column_description_path, 'w') as column_description_file: 33 | for idx, kind in column_descriptions.items(): 34 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 35 | else: 36 | column_description_path = None 37 | 38 | eval_data = Pool( 39 | data_path, 40 | column_description=column_description_path, 41 | has_header=True, 42 | delimiter=',', 43 | ) 44 | 45 | model = CatBoostClassifier() 46 | model.load_model(model_path) 47 | 48 | predictions = model.predict(eval_data) 49 | numpy.savetxt(predictions_path, predictions, fmt='%s') 50 | 51 | 52 | if __name__ == '__main__': 53 | catboost_predict_classes_op = create_component_from_func( 54 | catboost_predict_classes, 55 | output_component_file='component.yaml', 56 | base_image='python:3.7', 57 | packages_to_install=['catboost==0.22'] 58 | ) 59 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_class_probabilities/from_CSV/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def catboost_predict_class_probabilities( 4 | data_path: InputPath('CSV'), 5 | model_path: InputPath('CatBoostModel'), 6 | predictions_path: OutputPath(), 7 | 8 | label_column: int = None, 9 | ): 10 | '''Predict class probabilities with a CatBoost model. 11 | 12 | Args: 13 | data_path: Path for the data in CSV format. 14 | model_path: Path for the trained model in binary CatBoostModel format. 15 | label_column: Column containing the label data. 16 | predictions_path: Output path for the predictions. 17 | 18 | Outputs: 19 | predictions: Predictions in text format. 20 | 21 | Annotations: 22 | author: Alexey Volkov 23 | ''' 24 | import tempfile 25 | 26 | from catboost import CatBoost, Pool 27 | import numpy 28 | 29 | if label_column: 30 | column_descriptions = {label_column: 'Label'} 31 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 32 | with open(column_description_path, 'w') as column_description_file: 33 | for idx, kind in column_descriptions.items(): 34 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 35 | else: 36 | column_description_path = None 37 | 38 | eval_data = Pool( 39 | data_path, 40 | column_description=column_description_path, 41 | has_header=True, 42 | delimiter=',', 43 | ) 44 | 45 | model = CatBoost() 46 | model.load_model(model_path) 47 | 48 | predictions = model.predict(eval_data, prediction_type='Probability') 49 | numpy.savetxt(predictions_path, predictions) 50 | 51 | 52 | if __name__ == '__main__': 53 | catboost_predict_class_probabilities_op = create_component_from_func( 54 | catboost_predict_class_probabilities, 55 | output_component_file='component.yaml', 56 | base_image='python:3.7', 57 | packages_to_install=['catboost==0.23'] 58 | ) 59 | -------------------------------------------------------------------------------- /lesson7_storing_data/storing_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import kfp 17 | import kfp.dsl as dsl 18 | 19 | EXPERIMENT_NAME = 'Storing data' # Name of the experiment in the UI 20 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 21 | 22 | 23 | @dsl.pipeline( 24 | name="Volume Op DAG", 25 | description="The second example of the design doc." 26 | ) 27 | def volume_op_dag(): 28 | vop = dsl.VolumeOp( 29 | name="create_pvc", 30 | resource_name="my-pvc", 31 | size="10Gi", 32 | modes=dsl.VOLUME_MODE_RWM 33 | ) 34 | 35 | step1 = dsl.ContainerOp( 36 | name="step1", 37 | image="library/bash:4.4.23", 38 | command=["sh", "-c"], 39 | arguments=["echo 1 | tee /mnt/file1"], 40 | pvolumes={"/mnt": vop.volume} 41 | ) 42 | 43 | step2 = dsl.ContainerOp( 44 | name="step2", 45 | image="library/bash:4.4.23", 46 | command=["sh", "-c"], 47 | arguments=["echo 2 | tee /mnt2/file2"], 48 | pvolumes={"/mnt2": vop.volume} 49 | ) 50 | 51 | step3 = dsl.ContainerOp( 52 | name="step3", 53 | image="library/bash:4.4.23", 54 | command=["sh", "-c"], 55 | arguments=["cat /mnt/file1 /mnt/file2"], 56 | pvolumes={"/mnt": vop.volume.after(step1, step2)} 57 | ) 58 | 59 | 60 | if __name__ == "__main__": 61 | import kfp.compiler as compiler 62 | compiler.Compiler().compile(volume_op_dag, __file__ + ".tar.gz") 63 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 64 | volume_op_dag, 65 | arguments={}, 66 | experiment_name=EXPERIMENT_NAME) 67 | -------------------------------------------------------------------------------- /lesson4_parallel/parallel_execution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2019 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import kfp 18 | from kfp import dsl 19 | 20 | 21 | EXPERIMENT_NAME = 'Parallel execution' # Name of the experiment in the UI 22 | BASE_IMAGE = "python:3.7" 23 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 24 | 25 | 26 | def gcs_download_op(url): 27 | return dsl.ContainerOp( 28 | name='GCS - Download', 29 | image='google/cloud-sdk:272.0.0', 30 | command=['sh', '-c'], 31 | arguments=['gsutil cat $0 | tee $1', url, '/tmp/results.txt'], 32 | file_outputs={ 33 | 'data': '/tmp/results.txt', 34 | } 35 | ) 36 | 37 | 38 | def echo2_op(text1, text2): 39 | return dsl.ContainerOp( 40 | name='echo', 41 | image='library/bash:4.4.23', 42 | command=['sh', '-c'], 43 | arguments=['echo "Text 1: $0"; echo "Text 2: $1"', text1, text2] 44 | ) 45 | 46 | 47 | @dsl.pipeline( 48 | name='Parallel pipeline', 49 | description='Download two messages in parallel and prints the concatenated result.' 50 | ) 51 | def download_and_join( 52 | url1='gs://ml-pipeline-playground/shakespeare1.txt', 53 | url2='gs://ml-pipeline-playground/shakespeare2.txt' 54 | ): 55 | """A three-step pipeline with first two running in parallel.""" 56 | 57 | download1_task = gcs_download_op(url1) 58 | download2_task = gcs_download_op(url2) 59 | 60 | echo_task = echo2_op(download1_task.output, download2_task.output) 61 | 62 | 63 | if __name__ == '__main__': 64 | # kfp.compiler.Compiler().compile(download_and_join, __file__ + '.zip') 65 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 66 | download_and_join, 67 | arguments={}, 68 | experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /lesson3_add/add_python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2021 Chris Hoyean Song (sjhshy@gmail.com) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import kfp 17 | from kfp import components 18 | from kfp import dsl 19 | 20 | 21 | EXPERIMENT_NAME = 'Add number pipeline' # Name of the experiment in the UI 22 | BASE_IMAGE = "python:3.7" 23 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 24 | 25 | 26 | @dsl.python_component( 27 | name='add_op', 28 | description='adds two numbers', 29 | base_image=BASE_IMAGE # you can define the base image here, or when you build in the next step. 30 | ) 31 | def add(a: float, b: float) -> float: 32 | '''Calculates sum of two arguments''' 33 | print(a, '+', b, '=', a + b) 34 | return a + b 35 | 36 | 37 | # Convert the function to a pipeline operation. 38 | add_op = components.func_to_container_op( 39 | add, 40 | base_image=BASE_IMAGE, 41 | ) 42 | 43 | @dsl.pipeline( 44 | name='Calculation pipeline', 45 | description='A toy pipeline that performs arithmetic calculations.' 46 | ) 47 | def calc_pipeline( 48 | a: float = 0, 49 | b: float = 7 50 | ): 51 | #Passing pipeline parameter and a constant value as operation arguments 52 | add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. 53 | 54 | #You can create explicit dependency between the tasks using xyz_task.after(abc_task) 55 | add_2_task = add_op(a, b) 56 | 57 | add_3_task = add_op(add_task.output, add_2_task.output) 58 | 59 | 60 | if __name__ == "__main__": 61 | # Specify pipeline argument values 62 | arguments = {'a': '7', 'b': '8'} 63 | # Launch a pipeline run given the pipeline function definition 64 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 65 | calc_pipeline, 66 | arguments=arguments, 67 | experiment_name=EXPERIMENT_NAME) 68 | # The generated links below lead to the Experiment page and the pipeline run details page, respectively 69 | -------------------------------------------------------------------------------- /lesson10_catboost/convert_CatBoostModel_to_ONNX/component.yaml: -------------------------------------------------------------------------------- 1 | name: Convert CatBoostModel to ONNX 2 | description: |- 3 | Convert CatBoost model to ONNX format. 4 | 5 | Args: 6 | model_path: Path of a trained model in binary CatBoost model format. 7 | converted_model_path: Output path for the converted model. 8 | 9 | Outputs: 10 | converted_model: Model in ONNX format. 11 | 12 | Annotations: 13 | author: Alexey Volkov 14 | inputs: 15 | - {name: model, type: CatBoostModel} 16 | outputs: 17 | - {name: converted_model, type: ONNX} 18 | implementation: 19 | container: 20 | image: python:3.7 21 | command: 22 | - sh 23 | - -c 24 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 25 | 'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 26 | --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@" 27 | - python3 28 | - -u 29 | - -c 30 | - | 31 | def _make_parent_dirs_and_return_path(file_path: str): 32 | import os 33 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 34 | return file_path 35 | 36 | def convert_CatBoostModel_to_ONNX( 37 | model_path, 38 | converted_model_path, 39 | ): 40 | '''Convert CatBoost model to ONNX format. 41 | 42 | Args: 43 | model_path: Path of a trained model in binary CatBoost model format. 44 | converted_model_path: Output path for the converted model. 45 | 46 | Outputs: 47 | converted_model: Model in ONNX format. 48 | 49 | Annotations: 50 | author: Alexey Volkov 51 | ''' 52 | from catboost import CatBoost 53 | 54 | model = CatBoost() 55 | model.load_model(model_path) 56 | model.save_model(converted_model_path, format="onnx") 57 | 58 | import argparse 59 | _parser = argparse.ArgumentParser(prog='Convert CatBoostModel to ONNX', description='Convert CatBoost model to ONNX format.\n\n Args:\n model_path: Path of a trained model in binary CatBoost model format.\n converted_model_path: Output path for the converted model.\n\n Outputs:\n converted_model: Model in ONNX format.\n\n Annotations:\n author: Alexey Volkov ') 60 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 61 | _parser.add_argument("--converted-model", dest="converted_model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 62 | _parsed_args = vars(_parser.parse_args()) 63 | 64 | _outputs = convert_CatBoostModel_to_ONNX(**_parsed_args) 65 | args: 66 | - --model 67 | - {inputPath: model} 68 | - --converted-model 69 | - {outputPath: converted_model} 70 | -------------------------------------------------------------------------------- /lesson10_catboost/convert_CatBoostModel_to_AppleCoreMLModel/component.yaml: -------------------------------------------------------------------------------- 1 | name: Convert CatBoostModel to AppleCoreMLModel 2 | description: |- 3 | Convert CatBoost model to Apple CoreML format. 4 | 5 | Args: 6 | model_path: Path of a trained model in binary CatBoost model format. 7 | converted_model_path: Output path for the converted model. 8 | 9 | Outputs: 10 | converted_model: Model in Apple CoreML format. 11 | 12 | Annotations: 13 | author: Alexey Volkov 14 | inputs: 15 | - {name: model, type: CatBoostModel} 16 | outputs: 17 | - {name: converted_model, type: AppleCoreMLModel} 18 | implementation: 19 | container: 20 | image: python:3.7 21 | command: 22 | - sh 23 | - -c 24 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 25 | 'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 26 | --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@" 27 | - python3 28 | - -u 29 | - -c 30 | - | 31 | def _make_parent_dirs_and_return_path(file_path: str): 32 | import os 33 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 34 | return file_path 35 | 36 | def convert_CatBoostModel_to_AppleCoreMLModel( 37 | model_path, 38 | converted_model_path, 39 | ): 40 | '''Convert CatBoost model to Apple CoreML format. 41 | 42 | Args: 43 | model_path: Path of a trained model in binary CatBoost model format. 44 | converted_model_path: Output path for the converted model. 45 | 46 | Outputs: 47 | converted_model: Model in Apple CoreML format. 48 | 49 | Annotations: 50 | author: Alexey Volkov 51 | ''' 52 | from catboost import CatBoost 53 | 54 | model = CatBoost() 55 | model.load_model(model_path) 56 | model.save_model( 57 | converted_model_path, 58 | format="coreml", 59 | # export_parameters={'prediction_type': 'probability'}, 60 | # export_parameters={'prediction_type': 'raw'}, 61 | ) 62 | 63 | import argparse 64 | _parser = argparse.ArgumentParser(prog='Convert CatBoostModel to AppleCoreMLModel', description='Convert CatBoost model to Apple CoreML format.\n\n Args:\n model_path: Path of a trained model in binary CatBoost model format.\n converted_model_path: Output path for the converted model.\n\n Outputs:\n converted_model: Model in Apple CoreML format.\n\n Annotations:\n author: Alexey Volkov ') 65 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 66 | _parser.add_argument("--converted-model", dest="converted_model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 67 | _parsed_args = vars(_parser.parse_args()) 68 | 69 | _outputs = convert_CatBoostModel_to_AppleCoreMLModel(**_parsed_args) 70 | args: 71 | - --model 72 | - {inputPath: model} 73 | - --converted-model 74 | - {outputPath: converted_model} 75 | -------------------------------------------------------------------------------- /lesson10_catboost/Train_regression/from_CSV/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def catboost_train_regression( 4 | training_data_path: InputPath('CSV'), 5 | model_path: OutputPath('CatBoostModel'), 6 | starting_model_path: InputPath('CatBoostModel') = None, 7 | label_column: int = 0, 8 | 9 | loss_function: str = 'RMSE', 10 | num_iterations: int = 500, 11 | learning_rate: float = None, 12 | depth: int = 6, 13 | random_seed: int = 0, 14 | 15 | cat_features: list = None, 16 | 17 | additional_training_options: dict = {}, 18 | ): 19 | '''Train a CatBoost classifier model. 20 | 21 | Args: 22 | training_data_path: Path for the training data in CSV format. 23 | model_path: Output path for the trained model in binary CatBoostModel format. 24 | starting_model_path: Path for the existing trained model to start from. 25 | label_column: Column containing the label data. 26 | 27 | loss_function: The metric to use in training and also selector of the machine learning 28 | problem to solve. Default = 'RMSE'. Possible values: 29 | 'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value' 30 | num_iterations: Number of trees to add to the ensemble. 31 | learning_rate: Step size shrinkage used in update to prevents overfitting. 32 | Default value is selected automatically for binary classification with other parameters set to default. 33 | In all other cases default is 0.03. 34 | depth: Depth of a tree. All trees are the same depth. Default = 6 35 | random_seed: Random number seed. Default = 0 36 | 37 | cat_features: A list of Categorical features (indices or names). 38 | additional_training_options: A dictionary with additional options to pass to CatBoostRegressor 39 | 40 | Outputs: 41 | model: Trained model in binary CatBoostModel format. 42 | 43 | Annotations: 44 | author: Alexey Volkov 45 | ''' 46 | import tempfile 47 | from pathlib import Path 48 | 49 | from catboost import CatBoostRegressor, Pool 50 | 51 | column_descriptions = {label_column: 'Label'} 52 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 53 | with open(column_description_path, 'w') as column_description_file: 54 | for idx, kind in column_descriptions.items(): 55 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 56 | 57 | train_data = Pool( 58 | training_data_path, 59 | column_description=column_description_path, 60 | has_header=True, 61 | delimiter=',', 62 | ) 63 | 64 | model = CatBoostRegressor( 65 | iterations=num_iterations, 66 | depth=depth, 67 | learning_rate=learning_rate, 68 | loss_function=loss_function, 69 | random_seed=random_seed, 70 | verbose=True, 71 | **additional_training_options, 72 | ) 73 | 74 | model.fit( 75 | train_data, 76 | cat_features=cat_features, 77 | init_model=starting_model_path, 78 | #verbose=False, 79 | #plot=True, 80 | ) 81 | Path(model_path).parent.mkdir(parents=True, exist_ok=True) 82 | model.save_model(model_path) 83 | 84 | 85 | if __name__ == '__main__': 86 | catboost_train_regression_op = create_component_from_func( 87 | catboost_train_regression, 88 | output_component_file='component.yaml', 89 | base_image='python:3.7', 90 | packages_to_install=['catboost==0.23'] 91 | ) 92 | -------------------------------------------------------------------------------- /lesson10_catboost/Train_classifier/from_CSV/component.py: -------------------------------------------------------------------------------- 1 | from kfp.components import InputPath, OutputPath, create_component_from_func 2 | 3 | def catboost_train_classifier( 4 | training_data_path: InputPath('CSV'), 5 | model_path: OutputPath('CatBoostModel'), 6 | starting_model_path: InputPath('CatBoostModel') = None, 7 | label_column: int = 0, 8 | 9 | loss_function: str = 'Logloss', 10 | num_iterations: int = 500, 11 | learning_rate: float = None, 12 | depth: int = 6, 13 | random_seed: int = 0, 14 | 15 | cat_features: list = None, 16 | text_features: list = None, 17 | 18 | additional_training_options: dict = {}, 19 | ): 20 | '''Train a CatBoost classifier model. 21 | 22 | Args: 23 | training_data_path: Path for the training data in CSV format. 24 | model_path: Output path for the trained model in binary CatBoostModel format. 25 | starting_model_path: Path for the existing trained model to start from. 26 | label_column: Column containing the label data. 27 | 28 | loss_function: The metric to use in training and also selector of the machine learning 29 | problem to solve. Default = 'Logloss' 30 | num_iterations: Number of trees to add to the ensemble. 31 | learning_rate: Step size shrinkage used in update to prevents overfitting. 32 | Default value is selected automatically for binary classification with other parameters set to default. 33 | In all other cases default is 0.03. 34 | depth: Depth of a tree. All trees are the same depth. Default = 6 35 | random_seed: Random number seed. Default = 0 36 | 37 | cat_features: A list of Categorical features (indices or names). 38 | text_features: A list of Text features (indices or names). 39 | additional_training_options: A dictionary with additional options to pass to CatBoostClassifier 40 | 41 | Outputs: 42 | model: Trained model in binary CatBoostModel format. 43 | 44 | Annotations: 45 | author: Alexey Volkov 46 | ''' 47 | import tempfile 48 | from pathlib import Path 49 | 50 | from catboost import CatBoostClassifier, Pool 51 | 52 | column_descriptions = {label_column: 'Label'} 53 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 54 | with open(column_description_path, 'w') as column_description_file: 55 | for idx, kind in column_descriptions.items(): 56 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 57 | 58 | train_data = Pool( 59 | training_data_path, 60 | column_description=column_description_path, 61 | has_header=True, 62 | delimiter=',', 63 | ) 64 | 65 | model = CatBoostClassifier( 66 | iterations=num_iterations, 67 | depth=depth, 68 | learning_rate=learning_rate, 69 | loss_function=loss_function, 70 | random_seed=random_seed, 71 | verbose=True, 72 | **additional_training_options, 73 | ) 74 | 75 | model.fit( 76 | train_data, 77 | cat_features=cat_features, 78 | text_features=text_features, 79 | init_model=starting_model_path, 80 | #verbose=False, 81 | #plot=True, 82 | ) 83 | Path(model_path).parent.mkdir(parents=True, exist_ok=True) 84 | model.save_model(model_path) 85 | 86 | 87 | if __name__ == '__main__': 88 | catboost_train_classifier_op = create_component_from_func( 89 | catboost_train_classifier, 90 | output_component_file='component.yaml', 91 | base_image='python:3.7', 92 | packages_to_install=['catboost==0.23'] 93 | ) 94 | -------------------------------------------------------------------------------- /lesson7_output_a_directory/output_a_directory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This sample shows how components can output directories 16 | # Outputting a directory is performed the same way as outputting a file: 17 | # component receives an output path, writes data at that path and the system takes that data and makes it available for the downstream components. 18 | # To output a file, create a new file at the output path location. 19 | # To output a directory, create a new directory at the output path location. 20 | 21 | 22 | import kfp 23 | from kfp.components import create_component_from_func, load_component_from_text, InputPath, OutputPath 24 | 25 | 26 | EXPERIMENT_NAME = 'Output a directory' # Name of the experiment in the UI 27 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 28 | 29 | 30 | # Outputting directories from Python-based components: 31 | 32 | @create_component_from_func 33 | def produce_dir_with_files_python_op(output_dir_path: OutputPath(), num_files: int = 10): 34 | import os 35 | os.makedirs(output_dir_path, exist_ok=True) 36 | for i in range(num_files): 37 | file_path = os.path.join(output_dir_path, str(i) + '.txt') 38 | with open(file_path, 'w') as f: 39 | f.write(str(i)) 40 | 41 | 42 | @create_component_from_func 43 | def list_dir_files_python_op(input_dir_path: InputPath()): 44 | import os 45 | dir_items = os.listdir(input_dir_path) 46 | for dir_item in dir_items: 47 | print(dir_item) 48 | 49 | 50 | # Outputting directories from general command-line based components: 51 | 52 | produce_dir_with_files_general_op = load_component_from_text(''' 53 | name: Produce directory 54 | inputs: 55 | - {name: num_files, type: Integer} 56 | outputs: 57 | - {name: output_dir} 58 | implementation: 59 | container: 60 | image: alpine 61 | command: 62 | - sh 63 | - -ecx 64 | - | 65 | num_files="$0" 66 | output_path="$1" 67 | mkdir -p "$output_path" 68 | for i in $(seq "$num_files"); do 69 | echo "$i" > "$output_path/${i}.txt" 70 | done 71 | - {inputValue: num_files} 72 | - {outputPath: output_dir} 73 | ''') 74 | 75 | 76 | list_dir_files_general_op = load_component_from_text(''' 77 | name: List dir files 78 | inputs: 79 | - {name: input_dir} 80 | implementation: 81 | container: 82 | image: alpine 83 | command: 84 | - ls 85 | - {inputPath: input_dir} 86 | ''') 87 | 88 | 89 | # Test pipeline 90 | 91 | def dir_pipeline(): 92 | produce_dir_python_task = produce_dir_with_files_python_op(num_files=15) 93 | list_dir_files_python_op(input_dir=produce_dir_python_task.output) 94 | 95 | produce_dir_general_task = produce_dir_with_files_general_op(num_files=15) 96 | list_dir_files_general_op(input_dir=produce_dir_general_task.output) 97 | 98 | 99 | if __name__ == '__main__': 100 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 101 | dir_pipeline, 102 | arguments={}, 103 | experiment_name=EXPERIMENT_NAME) -------------------------------------------------------------------------------- /first_project/train_pipeline.py: -------------------------------------------------------------------------------- 1 | import kfp.dsl as dsl 2 | import kfp.gcp as gcp 3 | import kfp.onprem as onprem 4 | 5 | from string import Template 6 | import json 7 | 8 | 9 | @dsl.pipeline(name='Simple sci-kit KF Pipeline', 10 | description='A simple end to end sci-kit seldon kf pipeline') 11 | def mnist_train_pipeline(docker_org="index.docker.io/seldonio", 12 | train_container_version="0.2", 13 | serve_container_version="0.1"): 14 | 15 | vop = dsl.VolumeOp(name="create_pvc", 16 | resource_name="nfs-1", 17 | modes=dsl.VOLUME_MODE_RWO, 18 | size="10G") 19 | volume = vop.volume 20 | train = dsl.ContainerOp( 21 | name='sk-train', 22 | image= 23 | f"{docker_org}/skmnistclassifier_trainer:{train_container_version}", 24 | pvolumes={"/data": volume}) 25 | 26 | seldon_serving_json_template = Template(""" 27 | { 28 | "apiVersion": "machinelearning.seldon.io/v1alpha2", 29 | "kind": "SeldonDeployment", 30 | "metadata": { 31 | "labels": { 32 | "app": "seldon" 33 | }, 34 | "name": "mnist-classifier" 35 | }, 36 | "spec": { 37 | "annotations": { 38 | "deployment_version": "v1", 39 | "project_name": "MNIST Example" 40 | }, 41 | "name": "mnist-classifier", 42 | "predictors": [ 43 | { 44 | "annotations": { 45 | "predictor_version": "v1" 46 | }, 47 | "componentSpecs": [ 48 | { 49 | "spec": { 50 | "containers": [ 51 | { 52 | "image": "$dockerreposerving:$dockertagserving", 53 | "imagePullPolicy": "Always", 54 | "name": "mnist-classifier", 55 | "volumeMounts": [ 56 | { 57 | "mountPath": "/data", 58 | "name": "persistent-storage" 59 | } 60 | ] 61 | } 62 | ], 63 | "terminationGracePeriodSeconds": 1, 64 | "volumes": [ 65 | { 66 | "name": "persistent-storage", 67 | "persistentVolumeClaim": { 68 | "claimName": "$modelpvc" 69 | } 70 | } 71 | ] 72 | } 73 | } 74 | ], 75 | "graph": { 76 | "children": [], 77 | "endpoint": { 78 | "type": "REST" 79 | }, 80 | "name": "mnist-classifier", 81 | "type": "MODEL" 82 | }, 83 | "name": "mnist-classifier", 84 | "replicas": 1 85 | } 86 | ] 87 | } 88 | } 89 | """) 90 | seldon_serving_json = seldon_serving_json_template.substitute({ 91 | 'dockerreposerving': 92 | f"{docker_org}/skmnistclassifier_runtime", 93 | 'dockertagserving': 94 | str(serve_container_version), 95 | 'modelpvc': 96 | vop.outputs["name"] 97 | }) 98 | 99 | seldon_deployment = json.loads(seldon_serving_json) 100 | 101 | serve = dsl.ResourceOp( 102 | name='serve', 103 | k8s_resource=seldon_deployment, 104 | success_condition='status.state == Available').after(train) 105 | 106 | 107 | # If we're called directly create an expirement and run 108 | if __name__ == '__main__': 109 | pipeline_func = mnist_train_pipeline 110 | pipeline_filename = pipeline_func.__name__ + '.pipeline.zip' 111 | import kfp.compiler as compiler 112 | compiler.Compiler().compile(pipeline_func, pipeline_filename) 113 | expirement_name = "cheese" 114 | # experiment = client.create_experiment(expirement_name) 115 | # run_name = pipeline_func.__name__ + ' run' 116 | # run_result = client.run_pipeline(experiment.id, run_name, 117 | # pipeline_filename, arguments) 118 | # print(run_result) -------------------------------------------------------------------------------- /lessonx_mnist_pipeline/mnist_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Kubeflow Pipelines MNIST example 16 | Run this script to compile pipeline 17 | """ 18 | 19 | 20 | import kfp 21 | import kfp.dsl as dsl 22 | import kfp.gcp as gcp 23 | import kfp.onprem as onprem 24 | 25 | platform = 'onprem' 26 | EXPERIMENT_NAME = 'MNIST pipeline' # Name of the experiment in the UI 27 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 28 | 29 | 30 | @dsl.pipeline( 31 | name='MNIST', 32 | description='A pipeline to train and serve the MNIST example.' 33 | ) 34 | def mnist_pipeline(model_export_dir='gs://your-bucket/export', 35 | train_steps='200', 36 | learning_rate='0.01', 37 | batch_size='100', 38 | pvc_name=''): 39 | """ 40 | Pipeline with three stages: 41 | 1. train an MNIST classifier 42 | 2. deploy a tf-serving instance to the cluster 43 | 3. deploy a web-ui to interact with it 44 | """ 45 | train = dsl.ContainerOp( 46 | name='train', 47 | image='gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b', 48 | arguments=[ 49 | "/opt/model.py", 50 | "--tf-export-dir", model_export_dir, 51 | "--tf-train-steps", train_steps, 52 | "--tf-batch-size", batch_size, 53 | "--tf-learning-rate", learning_rate 54 | ] 55 | ) 56 | 57 | serve_args = [ 58 | '--model-export-path', model_export_dir, 59 | '--server-name', "mnist-service" 60 | ] 61 | if platform != 'GCP': 62 | serve_args.extend([ 63 | '--cluster-name', "mnist-pipeline", 64 | '--pvc-name', pvc_name 65 | ]) 66 | 67 | serve = dsl.ContainerOp( 68 | name='serve', 69 | image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:' 70 | '7775692adf28d6f79098e76e839986c9ee55dd61', 71 | arguments=serve_args 72 | ) 73 | serve.after(train) 74 | 75 | webui_args = [ 76 | '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:' 77 | 'v20190304-v0.2-176-g15d997b-pipelines', 78 | '--name', 'web-ui', 79 | '--container-port', '5000', 80 | '--service-port', '80', 81 | '--service-type', "LoadBalancer" 82 | ] 83 | if platform != 'GCP': 84 | webui_args.extend([ 85 | '--cluster-name', "mnist-pipeline" 86 | ]) 87 | 88 | web_ui = dsl.ContainerOp( 89 | name='web-ui', 90 | image='gcr.io/kubeflow-examples/mnist/deploy-service:latest', 91 | arguments=webui_args 92 | ) 93 | web_ui.after(serve) 94 | 95 | steps = [train, serve, web_ui] 96 | for step in steps: 97 | if platform == 'GCP': 98 | step.apply(gcp.use_gcp_secret('user-gcp-sa')) 99 | else: 100 | step.apply(onprem.mount_pvc(pvc_name, 'task-pv-volume', '/mnt')) 101 | 102 | 103 | if __name__ == '__main__': 104 | import kfp.compiler as compiler 105 | compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz') 106 | 107 | # Launch a pipeline run given the pipeline function definition 108 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 109 | mnist_pipeline, 110 | arguments={ 111 | "model_export_dir": "/mnt/export", 112 | "pvc_name": "task-pv-claim"}, 113 | experiment_name=EXPERIMENT_NAME) 114 | # The generated links below lead to the Experiment page and the pipeline run details page, respectively 115 | -------------------------------------------------------------------------------- /lesson10_catboost/catboost_pipeline.py: -------------------------------------------------------------------------------- 1 | import kfp 2 | from kfp import components 3 | 4 | EXPERIMENT_NAME = 'CatBoost pipeline' # Name of the experiment in the UI 5 | KUBEFLOW_HOST = "http://127.0.0.1:8080/pipeline" 6 | 7 | chicago_taxi_dataset_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml') 8 | pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e69a6694/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml') 9 | 10 | catboost_train_classifier_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Train_classifier/from_CSV/component.yaml') 11 | catboost_train_regression_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Train_regression/from_CSV/component.yaml') 12 | catboost_predict_classes_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_classes/from_CSV/component.yaml') 13 | catboost_predict_values_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_values/from_CSV/component.yaml') 14 | catboost_predict_class_probabilities_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/Predict_class_probabilities/from_CSV/component.yaml') 15 | catboost_to_apple_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/convert_CatBoostModel_to_AppleCoreMLModel/component.yaml') 16 | catboost_to_onnx_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/f97ad2/components/CatBoost/convert_CatBoostModel_to_ONNX/component.yaml') 17 | 18 | 19 | def catboost_pipeline(): 20 | training_data_in_csv = chicago_taxi_dataset_op( 21 | where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"', 22 | select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total', 23 | limit=10000, 24 | ).output 25 | 26 | training_data_for_classification_in_csv = pandas_transform_csv_op( 27 | table=training_data_in_csv, 28 | transform_code='''df.insert(0, "was_tipped", df["tips"] > 0); del df["tips"]''', 29 | ).output 30 | 31 | catboost_train_regression_task = catboost_train_regression_op( 32 | training_data=training_data_in_csv, 33 | loss_function='RMSE', 34 | label_column=0, 35 | num_iterations=200, 36 | ) 37 | 38 | regression_model = catboost_train_regression_task.outputs['model'] 39 | 40 | catboost_train_classifier_task = catboost_train_classifier_op( 41 | training_data=training_data_for_classification_in_csv, 42 | label_column=0, 43 | num_iterations=200, 44 | ) 45 | 46 | classification_model = catboost_train_classifier_task.outputs['model'] 47 | 48 | evaluation_data_for_regression_in_csv = training_data_in_csv 49 | evaluation_data_for_classification_in_csv = training_data_for_classification_in_csv 50 | 51 | catboost_predict_values_op( 52 | data=evaluation_data_for_regression_in_csv, 53 | model=regression_model, 54 | label_column=0, 55 | ) 56 | 57 | catboost_predict_classes_op( 58 | data=evaluation_data_for_classification_in_csv, 59 | model=classification_model, 60 | label_column=0, 61 | ) 62 | 63 | catboost_predict_class_probabilities_op( 64 | data=evaluation_data_for_classification_in_csv, 65 | model=classification_model, 66 | label_column=0, 67 | ) 68 | 69 | catboost_to_apple_op(regression_model) 70 | catboost_to_apple_op(classification_model) 71 | 72 | catboost_to_onnx_op(regression_model) 73 | catboost_to_onnx_op(classification_model) 74 | 75 | 76 | if __name__ == '__main__': 77 | kfp.compiler.Compiler().compile(catboost_pipeline, __file__ + '.zip') 78 | kfp.Client(host=KUBEFLOW_HOST).create_run_from_pipeline_func( 79 | catboost_pipeline, 80 | arguments={}, 81 | experiment_name=EXPERIMENT_NAME) 82 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_values/from_CSV/component.yaml: -------------------------------------------------------------------------------- 1 | name: Catboost predict values 2 | description: |- 3 | Predict values with a CatBoost model. 4 | 5 | Args: 6 | data_path: Path for the data in CSV format. 7 | model_path: Path for the trained model in binary CatBoostModel format. 8 | label_column: Column containing the label data. 9 | predictions_path: Output path for the predictions. 10 | 11 | Outputs: 12 | predictions: Predictions in text format. 13 | 14 | Annotations: 15 | author: Alexey Volkov 16 | inputs: 17 | - {name: data, type: CSV} 18 | - {name: model, type: CatBoostModel} 19 | - {name: label_column, type: Integer, optional: true} 20 | outputs: 21 | - {name: predictions} 22 | implementation: 23 | container: 24 | image: python:3.7 25 | command: 26 | - sh 27 | - -c 28 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 29 | 'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 30 | --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@" 31 | - python3 32 | - -u 33 | - -c 34 | - | 35 | def _make_parent_dirs_and_return_path(file_path: str): 36 | import os 37 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 38 | return file_path 39 | 40 | def catboost_predict_values( 41 | data_path, 42 | model_path, 43 | predictions_path, 44 | 45 | label_column = None, 46 | ): 47 | '''Predict values with a CatBoost model. 48 | 49 | Args: 50 | data_path: Path for the data in CSV format. 51 | model_path: Path for the trained model in binary CatBoostModel format. 52 | label_column: Column containing the label data. 53 | predictions_path: Output path for the predictions. 54 | 55 | Outputs: 56 | predictions: Predictions in text format. 57 | 58 | Annotations: 59 | author: Alexey Volkov 60 | ''' 61 | import tempfile 62 | 63 | from catboost import CatBoost, Pool 64 | import numpy 65 | 66 | if label_column: 67 | column_descriptions = {label_column: 'Label'} 68 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 69 | with open(column_description_path, 'w') as column_description_file: 70 | for idx, kind in column_descriptions.items(): 71 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 72 | else: 73 | column_description_path = None 74 | 75 | eval_data = Pool( 76 | data_path, 77 | column_description=column_description_path, 78 | has_header=True, 79 | delimiter=',', 80 | ) 81 | 82 | model = CatBoost() 83 | model.load_model(model_path) 84 | 85 | predictions = model.predict(eval_data, prediction_type='RawFormulaVal') 86 | numpy.savetxt(predictions_path, predictions) 87 | 88 | import argparse 89 | _parser = argparse.ArgumentParser(prog='Catboost predict values', description='Predict values with a CatBoost model.\n\n Args:\n data_path: Path for the data in CSV format.\n model_path: Path for the trained model in binary CatBoostModel format.\n label_column: Column containing the label data.\n predictions_path: Output path for the predictions.\n\n Outputs:\n predictions: Predictions in text format.\n\n Annotations:\n author: Alexey Volkov ') 90 | _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS) 91 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 92 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 93 | _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 94 | _parsed_args = vars(_parser.parse_args()) 95 | 96 | _outputs = catboost_predict_values(**_parsed_args) 97 | args: 98 | - --data 99 | - {inputPath: data} 100 | - --model 101 | - {inputPath: model} 102 | - if: 103 | cond: {isPresent: label_column} 104 | then: 105 | - --label-column 106 | - {inputValue: label_column} 107 | - --predictions 108 | - {outputPath: predictions} 109 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_classes/from_CSV/component.yaml: -------------------------------------------------------------------------------- 1 | name: Catboost predict classes 2 | description: |- 3 | Predict classes using the CatBoost classifier model. 4 | 5 | Args: 6 | data_path: Path for the data in CSV format. 7 | model_path: Path for the trained model in binary CatBoostModel format. 8 | label_column: Column containing the label data. 9 | predictions_path: Output path for the predictions. 10 | 11 | Outputs: 12 | predictions: Class predictions in text format. 13 | 14 | Annotations: 15 | author: Alexey Volkov 16 | inputs: 17 | - {name: data, type: CSV} 18 | - {name: model, type: CatBoostModel} 19 | - {name: label_column, type: Integer, optional: true} 20 | outputs: 21 | - {name: predictions} 22 | implementation: 23 | container: 24 | image: python:3.7 25 | command: 26 | - sh 27 | - -c 28 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 29 | 'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 30 | --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@" 31 | - python3 32 | - -u 33 | - -c 34 | - | 35 | def _make_parent_dirs_and_return_path(file_path: str): 36 | import os 37 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 38 | return file_path 39 | 40 | def catboost_predict_classes( 41 | data_path, 42 | model_path, 43 | predictions_path, 44 | 45 | label_column = None, 46 | ): 47 | '''Predict classes using the CatBoost classifier model. 48 | 49 | Args: 50 | data_path: Path for the data in CSV format. 51 | model_path: Path for the trained model in binary CatBoostModel format. 52 | label_column: Column containing the label data. 53 | predictions_path: Output path for the predictions. 54 | 55 | Outputs: 56 | predictions: Class predictions in text format. 57 | 58 | Annotations: 59 | author: Alexey Volkov 60 | ''' 61 | import tempfile 62 | 63 | from catboost import CatBoostClassifier, Pool 64 | import numpy 65 | 66 | if label_column: 67 | column_descriptions = {label_column: 'Label'} 68 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 69 | with open(column_description_path, 'w') as column_description_file: 70 | for idx, kind in column_descriptions.items(): 71 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 72 | else: 73 | column_description_path = None 74 | 75 | eval_data = Pool( 76 | data_path, 77 | column_description=column_description_path, 78 | has_header=True, 79 | delimiter=',', 80 | ) 81 | 82 | model = CatBoostClassifier() 83 | model.load_model(model_path) 84 | 85 | predictions = model.predict(eval_data) 86 | numpy.savetxt(predictions_path, predictions, fmt='%s') 87 | 88 | import argparse 89 | _parser = argparse.ArgumentParser(prog='Catboost predict classes', description='Predict classes using the CatBoost classifier model.\n\n Args:\n data_path: Path for the data in CSV format.\n model_path: Path for the trained model in binary CatBoostModel format.\n label_column: Column containing the label data.\n predictions_path: Output path for the predictions.\n\n Outputs:\n predictions: Class predictions in text format.\n\n Annotations:\n author: Alexey Volkov ') 90 | _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS) 91 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 92 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 93 | _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 94 | _parsed_args = vars(_parser.parse_args()) 95 | 96 | _outputs = catboost_predict_classes(**_parsed_args) 97 | args: 98 | - --data 99 | - {inputPath: data} 100 | - --model 101 | - {inputPath: model} 102 | - if: 103 | cond: {isPresent: label_column} 104 | then: 105 | - --label-column 106 | - {inputValue: label_column} 107 | - --predictions 108 | - {outputPath: predictions} 109 | -------------------------------------------------------------------------------- /lesson5_control_structure/control_structure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2020 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # %% [markdown] 18 | # # DSL control structures tutorial 19 | # Shows how to use conditional execution and exit handlers. 20 | 21 | # %% 22 | from typing import NamedTuple 23 | 24 | import kfp 25 | from kfp import dsl 26 | from kfp.components import func_to_container_op, InputPath, OutputPath 27 | 28 | # %% [markdown] 29 | # ## Conditional execution 30 | # You can use the `with dsl.Condition(task1.outputs["output_name"] = "value"):` context to execute parts of the pipeline conditionally 31 | 32 | # %% 33 | 34 | @func_to_container_op 35 | def get_random_int_op(minimum: int, maximum: int) -> int: 36 | """Generate a random number between minimum and maximum (inclusive).""" 37 | import random 38 | result = random.randint(minimum, maximum) 39 | print(result) 40 | return result 41 | 42 | 43 | @func_to_container_op 44 | def flip_coin_op() -> str: 45 | """Flip a coin and output heads or tails randomly.""" 46 | import random 47 | result = random.choice(['heads', 'tails']) 48 | print(result) 49 | return result 50 | 51 | 52 | @func_to_container_op 53 | def print_op(message: str): 54 | """Print a message.""" 55 | print(message) 56 | 57 | 58 | @dsl.pipeline( 59 | name='Conditional execution pipeline', 60 | description='Shows how to use dsl.Condition().' 61 | ) 62 | def flipcoin_pipeline(): 63 | flip = flip_coin_op() 64 | with dsl.Condition(flip.output == 'heads'): 65 | random_num_head = get_random_int_op(0, 9) 66 | with dsl.Condition(random_num_head.output > 5): 67 | print_op('heads and %s > 5!' % random_num_head.output) 68 | with dsl.Condition(random_num_head.output <= 5): 69 | print_op('heads and %s <= 5!' % random_num_head.output) 70 | 71 | with dsl.Condition(flip.output == 'tails'): 72 | random_num_tail = get_random_int_op(10, 19) 73 | with dsl.Condition(random_num_tail.output > 15): 74 | print_op('tails and %s > 15!' % random_num_tail.output) 75 | with dsl.Condition(random_num_tail.output <= 15): 76 | print_op('tails and %s <= 15!' % random_num_tail.output) 77 | 78 | 79 | # Submit the pipeline for execution: 80 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(flipcoin_pipeline, arguments={}) 81 | 82 | # %% [markdown] 83 | # ## Exit handlers 84 | # You can use `with dsl.ExitHandler(exit_task):` context to execute a task when the rest of the pipeline finishes (succeeds or fails) 85 | 86 | # %% 87 | @func_to_container_op 88 | def fail_op(message): 89 | """Fails.""" 90 | import sys 91 | print(message) 92 | sys.exit(1) 93 | 94 | 95 | @dsl.pipeline( 96 | name='Conditional execution pipeline with exit handler', 97 | description='Shows how to use dsl.Condition() and dsl.ExitHandler().' 98 | ) 99 | def flipcoin_exit_pipeline(): 100 | exit_task = print_op('Exit handler has worked!') 101 | with dsl.ExitHandler(exit_task): 102 | flip = flip_coin_op() 103 | with dsl.Condition(flip.output == 'heads'): 104 | random_num_head = get_random_int_op(0, 9) 105 | with dsl.Condition(random_num_head.output > 5): 106 | print_op('heads and %s > 5!' % random_num_head.output) 107 | with dsl.Condition(random_num_head.output <= 5): 108 | print_op('heads and %s <= 5!' % random_num_head.output) 109 | 110 | with dsl.Condition(flip.output == 'tails'): 111 | random_num_tail = get_random_int_op(10, 19) 112 | with dsl.Condition(random_num_tail.output > 15): 113 | print_op('tails and %s > 15!' % random_num_tail.output) 114 | with dsl.Condition(random_num_tail.output <= 15): 115 | print_op('tails and %s <= 15!' % random_num_tail.output) 116 | 117 | with dsl.Condition(flip.output == 'tails'): 118 | fail_op(message="Failing the run to demonstrate that exit handler still gets executed.") 119 | 120 | 121 | if __name__ == '__main__': 122 | # Compiling the pipeline 123 | kfp.compiler.Compiler().compile(flipcoin_exit_pipeline, __file__ + '.yaml') 124 | -------------------------------------------------------------------------------- /lesson10_catboost/Predict_class_probabilities/from_CSV/component.yaml: -------------------------------------------------------------------------------- 1 | name: Catboost predict class probabilities 2 | description: |- 3 | Predict class probabilities with a CatBoost model. 4 | 5 | Args: 6 | data_path: Path for the data in CSV format. 7 | model_path: Path for the trained model in binary CatBoostModel format. 8 | label_column: Column containing the label data. 9 | predictions_path: Output path for the predictions. 10 | 11 | Outputs: 12 | predictions: Predictions in text format. 13 | 14 | Annotations: 15 | author: Alexey Volkov 16 | inputs: 17 | - {name: data, type: CSV} 18 | - {name: model, type: CatBoostModel} 19 | - {name: label_column, type: Integer, optional: true} 20 | outputs: 21 | - {name: predictions} 22 | implementation: 23 | container: 24 | image: python:3.7 25 | command: 26 | - sh 27 | - -c 28 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 29 | 'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 30 | --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@" 31 | - python3 32 | - -u 33 | - -c 34 | - | 35 | def _make_parent_dirs_and_return_path(file_path: str): 36 | import os 37 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 38 | return file_path 39 | 40 | def catboost_predict_class_probabilities( 41 | data_path, 42 | model_path, 43 | predictions_path, 44 | 45 | label_column = None, 46 | ): 47 | '''Predict class probabilities with a CatBoost model. 48 | 49 | Args: 50 | data_path: Path for the data in CSV format. 51 | model_path: Path for the trained model in binary CatBoostModel format. 52 | label_column: Column containing the label data. 53 | predictions_path: Output path for the predictions. 54 | 55 | Outputs: 56 | predictions: Predictions in text format. 57 | 58 | Annotations: 59 | author: Alexey Volkov 60 | ''' 61 | import tempfile 62 | 63 | from catboost import CatBoost, Pool 64 | import numpy 65 | 66 | if label_column: 67 | column_descriptions = {label_column: 'Label'} 68 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 69 | with open(column_description_path, 'w') as column_description_file: 70 | for idx, kind in column_descriptions.items(): 71 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 72 | else: 73 | column_description_path = None 74 | 75 | eval_data = Pool( 76 | data_path, 77 | column_description=column_description_path, 78 | has_header=True, 79 | delimiter=',', 80 | ) 81 | 82 | model = CatBoost() 83 | model.load_model(model_path) 84 | 85 | predictions = model.predict(eval_data, prediction_type='Probability') 86 | numpy.savetxt(predictions_path, predictions) 87 | 88 | import argparse 89 | _parser = argparse.ArgumentParser(prog='Catboost predict class probabilities', description='Predict class probabilities with a CatBoost model.\n\n Args:\n data_path: Path for the data in CSV format.\n model_path: Path for the trained model in binary CatBoostModel format.\n label_column: Column containing the label data.\n predictions_path: Output path for the predictions.\n\n Outputs:\n predictions: Predictions in text format.\n\n Annotations:\n author: Alexey Volkov ') 90 | _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS) 91 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 92 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 93 | _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 94 | _parsed_args = vars(_parser.parse_args()) 95 | 96 | _outputs = catboost_predict_class_probabilities(**_parsed_args) 97 | args: 98 | - --data 99 | - {inputPath: data} 100 | - --model 101 | - {inputPath: model} 102 | - if: 103 | cond: {isPresent: label_column} 104 | then: 105 | - --label-column 106 | - {inputValue: label_column} 107 | - --predictions 108 | - {outputPath: predictions} 109 | -------------------------------------------------------------------------------- /train_until_good/train_until_good.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2020 The Kubeflow Pipleines authors 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # This sample demonstrates continuous training using a train-eval-check recursive loop. 17 | # The main pipeline trains the initial model and then gradually trains the model 18 | # some more until the model evaluation metrics are good enough. 19 | 20 | import kfp 21 | from kfp import components 22 | 23 | 24 | chicago_taxi_dataset_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml') 25 | xgboost_train_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml') 26 | xgboost_predict_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml') 27 | 28 | pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml') 29 | drop_header_op = kfp.components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml') 30 | calculate_regression_metrics_from_csv_op = kfp.components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/616542ac0f789914f4eb53438da713dd3004fba4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml') 31 | 32 | 33 | # This recursive sub-pipeline trains a model, evaluates it, calculates the metrics and checks them. 34 | # If the model error is too high, then more training is performed until the model is good. 35 | @kfp.dsl.graph_component 36 | def train_until_low_error(starting_model, training_data, true_values): 37 | # Training 38 | model = xgboost_train_on_csv_op( 39 | training_data=training_data, 40 | starting_model=starting_model, 41 | label_column=0, 42 | objective='reg:squarederror', 43 | num_iterations=50, 44 | ).outputs['model'] 45 | 46 | # Predicting 47 | predictions = xgboost_predict_on_csv_op( 48 | data=training_data, 49 | model=model, 50 | label_column=0, 51 | ).output 52 | 53 | # Calculating the regression metrics 54 | metrics_task = calculate_regression_metrics_from_csv_op( 55 | true_values=true_values, 56 | predicted_values=predictions, 57 | ) 58 | 59 | # Checking the metrics 60 | with kfp.dsl.Condition(metrics_task.outputs['mean_squared_error'] > 0.01): 61 | # Training some more 62 | train_until_low_error( 63 | starting_model=model, 64 | training_data=training_data, 65 | true_values=true_values, 66 | ) 67 | 68 | 69 | # The main pipleine trains the initial model and then gradually trains the model some more until the model evaluation metrics are good enough. 70 | @kfp.dsl.pipeline() 71 | def train_until_good_pipeline(): 72 | # Preparing the training data 73 | training_data = chicago_taxi_dataset_op( 74 | where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"', 75 | select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total', 76 | limit=10000, 77 | ).output 78 | 79 | # Preparing the true values 80 | true_values_table = pandas_transform_csv_op( 81 | table=training_data, 82 | transform_code='df = df[["tips"]]', 83 | ).output 84 | 85 | true_values = drop_header_op(true_values_table).output 86 | 87 | # Initial model training 88 | first_model = xgboost_train_on_csv_op( 89 | training_data=training_data, 90 | label_column=0, 91 | objective='reg:squarederror', 92 | num_iterations=100, 93 | ).outputs['model'] 94 | 95 | # Recursively training until the error becomes low 96 | train_until_low_error( 97 | starting_model=first_model, 98 | training_data=training_data, 99 | true_values=true_values, 100 | ) 101 | 102 | 103 | if __name__ == '__main__': 104 | kfp.compiler.Compiler().compile(train_until_good_pipeline, __file__ + '.yaml') -------------------------------------------------------------------------------- /first_project/first.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: simple-sci-kit-kf-pipeline- 5 | annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2, pipelines.kubeflow.org/pipeline_compilation_time: '2021-04-04T16:14:25.709465', 6 | pipelines.kubeflow.org/pipeline_spec: '{"description": "A simple end to end sci-kit 7 | seldon kf pipeline", "inputs": [{"default": "index.docker.io/seldonio", "name": 8 | "docker_org", "optional": true}, {"default": "0.2", "name": "train_container_version", 9 | "optional": true}, {"default": "0.1", "name": "serve_container_version", "optional": 10 | true}], "name": "Simple sci-kit KF Pipeline"}'} 11 | labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2} 12 | spec: 13 | entrypoint: simple-sci-kit-kf-pipeline 14 | templates: 15 | - name: create-pvc 16 | resource: 17 | action: create 18 | manifest: | 19 | apiVersion: v1 20 | kind: PersistentVolumeClaim 21 | metadata: 22 | name: '{{workflow.name}}-nfs-1' 23 | spec: 24 | accessModes: 25 | - ReadWriteOnce 26 | resources: 27 | requests: 28 | storage: 10G 29 | outputs: 30 | parameters: 31 | - name: create-pvc-manifest 32 | valueFrom: {jsonPath: '{}'} 33 | - name: create-pvc-name 34 | valueFrom: {jsonPath: '{.metadata.name}'} 35 | - name: create-pvc-size 36 | valueFrom: {jsonPath: '{.status.capacity.storage}'} 37 | - name: serve 38 | resource: 39 | action: create 40 | successCondition: status.state == Available 41 | manifest: | 42 | apiVersion: machinelearning.seldon.io/v1alpha2 43 | kind: SeldonDeployment 44 | metadata: 45 | labels: 46 | app: seldon 47 | name: mnist-classifier 48 | spec: 49 | annotations: 50 | deployment_version: v1 51 | project_name: MNIST Example 52 | name: mnist-classifier 53 | predictors: 54 | - annotations: 55 | predictor_version: v1 56 | componentSpecs: 57 | - spec: 58 | containers: 59 | - image: '{{inputs.parameters.docker_org}}/skmnistclassifier_runtime:{{inputs.parameters.serve_container_version}}' 60 | imagePullPolicy: Always 61 | name: mnist-classifier 62 | volumeMounts: 63 | - mountPath: /data 64 | name: persistent-storage 65 | terminationGracePeriodSeconds: 1 66 | volumes: 67 | - name: persistent-storage 68 | persistentVolumeClaim: 69 | claimName: '{{inputs.parameters.create-pvc-name}}' 70 | graph: 71 | children: [] 72 | endpoint: 73 | type: REST 74 | name: mnist-classifier 75 | type: MODEL 76 | name: mnist-classifier 77 | replicas: 1 78 | inputs: 79 | parameters: 80 | - {name: create-pvc-name} 81 | - {name: docker_org} 82 | - {name: serve_container_version} 83 | outputs: 84 | parameters: 85 | - name: serve-manifest 86 | valueFrom: {jsonPath: '{}'} 87 | - name: serve-name 88 | valueFrom: {jsonPath: '{.metadata.name}'} 89 | - name: simple-sci-kit-kf-pipeline 90 | inputs: 91 | parameters: 92 | - {name: docker_org} 93 | - {name: serve_container_version} 94 | - {name: train_container_version} 95 | dag: 96 | tasks: 97 | - {name: create-pvc, template: create-pvc} 98 | - name: serve 99 | template: serve 100 | dependencies: [create-pvc, sk-train] 101 | arguments: 102 | parameters: 103 | - {name: create-pvc-name, value: '{{tasks.create-pvc.outputs.parameters.create-pvc-name}}'} 104 | - {name: docker_org, value: '{{inputs.parameters.docker_org}}'} 105 | - {name: serve_container_version, value: '{{inputs.parameters.serve_container_version}}'} 106 | - name: sk-train 107 | template: sk-train 108 | dependencies: [create-pvc] 109 | arguments: 110 | parameters: 111 | - {name: create-pvc-name, value: '{{tasks.create-pvc.outputs.parameters.create-pvc-name}}'} 112 | - {name: docker_org, value: '{{inputs.parameters.docker_org}}'} 113 | - {name: train_container_version, value: '{{inputs.parameters.train_container_version}}'} 114 | - name: sk-train 115 | container: 116 | image: '{{inputs.parameters.docker_org}}/skmnistclassifier_trainer:{{inputs.parameters.train_container_version}}' 117 | volumeMounts: 118 | - {mountPath: /data, name: create-pvc} 119 | inputs: 120 | parameters: 121 | - {name: create-pvc-name} 122 | - {name: docker_org} 123 | - {name: train_container_version} 124 | volumes: 125 | - name: create-pvc 126 | persistentVolumeClaim: {claimName: '{{inputs.parameters.create-pvc-name}}'} 127 | arguments: 128 | parameters: 129 | - {name: docker_org, value: index.docker.io/seldonio} 130 | - {name: train_container_version, value: '0.2'} 131 | - {name: serve_container_version, value: '0.1'} 132 | serviceAccountName: pipeline-runner 133 | -------------------------------------------------------------------------------- /lesson6_data_passing/data_passing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2020 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # %% [markdown] 17 | # # Data passing tutorial 18 | # Data passing is the most important aspect of Pipelines. 19 | # 20 | # In Kubeflow Pipelines, the pipeline authors compose pipelines by creating component instances (tasks) and connecting them together. 21 | # 22 | # Component have inputs and outputs. They can consume and produce arbitrary data. 23 | # 24 | # Pipeline authors establish connections between component tasks by connecting their data inputs and outputs - by passing the output of one task as an argument to another task's input. 25 | # 26 | # The system takes care of storing the data produced by components and later passing that data to other components for consumption as instructed by the pipeline. 27 | # 28 | # This tutorial shows how to create python components that produce, consume and transform data. 29 | # It shows how to create data passing pipelines by instantiating components and connecting them together. 30 | 31 | # %% 32 | from typing import NamedTuple 33 | 34 | import kfp 35 | from kfp.components import func_to_container_op, InputPath, OutputPath 36 | 37 | # %% [markdown] 38 | # ## Small data 39 | # 40 | # Small data is the data that you'll be comfortable passing as program's command-line argument. Small data size should not exceed few kilobytes. 41 | # 42 | # Some examples of typical types of small data are: number, URL, small string (e.g. column name). 43 | # 44 | # Small lists, dictionaries and JSON structures are fine, but keep an eye on the size and consider switching to file-based data passing methods taht are more suitable for bigger data (more than several kilobytes) or binary data. 45 | # 46 | # All small data outputs will be at some point serialized to strings and all small data input values will be at some point deserialized from strings (passed as command-line argumants). There are built-in serializers and deserializers for several common types (e.g. `str`, `int`, `float`, `bool`, `list`, `dict`). All other types of data need to be serialized manually before returning the data. Make sure to properly specify type annotations, otherwize there would be no automatic deserialization and the component function will receive strings instead of deserialized objects. 47 | 48 | # %% [markdown] 49 | # ## Bigger data (files) 50 | # 51 | # Bigger data should be read from files and written to files. 52 | # 53 | # The paths for the input and output files are chosen by the system and are passed into the function (as strings). 54 | # 55 | # Use the `InputPath` parameter annotation to tell the system that the function wants to consume the corresponding input data as a file. The system will download the data, write it to a local file and then pass the **path** of that file to the function. 56 | # 57 | # Use the `OutputPath` parameter annotation to tell the system that the function wants to produce the corresponding output data as a file. The system will prepare and pass the **path** of a file where the function should write the output data. After the function exits, the system will upload the data to the storage system so that it can be passed to downstream components. 58 | # 59 | # You can specify the type of the consumed/produced data by specifying the type argument to `InputPath` and `OutputPath`. The type can be a python type or an arbitrary type name string. `OutputPath('TFModel')` means that the function states that the data it has written to a file has type 'TFModel'. `InputPath('TFModel')` means that the function states that it expect the data it reads from a file to have type 'TFModel'. When the pipeline author connects inputs to outputs the system checks whether the types match. 60 | # 61 | # Note on input/output names: When the function is converted to component, the input and output names generally follow the parameter names, but the "\_path" and "\_file" suffixes are stripped from file/path inputs and outputs. E.g. the `number_file_path: InputPath(int)` parameter becomes the `number: int` input. This makes the argument passing look more natural: `number=42` instead of `number_file_path=42`. 62 | # %% [markdown] 63 | # 64 | # ### Writing and reading bigger data 65 | 66 | # %% 67 | # Writing bigger data 68 | @func_to_container_op 69 | def repeat_line(line: str, output_text_path: OutputPath(str), count: int = 10): 70 | '''Repeat the line specified number of times''' 71 | with open(output_text_path, 'w') as writer: 72 | for i in range(count): 73 | writer.write(line + '\n') 74 | 75 | 76 | # Reading bigger data 77 | @func_to_container_op 78 | def print_text(text_path: InputPath()): # The "text" input is untyped so that any data can be printed 79 | '''Print text''' 80 | with open(text_path, 'r') as reader: 81 | for line in reader: 82 | print(line, end = '') 83 | 84 | def print_repeating_lines_pipeline(): 85 | repeat_lines_task = repeat_line(line='Hello', count=5000) 86 | print_text(repeat_lines_task.output) # Don't forget .output ! 87 | 88 | # Submit the pipeline for execution: 89 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(print_repeating_lines_pipeline, arguments={}) 90 | 91 | # %% [markdown] 92 | # ### Processing bigger data 93 | 94 | # %% 95 | @func_to_container_op 96 | def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)): 97 | with open(source_path, 'r') as reader: 98 | with open(odd_lines_path, 'w') as odd_writer: 99 | with open(even_lines_path, 'w') as even_writer: 100 | while True: 101 | line = reader.readline() 102 | if line == "": 103 | break 104 | odd_writer.write(line) 105 | line = reader.readline() 106 | if line == "": 107 | break 108 | even_writer.write(line) 109 | 110 | def text_splitting_pipeline(): 111 | text = '\n'.join(['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']) 112 | split_text_task = split_text_lines(text) 113 | print_text(split_text_task.outputs['odd_lines']) 114 | print_text(split_text_task.outputs['even_lines']) 115 | 116 | # Submit the pipeline for execution: 117 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(text_splitting_pipeline, arguments={}) 118 | 119 | 120 | # %% [markdown] 121 | # ### Example: Pipeline that generates then sums many numbers 122 | 123 | # %% 124 | # Writing many numbers 125 | @func_to_container_op 126 | def write_numbers(numbers_path: OutputPath(str), start: int = 0, count: int = 10): 127 | with open(numbers_path, 'w') as writer: 128 | for i in range(start, count): 129 | writer.write(str(i) + '\n') 130 | 131 | 132 | # Reading and summing many numbers 133 | @func_to_container_op 134 | def sum_numbers(numbers_path: InputPath(str)) -> int: 135 | sum = 0 136 | with open(numbers_path, 'r') as reader: 137 | for line in reader: 138 | sum = sum + int(line) 139 | return sum 140 | 141 | 142 | 143 | # Pipeline to sum 100000 numbers 144 | def sum_pipeline(count: int = 100000): 145 | numbers_task = write_numbers(count=count) 146 | print_text(numbers_task.output) 147 | 148 | sum_task = sum_numbers(numbers_task.outputs['numbers']) 149 | print_text(sum_task.output) 150 | 151 | 152 | # Submit the pipeline for execution: 153 | #kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(sum_pipeline, arguments={}) 154 | 155 | # Combining all pipelines together in a single pipeline 156 | def file_passing_pipelines(): 157 | print_repeating_lines_pipeline() 158 | text_splitting_pipeline() 159 | sum_pipeline() 160 | 161 | 162 | if __name__ == '__main__': 163 | # Compiling the pipeline 164 | kfp.compiler.Compiler().compile(file_passing_pipelines, __file__ + '.yaml') 165 | -------------------------------------------------------------------------------- /lesson10_catboost/Train_regression/from_CSV/component.yaml: -------------------------------------------------------------------------------- 1 | name: Catboost train regression 2 | description: |- 3 | Train a CatBoost classifier model. 4 | 5 | Args: 6 | training_data_path: Path for the training data in CSV format. 7 | model_path: Output path for the trained model in binary CatBoostModel format. 8 | starting_model_path: Path for the existing trained model to start from. 9 | label_column: Column containing the label data. 10 | 11 | loss_function: The metric to use in training and also selector of the machine learning 12 | problem to solve. Default = 'RMSE'. Possible values: 13 | 'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value' 14 | num_iterations: Number of trees to add to the ensemble. 15 | learning_rate: Step size shrinkage used in update to prevents overfitting. 16 | Default value is selected automatically for binary classification with other parameters set to default. 17 | In all other cases default is 0.03. 18 | depth: Depth of a tree. All trees are the same depth. Default = 6 19 | random_seed: Random number seed. Default = 0 20 | 21 | cat_features: A list of Categorical features (indices or names). 22 | additional_training_options: A dictionary with additional options to pass to CatBoostRegressor 23 | 24 | Outputs: 25 | model: Trained model in binary CatBoostModel format. 26 | 27 | Annotations: 28 | author: Alexey Volkov 29 | inputs: 30 | - {name: training_data, type: CSV} 31 | - {name: starting_model, type: CatBoostModel, optional: true} 32 | - {name: label_column, type: Integer, default: '0', optional: true} 33 | - {name: loss_function, type: String, default: RMSE, optional: true} 34 | - {name: num_iterations, type: Integer, default: '500', optional: true} 35 | - {name: learning_rate, type: Float, optional: true} 36 | - {name: depth, type: Integer, default: '6', optional: true} 37 | - {name: random_seed, type: Integer, default: '0', optional: true} 38 | - {name: cat_features, type: JsonArray, optional: true} 39 | - {name: additional_training_options, type: JsonObject, default: '{}', optional: true} 40 | outputs: 41 | - {name: model, type: CatBoostModel} 42 | implementation: 43 | container: 44 | image: python:3.7 45 | command: 46 | - sh 47 | - -c 48 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 49 | 'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 50 | --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@" 51 | - python3 52 | - -u 53 | - -c 54 | - | 55 | def _make_parent_dirs_and_return_path(file_path: str): 56 | import os 57 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 58 | return file_path 59 | 60 | def catboost_train_regression( 61 | training_data_path, 62 | model_path, 63 | starting_model_path = None, 64 | label_column = 0, 65 | 66 | loss_function = 'RMSE', 67 | num_iterations = 500, 68 | learning_rate = None, 69 | depth = 6, 70 | random_seed = 0, 71 | 72 | cat_features = None, 73 | 74 | additional_training_options = {}, 75 | ): 76 | '''Train a CatBoost classifier model. 77 | 78 | Args: 79 | training_data_path: Path for the training data in CSV format. 80 | model_path: Output path for the trained model in binary CatBoostModel format. 81 | starting_model_path: Path for the existing trained model to start from. 82 | label_column: Column containing the label data. 83 | 84 | loss_function: The metric to use in training and also selector of the machine learning 85 | problem to solve. Default = 'RMSE'. Possible values: 86 | 'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value' 87 | num_iterations: Number of trees to add to the ensemble. 88 | learning_rate: Step size shrinkage used in update to prevents overfitting. 89 | Default value is selected automatically for binary classification with other parameters set to default. 90 | In all other cases default is 0.03. 91 | depth: Depth of a tree. All trees are the same depth. Default = 6 92 | random_seed: Random number seed. Default = 0 93 | 94 | cat_features: A list of Categorical features (indices or names). 95 | additional_training_options: A dictionary with additional options to pass to CatBoostRegressor 96 | 97 | Outputs: 98 | model: Trained model in binary CatBoostModel format. 99 | 100 | Annotations: 101 | author: Alexey Volkov 102 | ''' 103 | import tempfile 104 | from pathlib import Path 105 | 106 | from catboost import CatBoostRegressor, Pool 107 | 108 | column_descriptions = {label_column: 'Label'} 109 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 110 | with open(column_description_path, 'w') as column_description_file: 111 | for idx, kind in column_descriptions.items(): 112 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 113 | 114 | train_data = Pool( 115 | training_data_path, 116 | column_description=column_description_path, 117 | has_header=True, 118 | delimiter=',', 119 | ) 120 | 121 | model = CatBoostRegressor( 122 | iterations=num_iterations, 123 | depth=depth, 124 | learning_rate=learning_rate, 125 | loss_function=loss_function, 126 | random_seed=random_seed, 127 | verbose=True, 128 | **additional_training_options, 129 | ) 130 | 131 | model.fit( 132 | train_data, 133 | cat_features=cat_features, 134 | init_model=starting_model_path, 135 | #verbose=False, 136 | #plot=True, 137 | ) 138 | Path(model_path).parent.mkdir(parents=True, exist_ok=True) 139 | model.save_model(model_path) 140 | 141 | import json 142 | import argparse 143 | _parser = argparse.ArgumentParser(prog='Catboost train regression', description="Train a CatBoost classifier model.\n\n Args:\n training_data_path: Path for the training data in CSV format.\n model_path: Output path for the trained model in binary CatBoostModel format.\n starting_model_path: Path for the existing trained model to start from.\n label_column: Column containing the label data.\n\n loss_function: The metric to use in training and also selector of the machine learning\n problem to solve. Default = 'RMSE'. Possible values:\n 'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'\n num_iterations: Number of trees to add to the ensemble.\n learning_rate: Step size shrinkage used in update to prevents overfitting.\n Default value is selected automatically for binary classification with other parameters set to default.\n In all other cases default is 0.03.\n depth: Depth of a tree. All trees are the same depth. Default = 6\n random_seed: Random number seed. Default = 0\n\n cat_features: A list of Categorical features (indices or names).\n additional_training_options: A dictionary with additional options to pass to CatBoostRegressor\n\n Outputs:\n model: Trained model in binary CatBoostModel format.\n\n Annotations:\n author: Alexey Volkov ") 144 | _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS) 145 | _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS) 146 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 147 | _parser.add_argument("--loss-function", dest="loss_function", type=str, required=False, default=argparse.SUPPRESS) 148 | _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS) 149 | _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS) 150 | _parser.add_argument("--depth", dest="depth", type=int, required=False, default=argparse.SUPPRESS) 151 | _parser.add_argument("--random-seed", dest="random_seed", type=int, required=False, default=argparse.SUPPRESS) 152 | _parser.add_argument("--cat-features", dest="cat_features", type=json.loads, required=False, default=argparse.SUPPRESS) 153 | _parser.add_argument("--additional-training-options", dest="additional_training_options", type=json.loads, required=False, default=argparse.SUPPRESS) 154 | _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 155 | _parsed_args = vars(_parser.parse_args()) 156 | 157 | _outputs = catboost_train_regression(**_parsed_args) 158 | args: 159 | - --training-data 160 | - {inputPath: training_data} 161 | - if: 162 | cond: {isPresent: starting_model} 163 | then: 164 | - --starting-model 165 | - {inputPath: starting_model} 166 | - if: 167 | cond: {isPresent: label_column} 168 | then: 169 | - --label-column 170 | - {inputValue: label_column} 171 | - if: 172 | cond: {isPresent: loss_function} 173 | then: 174 | - --loss-function 175 | - {inputValue: loss_function} 176 | - if: 177 | cond: {isPresent: num_iterations} 178 | then: 179 | - --num-iterations 180 | - {inputValue: num_iterations} 181 | - if: 182 | cond: {isPresent: learning_rate} 183 | then: 184 | - --learning-rate 185 | - {inputValue: learning_rate} 186 | - if: 187 | cond: {isPresent: depth} 188 | then: 189 | - --depth 190 | - {inputValue: depth} 191 | - if: 192 | cond: {isPresent: random_seed} 193 | then: 194 | - --random-seed 195 | - {inputValue: random_seed} 196 | - if: 197 | cond: {isPresent: cat_features} 198 | then: 199 | - --cat-features 200 | - {inputValue: cat_features} 201 | - if: 202 | cond: {isPresent: additional_training_options} 203 | then: 204 | - --additional-training-options 205 | - {inputValue: additional_training_options} 206 | - --model 207 | - {outputPath: model} 208 | -------------------------------------------------------------------------------- /lesson10_catboost/Train_classifier/from_CSV/component.yaml: -------------------------------------------------------------------------------- 1 | name: Catboost train classifier 2 | description: |- 3 | Train a CatBoost classifier model. 4 | 5 | Args: 6 | training_data_path: Path for the training data in CSV format. 7 | model_path: Output path for the trained model in binary CatBoostModel format. 8 | starting_model_path: Path for the existing trained model to start from. 9 | label_column: Column containing the label data. 10 | 11 | loss_function: The metric to use in training and also selector of the machine learning 12 | problem to solve. Default = 'Logloss' 13 | num_iterations: Number of trees to add to the ensemble. 14 | learning_rate: Step size shrinkage used in update to prevents overfitting. 15 | Default value is selected automatically for binary classification with other parameters set to default. 16 | In all other cases default is 0.03. 17 | depth: Depth of a tree. All trees are the same depth. Default = 6 18 | random_seed: Random number seed. Default = 0 19 | 20 | cat_features: A list of Categorical features (indices or names). 21 | text_features: A list of Text features (indices or names). 22 | additional_training_options: A dictionary with additional options to pass to CatBoostClassifier 23 | 24 | Outputs: 25 | model: Trained model in binary CatBoostModel format. 26 | 27 | Annotations: 28 | author: Alexey Volkov 29 | inputs: 30 | - {name: training_data, type: CSV} 31 | - {name: starting_model, type: CatBoostModel, optional: true} 32 | - {name: label_column, type: Integer, default: '0', optional: true} 33 | - {name: loss_function, type: String, default: Logloss, optional: true} 34 | - {name: num_iterations, type: Integer, default: '500', optional: true} 35 | - {name: learning_rate, type: Float, optional: true} 36 | - {name: depth, type: Integer, default: '6', optional: true} 37 | - {name: random_seed, type: Integer, default: '0', optional: true} 38 | - {name: cat_features, type: JsonArray, optional: true} 39 | - {name: text_features, type: JsonArray, optional: true} 40 | - {name: additional_training_options, type: JsonObject, default: '{}', optional: true} 41 | outputs: 42 | - {name: model, type: CatBoostModel} 43 | implementation: 44 | container: 45 | image: python:3.7 46 | command: 47 | - sh 48 | - -c 49 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 50 | 'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet 51 | --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@" 52 | - python3 53 | - -u 54 | - -c 55 | - | 56 | def _make_parent_dirs_and_return_path(file_path: str): 57 | import os 58 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 59 | return file_path 60 | 61 | def catboost_train_classifier( 62 | training_data_path, 63 | model_path, 64 | starting_model_path = None, 65 | label_column = 0, 66 | 67 | loss_function = 'Logloss', 68 | num_iterations = 500, 69 | learning_rate = None, 70 | depth = 6, 71 | random_seed = 0, 72 | 73 | cat_features = None, 74 | text_features = None, 75 | 76 | additional_training_options = {}, 77 | ): 78 | '''Train a CatBoost classifier model. 79 | 80 | Args: 81 | training_data_path: Path for the training data in CSV format. 82 | model_path: Output path for the trained model in binary CatBoostModel format. 83 | starting_model_path: Path for the existing trained model to start from. 84 | label_column: Column containing the label data. 85 | 86 | loss_function: The metric to use in training and also selector of the machine learning 87 | problem to solve. Default = 'Logloss' 88 | num_iterations: Number of trees to add to the ensemble. 89 | learning_rate: Step size shrinkage used in update to prevents overfitting. 90 | Default value is selected automatically for binary classification with other parameters set to default. 91 | In all other cases default is 0.03. 92 | depth: Depth of a tree. All trees are the same depth. Default = 6 93 | random_seed: Random number seed. Default = 0 94 | 95 | cat_features: A list of Categorical features (indices or names). 96 | text_features: A list of Text features (indices or names). 97 | additional_training_options: A dictionary with additional options to pass to CatBoostClassifier 98 | 99 | Outputs: 100 | model: Trained model in binary CatBoostModel format. 101 | 102 | Annotations: 103 | author: Alexey Volkov 104 | ''' 105 | import tempfile 106 | from pathlib import Path 107 | 108 | from catboost import CatBoostClassifier, Pool 109 | 110 | column_descriptions = {label_column: 'Label'} 111 | column_description_path = tempfile.NamedTemporaryFile(delete=False).name 112 | with open(column_description_path, 'w') as column_description_file: 113 | for idx, kind in column_descriptions.items(): 114 | column_description_file.write('{}\t{}\n'.format(idx, kind)) 115 | 116 | train_data = Pool( 117 | training_data_path, 118 | column_description=column_description_path, 119 | has_header=True, 120 | delimiter=',', 121 | ) 122 | 123 | model = CatBoostClassifier( 124 | iterations=num_iterations, 125 | depth=depth, 126 | learning_rate=learning_rate, 127 | loss_function=loss_function, 128 | random_seed=random_seed, 129 | verbose=True, 130 | **additional_training_options, 131 | ) 132 | 133 | model.fit( 134 | train_data, 135 | cat_features=cat_features, 136 | text_features=text_features, 137 | init_model=starting_model_path, 138 | #verbose=False, 139 | #plot=True, 140 | ) 141 | Path(model_path).parent.mkdir(parents=True, exist_ok=True) 142 | model.save_model(model_path) 143 | 144 | import json 145 | import argparse 146 | _parser = argparse.ArgumentParser(prog='Catboost train classifier', description="Train a CatBoost classifier model.\n\n Args:\n training_data_path: Path for the training data in CSV format.\n model_path: Output path for the trained model in binary CatBoostModel format.\n starting_model_path: Path for the existing trained model to start from.\n label_column: Column containing the label data.\n\n loss_function: The metric to use in training and also selector of the machine learning\n problem to solve. Default = 'Logloss'\n num_iterations: Number of trees to add to the ensemble.\n learning_rate: Step size shrinkage used in update to prevents overfitting.\n Default value is selected automatically for binary classification with other parameters set to default.\n In all other cases default is 0.03.\n depth: Depth of a tree. All trees are the same depth. Default = 6\n random_seed: Random number seed. Default = 0\n\n cat_features: A list of Categorical features (indices or names).\n text_features: A list of Text features (indices or names).\n additional_training_options: A dictionary with additional options to pass to CatBoostClassifier\n\n Outputs:\n model: Trained model in binary CatBoostModel format.\n\n Annotations:\n author: Alexey Volkov ") 147 | _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS) 148 | _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS) 149 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 150 | _parser.add_argument("--loss-function", dest="loss_function", type=str, required=False, default=argparse.SUPPRESS) 151 | _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS) 152 | _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS) 153 | _parser.add_argument("--depth", dest="depth", type=int, required=False, default=argparse.SUPPRESS) 154 | _parser.add_argument("--random-seed", dest="random_seed", type=int, required=False, default=argparse.SUPPRESS) 155 | _parser.add_argument("--cat-features", dest="cat_features", type=json.loads, required=False, default=argparse.SUPPRESS) 156 | _parser.add_argument("--text-features", dest="text_features", type=json.loads, required=False, default=argparse.SUPPRESS) 157 | _parser.add_argument("--additional-training-options", dest="additional_training_options", type=json.loads, required=False, default=argparse.SUPPRESS) 158 | _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 159 | _parsed_args = vars(_parser.parse_args()) 160 | 161 | _outputs = catboost_train_classifier(**_parsed_args) 162 | args: 163 | - --training-data 164 | - {inputPath: training_data} 165 | - if: 166 | cond: {isPresent: starting_model} 167 | then: 168 | - --starting-model 169 | - {inputPath: starting_model} 170 | - if: 171 | cond: {isPresent: label_column} 172 | then: 173 | - --label-column 174 | - {inputValue: label_column} 175 | - if: 176 | cond: {isPresent: loss_function} 177 | then: 178 | - --loss-function 179 | - {inputValue: loss_function} 180 | - if: 181 | cond: {isPresent: num_iterations} 182 | then: 183 | - --num-iterations 184 | - {inputValue: num_iterations} 185 | - if: 186 | cond: {isPresent: learning_rate} 187 | then: 188 | - --learning-rate 189 | - {inputValue: learning_rate} 190 | - if: 191 | cond: {isPresent: depth} 192 | then: 193 | - --depth 194 | - {inputValue: depth} 195 | - if: 196 | cond: {isPresent: random_seed} 197 | then: 198 | - --random-seed 199 | - {inputValue: random_seed} 200 | - if: 201 | cond: {isPresent: cat_features} 202 | then: 203 | - --cat-features 204 | - {inputValue: cat_features} 205 | - if: 206 | cond: {isPresent: text_features} 207 | then: 208 | - --text-features 209 | - {inputValue: text_features} 210 | - if: 211 | cond: {isPresent: additional_training_options} 212 | then: 213 | - --additional-training-options 214 | - {inputValue: additional_training_options} 215 | - --model 216 | - {outputPath: model} 217 | -------------------------------------------------------------------------------- /lesson5_control_structure/control.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: conditional-execution-pipeline- 5 | annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2, pipelines.kubeflow.org/pipeline_compilation_time: '2021-04-04T17:39:09.256839', 6 | pipelines.kubeflow.org/pipeline_spec: '{"description": "Shows how to use dsl.Condition().", 7 | "name": "Conditional execution pipeline"}'} 8 | labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.5.0-rc.2} 9 | spec: 10 | entrypoint: conditional-execution-pipeline 11 | templates: 12 | - name: condition-1 13 | dag: 14 | tasks: 15 | - name: condition-2 16 | template: condition-2 17 | when: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}} 18 | > 5' 19 | dependencies: [get-random-int-op] 20 | arguments: 21 | parameters: 22 | - {name: get-random-int-op-Output, value: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}'} 23 | - name: condition-3 24 | template: condition-3 25 | when: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}} 26 | <= 5' 27 | dependencies: [get-random-int-op] 28 | arguments: 29 | parameters: 30 | - {name: get-random-int-op-Output, value: '{{tasks.get-random-int-op.outputs.parameters.get-random-int-op-Output}}'} 31 | - {name: get-random-int-op, template: get-random-int-op} 32 | - name: condition-2 33 | inputs: 34 | parameters: 35 | - {name: get-random-int-op-Output} 36 | dag: 37 | tasks: 38 | - name: print-op 39 | template: print-op 40 | arguments: 41 | parameters: 42 | - {name: get-random-int-op-Output, value: '{{inputs.parameters.get-random-int-op-Output}}'} 43 | - name: condition-3 44 | inputs: 45 | parameters: 46 | - {name: get-random-int-op-Output} 47 | dag: 48 | tasks: 49 | - name: print-op-2 50 | template: print-op-2 51 | arguments: 52 | parameters: 53 | - {name: get-random-int-op-Output, value: '{{inputs.parameters.get-random-int-op-Output}}'} 54 | - name: condition-4 55 | dag: 56 | tasks: 57 | - name: condition-5 58 | template: condition-5 59 | when: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}} 60 | > 15' 61 | dependencies: [get-random-int-op-2] 62 | arguments: 63 | parameters: 64 | - {name: get-random-int-op-2-Output, value: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}'} 65 | - name: condition-6 66 | template: condition-6 67 | when: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}} 68 | <= 15' 69 | dependencies: [get-random-int-op-2] 70 | arguments: 71 | parameters: 72 | - {name: get-random-int-op-2-Output, value: '{{tasks.get-random-int-op-2.outputs.parameters.get-random-int-op-2-Output}}'} 73 | - {name: get-random-int-op-2, template: get-random-int-op-2} 74 | - name: condition-5 75 | inputs: 76 | parameters: 77 | - {name: get-random-int-op-2-Output} 78 | dag: 79 | tasks: 80 | - name: print-op-3 81 | template: print-op-3 82 | arguments: 83 | parameters: 84 | - {name: get-random-int-op-2-Output, value: '{{inputs.parameters.get-random-int-op-2-Output}}'} 85 | - name: condition-6 86 | inputs: 87 | parameters: 88 | - {name: get-random-int-op-2-Output} 89 | dag: 90 | tasks: 91 | - name: print-op-4 92 | template: print-op-4 93 | arguments: 94 | parameters: 95 | - {name: get-random-int-op-2-Output, value: '{{inputs.parameters.get-random-int-op-2-Output}}'} 96 | - name: conditional-execution-pipeline 97 | dag: 98 | tasks: 99 | - name: condition-1 100 | template: condition-1 101 | when: '"{{tasks.flip-coin-op.outputs.parameters.flip-coin-op-Output}}" == 102 | "heads"' 103 | dependencies: [flip-coin-op] 104 | - name: condition-4 105 | template: condition-4 106 | when: '"{{tasks.flip-coin-op.outputs.parameters.flip-coin-op-Output}}" == 107 | "tails"' 108 | dependencies: [flip-coin-op] 109 | - {name: flip-coin-op, template: flip-coin-op} 110 | - name: flip-coin-op 111 | container: 112 | args: ['----output-paths', /tmp/outputs/Output/data] 113 | command: 114 | - sh 115 | - -ec 116 | - | 117 | program_path=$(mktemp) 118 | printf "%s" "$0" > "$program_path" 119 | python3 -u "$program_path" "$@" 120 | - | 121 | def flip_coin_op(): 122 | """Flip a coin and output heads or tails randomly.""" 123 | import random 124 | result = random.choice(['heads', 'tails']) 125 | print(result) 126 | return result 127 | 128 | def _serialize_str(str_value: str) -> str: 129 | if not isinstance(str_value, str): 130 | raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) 131 | return str_value 132 | 133 | import argparse 134 | _parser = argparse.ArgumentParser(prog='Flip coin op', description='Flip a coin and output heads or tails randomly.') 135 | _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) 136 | _parsed_args = vars(_parser.parse_args()) 137 | _output_files = _parsed_args.pop("_output_paths", []) 138 | 139 | _outputs = flip_coin_op(**_parsed_args) 140 | 141 | _outputs = [_outputs] 142 | 143 | _output_serializers = [ 144 | _serialize_str, 145 | 146 | ] 147 | 148 | import os 149 | for idx, output_file in enumerate(_output_files): 150 | try: 151 | os.makedirs(os.path.dirname(output_file)) 152 | except OSError: 153 | pass 154 | with open(output_file, 'w') as f: 155 | f.write(_output_serializers[idx](_outputs[idx])) 156 | image: python:3.7 157 | outputs: 158 | parameters: 159 | - name: flip-coin-op-Output 160 | valueFrom: {path: /tmp/outputs/Output/data} 161 | artifacts: 162 | - {name: flip-coin-op-Output, path: /tmp/outputs/Output/data} 163 | metadata: 164 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Flip 165 | a coin and output heads or tails randomly.", "implementation": {"container": 166 | {"args": ["----output-paths", {"outputPath": "Output"}], "command": ["sh", 167 | "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 168 | -u \"$program_path\" \"$@\"\n", "def flip_coin_op():\n \"\"\"Flip a coin 169 | and output heads or tails randomly.\"\"\"\n import random\n result 170 | = random.choice([''heads'', ''tails''])\n print(result)\n return result\n\ndef 171 | _serialize_str(str_value: str) -> str:\n if not isinstance(str_value, 172 | str):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of 173 | str.''.format(str(str_value), str(type(str_value))))\n return str_value\n\nimport 174 | argparse\n_parser = argparse.ArgumentParser(prog=''Flip coin op'', description=''Flip 175 | a coin and output heads or tails randomly.'')\n_parser.add_argument(\"----output-paths\", 176 | dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files 177 | = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = flip_coin_op(**_parsed_args)\n\n_outputs 178 | = [_outputs]\n\n_output_serializers = [\n _serialize_str,\n\n]\n\nimport 179 | os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except 180 | OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], 181 | "image": "python:3.7"}}, "name": "Flip coin op", "outputs": [{"name": "Output", 182 | "type": "String"}]}', pipelines.kubeflow.org/component_ref: '{}'} 183 | - name: get-random-int-op 184 | container: 185 | args: [--minimum, '0', --maximum, '9', '----output-paths', /tmp/outputs/Output/data] 186 | command: 187 | - sh 188 | - -ec 189 | - | 190 | program_path=$(mktemp) 191 | printf "%s" "$0" > "$program_path" 192 | python3 -u "$program_path" "$@" 193 | - | 194 | def get_random_int_op(minimum, maximum): 195 | """Generate a random number between minimum and maximum (inclusive).""" 196 | import random 197 | result = random.randint(minimum, maximum) 198 | print(result) 199 | return result 200 | 201 | def _serialize_int(int_value: int) -> str: 202 | if isinstance(int_value, str): 203 | return int_value 204 | if not isinstance(int_value, int): 205 | raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) 206 | return str(int_value) 207 | 208 | import argparse 209 | _parser = argparse.ArgumentParser(prog='Get random int op', description='Generate a random number between minimum and maximum (inclusive).') 210 | _parser.add_argument("--minimum", dest="minimum", type=int, required=True, default=argparse.SUPPRESS) 211 | _parser.add_argument("--maximum", dest="maximum", type=int, required=True, default=argparse.SUPPRESS) 212 | _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) 213 | _parsed_args = vars(_parser.parse_args()) 214 | _output_files = _parsed_args.pop("_output_paths", []) 215 | 216 | _outputs = get_random_int_op(**_parsed_args) 217 | 218 | _outputs = [_outputs] 219 | 220 | _output_serializers = [ 221 | _serialize_int, 222 | 223 | ] 224 | 225 | import os 226 | for idx, output_file in enumerate(_output_files): 227 | try: 228 | os.makedirs(os.path.dirname(output_file)) 229 | except OSError: 230 | pass 231 | with open(output_file, 'w') as f: 232 | f.write(_output_serializers[idx](_outputs[idx])) 233 | image: python:3.7 234 | outputs: 235 | parameters: 236 | - name: get-random-int-op-Output 237 | valueFrom: {path: /tmp/outputs/Output/data} 238 | artifacts: 239 | - {name: get-random-int-op-Output, path: /tmp/outputs/Output/data} 240 | metadata: 241 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Generate 242 | a random number between minimum and maximum (inclusive).", "implementation": 243 | {"container": {"args": ["--minimum", {"inputValue": "minimum"}, "--maximum", 244 | {"inputValue": "maximum"}, "----output-paths", {"outputPath": "Output"}], 245 | "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > 246 | \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def get_random_int_op(minimum, 247 | maximum):\n \"\"\"Generate a random number between minimum and maximum 248 | (inclusive).\"\"\"\n import random\n result = random.randint(minimum, 249 | maximum)\n print(result)\n return result\n\ndef _serialize_int(int_value: 250 | int) -> str:\n if isinstance(int_value, str):\n return int_value\n if 251 | not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" 252 | has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return 253 | str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Get 254 | random int op'', description=''Generate a random number between minimum 255 | and maximum (inclusive).'')\n_parser.add_argument(\"--minimum\", dest=\"minimum\", 256 | type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--maximum\", 257 | dest=\"maximum\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", 258 | dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files 259 | = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = get_random_int_op(**_parsed_args)\n\n_outputs 260 | = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport 261 | os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except 262 | OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], 263 | "image": "python:3.7"}}, "inputs": [{"name": "minimum", "type": "Integer"}, 264 | {"name": "maximum", "type": "Integer"}], "name": "Get random int op", "outputs": 265 | [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', 266 | pipelines.kubeflow.org/arguments.parameters: '{"maximum": "9", "minimum": 267 | "0"}'} 268 | - name: get-random-int-op-2 269 | container: 270 | args: [--minimum, '10', --maximum, '19', '----output-paths', /tmp/outputs/Output/data] 271 | command: 272 | - sh 273 | - -ec 274 | - | 275 | program_path=$(mktemp) 276 | printf "%s" "$0" > "$program_path" 277 | python3 -u "$program_path" "$@" 278 | - | 279 | def get_random_int_op(minimum, maximum): 280 | """Generate a random number between minimum and maximum (inclusive).""" 281 | import random 282 | result = random.randint(minimum, maximum) 283 | print(result) 284 | return result 285 | 286 | def _serialize_int(int_value: int) -> str: 287 | if isinstance(int_value, str): 288 | return int_value 289 | if not isinstance(int_value, int): 290 | raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) 291 | return str(int_value) 292 | 293 | import argparse 294 | _parser = argparse.ArgumentParser(prog='Get random int op', description='Generate a random number between minimum and maximum (inclusive).') 295 | _parser.add_argument("--minimum", dest="minimum", type=int, required=True, default=argparse.SUPPRESS) 296 | _parser.add_argument("--maximum", dest="maximum", type=int, required=True, default=argparse.SUPPRESS) 297 | _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) 298 | _parsed_args = vars(_parser.parse_args()) 299 | _output_files = _parsed_args.pop("_output_paths", []) 300 | 301 | _outputs = get_random_int_op(**_parsed_args) 302 | 303 | _outputs = [_outputs] 304 | 305 | _output_serializers = [ 306 | _serialize_int, 307 | 308 | ] 309 | 310 | import os 311 | for idx, output_file in enumerate(_output_files): 312 | try: 313 | os.makedirs(os.path.dirname(output_file)) 314 | except OSError: 315 | pass 316 | with open(output_file, 'w') as f: 317 | f.write(_output_serializers[idx](_outputs[idx])) 318 | image: python:3.7 319 | outputs: 320 | parameters: 321 | - name: get-random-int-op-2-Output 322 | valueFrom: {path: /tmp/outputs/Output/data} 323 | artifacts: 324 | - {name: get-random-int-op-2-Output, path: /tmp/outputs/Output/data} 325 | metadata: 326 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Generate 327 | a random number between minimum and maximum (inclusive).", "implementation": 328 | {"container": {"args": ["--minimum", {"inputValue": "minimum"}, "--maximum", 329 | {"inputValue": "maximum"}, "----output-paths", {"outputPath": "Output"}], 330 | "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > 331 | \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def get_random_int_op(minimum, 332 | maximum):\n \"\"\"Generate a random number between minimum and maximum 333 | (inclusive).\"\"\"\n import random\n result = random.randint(minimum, 334 | maximum)\n print(result)\n return result\n\ndef _serialize_int(int_value: 335 | int) -> str:\n if isinstance(int_value, str):\n return int_value\n if 336 | not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" 337 | has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return 338 | str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Get 339 | random int op'', description=''Generate a random number between minimum 340 | and maximum (inclusive).'')\n_parser.add_argument(\"--minimum\", dest=\"minimum\", 341 | type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--maximum\", 342 | dest=\"maximum\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", 343 | dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files 344 | = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = get_random_int_op(**_parsed_args)\n\n_outputs 345 | = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport 346 | os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except 347 | OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], 348 | "image": "python:3.7"}}, "inputs": [{"name": "minimum", "type": "Integer"}, 349 | {"name": "maximum", "type": "Integer"}], "name": "Get random int op", "outputs": 350 | [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', 351 | pipelines.kubeflow.org/arguments.parameters: '{"maximum": "19", "minimum": 352 | "10"}'} 353 | - name: print-op 354 | container: 355 | args: [--message, 'heads and {{inputs.parameters.get-random-int-op-Output}} 356 | > 5!'] 357 | command: 358 | - sh 359 | - -ec 360 | - | 361 | program_path=$(mktemp) 362 | printf "%s" "$0" > "$program_path" 363 | python3 -u "$program_path" "$@" 364 | - | 365 | def print_op(message): 366 | """Print a message.""" 367 | print(message) 368 | 369 | import argparse 370 | _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.') 371 | _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS) 372 | _parsed_args = vars(_parser.parse_args()) 373 | 374 | _outputs = print_op(**_parsed_args) 375 | image: python:3.7 376 | inputs: 377 | parameters: 378 | - {name: get-random-int-op-Output} 379 | metadata: 380 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print 381 | a message.", "implementation": {"container": {"args": ["--message", {"inputValue": 382 | "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" 383 | \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def 384 | print_op(message):\n \"\"\"Print a message.\"\"\"\n print(message)\n\nimport 385 | argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print 386 | a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str, 387 | required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 388 | = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": 389 | "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}', 390 | pipelines.kubeflow.org/arguments.parameters: '{"message": "heads and {{inputs.parameters.get-random-int-op-Output}} 391 | > 5!"}'} 392 | - name: print-op-2 393 | container: 394 | args: [--message, 'heads and {{inputs.parameters.get-random-int-op-Output}} 395 | <= 5!'] 396 | command: 397 | - sh 398 | - -ec 399 | - | 400 | program_path=$(mktemp) 401 | printf "%s" "$0" > "$program_path" 402 | python3 -u "$program_path" "$@" 403 | - | 404 | def print_op(message): 405 | """Print a message.""" 406 | print(message) 407 | 408 | import argparse 409 | _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.') 410 | _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS) 411 | _parsed_args = vars(_parser.parse_args()) 412 | 413 | _outputs = print_op(**_parsed_args) 414 | image: python:3.7 415 | inputs: 416 | parameters: 417 | - {name: get-random-int-op-Output} 418 | metadata: 419 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print 420 | a message.", "implementation": {"container": {"args": ["--message", {"inputValue": 421 | "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" 422 | \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def 423 | print_op(message):\n \"\"\"Print a message.\"\"\"\n print(message)\n\nimport 424 | argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print 425 | a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str, 426 | required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 427 | = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": 428 | "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}', 429 | pipelines.kubeflow.org/arguments.parameters: '{"message": "heads and {{inputs.parameters.get-random-int-op-Output}} 430 | <= 5!"}'} 431 | - name: print-op-3 432 | container: 433 | args: [--message, 'tails and {{inputs.parameters.get-random-int-op-2-Output}} 434 | > 15!'] 435 | command: 436 | - sh 437 | - -ec 438 | - | 439 | program_path=$(mktemp) 440 | printf "%s" "$0" > "$program_path" 441 | python3 -u "$program_path" "$@" 442 | - | 443 | def print_op(message): 444 | """Print a message.""" 445 | print(message) 446 | 447 | import argparse 448 | _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.') 449 | _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS) 450 | _parsed_args = vars(_parser.parse_args()) 451 | 452 | _outputs = print_op(**_parsed_args) 453 | image: python:3.7 454 | inputs: 455 | parameters: 456 | - {name: get-random-int-op-2-Output} 457 | metadata: 458 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print 459 | a message.", "implementation": {"container": {"args": ["--message", {"inputValue": 460 | "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" 461 | \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def 462 | print_op(message):\n \"\"\"Print a message.\"\"\"\n print(message)\n\nimport 463 | argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print 464 | a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str, 465 | required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 466 | = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": 467 | "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}', 468 | pipelines.kubeflow.org/arguments.parameters: '{"message": "tails and {{inputs.parameters.get-random-int-op-2-Output}} 469 | > 15!"}'} 470 | - name: print-op-4 471 | container: 472 | args: [--message, 'tails and {{inputs.parameters.get-random-int-op-2-Output}} 473 | <= 15!'] 474 | command: 475 | - sh 476 | - -ec 477 | - | 478 | program_path=$(mktemp) 479 | printf "%s" "$0" > "$program_path" 480 | python3 -u "$program_path" "$@" 481 | - | 482 | def print_op(message): 483 | """Print a message.""" 484 | print(message) 485 | 486 | import argparse 487 | _parser = argparse.ArgumentParser(prog='Print op', description='Print a message.') 488 | _parser.add_argument("--message", dest="message", type=str, required=True, default=argparse.SUPPRESS) 489 | _parsed_args = vars(_parser.parse_args()) 490 | 491 | _outputs = print_op(**_parsed_args) 492 | image: python:3.7 493 | inputs: 494 | parameters: 495 | - {name: get-random-int-op-2-Output} 496 | metadata: 497 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Print 498 | a message.", "implementation": {"container": {"args": ["--message", {"inputValue": 499 | "message"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" 500 | \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def 501 | print_op(message):\n \"\"\"Print a message.\"\"\"\n print(message)\n\nimport 502 | argparse\n_parser = argparse.ArgumentParser(prog=''Print op'', description=''Print 503 | a message.'')\n_parser.add_argument(\"--message\", dest=\"message\", type=str, 504 | required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 505 | = print_op(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": 506 | "message", "type": "String"}], "name": "Print op"}', pipelines.kubeflow.org/component_ref: '{}', 507 | pipelines.kubeflow.org/arguments.parameters: '{"message": "tails and {{inputs.parameters.get-random-int-op-2-Output}} 508 | <= 15!"}'} 509 | arguments: 510 | parameters: [] 511 | serviceAccountName: pipeline-runner 512 | -------------------------------------------------------------------------------- /train_until_good/train_until_good.py.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: train-until-good-pipeline- 5 | annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.4.0, pipelines.kubeflow.org/pipeline_compilation_time: '2021-03-25T08:14:34.624430', 6 | pipelines.kubeflow.org/pipeline_spec: '{"name": "Train until good pipeline"}'} 7 | labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.4.0} 8 | spec: 9 | entrypoint: train-until-good-pipeline 10 | templates: 11 | - name: calculate-regression-metrics-from-csv 12 | container: 13 | args: [--true-values, /tmp/inputs/true_values/data, --predicted-values, /tmp/inputs/predicted_values/data, 14 | '----output-paths', /tmp/outputs/max_absolute_error/data, /tmp/outputs/mean_absolute_error/data, 15 | /tmp/outputs/mean_squared_error/data, /tmp/outputs/root_mean_squared_error/data] 16 | command: 17 | - sh 18 | - -c 19 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 20 | 'numpy==1.19.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install 21 | --quiet --no-warn-script-location 'numpy==1.19.0' --user) && "$0" "$@" 22 | - python3 23 | - -u 24 | - -c 25 | - | 26 | def calculate_regression_metrics_from_csv( 27 | true_values_path, 28 | predicted_values_path, 29 | ): 30 | '''Calculates regression metrics. 31 | 32 | Annotations: 33 | author: Alexey Volkov 34 | ''' 35 | import math 36 | import numpy 37 | 38 | true_values = numpy.loadtxt(true_values_path, dtype=numpy.float64) 39 | predicted_values = numpy.loadtxt(predicted_values_path, dtype=numpy.float64) 40 | 41 | if len(predicted_values.shape) != 1: 42 | raise NotImplemented('Only single prediction values are supported.') 43 | if len(true_values.shape) != 1: 44 | raise NotImplemented('Only single true values are supported.') 45 | 46 | if predicted_values.shape != true_values.shape: 47 | raise ValueError('Input shapes are different: {} != {}'.format(predicted_values.shape, true_values.shape)) 48 | 49 | num_true_values = true_values 50 | errors = (true_values - predicted_values) 51 | abs_errors = numpy.abs(errors) 52 | squared_errors = errors ** 2 53 | max_absolute_error = numpy.max(abs_errors) 54 | mean_absolute_error = numpy.average(abs_errors) 55 | mean_squared_error = numpy.average(squared_errors) 56 | root_mean_squared_error = math.sqrt(mean_squared_error) 57 | 58 | return ( 59 | max_absolute_error, 60 | mean_absolute_error, 61 | mean_squared_error, 62 | root_mean_squared_error, 63 | ) 64 | 65 | def _serialize_float(float_value: float) -> str: 66 | if isinstance(float_value, str): 67 | return float_value 68 | if not isinstance(float_value, (float, int)): 69 | raise TypeError('Value "{}" has type "{}" instead of float.'.format(str(float_value), str(type(float_value)))) 70 | return str(float_value) 71 | 72 | import argparse 73 | _parser = argparse.ArgumentParser(prog='Calculate regression metrics from csv', description='Calculates regression metrics.\n\n Annotations:\n author: Alexey Volkov ') 74 | _parser.add_argument("--true-values", dest="true_values_path", type=str, required=True, default=argparse.SUPPRESS) 75 | _parser.add_argument("--predicted-values", dest="predicted_values_path", type=str, required=True, default=argparse.SUPPRESS) 76 | _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4) 77 | _parsed_args = vars(_parser.parse_args()) 78 | _output_files = _parsed_args.pop("_output_paths", []) 79 | 80 | _outputs = calculate_regression_metrics_from_csv(**_parsed_args) 81 | 82 | _output_serializers = [ 83 | _serialize_float, 84 | _serialize_float, 85 | _serialize_float, 86 | _serialize_float, 87 | 88 | ] 89 | 90 | import os 91 | for idx, output_file in enumerate(_output_files): 92 | try: 93 | os.makedirs(os.path.dirname(output_file)) 94 | except OSError: 95 | pass 96 | with open(output_file, 'w') as f: 97 | f.write(_output_serializers[idx](_outputs[idx])) 98 | image: python:3.7 99 | inputs: 100 | artifacts: 101 | - {name: xgboost-predict-predictions, path: /tmp/inputs/predicted_values/data} 102 | - {name: remove-header-table, path: /tmp/inputs/true_values/data} 103 | outputs: 104 | parameters: 105 | - name: calculate-regression-metrics-from-csv-mean_squared_error 106 | valueFrom: {path: /tmp/outputs/mean_squared_error/data} 107 | artifacts: 108 | - {name: calculate-regression-metrics-from-csv-max_absolute_error, path: /tmp/outputs/max_absolute_error/data} 109 | - {name: calculate-regression-metrics-from-csv-mean_absolute_error, path: /tmp/outputs/mean_absolute_error/data} 110 | - {name: calculate-regression-metrics-from-csv-mean_squared_error, path: /tmp/outputs/mean_squared_error/data} 111 | - {name: calculate-regression-metrics-from-csv-root_mean_squared_error, path: /tmp/outputs/root_mean_squared_error/data} 112 | metadata: 113 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Calculates 114 | regression metrics.\n\n Annotations:\n author: Alexey Volkov ", 115 | "implementation": {"container": {"args": ["--true-values", {"inputPath": 116 | "true_values"}, "--predicted-values", {"inputPath": "predicted_values"}, 117 | "----output-paths", {"outputPath": "max_absolute_error"}, {"outputPath": 118 | "mean_absolute_error"}, {"outputPath": "mean_squared_error"}, {"outputPath": 119 | "root_mean_squared_error"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 120 | python3 -m pip install --quiet --no-warn-script-location ''numpy==1.19.0'' 121 | || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 122 | ''numpy==1.19.0'' --user) && \"$0\" \"$@\"", "python3", "-u", "-c", "def 123 | calculate_regression_metrics_from_csv(\n true_values_path,\n predicted_values_path,\n):\n ''''''Calculates 124 | regression metrics.\n\n Annotations:\n author: Alexey Volkov \n ''''''\n import 125 | math\n import numpy\n\n true_values = numpy.loadtxt(true_values_path, 126 | dtype=numpy.float64)\n predicted_values = numpy.loadtxt(predicted_values_path, 127 | dtype=numpy.float64)\n\n if len(predicted_values.shape) != 1:\n raise 128 | NotImplemented(''Only single prediction values are supported.'')\n if 129 | len(true_values.shape) != 1:\n raise NotImplemented(''Only single 130 | true values are supported.'')\n\n if predicted_values.shape != true_values.shape:\n raise 131 | ValueError(''Input shapes are different: {} != {}''.format(predicted_values.shape, 132 | true_values.shape))\n\n num_true_values = true_values\n errors = (true_values 133 | - predicted_values)\n abs_errors = numpy.abs(errors)\n squared_errors 134 | = errors ** 2\n max_absolute_error = numpy.max(abs_errors)\n mean_absolute_error 135 | = numpy.average(abs_errors)\n mean_squared_error = numpy.average(squared_errors)\n root_mean_squared_error 136 | = math.sqrt(mean_squared_error)\n\n return (\n max_absolute_error,\n mean_absolute_error,\n mean_squared_error,\n root_mean_squared_error,\n )\n\ndef 137 | _serialize_float(float_value: float) -> str:\n if isinstance(float_value, 138 | str):\n return float_value\n if not isinstance(float_value, (float, 139 | int)):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of 140 | float.''.format(str(float_value), str(type(float_value))))\n return str(float_value)\n\nimport 141 | argparse\n_parser = argparse.ArgumentParser(prog=''Calculate regression 142 | metrics from csv'', description=''Calculates regression metrics.\\n\\n Annotations:\\n author: 143 | Alexey Volkov '')\n_parser.add_argument(\"--true-values\", 144 | dest=\"true_values_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--predicted-values\", 145 | dest=\"predicted_values_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", 146 | dest=\"_output_paths\", type=str, nargs=4)\n_parsed_args = vars(_parser.parse_args())\n_output_files 147 | = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = calculate_regression_metrics_from_csv(**_parsed_args)\n\n_output_serializers 148 | = [\n _serialize_float,\n _serialize_float,\n _serialize_float,\n _serialize_float,\n\n]\n\nimport 149 | os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except 150 | OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], 151 | "image": "python:3.7"}}, "inputs": [{"name": "true_values"}, {"name": "predicted_values"}], 152 | "name": "Calculate regression metrics from csv", "outputs": [{"name": "max_absolute_error", 153 | "type": "Float"}, {"name": "mean_absolute_error", "type": "Float"}, {"name": 154 | "mean_squared_error", "type": "Float"}, {"name": "root_mean_squared_error", 155 | "type": "Float"}]}', pipelines.kubeflow.org/component_ref: '{"digest": "f326bddad865f292b6e67b0edc485649b13f5fa74b1546584974274c2bced3e1", 156 | "url": "https://raw.githubusercontent.com/kubeflow/pipelines/616542ac0f789914f4eb53438da713dd3004fba4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml"}'} 157 | - name: chicago-taxi-trips-dataset 158 | container: 159 | args: [] 160 | command: 161 | - sh 162 | - -c 163 | - | 164 | set -e -x -o pipefail 165 | output_path="$0" 166 | select="$1" 167 | where="$2" 168 | limit="$3" 169 | format="$4" 170 | mkdir -p "$(dirname "$output_path")" 171 | curl --get 'https://data.cityofchicago.org/resource/wrvz-psew.'"${format}" \ 172 | --data-urlencode '$limit='"${limit}" \ 173 | --data-urlencode '$where='"${where}" \ 174 | --data-urlencode '$select='"${select}" \ 175 | | tr -d '"' > "$output_path" # Removing unneeded quotes around all numbers 176 | - /tmp/outputs/Table/data 177 | - tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total 178 | - trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01" 179 | - '10000' 180 | - csv 181 | image: curlimages/curl 182 | outputs: 183 | artifacts: 184 | - {name: chicago-taxi-trips-dataset-Table, path: /tmp/outputs/Table/data} 185 | metadata: 186 | annotations: {author: Alexey Volkov , pipelines.kubeflow.org/component_spec: '{"description": 187 | "City of Chicago Taxi Trips dataset: https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew\n\nThe 188 | input parameters configure the SQL query to the database.\nThe dataset is 189 | pretty big, so limit the number of results using the `Limit` or `Where` 190 | parameters.\nRead [Socrata dev](https://dev.socrata.com/docs/queries/) for 191 | the advanced query syntax\n", "implementation": {"container": {"command": 192 | ["sh", "-c", "set -e -x -o pipefail\noutput_path=\"$0\"\nselect=\"$1\"\nwhere=\"$2\"\nlimit=\"$3\"\nformat=\"$4\"\nmkdir 193 | -p \"$(dirname \"$output_path\")\"\ncurl --get ''https://data.cityofchicago.org/resource/wrvz-psew.''\"${format}\" 194 | \\\n --data-urlencode ''$limit=''\"${limit}\" \\\n --data-urlencode 195 | ''$where=''\"${where}\" \\\n --data-urlencode ''$select=''\"${select}\" 196 | \\\n | tr -d ''\"'' > \"$output_path\" # Removing unneeded quotes around 197 | all numbers\n", {"outputPath": "Table"}, {"inputValue": "Select"}, {"inputValue": 198 | "Where"}, {"inputValue": "Limit"}, {"inputValue": "Format"}], "image": "curlimages/curl"}}, 199 | "inputs": [{"default": "trip_start_timestamp>=\"1900-01-01\" AND trip_start_timestamp<\"2100-01-01\"", 200 | "name": "Where", "type": "String"}, {"default": "1000", "description": "Number 201 | of rows to return. The rows are randomly sampled.", "name": "Limit", "type": 202 | "Integer"}, {"default": "trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location", 203 | "name": "Select", "type": "String"}, {"default": "csv", "description": "Output 204 | data format. Suports csv,tsv,cml,rdf,json", "name": "Format", "type": "String"}], 205 | "metadata": {"annotations": {"author": "Alexey Volkov "}}, 206 | "name": "Chicago Taxi Trips dataset", "outputs": [{"description": "Result 207 | type depends on format. CSV and TSV have header.", "name": "Table"}]}', 208 | pipelines.kubeflow.org/component_ref: '{"digest": "ecf2f2840c57bd9cb2778c8f529da9b938b81f59294b3f7271cb23b363640343", 209 | "url": "https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml"}', 210 | pipelines.kubeflow.org/arguments.parameters: '{"Format": "csv", "Limit": "10000", 211 | "Select": "tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total", 212 | "Where": "trip_start_timestamp >= \"2019-01-01\" AND trip_start_timestamp 213 | < \"2019-02-01\""}'} 214 | - name: condition-2 215 | inputs: 216 | artifacts: 217 | - {name: chicago-taxi-trips-dataset-Table} 218 | - {name: remove-header-table} 219 | - {name: xgboost-train-2-model} 220 | dag: 221 | tasks: 222 | - name: graph-train-until-low-error-1 223 | template: graph-train-until-low-error-1 224 | arguments: 225 | artifacts: 226 | - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 227 | - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'} 228 | - {name: xgboost-train-model, from: '{{inputs.artifacts.xgboost-train-2-model}}'} 229 | - name: graph-train-until-low-error-1 230 | inputs: 231 | artifacts: 232 | - {name: chicago-taxi-trips-dataset-Table} 233 | - {name: remove-header-table} 234 | - {name: xgboost-train-model} 235 | dag: 236 | tasks: 237 | - name: calculate-regression-metrics-from-csv 238 | template: calculate-regression-metrics-from-csv 239 | dependencies: [xgboost-predict] 240 | arguments: 241 | artifacts: 242 | - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'} 243 | - {name: xgboost-predict-predictions, from: '{{tasks.xgboost-predict.outputs.artifacts.xgboost-predict-predictions}}'} 244 | - name: condition-2 245 | template: condition-2 246 | when: '{{tasks.calculate-regression-metrics-from-csv.outputs.parameters.calculate-regression-metrics-from-csv-mean_squared_error}} 247 | > 0.01' 248 | dependencies: [calculate-regression-metrics-from-csv, xgboost-train-2] 249 | arguments: 250 | artifacts: 251 | - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 252 | - {name: remove-header-table, from: '{{inputs.artifacts.remove-header-table}}'} 253 | - {name: xgboost-train-2-model, from: '{{tasks.xgboost-train-2.outputs.artifacts.xgboost-train-2-model}}'} 254 | - name: xgboost-predict 255 | template: xgboost-predict 256 | dependencies: [xgboost-train-2] 257 | arguments: 258 | artifacts: 259 | - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 260 | - {name: xgboost-train-2-model, from: '{{tasks.xgboost-train-2.outputs.artifacts.xgboost-train-2-model}}'} 261 | - name: xgboost-train-2 262 | template: xgboost-train-2 263 | arguments: 264 | artifacts: 265 | - {name: chicago-taxi-trips-dataset-Table, from: '{{inputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 266 | - {name: xgboost-train-model, from: '{{inputs.artifacts.xgboost-train-model}}'} 267 | - name: pandas-transform-dataframe-in-csv-format 268 | container: 269 | args: [--table, /tmp/inputs/table/data, --transform-code, 'df = df[["tips"]]', 270 | --transformed-table, /tmp/outputs/transformed_table/data] 271 | command: 272 | - sh 273 | - -c 274 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 275 | 'pandas==1.0.4' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install 276 | --quiet --no-warn-script-location 'pandas==1.0.4' --user) && "$0" "$@" 277 | - python3 278 | - -u 279 | - -c 280 | - | 281 | def _make_parent_dirs_and_return_path(file_path: str): 282 | import os 283 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 284 | return file_path 285 | 286 | def Pandas_Transform_DataFrame_in_CSV_format( 287 | table_path, 288 | transformed_table_path, 289 | transform_code, 290 | ): 291 | '''Transform DataFrame loaded from a CSV file. 292 | 293 | Inputs: 294 | table: Table to transform. 295 | transform_code: Transformation code. Code is written in Python and can consist of multiple lines. 296 | The DataFrame variable is called "df". 297 | Examples: 298 | - `df['prod'] = df['X'] * df['Y']` 299 | - `df = df[['X', 'prod']]` 300 | - `df.insert(0, "is_positive", df["X"] > 0)` 301 | 302 | Outputs: 303 | transformed_table: Transformed table. 304 | 305 | Annotations: 306 | author: Alexey Volkov 307 | ''' 308 | import pandas 309 | 310 | df = pandas.read_csv( 311 | table_path, 312 | ) 313 | # The namespace is needed so that the code can replace `df`. For example df = df[['X']] 314 | namespace = locals() 315 | exec(transform_code, namespace) 316 | namespace['df'].to_csv( 317 | transformed_table_path, 318 | index=False, 319 | ) 320 | 321 | import argparse 322 | _parser = argparse.ArgumentParser(prog='Pandas Transform DataFrame in CSV format', description='Transform DataFrame loaded from a CSV file.\n\n Inputs:\n table: Table to transform.\n transform_code: Transformation code. Code is written in Python and can consist of multiple lines.\n The DataFrame variable is called "df".\n Examples:\n - `df[\'prod\'] = df[\'X\'] * df[\'Y\']`\n - `df = df[[\'X\', \'prod\']]`\n - `df.insert(0, "is_positive", df["X"] > 0)`\n\n Outputs:\n transformed_table: Transformed table.\n\n Annotations:\n author: Alexey Volkov ') 323 | _parser.add_argument("--table", dest="table_path", type=str, required=True, default=argparse.SUPPRESS) 324 | _parser.add_argument("--transform-code", dest="transform_code", type=str, required=True, default=argparse.SUPPRESS) 325 | _parser.add_argument("--transformed-table", dest="transformed_table_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 326 | _parsed_args = vars(_parser.parse_args()) 327 | 328 | _outputs = Pandas_Transform_DataFrame_in_CSV_format(**_parsed_args) 329 | image: python:3.7 330 | inputs: 331 | artifacts: 332 | - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/table/data} 333 | outputs: 334 | artifacts: 335 | - {name: pandas-transform-dataframe-in-csv-format-transformed_table, path: /tmp/outputs/transformed_table/data} 336 | metadata: 337 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Transform 338 | DataFrame loaded from a CSV file.\n\n Inputs:\n table: Table to 339 | transform.\n transform_code: Transformation code. Code is written 340 | in Python and can consist of multiple lines.\n The DataFrame 341 | variable is called \"df\".\n Examples:\n - `df[''prod''] 342 | = df[''X''] * df[''Y'']`\n - `df = df[[''X'', ''prod'']]`\n - 343 | `df.insert(0, \"is_positive\", df[\"X\"] > 0)`\n\n Outputs:\n transformed_table: 344 | Transformed table.\n\n Annotations:\n author: Alexey Volkov ", 345 | "implementation": {"container": {"args": ["--table", {"inputPath": "table"}, 346 | "--transform-code", {"inputValue": "transform_code"}, "--transformed-table", 347 | {"outputPath": "transformed_table"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 348 | python3 -m pip install --quiet --no-warn-script-location ''pandas==1.0.4'' 349 | || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 350 | ''pandas==1.0.4'' --user) && \"$0\" \"$@\"", "python3", "-u", "-c", "def 351 | _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), 352 | exist_ok=True)\n return file_path\n\ndef Pandas_Transform_DataFrame_in_CSV_format(\n table_path,\n transformed_table_path,\n transform_code,\n):\n ''''''Transform 353 | DataFrame loaded from a CSV file.\n\n Inputs:\n table: Table to 354 | transform.\n transform_code: Transformation code. Code is written 355 | in Python and can consist of multiple lines.\n The DataFrame 356 | variable is called \"df\".\n Examples:\n - `df[''prod''] 357 | = df[''X''] * df[''Y'']`\n - `df = df[[''X'', ''prod'']]`\n - 358 | `df.insert(0, \"is_positive\", df[\"X\"] > 0)`\n\n Outputs:\n transformed_table: 359 | Transformed table.\n\n Annotations:\n author: Alexey Volkov \n ''''''\n import 360 | pandas\n\n df = pandas.read_csv(\n table_path,\n )\n # The 361 | namespace is needed so that the code can replace `df`. For example df = 362 | df[[''X'']]\n namespace = locals()\n exec(transform_code, namespace)\n namespace[''df''].to_csv(\n transformed_table_path,\n index=False,\n )\n\nimport 363 | argparse\n_parser = argparse.ArgumentParser(prog=''Pandas Transform DataFrame 364 | in CSV format'', description=''Transform DataFrame loaded from a CSV file.\\n\\n Inputs:\\n table: 365 | Table to transform.\\n transform_code: Transformation code. Code 366 | is written in Python and can consist of multiple lines.\\n The 367 | DataFrame variable is called \"df\".\\n Examples:\\n - 368 | `df[\\''prod\\''] = df[\\''X\\''] * df[\\''Y\\'']`\\n - `df = 369 | df[[\\''X\\'', \\''prod\\'']]`\\n - `df.insert(0, \"is_positive\", 370 | df[\"X\"] > 0)`\\n\\n Outputs:\\n transformed_table: Transformed 371 | table.\\n\\n Annotations:\\n author: Alexey Volkov '')\n_parser.add_argument(\"--table\", 372 | dest=\"table_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--transform-code\", 373 | dest=\"transform_code\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--transformed-table\", 374 | dest=\"transformed_table_path\", type=_make_parent_dirs_and_return_path, 375 | required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 376 | = Pandas_Transform_DataFrame_in_CSV_format(**_parsed_args)\n"], "image": 377 | "python:3.7"}}, "inputs": [{"name": "table", "type": "CSV"}, {"name": "transform_code", 378 | "type": "PythonCode"}], "name": "Pandas Transform DataFrame in CSV format", 379 | "outputs": [{"name": "transformed_table", "type": "CSV"}]}', pipelines.kubeflow.org/component_ref: '{"digest": 380 | "58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510", "url": 381 | "https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml"}', 382 | pipelines.kubeflow.org/arguments.parameters: '{"transform_code": "df = df[[\"tips\"]]"}'} 383 | - name: remove-header 384 | container: 385 | args: [] 386 | command: 387 | - sh 388 | - -exc 389 | - | 390 | mkdir -p "$(dirname "$1")" 391 | tail -n +2 <"$0" >"$1" 392 | - /tmp/inputs/table/data 393 | - /tmp/outputs/table/data 394 | image: alpine 395 | inputs: 396 | artifacts: 397 | - {name: pandas-transform-dataframe-in-csv-format-transformed_table, path: /tmp/inputs/table/data} 398 | outputs: 399 | artifacts: 400 | - {name: remove-header-table, path: /tmp/outputs/table/data} 401 | metadata: 402 | annotations: {author: Alexey Volkov , pipelines.kubeflow.org/component_spec: '{"description": 403 | "Remove the header line from CSV and TSV data (unconditionally)", "implementation": 404 | {"container": {"command": ["sh", "-exc", "mkdir -p \"$(dirname \"$1\")\"\ntail 405 | -n +2 <\"$0\" >\"$1\"\n", {"inputPath": "table"}, {"outputPath": "table"}], 406 | "image": "alpine"}}, "inputs": [{"name": "table"}], "metadata": {"annotations": 407 | {"author": "Alexey Volkov "}}, "name": "Remove 408 | header", "outputs": [{"name": "table"}]}', pipelines.kubeflow.org/component_ref: '{"digest": 409 | "ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3", "url": 410 | "https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml"}'} 411 | - name: train-until-good-pipeline 412 | dag: 413 | tasks: 414 | - {name: chicago-taxi-trips-dataset, template: chicago-taxi-trips-dataset} 415 | - name: graph-train-until-low-error-1 416 | template: graph-train-until-low-error-1 417 | dependencies: [chicago-taxi-trips-dataset, remove-header, xgboost-train] 418 | arguments: 419 | artifacts: 420 | - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 421 | - {name: remove-header-table, from: '{{tasks.remove-header.outputs.artifacts.remove-header-table}}'} 422 | - {name: xgboost-train-model, from: '{{tasks.xgboost-train.outputs.artifacts.xgboost-train-model}}'} 423 | - name: pandas-transform-dataframe-in-csv-format 424 | template: pandas-transform-dataframe-in-csv-format 425 | dependencies: [chicago-taxi-trips-dataset] 426 | arguments: 427 | artifacts: 428 | - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 429 | - name: remove-header 430 | template: remove-header 431 | dependencies: [pandas-transform-dataframe-in-csv-format] 432 | arguments: 433 | artifacts: 434 | - {name: pandas-transform-dataframe-in-csv-format-transformed_table, from: '{{tasks.pandas-transform-dataframe-in-csv-format.outputs.artifacts.pandas-transform-dataframe-in-csv-format-transformed_table}}'} 435 | - name: xgboost-train 436 | template: xgboost-train 437 | dependencies: [chicago-taxi-trips-dataset] 438 | arguments: 439 | artifacts: 440 | - {name: chicago-taxi-trips-dataset-Table, from: '{{tasks.chicago-taxi-trips-dataset.outputs.artifacts.chicago-taxi-trips-dataset-Table}}'} 441 | - name: xgboost-predict 442 | container: 443 | args: [--data, /tmp/inputs/data/data, --model, /tmp/inputs/model/data, --label-column, 444 | '0', --predictions, /tmp/outputs/predictions/data] 445 | command: 446 | - sh 447 | - -c 448 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 449 | 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 450 | -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' 451 | --user) && "$0" "$@" 452 | - python3 453 | - -u 454 | - -c 455 | - | 456 | def _make_parent_dirs_and_return_path(file_path: str): 457 | import os 458 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 459 | return file_path 460 | 461 | def xgboost_predict( 462 | data_path, # Also supports LibSVM 463 | model_path, 464 | predictions_path, 465 | label_column = None, 466 | ): 467 | '''Make predictions using a trained XGBoost model. 468 | 469 | Args: 470 | data_path: Path for the feature data in CSV format. 471 | model_path: Path for the trained model in binary XGBoost format. 472 | predictions_path: Output path for the predictions. 473 | label_column: Column containing the label data. 474 | 475 | Annotations: 476 | author: Alexey Volkov 477 | ''' 478 | from pathlib import Path 479 | 480 | import numpy 481 | import pandas 482 | import xgboost 483 | 484 | df = pandas.read_csv( 485 | data_path, 486 | ) 487 | 488 | if label_column is not None: 489 | df = df.drop(columns=[df.columns[label_column]]) 490 | 491 | testing_data = xgboost.DMatrix( 492 | data=df, 493 | ) 494 | 495 | model = xgboost.Booster(model_file=model_path) 496 | 497 | predictions = model.predict(testing_data) 498 | 499 | Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) 500 | numpy.savetxt(predictions_path, predictions) 501 | 502 | import argparse 503 | _parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n Args:\n data_path: Path for the feature data in CSV format.\n model_path: Path for the trained model in binary XGBoost format.\n predictions_path: Output path for the predictions.\n label_column: Column containing the label data.\n\n Annotations:\n author: Alexey Volkov ') 504 | _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS) 505 | _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) 506 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 507 | _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 508 | _parsed_args = vars(_parser.parse_args()) 509 | 510 | _outputs = xgboost_predict(**_parsed_args) 511 | image: python:3.7 512 | inputs: 513 | artifacts: 514 | - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/data/data} 515 | - {name: xgboost-train-2-model, path: /tmp/inputs/model/data} 516 | outputs: 517 | artifacts: 518 | - {name: xgboost-predict-predictions, path: /tmp/outputs/predictions/data} 519 | metadata: 520 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Make 521 | predictions using a trained XGBoost model.\n\n Args:\n data_path: 522 | Path for the feature data in CSV format.\n model_path: Path for the 523 | trained model in binary XGBoost format.\n predictions_path: Output 524 | path for the predictions.\n label_column: Column containing the label 525 | data.\n\n Annotations:\n author: Alexey Volkov ", 526 | "implementation": {"container": {"args": ["--data", {"inputPath": "data"}, 527 | "--model", {"inputPath": "model"}, {"if": {"cond": {"isPresent": "label_column"}, 528 | "then": ["--label-column", {"inputValue": "label_column"}]}}, "--predictions", 529 | {"outputPath": "predictions"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 530 | python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1'' 531 | ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install 532 | --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user) 533 | && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path: 534 | str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return 535 | file_path\n\ndef xgboost_predict(\n data_path, # Also supports LibSVM\n model_path,\n predictions_path,\n label_column 536 | = None,\n):\n ''''''Make predictions using a trained XGBoost model.\n\n Args:\n data_path: 537 | Path for the feature data in CSV format.\n model_path: Path for the 538 | trained model in binary XGBoost format.\n predictions_path: Output 539 | path for the predictions.\n label_column: Column containing the label 540 | data.\n\n Annotations:\n author: Alexey Volkov \n ''''''\n from 541 | pathlib import Path\n\n import numpy\n import pandas\n import xgboost\n\n df 542 | = pandas.read_csv(\n data_path,\n )\n\n if label_column is 543 | not None:\n df = df.drop(columns=[df.columns[label_column]])\n\n testing_data 544 | = xgboost.DMatrix(\n data=df,\n )\n\n model = xgboost.Booster(model_file=model_path)\n\n predictions 545 | = model.predict(testing_data)\n\n Path(predictions_path).parent.mkdir(parents=True, 546 | exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport 547 | argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost predict'', description=''Make 548 | predictions using a trained XGBoost model.\\n\\n Args:\\n data_path: 549 | Path for the feature data in CSV format.\\n model_path: Path for 550 | the trained model in binary XGBoost format.\\n predictions_path: 551 | Output path for the predictions.\\n label_column: Column containing 552 | the label data.\\n\\n Annotations:\\n author: Alexey Volkov '')\n_parser.add_argument(\"--data\", 553 | dest=\"data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", 554 | dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\", 555 | dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", 556 | dest=\"predictions_path\", type=_make_parent_dirs_and_return_path, required=True, 557 | default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs 558 | = xgboost_predict(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": 559 | [{"name": "data", "type": "CSV"}, {"name": "model", "type": "XGBoostModel"}, 560 | {"name": "label_column", "optional": true, "type": "Integer"}], "name": 561 | "Xgboost predict", "outputs": [{"name": "predictions", "type": "Text"}]}', 562 | pipelines.kubeflow.org/component_ref: '{"digest": "ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357", 563 | "url": "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml"}', 564 | pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0"}'} 565 | - name: xgboost-train 566 | container: 567 | args: [--training-data, /tmp/inputs/training_data/data, --label-column, '0', 568 | --num-iterations, '100', --objective, 'reg:squarederror', --model, /tmp/outputs/model/data, 569 | --model-config, /tmp/outputs/model_config/data] 570 | command: 571 | - sh 572 | - -c 573 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 574 | 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 575 | -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' 576 | --user) && "$0" "$@" 577 | - python3 578 | - -u 579 | - -c 580 | - | 581 | def _make_parent_dirs_and_return_path(file_path: str): 582 | import os 583 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 584 | return file_path 585 | 586 | def xgboost_train( 587 | training_data_path, # Also supports LibSVM 588 | model_path, 589 | model_config_path, 590 | starting_model_path = None, 591 | 592 | label_column = 0, 593 | num_iterations = 10, 594 | booster_params = None, 595 | 596 | # Booster parameters 597 | objective = 'reg:squarederror', 598 | booster = 'gbtree', 599 | learning_rate = 0.3, 600 | min_split_loss = 0, 601 | max_depth = 6, 602 | ): 603 | '''Train an XGBoost model. 604 | 605 | Args: 606 | training_data_path: Path for the training data in CSV format. 607 | model_path: Output path for the trained model in binary XGBoost format. 608 | model_config_path: Output path for the internal parameter configuration of Booster as a JSON string. 609 | starting_model_path: Path for the existing trained model to start from. 610 | label_column: Column containing the label data. 611 | num_boost_rounds: Number of boosting iterations. 612 | booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html 613 | objective: The learning task and the corresponding learning objective. 614 | See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 615 | The most common values are: 616 | "reg:squarederror" - Regression with squared loss (default). 617 | "reg:logistic" - Logistic regression. 618 | "binary:logistic" - Logistic regression for binary classification, output probability. 619 | "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation 620 | "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized 621 | "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized 622 | 623 | Annotations: 624 | author: Alexey Volkov 625 | ''' 626 | import pandas 627 | import xgboost 628 | 629 | df = pandas.read_csv( 630 | training_data_path, 631 | ) 632 | 633 | training_data = xgboost.DMatrix( 634 | data=df.drop(columns=[df.columns[label_column]]), 635 | label=df[df.columns[label_column]], 636 | ) 637 | 638 | booster_params = booster_params or {} 639 | booster_params.setdefault('objective', objective) 640 | booster_params.setdefault('booster', booster) 641 | booster_params.setdefault('learning_rate', learning_rate) 642 | booster_params.setdefault('min_split_loss', min_split_loss) 643 | booster_params.setdefault('max_depth', max_depth) 644 | 645 | starting_model = None 646 | if starting_model_path: 647 | starting_model = xgboost.Booster(model_file=starting_model_path) 648 | 649 | model = xgboost.train( 650 | params=booster_params, 651 | dtrain=training_data, 652 | num_boost_round=num_iterations, 653 | xgb_model=starting_model 654 | ) 655 | 656 | # Saving the model in binary format 657 | model.save_model(model_path) 658 | 659 | model_config_str = model.save_config() 660 | with open(model_config_path, 'w') as model_config_file: 661 | model_config_file.write(model_config_str) 662 | 663 | import json 664 | import argparse 665 | _parser = argparse.ArgumentParser(prog='Xgboost train', description='Train an XGBoost model.\n\n Args:\n training_data_path: Path for the training data in CSV format.\n model_path: Output path for the trained model in binary XGBoost format.\n model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.\n starting_model_path: Path for the existing trained model to start from.\n label_column: Column containing the label data.\n num_boost_rounds: Number of boosting iterations.\n booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: The learning task and the corresponding learning objective.\n See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The most common values are:\n "reg:squarederror" - Regression with squared loss (default).\n "reg:logistic" - Logistic regression.\n "binary:logistic" - Logistic regression for binary classification, output probability.\n "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation\n "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized\n "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: Alexey Volkov ') 666 | _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS) 667 | _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS) 668 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 669 | _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS) 670 | _parser.add_argument("--booster-params", dest="booster_params", type=json.loads, required=False, default=argparse.SUPPRESS) 671 | _parser.add_argument("--objective", dest="objective", type=str, required=False, default=argparse.SUPPRESS) 672 | _parser.add_argument("--booster", dest="booster", type=str, required=False, default=argparse.SUPPRESS) 673 | _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS) 674 | _parser.add_argument("--min-split-loss", dest="min_split_loss", type=float, required=False, default=argparse.SUPPRESS) 675 | _parser.add_argument("--max-depth", dest="max_depth", type=int, required=False, default=argparse.SUPPRESS) 676 | _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 677 | _parser.add_argument("--model-config", dest="model_config_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 678 | _parsed_args = vars(_parser.parse_args()) 679 | 680 | _outputs = xgboost_train(**_parsed_args) 681 | image: python:3.7 682 | inputs: 683 | artifacts: 684 | - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/training_data/data} 685 | outputs: 686 | artifacts: 687 | - {name: xgboost-train-model, path: /tmp/outputs/model/data} 688 | - {name: xgboost-train-model_config, path: /tmp/outputs/model_config/data} 689 | metadata: 690 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Train 691 | an XGBoost model.\n\n Args:\n training_data_path: Path for the 692 | training data in CSV format.\n model_path: Output path for the trained 693 | model in binary XGBoost format.\n model_config_path: Output path 694 | for the internal parameter configuration of Booster as a JSON string.\n starting_model_path: 695 | Path for the existing trained model to start from.\n label_column: 696 | Column containing the label data.\n num_boost_rounds: Number of boosting 697 | iterations.\n booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: 698 | The learning task and the corresponding learning objective.\n See 699 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The 700 | most common values are:\n \"reg:squarederror\" - Regression with 701 | squared loss (default).\n \"reg:logistic\" - Logistic regression.\n \"binary:logistic\" 702 | - Logistic regression for binary classification, output probability.\n \"binary:logitraw\" 703 | - Logistic regression for binary classification, output score before logistic 704 | transformation\n \"rank:pairwise\" - Use LambdaMART to perform 705 | pairwise ranking where the pairwise loss is minimized\n \"rank:ndcg\" 706 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 707 | Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: 708 | Alexey Volkov ", "implementation": {"container": 709 | {"args": ["--training-data", {"inputPath": "training_data"}, {"if": {"cond": 710 | {"isPresent": "starting_model"}, "then": ["--starting-model", {"inputPath": 711 | "starting_model"}]}}, {"if": {"cond": {"isPresent": "label_column"}, "then": 712 | ["--label-column", {"inputValue": "label_column"}]}}, {"if": {"cond": {"isPresent": 713 | "num_iterations"}, "then": ["--num-iterations", {"inputValue": "num_iterations"}]}}, 714 | {"if": {"cond": {"isPresent": "booster_params"}, "then": ["--booster-params", 715 | {"inputValue": "booster_params"}]}}, {"if": {"cond": {"isPresent": "objective"}, 716 | "then": ["--objective", {"inputValue": "objective"}]}}, {"if": {"cond": 717 | {"isPresent": "booster"}, "then": ["--booster", {"inputValue": "booster"}]}}, 718 | {"if": {"cond": {"isPresent": "learning_rate"}, "then": ["--learning-rate", 719 | {"inputValue": "learning_rate"}]}}, {"if": {"cond": {"isPresent": "min_split_loss"}, 720 | "then": ["--min-split-loss", {"inputValue": "min_split_loss"}]}}, {"if": 721 | {"cond": {"isPresent": "max_depth"}, "then": ["--max-depth", {"inputValue": 722 | "max_depth"}]}}, "--model", {"outputPath": "model"}, "--model-config", {"outputPath": 723 | "model_config"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 724 | python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1'' 725 | ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install 726 | --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user) 727 | && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path: 728 | str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return 729 | file_path\n\ndef xgboost_train(\n training_data_path, # Also supports 730 | LibSVM\n model_path,\n model_config_path,\n starting_model_path 731 | = None,\n\n label_column = 0,\n num_iterations = 10,\n booster_params 732 | = None,\n\n # Booster parameters\n objective = ''reg:squarederror'',\n booster 733 | = ''gbtree'',\n learning_rate = 0.3,\n min_split_loss = 0,\n max_depth 734 | = 6,\n):\n ''''''Train an XGBoost model.\n\n Args:\n training_data_path: 735 | Path for the training data in CSV format.\n model_path: Output path 736 | for the trained model in binary XGBoost format.\n model_config_path: 737 | Output path for the internal parameter configuration of Booster as a JSON 738 | string.\n starting_model_path: Path for the existing trained model 739 | to start from.\n label_column: Column containing the label data.\n num_boost_rounds: 740 | Number of boosting iterations.\n booster_params: Parameters for the 741 | booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: 742 | The learning task and the corresponding learning objective.\n See 743 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The 744 | most common values are:\n \"reg:squarederror\" - Regression with 745 | squared loss (default).\n \"reg:logistic\" - Logistic regression.\n \"binary:logistic\" 746 | - Logistic regression for binary classification, output probability.\n \"binary:logitraw\" 747 | - Logistic regression for binary classification, output score before logistic 748 | transformation\n \"rank:pairwise\" - Use LambdaMART to perform 749 | pairwise ranking where the pairwise loss is minimized\n \"rank:ndcg\" 750 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 751 | Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: 752 | Alexey Volkov \n ''''''\n import pandas\n import 753 | xgboost\n\n df = pandas.read_csv(\n training_data_path,\n )\n\n training_data 754 | = xgboost.DMatrix(\n data=df.drop(columns=[df.columns[label_column]]),\n label=df[df.columns[label_column]],\n )\n\n booster_params 755 | = booster_params or {}\n booster_params.setdefault(''objective'', objective)\n booster_params.setdefault(''booster'', 756 | booster)\n booster_params.setdefault(''learning_rate'', learning_rate)\n booster_params.setdefault(''min_split_loss'', 757 | min_split_loss)\n booster_params.setdefault(''max_depth'', max_depth)\n\n starting_model 758 | = None\n if starting_model_path:\n starting_model = xgboost.Booster(model_file=starting_model_path)\n\n model 759 | = xgboost.train(\n params=booster_params,\n dtrain=training_data,\n num_boost_round=num_iterations,\n xgb_model=starting_model\n )\n\n # 760 | Saving the model in binary format\n model.save_model(model_path)\n\n model_config_str 761 | = model.save_config()\n with open(model_config_path, ''w'') as model_config_file:\n model_config_file.write(model_config_str)\n\nimport 762 | json\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost 763 | train'', description=''Train an XGBoost model.\\n\\n Args:\\n training_data_path: 764 | Path for the training data in CSV format.\\n model_path: Output path 765 | for the trained model in binary XGBoost format.\\n model_config_path: 766 | Output path for the internal parameter configuration of Booster as a JSON 767 | string.\\n starting_model_path: Path for the existing trained model 768 | to start from.\\n label_column: Column containing the label data.\\n num_boost_rounds: 769 | Number of boosting iterations.\\n booster_params: Parameters for 770 | the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\n objective: 771 | The learning task and the corresponding learning objective.\\n See 772 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\n The 773 | most common values are:\\n \"reg:squarederror\" - Regression 774 | with squared loss (default).\\n \"reg:logistic\" - Logistic regression.\\n \"binary:logistic\" 775 | - Logistic regression for binary classification, output probability.\\n \"binary:logitraw\" 776 | - Logistic regression for binary classification, output score before logistic 777 | transformation\\n \"rank:pairwise\" - Use LambdaMART to perform 778 | pairwise ranking where the pairwise loss is minimized\\n \"rank:ndcg\" 779 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 780 | Cumulative Gain (NDCG) is maximized\\n\\n Annotations:\\n author: 781 | Alexey Volkov '')\n_parser.add_argument(\"--training-data\", 782 | dest=\"training_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--starting-model\", 783 | dest=\"starting_model_path\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\", 784 | dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--num-iterations\", 785 | dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster-params\", 786 | dest=\"booster_params\", type=json.loads, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\", 787 | dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster\", 788 | dest=\"booster\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\", 789 | dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--min-split-loss\", 790 | dest=\"min_split_loss\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\", 791 | dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", 792 | dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, 793 | default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\", dest=\"model_config_path\", 794 | type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args 795 | = vars(_parser.parse_args())\n\n_outputs = xgboost_train(**_parsed_args)\n"], 796 | "image": "python:3.7"}}, "inputs": [{"name": "training_data", "type": "CSV"}, 797 | {"name": "starting_model", "optional": true, "type": "XGBoostModel"}, {"default": 798 | "0", "name": "label_column", "optional": true, "type": "Integer"}, {"default": 799 | "10", "name": "num_iterations", "optional": true, "type": "Integer"}, {"name": 800 | "booster_params", "optional": true, "type": "JsonObject"}, {"default": "reg:squarederror", 801 | "name": "objective", "optional": true, "type": "String"}, {"default": "gbtree", 802 | "name": "booster", "optional": true, "type": "String"}, {"default": "0.3", 803 | "name": "learning_rate", "optional": true, "type": "Float"}, {"default": 804 | "0", "name": "min_split_loss", "optional": true, "type": "Float"}, {"default": 805 | "6", "name": "max_depth", "optional": true, "type": "Integer"}], "name": 806 | "Xgboost train", "outputs": [{"name": "model", "type": "XGBoostModel"}, 807 | {"name": "model_config", "type": "XGBoostModelConfig"}]}', pipelines.kubeflow.org/component_ref: '{"digest": 808 | "09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38", "url": 809 | "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml"}', 810 | pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0", "num_iterations": 811 | "100", "objective": "reg:squarederror"}'} 812 | - name: xgboost-train-2 813 | container: 814 | args: [--training-data, /tmp/inputs/training_data/data, --starting-model, /tmp/inputs/starting_model/data, 815 | --label-column, '0', --num-iterations, '50', --objective, 'reg:squarederror', 816 | --model, /tmp/outputs/model/data, --model-config, /tmp/outputs/model_config/data] 817 | command: 818 | - sh 819 | - -c 820 | - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 821 | 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 822 | -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' 823 | --user) && "$0" "$@" 824 | - python3 825 | - -u 826 | - -c 827 | - | 828 | def _make_parent_dirs_and_return_path(file_path: str): 829 | import os 830 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 831 | return file_path 832 | 833 | def xgboost_train( 834 | training_data_path, # Also supports LibSVM 835 | model_path, 836 | model_config_path, 837 | starting_model_path = None, 838 | 839 | label_column = 0, 840 | num_iterations = 10, 841 | booster_params = None, 842 | 843 | # Booster parameters 844 | objective = 'reg:squarederror', 845 | booster = 'gbtree', 846 | learning_rate = 0.3, 847 | min_split_loss = 0, 848 | max_depth = 6, 849 | ): 850 | '''Train an XGBoost model. 851 | 852 | Args: 853 | training_data_path: Path for the training data in CSV format. 854 | model_path: Output path for the trained model in binary XGBoost format. 855 | model_config_path: Output path for the internal parameter configuration of Booster as a JSON string. 856 | starting_model_path: Path for the existing trained model to start from. 857 | label_column: Column containing the label data. 858 | num_boost_rounds: Number of boosting iterations. 859 | booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html 860 | objective: The learning task and the corresponding learning objective. 861 | See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 862 | The most common values are: 863 | "reg:squarederror" - Regression with squared loss (default). 864 | "reg:logistic" - Logistic regression. 865 | "binary:logistic" - Logistic regression for binary classification, output probability. 866 | "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation 867 | "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized 868 | "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized 869 | 870 | Annotations: 871 | author: Alexey Volkov 872 | ''' 873 | import pandas 874 | import xgboost 875 | 876 | df = pandas.read_csv( 877 | training_data_path, 878 | ) 879 | 880 | training_data = xgboost.DMatrix( 881 | data=df.drop(columns=[df.columns[label_column]]), 882 | label=df[df.columns[label_column]], 883 | ) 884 | 885 | booster_params = booster_params or {} 886 | booster_params.setdefault('objective', objective) 887 | booster_params.setdefault('booster', booster) 888 | booster_params.setdefault('learning_rate', learning_rate) 889 | booster_params.setdefault('min_split_loss', min_split_loss) 890 | booster_params.setdefault('max_depth', max_depth) 891 | 892 | starting_model = None 893 | if starting_model_path: 894 | starting_model = xgboost.Booster(model_file=starting_model_path) 895 | 896 | model = xgboost.train( 897 | params=booster_params, 898 | dtrain=training_data, 899 | num_boost_round=num_iterations, 900 | xgb_model=starting_model 901 | ) 902 | 903 | # Saving the model in binary format 904 | model.save_model(model_path) 905 | 906 | model_config_str = model.save_config() 907 | with open(model_config_path, 'w') as model_config_file: 908 | model_config_file.write(model_config_str) 909 | 910 | import json 911 | import argparse 912 | _parser = argparse.ArgumentParser(prog='Xgboost train', description='Train an XGBoost model.\n\n Args:\n training_data_path: Path for the training data in CSV format.\n model_path: Output path for the trained model in binary XGBoost format.\n model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.\n starting_model_path: Path for the existing trained model to start from.\n label_column: Column containing the label data.\n num_boost_rounds: Number of boosting iterations.\n booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: The learning task and the corresponding learning objective.\n See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The most common values are:\n "reg:squarederror" - Regression with squared loss (default).\n "reg:logistic" - Logistic regression.\n "binary:logistic" - Logistic regression for binary classification, output probability.\n "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation\n "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized\n "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: Alexey Volkov ') 913 | _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS) 914 | _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS) 915 | _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS) 916 | _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS) 917 | _parser.add_argument("--booster-params", dest="booster_params", type=json.loads, required=False, default=argparse.SUPPRESS) 918 | _parser.add_argument("--objective", dest="objective", type=str, required=False, default=argparse.SUPPRESS) 919 | _parser.add_argument("--booster", dest="booster", type=str, required=False, default=argparse.SUPPRESS) 920 | _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS) 921 | _parser.add_argument("--min-split-loss", dest="min_split_loss", type=float, required=False, default=argparse.SUPPRESS) 922 | _parser.add_argument("--max-depth", dest="max_depth", type=int, required=False, default=argparse.SUPPRESS) 923 | _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 924 | _parser.add_argument("--model-config", dest="model_config_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) 925 | _parsed_args = vars(_parser.parse_args()) 926 | 927 | _outputs = xgboost_train(**_parsed_args) 928 | image: python:3.7 929 | inputs: 930 | artifacts: 931 | - {name: xgboost-train-model, path: /tmp/inputs/starting_model/data} 932 | - {name: chicago-taxi-trips-dataset-Table, path: /tmp/inputs/training_data/data} 933 | outputs: 934 | artifacts: 935 | - {name: xgboost-train-2-model, path: /tmp/outputs/model/data} 936 | - {name: xgboost-train-2-model_config, path: /tmp/outputs/model_config/data} 937 | metadata: 938 | annotations: {pipelines.kubeflow.org/component_spec: '{"description": "Train 939 | an XGBoost model.\n\n Args:\n training_data_path: Path for the 940 | training data in CSV format.\n model_path: Output path for the trained 941 | model in binary XGBoost format.\n model_config_path: Output path 942 | for the internal parameter configuration of Booster as a JSON string.\n starting_model_path: 943 | Path for the existing trained model to start from.\n label_column: 944 | Column containing the label data.\n num_boost_rounds: Number of boosting 945 | iterations.\n booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: 946 | The learning task and the corresponding learning objective.\n See 947 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The 948 | most common values are:\n \"reg:squarederror\" - Regression with 949 | squared loss (default).\n \"reg:logistic\" - Logistic regression.\n \"binary:logistic\" 950 | - Logistic regression for binary classification, output probability.\n \"binary:logitraw\" 951 | - Logistic regression for binary classification, output score before logistic 952 | transformation\n \"rank:pairwise\" - Use LambdaMART to perform 953 | pairwise ranking where the pairwise loss is minimized\n \"rank:ndcg\" 954 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 955 | Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: 956 | Alexey Volkov ", "implementation": {"container": 957 | {"args": ["--training-data", {"inputPath": "training_data"}, {"if": {"cond": 958 | {"isPresent": "starting_model"}, "then": ["--starting-model", {"inputPath": 959 | "starting_model"}]}}, {"if": {"cond": {"isPresent": "label_column"}, "then": 960 | ["--label-column", {"inputValue": "label_column"}]}}, {"if": {"cond": {"isPresent": 961 | "num_iterations"}, "then": ["--num-iterations", {"inputValue": "num_iterations"}]}}, 962 | {"if": {"cond": {"isPresent": "booster_params"}, "then": ["--booster-params", 963 | {"inputValue": "booster_params"}]}}, {"if": {"cond": {"isPresent": "objective"}, 964 | "then": ["--objective", {"inputValue": "objective"}]}}, {"if": {"cond": 965 | {"isPresent": "booster"}, "then": ["--booster", {"inputValue": "booster"}]}}, 966 | {"if": {"cond": {"isPresent": "learning_rate"}, "then": ["--learning-rate", 967 | {"inputValue": "learning_rate"}]}}, {"if": {"cond": {"isPresent": "min_split_loss"}, 968 | "then": ["--min-split-loss", {"inputValue": "min_split_loss"}]}}, {"if": 969 | {"cond": {"isPresent": "max_depth"}, "then": ["--max-depth", {"inputValue": 970 | "max_depth"}]}}, "--model", {"outputPath": "model"}, "--model-config", {"outputPath": 971 | "model_config"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 972 | python3 -m pip install --quiet --no-warn-script-location ''xgboost==1.1.1'' 973 | ''pandas==1.0.5'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install 974 | --quiet --no-warn-script-location ''xgboost==1.1.1'' ''pandas==1.0.5'' --user) 975 | && \"$0\" \"$@\"", "python3", "-u", "-c", "def _make_parent_dirs_and_return_path(file_path: 976 | str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return 977 | file_path\n\ndef xgboost_train(\n training_data_path, # Also supports 978 | LibSVM\n model_path,\n model_config_path,\n starting_model_path 979 | = None,\n\n label_column = 0,\n num_iterations = 10,\n booster_params 980 | = None,\n\n # Booster parameters\n objective = ''reg:squarederror'',\n booster 981 | = ''gbtree'',\n learning_rate = 0.3,\n min_split_loss = 0,\n max_depth 982 | = 6,\n):\n ''''''Train an XGBoost model.\n\n Args:\n training_data_path: 983 | Path for the training data in CSV format.\n model_path: Output path 984 | for the trained model in binary XGBoost format.\n model_config_path: 985 | Output path for the internal parameter configuration of Booster as a JSON 986 | string.\n starting_model_path: Path for the existing trained model 987 | to start from.\n label_column: Column containing the label data.\n num_boost_rounds: 988 | Number of boosting iterations.\n booster_params: Parameters for the 989 | booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective: 990 | The learning task and the corresponding learning objective.\n See 991 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n The 992 | most common values are:\n \"reg:squarederror\" - Regression with 993 | squared loss (default).\n \"reg:logistic\" - Logistic regression.\n \"binary:logistic\" 994 | - Logistic regression for binary classification, output probability.\n \"binary:logitraw\" 995 | - Logistic regression for binary classification, output score before logistic 996 | transformation\n \"rank:pairwise\" - Use LambdaMART to perform 997 | pairwise ranking where the pairwise loss is minimized\n \"rank:ndcg\" 998 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 999 | Cumulative Gain (NDCG) is maximized\n\n Annotations:\n author: 1000 | Alexey Volkov \n ''''''\n import pandas\n import 1001 | xgboost\n\n df = pandas.read_csv(\n training_data_path,\n )\n\n training_data 1002 | = xgboost.DMatrix(\n data=df.drop(columns=[df.columns[label_column]]),\n label=df[df.columns[label_column]],\n )\n\n booster_params 1003 | = booster_params or {}\n booster_params.setdefault(''objective'', objective)\n booster_params.setdefault(''booster'', 1004 | booster)\n booster_params.setdefault(''learning_rate'', learning_rate)\n booster_params.setdefault(''min_split_loss'', 1005 | min_split_loss)\n booster_params.setdefault(''max_depth'', max_depth)\n\n starting_model 1006 | = None\n if starting_model_path:\n starting_model = xgboost.Booster(model_file=starting_model_path)\n\n model 1007 | = xgboost.train(\n params=booster_params,\n dtrain=training_data,\n num_boost_round=num_iterations,\n xgb_model=starting_model\n )\n\n # 1008 | Saving the model in binary format\n model.save_model(model_path)\n\n model_config_str 1009 | = model.save_config()\n with open(model_config_path, ''w'') as model_config_file:\n model_config_file.write(model_config_str)\n\nimport 1010 | json\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Xgboost 1011 | train'', description=''Train an XGBoost model.\\n\\n Args:\\n training_data_path: 1012 | Path for the training data in CSV format.\\n model_path: Output path 1013 | for the trained model in binary XGBoost format.\\n model_config_path: 1014 | Output path for the internal parameter configuration of Booster as a JSON 1015 | string.\\n starting_model_path: Path for the existing trained model 1016 | to start from.\\n label_column: Column containing the label data.\\n num_boost_rounds: 1017 | Number of boosting iterations.\\n booster_params: Parameters for 1018 | the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\n objective: 1019 | The learning task and the corresponding learning objective.\\n See 1020 | https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\n The 1021 | most common values are:\\n \"reg:squarederror\" - Regression 1022 | with squared loss (default).\\n \"reg:logistic\" - Logistic regression.\\n \"binary:logistic\" 1023 | - Logistic regression for binary classification, output probability.\\n \"binary:logitraw\" 1024 | - Logistic regression for binary classification, output score before logistic 1025 | transformation\\n \"rank:pairwise\" - Use LambdaMART to perform 1026 | pairwise ranking where the pairwise loss is minimized\\n \"rank:ndcg\" 1027 | - Use LambdaMART to perform list-wise ranking where Normalized Discounted 1028 | Cumulative Gain (NDCG) is maximized\\n\\n Annotations:\\n author: 1029 | Alexey Volkov '')\n_parser.add_argument(\"--training-data\", 1030 | dest=\"training_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--starting-model\", 1031 | dest=\"starting_model_path\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column\", 1032 | dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--num-iterations\", 1033 | dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster-params\", 1034 | dest=\"booster_params\", type=json.loads, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\", 1035 | dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--booster\", 1036 | dest=\"booster\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\", 1037 | dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--min-split-loss\", 1038 | dest=\"min_split_loss\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\", 1039 | dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", 1040 | dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, 1041 | default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\", dest=\"model_config_path\", 1042 | type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args 1043 | = vars(_parser.parse_args())\n\n_outputs = xgboost_train(**_parsed_args)\n"], 1044 | "image": "python:3.7"}}, "inputs": [{"name": "training_data", "type": "CSV"}, 1045 | {"name": "starting_model", "optional": true, "type": "XGBoostModel"}, {"default": 1046 | "0", "name": "label_column", "optional": true, "type": "Integer"}, {"default": 1047 | "10", "name": "num_iterations", "optional": true, "type": "Integer"}, {"name": 1048 | "booster_params", "optional": true, "type": "JsonObject"}, {"default": "reg:squarederror", 1049 | "name": "objective", "optional": true, "type": "String"}, {"default": "gbtree", 1050 | "name": "booster", "optional": true, "type": "String"}, {"default": "0.3", 1051 | "name": "learning_rate", "optional": true, "type": "Float"}, {"default": 1052 | "0", "name": "min_split_loss", "optional": true, "type": "Float"}, {"default": 1053 | "6", "name": "max_depth", "optional": true, "type": "Integer"}], "name": 1054 | "Xgboost train", "outputs": [{"name": "model", "type": "XGBoostModel"}, 1055 | {"name": "model_config", "type": "XGBoostModelConfig"}]}', pipelines.kubeflow.org/component_ref: '{"digest": 1056 | "09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38", "url": 1057 | "https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml"}', 1058 | pipelines.kubeflow.org/arguments.parameters: '{"label_column": "0", "num_iterations": 1059 | "50", "objective": "reg:squarederror"}'} 1060 | arguments: 1061 | parameters: [] 1062 | serviceAccountName: pipeline-runner 1063 | --------------------------------------------------------------------------------