├── .gitignore ├── Dockerfile.copy ├── Dockerfile.monitoring ├── Dockerfile.train ├── Dockerfile.validation ├── LICENSE ├── README.md ├── build_copy_container.sh ├── build_monitoring_component.sh ├── build_training_container.sh ├── build_validation_component.sh ├── data-extraction └── run_copy_merlin.sh ├── images └── merlin-kubeflow-arch.png ├── inference ├── criteo-inference-client.py ├── load-triton-ensemble.py ├── run_merlin_inference.sh └── triton │ ├── Chart.yaml │ ├── README.md │ ├── dashboard.json │ ├── run_triton.sh │ ├── templates │ ├── _helpers.tpl │ ├── deployment.yaml │ └── service.yaml │ └── values.yaml ├── merlin-pipeline.py ├── monitoring ├── .helmignore ├── Chart.yaml ├── csv_read_gcs_write.py ├── perf-monitor-test.py ├── perf-monitor.py ├── run_monitoring.sh ├── run_monitoring_and_live_data.sh ├── templates │ ├── _helpers.tpl │ └── deployment.yaml └── values.yaml ├── preprocess-train ├── dcn_files │ ├── dcn.json │ └── format_dcn.py ├── preprocess-train.sh ├── preprocessing │ ├── nvt-preprocess-incremental.py │ └── nvt-preprocess.py └── training │ ├── create-nvt-hugectr-ensemble.py │ ├── ensemble-config.json │ └── hugectr-train-criteo-dcn.py ├── run_all.sh ├── validation ├── generate-stats.py ├── run_validation.sh ├── train_stats │ └── stats.txt ├── val_stats │ └── stats.txt └── validate-stats.py └── yamls ├── Autoscaling_custom_metrics ├── 1_custom-metric-server-config.yaml ├── 2_custom-metric-server.yaml ├── 3_custom-metrics-server-rbac.yaml └── 4_triton-hpa.yaml ├── pv.yaml └── pvc.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | *.tar 2 | *.tar.gz 3 | gcloud_key.json 4 | kfp_client_host_key.txt 5 | -------------------------------------------------------------------------------- /Dockerfile.copy: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | FROM google/cloud-sdk 18 | 19 | ARG gcloud_key 20 | 21 | # Install dependencies 22 | RUN apt-get update && \ 23 | apt-get install -y python3-pip vim curl 24 | 25 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \ 26 | apt-get install apt-transport-https --yes && \ 27 | echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \ 28 | apt-get update && apt-get install helm 29 | 30 | RUN pip3 install kfp 31 | 32 | COPY $PWD/data-extraction/run_copy_merlin.sh /script/ 33 | COPY $PWD/inference /script 34 | COPY $gcloud_key /script 35 | 36 | WORKDIR /script 37 | -------------------------------------------------------------------------------- /Dockerfile.monitoring: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | FROM google/cloud-sdk 18 | 19 | ARG gcloud_key 20 | ARG project_id 21 | 22 | # Install dependencies 23 | RUN apt-get update && \ 24 | apt-get install -y python3-pip vim curl 25 | 26 | RUN pip3 install kfp 27 | RUN pip3 install --upgrade google-cloud-pubsub 28 | RUN pip3 install scikit-learn pandas pyarrow 29 | 30 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \ 31 | apt-get install apt-transport-https --yes && \ 32 | echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \ 33 | apt-get update && apt-get install helm 34 | 35 | COPY $PWD/monitoring /script/ 36 | COPY $gcloud_key /script/ 37 | COPY $PWD/kfp_client_host_key.txt /script/ 38 | 39 | # Set Environment variables 40 | ENV GOOGLE_APPLICATION_CREDENTIALS /script/$gcloud_key 41 | ENV PROJECT $project_id 42 | # ENV PYTHONUNBUFFERED=0 43 | 44 | WORKDIR /script 45 | -------------------------------------------------------------------------------- /Dockerfile.train: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | FROM nvcr.io/nvidia/merlin/merlin-training:0.5.1 18 | 19 | ARG gcloud_key 20 | 21 | RUN apt-get update && \ 22 | apt-get install -y python3-pip vim curl 23 | 24 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \ 25 | apt-get install apt-transport-https --yes && \ 26 | echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \ 27 | apt-get update && apt-get install helm 28 | 29 | RUN curl -sSL https://sdk.cloud.google.com | bash 30 | ENV PATH $PATH:/root/google-cloud-sdk/bin 31 | 32 | COPY $PWD/preprocess-train /script 33 | COPY $gcloud_key /script 34 | 35 | WORKDIR /script 36 | -------------------------------------------------------------------------------- /Dockerfile.validation: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | FROM google/cloud-sdk 18 | 19 | ARG gcloud_key 20 | ARG project_id 21 | 22 | RUN apt-get update && \ 23 | apt-get install -y python3-pip vim curl 24 | 25 | RUN pip3 install tensorflow-data-validation && \ 26 | pip3 install protobuf && \ 27 | pip3 install pandas ipython 28 | 29 | COPY $PWD/validation /script/ 30 | COPY $gcloud_key /script/ 31 | 32 | ENV GOOGLE_APPLICATION_CREDENTIALS /script/$gcloud_key 33 | ENV PROJECT $project_id 34 | ENV PYTHONUNBUFFERED=0 35 | 36 | WORKDIR /script 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Merlin - MLOps on GKE 2 | 3 | ## Introduction 4 | [NVIDIA Merlin](https://developer.nvidia.com/nvidia-merlin) is an open-source application framework that facilitates the development and deployment of large-scale deep recommender systems on GPUs. 5 | 6 | The figure below shows the architecture of a recommendation system example using NVIDIA Merlin on a [Kubeflow pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/pipelines-overview/). 7 | 8 | Through this, we intend to show an end-to-end reference architecture, all the way from data preparation, to model deployment, with features like continuous and fast re-training, autoscaling, and model monitoring. 9 | 10 | ![Merlin-Kubeflow Architecture](images/merlin-kubeflow-arch.png) 11 | 12 | For this example, we use the [Criteo 1TB Click Logs](https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/) dataset, a large publicly available dataset for recommender systems. It contains feature values and click feedback for millions of display ads. It is divided into 24 files, each one corresponding to one day of data. 13 | 14 | ## Running the example 15 | Please follow the User Guide available [here](https://docs.google.com/document/d/1P_BerGSP5CNzGjGbRqgMrPcNaCmQuKUyodFaG0jlu9I/edit?usp=sharing)! 16 | 17 | -------------------------------------------------------------------------------- /build_copy_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID 20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key 21 | 22 | image_name=gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk # Specify the image name here 23 | image_tag=0.5.1 24 | 25 | full_image_name=${image_name}:${image_tag} 26 | 27 | docker build --build-arg gcloud_key=$GCLOUD_KEY -f Dockerfile.copy -t $full_image_name . 28 | 29 | printf "\n\nPushing the container on GCR..." 30 | docker push $full_image_name 31 | 32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n" 33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}" 34 | printf "\n\n------------------------------------------------------------------------------------------\n\n" 35 | -------------------------------------------------------------------------------- /build_monitoring_component.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID 20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key 21 | 22 | image_name=gcr.io/$PROJECT_ID/monitoring # Specify the image name here 23 | image_tag=0.5.1 24 | 25 | full_image_name=${image_name}:${image_tag} 26 | 27 | docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID -f Dockerfile.monitoring -t $full_image_name . 28 | 29 | printf "\n\nPushing the container on GCR..." 30 | docker push $full_image_name 31 | 32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n" 33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}" 34 | printf "\n\n------------------------------------------------------------------------------------------\n\n" -------------------------------------------------------------------------------- /build_training_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID 20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key 21 | 22 | image_name=gcr.io/$PROJECT_ID/merlin/merlin-training # Specify the image name here 23 | image_tag=0.5.1 24 | 25 | full_image_name=${image_name}:${image_tag} 26 | 27 | docker build --build-arg gcloud_key=$GCLOUD_KEY -f Dockerfile.train -t $full_image_name . 28 | 29 | printf "\n\nPushing the container on GCR..." 30 | docker push $full_image_name 31 | 32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n" 33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}" 34 | printf "\n\n------------------------------------------------------------------------------------------\n\n" 35 | 36 | -------------------------------------------------------------------------------- /build_validation_component.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID 20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key 21 | 22 | image_name=gcr.io/$PROJECT_ID/validation # Specify the image name here 23 | image_tag=0.5.1 24 | 25 | full_image_name=${image_name}:${image_tag} 26 | 27 | # docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID --no-cache -f Dockerfile.validation -t $full_image_name . 28 | docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID -f Dockerfile.validation -t $full_image_name . 29 | 30 | printf "\n\nPushing the container on GCR..." 31 | docker push $full_image_name 32 | 33 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n" 34 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}" 35 | printf "\n\n------------------------------------------------------------------------------------------\n\n" -------------------------------------------------------------------------------- /data-extraction/run_copy_merlin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | data_input_path=$1; 19 | data_local=$2; 20 | project_id=$3; 21 | new_data_path=$4; 22 | cluster=$5; 23 | zone=$6; 24 | 25 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json 26 | gcloud container clusters get-credentials $cluster --zone $zone --project $project_id 27 | gcloud config set project $project_id 28 | 29 | triton_status=$(helm status triton 2>&1) 30 | if [[ "$triton_status" == "Error: release: not found" ]]; then 31 | if [ -d "$data_local" ]; then 32 | ### Take action if $DIR exists ### 33 | echo "Running first time..." 34 | echo "Directory ${DIR} exists. Copying files from gcs" 35 | gsutil list gs:// 36 | if ! [ -d "$data_local/criteo-data" ]; then 37 | echo "Making criteo-data" 38 | mkdir -p $data_local/criteo-data/crit_int_pq 39 | fi 40 | echo "Copying data..." 41 | gsutil cp -r $data_input_path $data_local/criteo-data/crit_int_pq 42 | echo "Copying done" 43 | 44 | for entry in "$data_local/criteo-data/crit_int_pq"/* 45 | do 46 | echo "$entry" 47 | done 48 | 49 | else 50 | ### Control will jump here if $DIR does NOT exists ### 51 | echo "Error: ${DIR} not found. Can not continue." 52 | exit 1 53 | fi 54 | echo "copying done" 55 | else 56 | if [ -d "$data_local" ]; then 57 | ### Take action if $DIR exists ### 58 | echo "Recurrent run..." 59 | echo "Directory ${DIR} exists. Copying files from gcs" 60 | # gsutil list gs:// 61 | if ! [ -d "$data_local/criteo-data/new_data" ]; then 62 | echo "Making criteo-data" 63 | mkdir -p $data_local/criteo-data/new_data 64 | fi 65 | echo "Copying data..." 66 | gsutil cp -r $new_data_path $data_local/criteo-data/new_data 67 | echo "Copying done" 68 | 69 | for entry in "$data_local/criteo-data/new_data"/* 70 | do 71 | echo "$entry" 72 | done 73 | else 74 | ### Control will jump here if $DIR does NOT exists ### 75 | echo "Error: ${DIR} not found. Can not continue." 76 | exit 1 77 | fi 78 | fi 79 | 80 | -------------------------------------------------------------------------------- /images/merlin-kubeflow-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Merlin/gcp-ml-ops/5fed2a218b605854fe7147576891e91871698359/images/merlin-kubeflow-arch.png -------------------------------------------------------------------------------- /inference/criteo-inference-client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import numpy as np 17 | import os 18 | import argparse 19 | import sys 20 | import warnings 21 | import sys 22 | 23 | import tritonclient.http as httpclient 24 | import tritonclient.grpc as grpcclient 25 | from tritonclient.utils import * 26 | import cudf 27 | 28 | from sklearn import metrics 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | 34 | parser.add_argument('-u', 35 | '--triton_grpc_url', 36 | type=str, 37 | required=False, 38 | default='localhost:8001', 39 | help='URL to Triton gRPC Endpoint') 40 | 41 | parser.add_argument('-m', 42 | '--model_name', 43 | type=str, 44 | required=False, 45 | default='dcn_ens', 46 | help='Name of the model ensemble to load') 47 | 48 | parser.add_argument('-d', 49 | '--test_data', 50 | type=str, 51 | required=False, 52 | default='/crit_int_pq/day_23.parquet', 53 | help='Path to a test .parquet file. Default') 54 | 55 | parser.add_argument('-b', 56 | '--batch_size', 57 | type=int, 58 | required=False, 59 | default=64, 60 | help='Batch size. Max is 64 at the moment, but this max size could be specified when create the model and the ensemble.') 61 | 62 | parser.add_argument('-n', 63 | '--n_batches', 64 | type=int, 65 | required=False, 66 | default=1, 67 | help='Number of batches of data to send') 68 | 69 | parser.add_argument('-v', 70 | '--verbose', 71 | type=bool, 72 | required=False, 73 | default=False, 74 | help='Verbosity, True or False') 75 | 76 | 77 | args = parser.parse_args() 78 | 79 | # warnings can be disabled 80 | if not sys.warnoptions: 81 | warnings.simplefilter("ignore") 82 | 83 | try: 84 | triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose) 85 | print("Triton client created.") 86 | except Exception as e: 87 | print("channel creation failed: " + str(e)) 88 | sys.exit() 89 | 90 | 91 | if not triton_client.is_model_ready(args.model_name): 92 | print(f"Model {args.model_name} is not ready!") 93 | sys.exit(1) 94 | else: 95 | print(f"Model {args.model_name} is ready!") 96 | 97 | ### .... 98 | 99 | 100 | # Load the dataset 101 | CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1,27)] 102 | CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)] 103 | LABEL_COLUMNS = ['label'] 104 | col_names = CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS 105 | col_dtypes = [np.int32]*26 + [np.int64]*13 106 | 107 | 108 | 109 | print("Reading dataset..") 110 | batch_whole = cudf.read_parquet(args.test_data, num_rows=args.batch_size*args.n_batches) 111 | batch_features = batch_whole[col_names] 112 | batch_labels = batch_whole[LABEL_COLUMNS] 113 | 114 | 115 | 116 | results=[] 117 | 118 | 119 | with grpcclient.InferenceServerClient(url=args.triton_grpc_url) as client: 120 | for batch in range(args.n_batches): 121 | print(f"Requesting inference for batch {batch}..") 122 | start_idx=batch*args.batch_size 123 | end_idx=(batch+1)*(args.batch_size) 124 | # convert the batch to a triton inputs 125 | columns = [(col, batch_features[col][start_idx:end_idx]) for col in col_names] 126 | inputs = [] 127 | 128 | for i, (name, col) in enumerate(columns): 129 | d = col.values_host.astype(col_dtypes[i]) 130 | d = d.reshape(len(d), 1) 131 | inputs.append(grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) 132 | inputs[i].set_data_from_numpy(d) 133 | 134 | outputs = [] 135 | outputs.append(grpcclient.InferRequestedOutput("OUTPUT0")) 136 | 137 | response = client.infer(args.model_name, inputs, request_id=str(1), outputs=outputs) 138 | 139 | results.extend(response.as_numpy("OUTPUT0")) 140 | 141 | 142 | print(f"ROC AUC Score: {metrics.roc_auc_score(batch_labels[LABEL_COLUMNS].values.tolist(), results)}") 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /inference/load-triton-ensemble.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | 18 | import sys 19 | import argparse 20 | import logging 21 | 22 | import tritonclient.grpc as grpcclient 23 | from tritonclient.utils import InferenceServerException 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument('-u', 30 | '--triton_grpc_url', 31 | type=str, 32 | required=False, 33 | default='localhost:8001', 34 | help='URL to Triton gRPC Endpoint') 35 | 36 | parser.add_argument('-m', 37 | '--model_name', 38 | type=str, 39 | required=False, 40 | default='dcn_ens', 41 | help='Name of the model ensemble to load') 42 | 43 | parser.add_argument('-v', 44 | '--verbose', 45 | type=bool, 46 | required=False, 47 | default=True, 48 | help='Verbosity, True or False') 49 | 50 | 51 | args = parser.parse_args() 52 | 53 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 54 | logging.info(f"Args: {args}") 55 | 56 | try: 57 | triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose) 58 | logging.info("Triton client created.") 59 | except Exception as e: 60 | logging.error(f"channel creation failed: {str(e)}") 61 | sys.exit() 62 | 63 | 64 | # Health 65 | if not triton_client.is_server_live(headers={'test': '1', 'dummy': '2'}): 66 | logging.error("FAILED : is_server_live") 67 | sys.exit(1) 68 | 69 | if not triton_client.is_server_ready(): 70 | logging.error("FAILED : is_server_ready") 71 | sys.exit(1) 72 | 73 | logging.info(f"Models available: {triton_client.get_model_repository_index()}") 74 | 75 | # Load the ensemble model 76 | # TODO: Increase the timeout. Sometimes this times out with 8xGPUs because loading 77 | # the model takes longer. 78 | try: 79 | triton_client.load_model(model_name=args.model_name) 80 | except InferenceServerException as e: 81 | if "failed to load" in e.message(): 82 | logging.error(f"Model {args.model_name} failed to load!") 83 | 84 | if not triton_client.is_model_ready(args.model_name): 85 | logging.error(f"Model {args.model_name} is not ready!") 86 | sys.exit(1) 87 | else: 88 | logging.info(f"Model {args.model_name} is ready!") 89 | -------------------------------------------------------------------------------- /inference/run_merlin_inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | PV_LOC=${1:-"/var/lib/data"} 19 | PROJECT_ID=${2:-"dl-tme"} 20 | GCLOUD_KEY=${3:-"/script/gcloud_key.json"} 21 | CLUSTER=${4:-"merlin-mlops"} 22 | ZONE=${5:-"us-central1-a"} 23 | 24 | gcloud auth activate-service-account --key-file=$GCLOUD_KEY 25 | gcloud container clusters get-credentials $CLUSTER --zone $ZONE --project $PROJECT_ID 26 | gcloud config set project $PROJECT_ID 27 | 28 | if ! [ -d $PV_LOC/inference ]; then 29 | mkdir $PV_LOC/inference 30 | fi 31 | 32 | triton_status=$(helm status triton 2>&1) 33 | echo "Triton status: " 34 | echo $triton_status 35 | if [[ "$triton_status" == "Error: release: not found" ]]; then 36 | cp /script/load-triton-ensemble.py $PV_LOC/inference/load-triton-ensemble.py 37 | cp /script/triton/run_triton.sh $PV_LOC/inference/run_triton.sh 38 | 39 | # helm install triton /script/triton/ --set image.repository=gcr.io/$PROJECT_ID/merlin/merlin-inference:v0.5 40 | helm install triton /script/triton/ --set image.repository=gcr.io/$PROJECT_ID/merlin/merlin-inference:0.5.1 41 | else 42 | echo "Triton running already, not deploying another instance." 43 | fi 44 | -------------------------------------------------------------------------------- /inference/triton/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | apiVersion: v1 28 | appVersion: "2.0" 29 | description: Triton Inference Server 30 | name: triton-inference-server 31 | version: 1.0.0 32 | -------------------------------------------------------------------------------- /inference/triton/README.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) 30 | 31 | # Kubernetes Deploy: Triton Inference Server Cluster 32 | 33 | **NOTE: The prometheuos operator used in these instructions is not yet 34 | updated to work with 1.16.x versions of Google Kubernetes Engine 35 | (GKE). You must use a GKE 1.15.x version to avoid this issue.** 36 | 37 | A helm chart for installing a single cluster of Triton Inference 38 | Server is provided. By default the cluster contains a single instance 39 | of the inference server but the *replicaCount* configuration parameter 40 | can be set to create a cluster of any size, as described below. 41 | 42 | This guide assumes you already have a functional Kubernetes cluster 43 | and helm installed (see below for instructions on installing 44 | helm). Note the following requirements: 45 | 46 | * The helm chart deploys Prometheus and Grafana to collect and display 47 | Triton metrics. Your cluster must contain sufficient CPU resourses to 48 | support these services. At a minimum you will likely require 2 CPU 49 | nodes with machine type of n1-standard-2 or greater. 50 | 51 | * If you want Triton Server to use GPUs for inferencing, your cluster 52 | must be configured to contain the desired number of GPU nodes with 53 | support for the NVIDIA driver and CUDA version required by the version 54 | of the inference server you are using. 55 | 56 | This helm chart is available from [Triton Inference Server 57 | GitHub](https://github.com/triton-inference-server/server) or from the 58 | [NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com). 59 | 60 | The steps below describe how to set-up a model repository, use helm to 61 | launch the inference server, and then send inference requests to the 62 | running server. You can access a Grafana endpoint to see real-time 63 | metrics reported by the inference server. 64 | 65 | ## Installing Helm 66 | 67 | If you do not already have Helm installed in your Kubernetes cluster, 68 | executing the following steps from the [official helm install 69 | guide](https://helm.sh/docs/intro/install/) will 70 | give you a quick setup. 71 | 72 | ``` 73 | $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash 74 | $ kubectl create serviceaccount -n kube-system tiller 75 | serviceaccount/tiller created 76 | $ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller 77 | $ helm init --service-account tiller --wait 78 | ``` 79 | 80 | ## Model Repository 81 | 82 | If you already have a model repository you may use that with this helm 83 | chart. If you do not have a model repository, you can checkout a local 84 | copy of the inference server source repository to create an example 85 | model repository:: 86 | 87 | ``` 88 | $ git clone https://github.com/triton-inference-server/server.git 89 | ``` 90 | 91 | Triton Server needs a repository of models that it will make available 92 | for inferencing. For this example you will place the model repository 93 | in a Google Cloud Storage bucket. 94 | 95 | ``` 96 | $ gsutil mb gs://triton-inference-server-repository 97 | ``` 98 | 99 | Following the [QuickStart](../../docs/quickstart.md) download the 100 | example model repository to your system and copy it into the GCS 101 | bucket. 102 | 103 | ``` 104 | $ gsutil cp -r docs/examples/model_repository gs://triton-inference-server-repository/model_repository 105 | ``` 106 | 107 | ### GCS Permissions 108 | 109 | Make sure the bucket permissions are set so that the inference server 110 | can access the model repository. If the bucket is public then no 111 | additional changes are needed and you can proceed to "Deploy 112 | Prometheus and Grafana" section. 113 | 114 | If bucket premissions need to be set with the 115 | GOOGLE_APPLICATION_CREDENTIALS environment variable then perform the 116 | following steps: 117 | 118 | * Generate Google service account JSON with proper permissions called 119 | *gcp-creds.json*. 120 | 121 | * Create a Kubernetes secret from *gcp-creds.json*: 122 | 123 | ``` 124 | $ kubectl create configmap gcpcreds --from-literal "project-id=myproject" 125 | $ kubectl create secret generic gcpcreds --from-file gcp-creds.json 126 | ``` 127 | 128 | * Modify templates/deployment.yaml to include the 129 | GOOGLE_APPLICATION_CREDENTIALS environment variable: 130 | 131 | ``` 132 | env: 133 | - name: GOOGLE_APPLICATION_CREDENTIALS 134 | value: /secret/gcp-creds.json 135 | ``` 136 | 137 | * Modify templates/deployment.yaml to mount the secret in a volume at 138 | /secret: 139 | 140 | ``` 141 | volumeMounts: 142 | - name: vsecret 143 | mountPath: "/secret" 144 | readOnly: true 145 | ... 146 | volumes: 147 | - name: vsecret 148 | secret: 149 | secretName: gcpcreds 150 | ``` 151 | 152 | ## Deploy Prometheus and Grafana 153 | 154 | The inference server metrics are collected by Prometheus and viewable 155 | by Grafana. The inference server helm chart assumes that Prometheus 156 | and Grafana are available so this step must be followed even if you 157 | don't want to use Grafana. 158 | 159 | Use the prometheus-operator to install these components. The 160 | *serviceMonitorSelectorNilUsesHelmValues* flag is needed so that 161 | Prometheus can find the inference server metrics in the *example* 162 | release deployed below. 163 | 164 | ``` 165 | $ helm install --name example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false stable/prometheus-operator 166 | ``` 167 | 168 | Then port-forward to the Grafana service so you can access it from 169 | your local browser. 170 | 171 | ``` 172 | $ kubectl port-forward service/example-metrics-grafana 8080:80 173 | ``` 174 | 175 | Now you should be able to navigate in your browser to localhost:8080 176 | and see the Grafana login page. Use username=admin and 177 | password=prom-operator to login. 178 | 179 | An example Grafana dashboard is available in dashboard.json. Use the 180 | import function in Grafana to import and view this dashboard. 181 | 182 | ## Deploy the Inference Server 183 | 184 | Deploy the inference server using the default configuration with the 185 | following commands. 186 | 187 | ``` 188 | $ cd 189 | $ helm install --name example . 190 | ``` 191 | 192 | Use kubectl to see status and wait until the inference server pods are 193 | running. 194 | 195 | ``` 196 | $ kubectl get pods 197 | NAME READY STATUS RESTARTS AGE 198 | example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s 199 | ``` 200 | 201 | There are several ways of overriding the default configuration as 202 | described in this [helm 203 | documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing). 204 | 205 | You can edit the values.yaml file directly or you can use the *--set* 206 | option to override a single parameter with the CLI. For example, to 207 | deploy a cluster of four inference servers use *--set* to set the 208 | replicaCount parameter. 209 | 210 | ``` 211 | $ helm install --name example --set replicaCount=4 . 212 | ``` 213 | 214 | You can also write your own "config.yaml" file with the values you 215 | want to override and pass it to helm. 216 | 217 | ``` 218 | $ cat << EOF > config.yaml 219 | namespace: MyCustomNamespace 220 | image: 221 | imageName: nvcr.io/nvidia/tritonserver:custom-tag 222 | modelRepositoryPath: gs://my_model_repository 223 | EOF 224 | $ helm install --name example -f config.yaml . 225 | ``` 226 | 227 | ## Using Triton Inference Server 228 | 229 | Now that the inference server is running you can send HTTP or GRPC 230 | requests to it to perform inferencing. By default, the inferencing 231 | service is exposed with a LoadBalancer service type. Use the following 232 | to find the external IP for the inference server. In this case it is 233 | 34.83.9.133. 234 | 235 | ``` 236 | $ kubectl get services 237 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 238 | ... 239 | example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m 240 | ``` 241 | 242 | The inference server exposes an HTTP endpoint on port 8000, and GRPC 243 | endpoint on port 8001 and a Prometheus metrics endpoint on 244 | port 8002. You can use curl to get the meta-data of the inference server 245 | from the HTTP endpoint. 246 | 247 | ``` 248 | $ curl 34.83.9.133:8000/v2 249 | ``` 250 | 251 | Follow the [QuickStart](../../docs/quickstart.md) to get the example 252 | image classification client that can be used to perform inferencing 253 | using image classification models being served by the inference 254 | server. For example, 255 | 256 | ``` 257 | $ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg 258 | Request 0, batch size 1 259 | Image 'images/mug.jpg': 260 | 504 (COFFEE MUG) = 0.723992 261 | 968 (CUP) = 0.270953 262 | 967 (ESPRESSO) = 0.00115997 263 | ``` 264 | 265 | ## Cleanup 266 | 267 | Once you've finished using the inference server you should use helm to 268 | delete the deployment. 269 | 270 | ``` 271 | $ helm list 272 | NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE 273 | example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default 274 | example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default 275 | 276 | $ helm delete --purge example 277 | $ helm delete --purge example-metrics 278 | ``` 279 | 280 | For the Prometheus and Grafana services you should [explicitly delete 281 | CRDs](https://github.com/helm/charts/tree/master/stable/prometheus-operator#uninstalling-the-chart): 282 | 283 | ``` 284 | $ kubectl delete crd alertmanagers.monitoring.coreos.com servicemonitors.monitoring.coreos.com podmonitors.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com 285 | ``` 286 | 287 | You may also want to delete the GCS bucket you created to hold the 288 | model repository. 289 | 290 | ``` 291 | $ gsutil rm -r gs://triton-inference-server-repository 292 | ``` 293 | -------------------------------------------------------------------------------- /inference/triton/dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_PROMETHEUS", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "6.3.5" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "" 24 | }, 25 | { 26 | "type": "panel", 27 | "id": "heatmap", 28 | "name": "Heatmap", 29 | "version": "" 30 | }, 31 | { 32 | "type": "datasource", 33 | "id": "prometheus", 34 | "name": "Prometheus", 35 | "version": "1.0.0" 36 | } 37 | ], 38 | "annotations": { 39 | "list": [ 40 | { 41 | "builtIn": 1, 42 | "datasource": "-- Grafana --", 43 | "enable": true, 44 | "hide": true, 45 | "iconColor": "rgba(0, 211, 255, 1)", 46 | "name": "Annotations & Alerts", 47 | "type": "dashboard" 48 | } 49 | ] 50 | }, 51 | "editable": true, 52 | "gnetId": null, 53 | "graphTooltip": 0, 54 | "id": null, 55 | "links": [], 56 | "panels": [ 57 | { 58 | "aliasColors": {}, 59 | "bars": false, 60 | "dashLength": 10, 61 | "dashes": false, 62 | "datasource": "${DS_PROMETHEUS}", 63 | "fill": 1, 64 | "fillGradient": 0, 65 | "gridPos": { 66 | "h": 9, 67 | "w": 12, 68 | "x": 0, 69 | "y": 0 70 | }, 71 | "id": 2, 72 | "legend": { 73 | "avg": false, 74 | "current": false, 75 | "max": false, 76 | "min": false, 77 | "show": true, 78 | "total": false, 79 | "values": false 80 | }, 81 | "lines": true, 82 | "linewidth": 1, 83 | "nullPointMode": "null", 84 | "options": { 85 | "dataLinks": [] 86 | }, 87 | "percentage": false, 88 | "pointradius": 2, 89 | "points": false, 90 | "renderer": "flot", 91 | "seriesOverrides": [], 92 | "spaceLength": 10, 93 | "stack": false, 94 | "steppedLine": false, 95 | "targets": [ 96 | { 97 | "expr": "nv_inference_request_success", 98 | "legendFormat": "Success {{instance}}", 99 | "refId": "A" 100 | }, 101 | { 102 | "expr": "nv_inference_request_failure", 103 | "legendFormat": "Failure {{instance}}", 104 | "refId": "B" 105 | } 106 | ], 107 | "thresholds": [], 108 | "timeFrom": null, 109 | "timeRegions": [], 110 | "timeShift": null, 111 | "title": "Cumulative Inference Requests", 112 | "tooltip": { 113 | "shared": true, 114 | "sort": 0, 115 | "value_type": "individual" 116 | }, 117 | "type": "graph", 118 | "xaxis": { 119 | "buckets": null, 120 | "mode": "time", 121 | "name": null, 122 | "show": true, 123 | "values": [] 124 | }, 125 | "yaxes": [ 126 | { 127 | "format": "short", 128 | "label": null, 129 | "logBase": 1, 130 | "max": null, 131 | "min": null, 132 | "show": true 133 | }, 134 | { 135 | "format": "short", 136 | "label": null, 137 | "logBase": 1, 138 | "max": null, 139 | "min": null, 140 | "show": false 141 | } 142 | ], 143 | "yaxis": { 144 | "align": false, 145 | "alignLevel": null 146 | } 147 | }, 148 | { 149 | "cards": { 150 | "cardPadding": null, 151 | "cardRound": null 152 | }, 153 | "color": { 154 | "cardColor": "#b4ff00", 155 | "colorScale": "sqrt", 156 | "colorScheme": "interpolateReds", 157 | "exponent": 0.5, 158 | "mode": "spectrum" 159 | }, 160 | "dataFormat": "timeseries", 161 | "gridPos": { 162 | "h": 9, 163 | "w": 12, 164 | "x": 12, 165 | "y": 0 166 | }, 167 | "heatmap": {}, 168 | "hideZeroBuckets": false, 169 | "highlightCards": true, 170 | "id": 7, 171 | "legend": { 172 | "show": false 173 | }, 174 | "options": {}, 175 | "reverseYBuckets": false, 176 | "targets": [ 177 | { 178 | "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)", 179 | "legendFormat": "", 180 | "refId": "A" 181 | } 182 | ], 183 | "timeFrom": null, 184 | "timeShift": null, 185 | "title": "Load Ratio (Total Time / Compute Time)", 186 | "tooltip": { 187 | "show": true, 188 | "showHistogram": false 189 | }, 190 | "type": "heatmap", 191 | "xAxis": { 192 | "show": true 193 | }, 194 | "xBucketNumber": null, 195 | "xBucketSize": null, 196 | "yAxis": { 197 | "decimals": null, 198 | "format": "short", 199 | "logBase": 1, 200 | "max": null, 201 | "min": null, 202 | "show": true, 203 | "splitFactor": null 204 | }, 205 | "yBucketBound": "auto", 206 | "yBucketNumber": null, 207 | "yBucketSize": null 208 | }, 209 | { 210 | "aliasColors": {}, 211 | "bars": false, 212 | "dashLength": 10, 213 | "dashes": false, 214 | "datasource": "${DS_PROMETHEUS}", 215 | "fill": 1, 216 | "fillGradient": 0, 217 | "gridPos": { 218 | "h": 8, 219 | "w": 12, 220 | "x": 0, 221 | "y": 9 222 | }, 223 | "id": 4, 224 | "legend": { 225 | "avg": false, 226 | "current": false, 227 | "max": false, 228 | "min": false, 229 | "show": true, 230 | "total": false, 231 | "values": false 232 | }, 233 | "lines": true, 234 | "linewidth": 1, 235 | "nullPointMode": "null", 236 | "options": { 237 | "dataLinks": [] 238 | }, 239 | "percentage": false, 240 | "pointradius": 2, 241 | "points": false, 242 | "renderer": "flot", 243 | "seriesOverrides": [], 244 | "spaceLength": 10, 245 | "stack": false, 246 | "steppedLine": false, 247 | "targets": [ 248 | { 249 | "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000", 250 | "legendFormat": "{{instance}}", 251 | "refId": "A" 252 | } 253 | ], 254 | "thresholds": [], 255 | "timeFrom": null, 256 | "timeRegions": [], 257 | "timeShift": null, 258 | "title": "Queue Time (milliseconds)", 259 | "tooltip": { 260 | "shared": true, 261 | "sort": 0, 262 | "value_type": "individual" 263 | }, 264 | "type": "graph", 265 | "xaxis": { 266 | "buckets": null, 267 | "mode": "time", 268 | "name": null, 269 | "show": true, 270 | "values": [] 271 | }, 272 | "yaxes": [ 273 | { 274 | "format": "short", 275 | "label": "Queue Time (ms)", 276 | "logBase": 1, 277 | "max": null, 278 | "min": null, 279 | "show": true 280 | }, 281 | { 282 | "format": "short", 283 | "label": null, 284 | "logBase": 1, 285 | "max": null, 286 | "min": null, 287 | "show": false 288 | } 289 | ], 290 | "yaxis": { 291 | "align": false, 292 | "alignLevel": null 293 | } 294 | }, 295 | { 296 | "aliasColors": {}, 297 | "bars": false, 298 | "dashLength": 10, 299 | "dashes": false, 300 | "datasource": "${DS_PROMETHEUS}", 301 | "fill": 1, 302 | "fillGradient": 0, 303 | "gridPos": { 304 | "h": 8, 305 | "w": 12, 306 | "x": 12, 307 | "y": 9 308 | }, 309 | "id": 5, 310 | "legend": { 311 | "avg": false, 312 | "current": false, 313 | "max": false, 314 | "min": false, 315 | "show": true, 316 | "total": false, 317 | "values": false 318 | }, 319 | "lines": true, 320 | "linewidth": 1, 321 | "nullPointMode": "null", 322 | "options": { 323 | "dataLinks": [] 324 | }, 325 | "percentage": false, 326 | "pointradius": 2, 327 | "points": false, 328 | "renderer": "flot", 329 | "seriesOverrides": [], 330 | "spaceLength": 10, 331 | "stack": false, 332 | "steppedLine": false, 333 | "targets": [ 334 | { 335 | "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000", 336 | "legendFormat": "{{instance}}", 337 | "refId": "A" 338 | } 339 | ], 340 | "thresholds": [], 341 | "timeFrom": null, 342 | "timeRegions": [], 343 | "timeShift": null, 344 | "title": "Compute Time (milliseconds)", 345 | "tooltip": { 346 | "shared": true, 347 | "sort": 0, 348 | "value_type": "individual" 349 | }, 350 | "type": "graph", 351 | "xaxis": { 352 | "buckets": null, 353 | "mode": "time", 354 | "name": null, 355 | "show": true, 356 | "values": [] 357 | }, 358 | "yaxes": [ 359 | { 360 | "format": "short", 361 | "label": "Compute Time (ms)", 362 | "logBase": 1, 363 | "max": null, 364 | "min": null, 365 | "show": true 366 | }, 367 | { 368 | "format": "short", 369 | "label": null, 370 | "logBase": 1, 371 | "max": null, 372 | "min": null, 373 | "show": false 374 | } 375 | ], 376 | "yaxis": { 377 | "align": false, 378 | "alignLevel": null 379 | } 380 | } 381 | ], 382 | "refresh": "5s", 383 | "schemaVersion": 19, 384 | "style": "dark", 385 | "tags": [], 386 | "templating": { 387 | "list": [] 388 | }, 389 | "time": { 390 | "from": "now-15m", 391 | "to": "now" 392 | }, 393 | "timepicker": { 394 | "refresh_intervals": [ 395 | "5s", 396 | "10s", 397 | "30s", 398 | "1m", 399 | "5m", 400 | "15m", 401 | "30m", 402 | "1h", 403 | "2h", 404 | "1d" 405 | ] 406 | }, 407 | "timezone": "", 408 | "title": "Triton Inference Server", 409 | "uid": "slEY4dsZk", 410 | "version": 8 411 | } 412 | -------------------------------------------------------------------------------- /inference/triton/run_triton.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | MODELS_DIR=${1:-"/model/models"} 19 | 20 | set -m 21 | 22 | tritonserver --model-repository=$MODELS_DIR --backend-config=hugectr,dcn=$MODELS_DIR/dcn/1/dcn.json --backend-config=hugectr,supportlonglong=true --model-control-mode=poll --repository-poll-secs=10 & 23 | 24 | sleep 120 25 | 26 | echo "starting script" 27 | python3 /model/inference/load-triton-ensemble.py --triton_grpc_url localhost:8001 --model_name dcn_ens --verbose False 28 | 29 | fg %1 30 | -------------------------------------------------------------------------------- /inference/triton/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */}} 28 | 29 | {{/* vim: set filetype=mustache: */}} 30 | {{/* 31 | Create inference server name. 32 | */}} 33 | {{- define "triton-inference-server.name" -}} 34 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 35 | {{- end -}} 36 | 37 | {{/* 38 | Create a default fully qualified app name. 39 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 40 | If release name contains chart name it will be used as a full name. 41 | */}} 42 | {{- define "triton-inference-server.fullname" -}} 43 | {{- if .Values.fullnameOverride -}} 44 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 45 | {{- else -}} 46 | {{- $name := default .Chart.Name .Values.nameOverride -}} 47 | {{- if contains $name .Release.Name -}} 48 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 49 | {{- else -}} 50 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 51 | {{- end -}} 52 | {{- end -}} 53 | {{- end -}} 54 | 55 | {{/* 56 | Create inference server metrics service name and fullname derived from above and 57 | truncated appropriately. 58 | */}} 59 | {{- define "triton-inference-server-metrics.name" -}} 60 | {{- $basename := include "triton-inference-server.name" . -}} 61 | {{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} 62 | {{- printf "%s-%s" $basename_trimmed "metrics" -}} 63 | {{- end -}} 64 | 65 | {{- define "triton-inference-server-metrics.fullname" -}} 66 | {{- $basename := include "triton-inference-server.fullname" . -}} 67 | {{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} 68 | {{- printf "%s-%s" $basename_trimmed "metrics" -}} 69 | {{- end -}} 70 | 71 | {{/* 72 | Create inference server metrics monitor name and fullname derived from 73 | above and truncated appropriately. 74 | */}} 75 | {{- define "triton-inference-server-metrics-monitor.name" -}} 76 | {{- $basename := include "triton-inference-server.name" . -}} 77 | {{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} 78 | {{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} 79 | {{- end -}} 80 | 81 | {{- define "triton-inference-server-metrics-monitor.fullname" -}} 82 | {{- $basename := include "triton-inference-server.fullname" . -}} 83 | {{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} 84 | {{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} 85 | {{- end -}} 86 | 87 | {{/* 88 | Create chart name and version as used by the chart label. 89 | */}} 90 | {{- define "triton-inference-server.chart" -}} 91 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 92 | {{- end -}} 93 | -------------------------------------------------------------------------------- /inference/triton/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | apiVersion: apps/v1 28 | kind: Deployment 29 | metadata: 30 | name: {{ template "triton-inference-server.fullname" . }} 31 | namespace: {{ .Release.Namespace }} 32 | labels: 33 | app: {{ template "triton-inference-server.name" . }} 34 | chart: {{ template "triton-inference-server.chart" . }} 35 | release: {{ .Release.Name }} 36 | heritage: {{ .Release.Service }} 37 | spec: 38 | replicas: {{ .Values.replicaCount }} 39 | selector: 40 | matchLabels: 41 | app: {{ template "triton-inference-server.name" . }} 42 | release: {{ .Release.Name }} 43 | template: 44 | metadata: 45 | labels: 46 | app: {{ template "triton-inference-server.name" . }} 47 | release: {{ .Release.Name }} 48 | 49 | spec: 50 | nodeSelector: 51 | cloud.google.com/gke-nodepool: a100-pool 52 | # cloud.google.com/gke-accelerator: nvidia-tesla-a100 53 | # nodeSelector: 54 | # cloud.google.com/gke-gpu-partition-size: {{ .Values.migPartition}} 55 | containers: 56 | - name: {{ .Chart.Name }} 57 | image: "{{ .Values.image.imageName }}" 58 | imagePullPolicy: {{ .Values.image.pullPolicy }} 59 | volumeMounts: 60 | - name: "storage" 61 | mountPath: /model 62 | 63 | resources: 64 | limits: 65 | nvidia.com/gpu: {{ .Values.image.numGpus }} 66 | 67 | command: ["/bin/sh","-c"] 68 | args: ["bash /model/inference/run_triton.sh {{ .Values.image.modelRepositoryPath }}"] 69 | 70 | ports: 71 | - containerPort: 8000 72 | name: http 73 | - containerPort: 8001 74 | name: grpc 75 | - containerPort: 8002 76 | name: metrics 77 | livenessProbe: 78 | httpGet: 79 | path: /v2/health/live 80 | port: http 81 | initialDelaySeconds: 100 82 | periodSeconds: 30 83 | readinessProbe: 84 | initialDelaySeconds: 100 85 | periodSeconds: 30 86 | httpGet: 87 | path: /v2/health/ready 88 | port: http 89 | volumes: 90 | - name: "storage" 91 | persistentVolumeClaim: 92 | claimName: my-volume-claim -------------------------------------------------------------------------------- /inference/triton/templates/service.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | apiVersion: v1 28 | kind: Service 29 | metadata: 30 | name: {{ template "triton-inference-server.fullname" . }} 31 | namespace: {{ .Release.Namespace }} 32 | labels: 33 | app: {{ template "triton-inference-server.name" . }} 34 | chart: {{ template "triton-inference-server.chart" . }} 35 | release: {{ .Release.Name }} 36 | heritage: {{ .Release.Service }} 37 | spec: 38 | type: {{ .Values.service.type }} 39 | ports: 40 | - port: 8000 41 | targetPort: http 42 | name: http-inference-server 43 | - port: 8001 44 | targetPort: grpc 45 | name: grpc-inference-server 46 | - port: 8002 47 | targetPort: metrics 48 | name: metrics-inference-server 49 | selector: 50 | app: {{ template "triton-inference-server.name" . }} 51 | release: {{ .Release.Name }} 52 | --- 53 | apiVersion: v1 54 | kind: Service 55 | metadata: 56 | name: {{ template "triton-inference-server-metrics.fullname" . }} 57 | namespace: {{ .Release.Namespace }} 58 | labels: 59 | app: {{ template "triton-inference-server-metrics.name" . }} 60 | chart: {{ template "triton-inference-server.chart" . }} 61 | release: {{ .Release.Name }} 62 | heritage: {{ .Release.Service }} 63 | annotations: 64 | alpha.monitoring.coreos.com/non-namespaced: "true" 65 | spec: 66 | ports: 67 | - name: metrics 68 | port: 8080 69 | targetPort: metrics 70 | protocol: TCP 71 | selector: 72 | app: {{ template "triton-inference-server.name" . }} 73 | release: {{ .Release.Name }} 74 | --- 75 | apiVersion: monitoring.coreos.com/v1 76 | kind: ServiceMonitor 77 | metadata: 78 | name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} 79 | namespace: {{ .Release.Namespace }} 80 | labels: 81 | app: {{ template "triton-inference-server-metrics-monitor.name" . }} 82 | chart: {{ template "triton-inference-server.chart" . }} 83 | release: {{ .Release.Name }} 84 | heritage: {{ .Release.Service }} 85 | spec: 86 | selector: 87 | matchLabels: 88 | app: {{ template "triton-inference-server-metrics.name" . }} 89 | endpoints: 90 | - port: metrics 91 | interval: 10s -------------------------------------------------------------------------------- /inference/triton/values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | replicaCount: 1 28 | 29 | migPartition: 3g.20gb 30 | 31 | image: 32 | imageName: gcr.io/dl-tme/merlin/merlin-inference:0.5.1 33 | pullPolicy: Always 34 | modelRepositoryPath: /model/models 35 | numGpus: 1 36 | 37 | service: 38 | type: LoadBalancer 39 | -------------------------------------------------------------------------------- /merlin-pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | import os 18 | import kfp.dsl as dsl 19 | import kfp.gcp as gcp 20 | import kfp.components as comp 21 | import kfp.dsl as dsl 22 | import datetime 23 | import os 24 | from kubernetes import client as k8s_client 25 | import argparse 26 | import logging 27 | 28 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s:%(message)s') 29 | logger = logging.getLogger(__name__) 30 | 31 | args = None 32 | 33 | @dsl.pipeline( 34 | name="Merlin pipeline", 35 | description="HugeCTR training to deployment" 36 | ) 37 | def merlin_pipeline( 38 | accelerator: str = 'nvidia-tesla-a100', 39 | node_pool: str = 'gpu-pool', 40 | high_mem_node: str = 'none', 41 | data_dir: 'GCSPath' = 'gs://tme-criteo/dummy_data/*', 42 | new_data_dir: 'GCSPath' = 'gs://tme-criteo/new_data/*', 43 | gcs_bucket_head: str = 'tme-criteo', 44 | local_data_dir: str = '/var/lib/data', 45 | project_id: str = 'dl-tme', 46 | pipeline_name: str = 'merlin-pipeline', 47 | new_data_collection: str = 'new_data', 48 | do_data_validation: str = 'False', 49 | pubsub_sub_id: str = 'mlops-test-sub', 50 | cluster: str = 'merlin-mlops', 51 | zone: str = 'us-central1-a'): 52 | 53 | global args 54 | 55 | # Persistent volume variables 56 | persistent_volume_name = 'my-file-server' 57 | persistent_volume_claim_name = 'my-volume-claim' 58 | persistent_volume_path = '/var/lib/data' 59 | 60 | # First component - Copy data from GCS to PV 61 | copy_data = dsl.ContainerOp( 62 | name="data-extraction", 63 | image=args.data_extraction, 64 | command=["bash" , "/script/run_copy_merlin.sh"], 65 | arguments=[data_dir, local_data_dir, project_id, new_data_dir, cluster, zone] 66 | ) 67 | 68 | # Second component - Data validation 69 | data_validation = dsl.ContainerOp( 70 | name="validate-data", 71 | image=args.validate_container, 72 | command=["bash" , "/script/run_validation.sh"], 73 | arguments=[local_data_dir, do_data_validation] 74 | ) 75 | 76 | # Third component - Preprocess and Train 77 | preprocess_train = dsl.ContainerOp( 78 | name="merlin-preprocess-train", 79 | image=args.preprocess_train_container, 80 | command=["bash", "/script/preprocess-train.sh"], 81 | arguments=[local_data_dir, project_id, cluster, zone] 82 | ) 83 | 84 | # Fourth component - Model deployment 85 | deploy_triton = dsl.ContainerOp( 86 | name="triton-inference", 87 | image=args.deploy_container, 88 | command=["bash" , "/script/run_merlin_inference.sh"], 89 | arguments=[local_data_dir, project_id, "/script/gcloud_key.json", cluster, zone] 90 | ) 91 | 92 | # Fifth component - Monitoring 93 | monitoring = dsl.ContainerOp( 94 | name="data-monitoring", 95 | image=args.monitor_container, 96 | command=["bash" , "/script/run_monitoring.sh"], 97 | arguments=[project_id, args.monitor_container, pipeline_name, gcs_bucket_head, new_data_collection, "{}{}{}".format(local_data_dir,"/",new_data_collection), cluster, zone] 98 | ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool) 99 | 100 | 101 | # Adding PV, PVC, GPU constraints to the components 102 | copy_data.add_volume(k8s_client.V1Volume(name=persistent_volume_name, 103 | persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( 104 | claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount( 105 | mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool) 106 | 107 | data_validation.add_volume(k8s_client.V1Volume(name=persistent_volume_name, 108 | persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( 109 | claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount( 110 | mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool) 111 | 112 | 113 | preprocess_train.add_volume(k8s_client.V1Volume(name=persistent_volume_name, 114 | persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( 115 | claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount( 116 | mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool) 117 | 118 | deploy_triton.add_volume(k8s_client.V1Volume(name=persistent_volume_name, 119 | persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( 120 | claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount( 121 | mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool) 122 | 123 | # Sequencing the components 124 | data_validation.after(copy_data) 125 | preprocess_train.after(data_validation) 126 | deploy_triton.after(preprocess_train) 127 | monitoring.after(deploy_triton) 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser() 131 | 132 | # Parse command line arguments 133 | parser.add_argument("-vc", 134 | "--validate_container", 135 | type=str, 136 | required=False, 137 | help="pass validate data container") 138 | 139 | parser.add_argument("-dex", 140 | "--data_extraction", 141 | type=str, 142 | required=True, 143 | help="pass copy container") 144 | 145 | parser.add_argument("-tc", 146 | "--preprocess_train_container", 147 | type=str, 148 | required=True, 149 | help="pass preprocess-train container") 150 | 151 | parser.add_argument("-dc", 152 | "--deploy_container", 153 | type=str, 154 | required=True, 155 | help="pass copy container") 156 | 157 | parser.add_argument("-mc", 158 | "--monitor_container", 159 | type=str, 160 | required=True, 161 | help="pass copy container") 162 | 163 | args = parser.parse_args() 164 | 165 | logger.info("Data extraction container: " + args.data_extraction) 166 | logger.info("Validate container: " + args.validate_container) 167 | logger.info("Preprocess-train container: " + args.preprocess_train_container) 168 | logger.info("Deploy container: " + args.deploy_container) 169 | logger.info("Monitor container: " + args.monitor_container) 170 | 171 | 172 | import kfp.compiler as compiler 173 | # Export pipeline as .tar.gz 174 | compiler.Compiler().compile(merlin_pipeline, __file__ + '.tar.gz') 175 | 176 | -------------------------------------------------------------------------------- /monitoring/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /monitoring/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | apiVersion: v1 28 | appVersion: "2.0" 29 | description: Monitoring Module 30 | name: monitoring-module 31 | version: 1.0.0 32 | -------------------------------------------------------------------------------- /monitoring/csv_read_gcs_write.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import pandas as pd 17 | import numpy as np 18 | from pyarrow import csv, parquet 19 | from glob import glob 20 | import os 21 | from datetime import datetime 22 | import re 23 | import sys 24 | from google.cloud import storage 25 | import argparse 26 | from time import sleep 27 | 28 | PATH = 'dummy' 29 | 30 | def get_local_files(path): 31 | local_files = glob(path+"/*") 32 | return local_files 33 | 34 | def files_to_data_frames(local_files): 35 | data_frames = [] 36 | for local_file in local_files: 37 | df = pd.read_csv(local_file) 38 | data_frames.append(df) 39 | del df 40 | return data_frames 41 | 42 | def files_to_data_frames_parquet(local_files): 43 | data_frames = [] 44 | for local_file in local_files: 45 | df = pd.read_parquet(local_file, engine='pyarrow') 46 | data_frames.append(df) 47 | del df 48 | return data_frames 49 | 50 | def one_giant_data_frame(data_frames): 51 | big_un = pd.concat(data_frames, copy=False) 52 | return big_un 53 | 54 | def file_to_data_frame_to_parquet(data_frame, parquet_file): 55 | # table = csv.read_csv(local_file) 56 | # parquet.write_table(table, parquet_file) 57 | data_frame.to_parquet(parquet_file, engine='pyarrow') 58 | 59 | class GCSStore: 60 | def __init__(self, bucket_name, bucket_path): 61 | self.bucket_name = bucket_name 62 | self.bucket_path = bucket_path 63 | # Create a Cloud Storage client. 64 | self.gcs = storage.Client() 65 | 66 | # Get the bucket that the file will be uploaded to. 67 | self.bucket = self.gcs.get_bucket(self.bucket_name) 68 | 69 | 70 | def list_bucket(self, limit=sys.maxsize): 71 | a_bucket = self.gcs.lookup_bucket(self.bucket_name) 72 | bucket_iterator = a_bucket.list_blobs(prefix=self.bucket_path) 73 | for resource in bucket_iterator: 74 | print(resource.name) 75 | limit = limit - 1 76 | if limit <= 0: 77 | break 78 | 79 | def upload_to_bucket(self, input_file_name, output_file_name): 80 | blob2 = self.bucket.blob(self.bucket_path + "/" + output_file_name) 81 | blob2.upload_from_filename(filename=input_file_name) 82 | 83 | 84 | if __name__=='__main__': 85 | parser = argparse.ArgumentParser() 86 | 87 | print("In read-write csv to parquet") 88 | 89 | parser.add_argument("--pv_dir", 90 | type=str, 91 | required=True, 92 | default="/var/lib/data/new_data", 93 | help="Path to new data in PV") 94 | 95 | parser.add_argument("--sleep_time", 96 | type=int, 97 | required=True, 98 | default=1, 99 | help="Sleep time in seconds") 100 | 101 | parser.add_argument("--bucket", 102 | type=str, 103 | required=True, 104 | default="criteo-data", 105 | help="Name of GCS bucket") 106 | 107 | parser.add_argument("--bucket_path", 108 | type=str, 109 | required=True, 110 | default="new_data", 111 | help="Path of directory to store files on GCS bucket") 112 | 113 | args = parser.parse_args() 114 | 115 | sleep_time = args.sleep_time 116 | gcs_store = GCSStore(args.bucket, args.bucket_path) 117 | 118 | while True: 119 | sleep(sleep_time) 120 | local_files = get_local_files(args.pv_dir) 121 | if len(local_files) == 0: 122 | print("No files to process. Sleeping for {} secs".format(sleep_time)) 123 | continue 124 | 125 | print("New files found. Pushing to GCS...") 126 | for each_file in local_files: 127 | print("pushing {} to {}".format(each_file, args.bucket + "/" + args.bucket_path + "/" +os.path.basename(each_file))) 128 | gcs_store.upload_to_bucket(each_file, os.path.basename(each_file)) 129 | print("Uploaded {} to {} at {}. Deleting {} from PV".format(each_file, 130 | args.bucket + "/" + args.bucket_path + "/" +os.path.basename(each_file), 131 | datetime.now(), each_file)) 132 | os.remove(each_file) 133 | 134 | 135 | -------------------------------------------------------------------------------- /monitoring/perf-monitor-test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import numpy as np 17 | import os 18 | import logging 19 | import argparse 20 | import sys 21 | import warnings 22 | import sys 23 | import time 24 | import json 25 | 26 | import cudf 27 | from sklearn import metrics 28 | import pandas as pd 29 | 30 | import tritonclient.http as httpclient 31 | import tritonclient.grpc as grpcclient 32 | from tritonclient.utils import * 33 | 34 | from google.cloud import pubsub_v1 35 | from google.protobuf.json_format import MessageToJson 36 | from google.pubsub_v1.types import Encoding 37 | 38 | 39 | 40 | def publish_batch(project_id, topic_id, current_batch, pred_label): 41 | # Initialize a Publisher client. 42 | client = pubsub_v1.PublisherClient() 43 | topic_path = client.topic_path(project_id, topic_id) 44 | 45 | batch_size = len(pred_label) 46 | df = current_batch.to_pandas() 47 | 48 | for i in range(batch_size): 49 | row = df.iloc[i] 50 | 51 | frame = { 52 | "input0": row[CONTINUOUS_COLUMNS].values.tolist(), 53 | "input1": row[CATEGORICAL_COLUMNS].values.tolist(), 54 | "trueval": row['label'], 55 | "predval": response.as_numpy("OUTPUT0")[i].astype('float64') 56 | } 57 | 58 | payload = json.dumps(frame).encode('utf-8') 59 | 60 | # When you publish a message, the client returns a future. 61 | api_future = client.publish(topic_path, data=''.encode(), payload=payload) 62 | 63 | 64 | if __name__ == "__main__": 65 | 66 | parser = argparse.ArgumentParser() 67 | 68 | parser.add_argument('-u', 69 | '--triton_grpc_url', 70 | type=str, 71 | required=False, 72 | default='localhost:8001', 73 | help='URL to Triton gRPC Endpoint') 74 | 75 | parser.add_argument('-m', 76 | '--model_name', 77 | type=str, 78 | required=False, 79 | default='dcn_ens', 80 | help='Name of the model ensemble to load') 81 | 82 | parser.add_argument('-d', 83 | '--test_data', 84 | type=str, 85 | required=False, 86 | default='/crit_int_pq/day_23.parquet', 87 | help='Path to a test .parquet file. Default') 88 | 89 | parser.add_argument('-b', 90 | '--batch_size', 91 | type=int, 92 | required=False, 93 | default=64, 94 | help='Batch size. Max is 64 at the moment, but this max size could be specified when create the model and the ensemble.') 95 | 96 | parser.add_argument('-n', 97 | '--n_batches', 98 | type=int, 99 | required=False, 100 | default=1, 101 | help='Number of batches of data to send') 102 | 103 | parser.add_argument('-v', 104 | '--verbose', 105 | type=bool, 106 | required=False, 107 | default=False, 108 | help='Verbosity, True or False') 109 | 110 | parser.add_argument("--project_id", 111 | type=str, 112 | required=True, 113 | default="dl-tme", 114 | help="Google Cloud project ID") 115 | 116 | parser.add_argument("--topic_id", 117 | type=str, 118 | required=True, 119 | default="pubsub", 120 | help="Pub/Sub topic ID") 121 | 122 | 123 | args = parser.parse_args() 124 | 125 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 126 | logging.info(f"Args: {args}") 127 | 128 | 129 | # warnings can be disabled 130 | if not sys.warnoptions: 131 | warnings.simplefilter("ignore") 132 | 133 | try: 134 | triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose) 135 | logging.info("Triton client created.") 136 | 137 | triton_client.is_model_ready(args.model_name) 138 | logging.info(f"Model {args.model_name} is ready!") 139 | except Exception as e: 140 | logging.error(f"Channel creation failed: {str(e)}") 141 | sys.exit() 142 | 143 | # Load the dataset 144 | CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1,27)] 145 | CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)] 146 | LABEL_COLUMNS = ['label'] 147 | col_names = CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS 148 | col_dtypes = [np.int32]*26 + [np.int64]*13 149 | 150 | logging.info("Reading dataset..") 151 | all_batches = cudf.read_parquet(args.test_data, num_rows=args.batch_size*args.n_batches) 152 | 153 | results=[] 154 | 155 | with grpcclient.InferenceServerClient(url=args.triton_grpc_url) as client: 156 | for batch in range(args.n_batches): 157 | 158 | logging.info(f"Requesting inference for batch {batch}..") 159 | start_idx = batch*args.batch_size 160 | end_idx = (batch+1)*(args.batch_size) 161 | 162 | # Convert the batch to a triton inputs 163 | current_batch = all_batches[start_idx:end_idx] 164 | columns = [(col, current_batch[col]) for col in col_names] 165 | inputs = [] 166 | 167 | for i, (name, col) in enumerate(columns): 168 | d = col.values_host.astype(col_dtypes[i]) 169 | d = d.reshape(len(d), 1) 170 | inputs.append(grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) 171 | inputs[i].set_data_from_numpy(d) 172 | 173 | outputs = [] 174 | outputs.append(grpcclient.InferRequestedOutput("OUTPUT0")) 175 | 176 | response = client.infer(args.model_name, inputs, request_id=str(1), outputs=outputs) 177 | 178 | results.extend(response.as_numpy("OUTPUT0")) 179 | 180 | publish_batch(args.project_id, args.topic_id, 181 | current_batch, 182 | response.as_numpy("OUTPUT0")) 183 | 184 | logging.info(f"ROC AUC Score: {metrics.roc_auc_score(all_batches[LABEL_COLUMNS].values.tolist(), results)}") -------------------------------------------------------------------------------- /monitoring/perf-monitor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os 17 | import logging 18 | from time import time, sleep 19 | from queue import Queue 20 | from threading import Thread 21 | 22 | import argparse 23 | from google.cloud import pubsub_v1 24 | import json 25 | import collections 26 | 27 | from sklearn import metrics 28 | import pandas as pd 29 | import numpy as np 30 | 31 | import kfp 32 | import datetime 33 | 34 | # logging.basicConfig(level=logging.INFO, 35 | # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 36 | # logger = logging.getLogger(__name__) 37 | 38 | # client = kfp.Client(host='https://320d47d67af4e8cf-dot-us-central1.pipelines.googleusercontent.com') 39 | 40 | def get_pipeline_id(name, client): 41 | pl_id = None 42 | page_size = 100 43 | page_token = '' 44 | while True: 45 | res = client.list_pipelines(page_size=page_size, page_token=page_token) 46 | pl_list = res.pipelines 47 | for pl in pl_list: 48 | if pl.name == name: 49 | pl_id = pl.id 50 | return pl_id 51 | page_token = res.next_page_token 52 | if not page_token: 53 | break 54 | return pl_id 55 | 56 | def get_pipeline_info(input_name, client_key): 57 | page_size = 200 58 | page_token = '' 59 | pipeline_runs = [] 60 | 61 | client = kfp.Client(host=client_key) 62 | 63 | res = client.list_runs(page_size=page_size, page_token=page_token) 64 | for runs in res.runs: 65 | if runs.resource_references[1].name == input_name: 66 | pipeline_runs.append(runs) 67 | 68 | if len(pipeline_runs) !=0: 69 | for prun in pipeline_runs: 70 | if prun.status == 'Running': 71 | return None 72 | 73 | # if prun.status == 'Succeeded': 74 | tmp = { 'pipelineID': prun.resource_references[1].key.id, 75 | 'experimentID': prun.resource_references[0].key.id, 76 | 'status': prun.status, 77 | 'new_run_name': 'triggered_'+str(datetime.datetime.now())} 78 | 79 | return tmp 80 | # pid = get_pipeline_id(input_name,client) 81 | # print("pid: ", name) 82 | 83 | return None 84 | 85 | def trigger_kfp(pipeline_name, client_key): 86 | logging.warning("Triggering Kubeflow Pipeline...") 87 | 88 | # If pipeline is already running --> False 89 | # Else -> True 90 | try: 91 | pipeline_info = get_pipeline_info(pipeline_name, client_key) 92 | except Exception as e: 93 | logging.error(f"Triggering pipeline error: {e}") 94 | return False 95 | 96 | logging.info(f"Pipeline info: {pipeline_info}") 97 | 98 | if pipeline_info != None: 99 | print("Using pipeline ID: ", pipeline_info['pipelineID'], " triggering ", pipeline_info['new_run_name'], " at: ", datetime.datetime.now()) 100 | client = kfp.Client(host=client_key) 101 | res = client.run_pipeline(pipeline_info['experimentID'], pipeline_info['new_run_name'], pipeline_id=pipeline_info['pipelineID']) 102 | return True 103 | else: 104 | logging.info("Did not trigger the pipeline") 105 | return False 106 | 107 | 108 | class AccMonitor: 109 | def __init__(self, project_id, subscription_id, timeout, evaluate_period=500, 110 | acc_threshold=0.5, min_trigger_len=0.5, pipeline_name='merlin-pipeline', 111 | min_log_length=320, log_time_delta=60,pv_location='/var/lib/data/', client_host=None): 112 | self.evaluate_period = evaluate_period 113 | self.pipeline_name = pipeline_name 114 | self.pv_location = pv_location 115 | self.client_host_key = client_host 116 | # Thread safe Queues where each item is a request 117 | self.request_queue = Queue(maxsize=self.evaluate_period) 118 | 119 | self.project_id = project_id 120 | self.subscription_id = subscription_id 121 | self.timeout = timeout 122 | self.acc_threshold = acc_threshold 123 | 124 | # Mininum number of results in the circular buffer to initiate a monitoring based trigger 125 | self.min_trigger_len = min_trigger_len * self.evaluate_period 126 | # print("Min trigger length", self.min_trigger_len) 127 | 128 | # Logging configs 129 | self.min_log_length = min_log_length 130 | self.log_time_delta = datetime.timedelta(seconds=log_time_delta) 131 | 132 | # Circular buffer to store results in a rolling manner 133 | self.label_queue = collections.deque(maxlen=self.evaluate_period) 134 | self.pred_queue = collections.deque(maxlen=self.evaluate_period) 135 | 136 | def run(self): 137 | 138 | def enqueue_request(self): 139 | """ 140 | Receives messages from a Pub/Sub subscription and adds the request to a queue. 141 | 142 | The idea is to decouple message processing from message reception so that 143 | if there are a large number of messages at once, processing does not cause delays in the 144 | thread recieving messages. 145 | """ 146 | 147 | # Initialize a Subscriber client 148 | subscriber_client = pubsub_v1.SubscriberClient() 149 | 150 | # Create a fully qualified identifier in the form of 151 | # `projects/{project_id}/subscriptions/{subscription_id}` 152 | subscription_path = subscriber_client.subscription_path(self.project_id, self.subscription_id) 153 | 154 | def callback(message): 155 | # Acknowledge the message. Unack'ed messages will be redelivered. 156 | message.ack() 157 | # print("JSON of message:", json.loads(message.attributes)) 158 | # print(f"Acknowledged {message.message_id}.") 159 | 160 | payload = json.loads(message.attributes['payload']) 161 | 162 | # If the queue at it's max size, this blocks until items are consumed 163 | # In case the dequeuing thread is slower, then this will block 164 | # from recieving more messages from the broker. The broker should 165 | # still have those messages so that they dont get lost. 166 | self.request_queue.put(payload) 167 | 168 | streaming_pull_future = subscriber_client.subscribe( 169 | subscription_path, callback=callback 170 | ) 171 | logging.info(f"Listening for messages on {subscription_path}..\n") 172 | 173 | try: 174 | # Calling result() on StreamingPullFuture keeps the main thread from 175 | # exiting while messages get processed in the callbacks. 176 | streaming_pull_future.result(timeout=self.timeout) 177 | except: 178 | streaming_pull_future.cancel() 179 | 180 | subscriber_client.close() 181 | 182 | # Start the enqueue thread as a daemon 183 | enqueue = Thread(target=enqueue_request, args=(self,)) 184 | enqueue.daemon = True 185 | enqueue.start() 186 | 187 | """ 188 | Fetches request from a queue, and calculates the rolling accuracy over last N requests 189 | If the rolling accuracy is below a pre-specified threshold, raises an alarm 190 | 191 | - We have access to the features here. We save the requests into a .parquet file 192 | in batches 193 | 194 | - PubSub usually does not guarantee in-order delivery of messages 195 | """ 196 | 197 | # Initialization 198 | rolling_acc = 1.0 199 | 200 | CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1,27)] 201 | CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)] 202 | LABEL_COLUMNS = ['label'] 203 | col_names = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS 204 | DATETIME_FORMAT = '%d_%m_%Y-%H-%M-%S' 205 | last_log_time = datetime.datetime.strptime('01_01_1970-00-00-00', DATETIME_FORMAT) 206 | 207 | # Create an empty dataframe 208 | df_temp = pd.DataFrame(columns = col_names) 209 | 210 | while True: 211 | while self.request_queue.empty(): 212 | # sleep so .get doesnt eat CPU cycles if queue is empty 213 | sleep(0.1) 214 | 215 | # Fetch the payload 216 | payload = self.request_queue.get() 217 | 218 | # TODO: put checks for payload 219 | request = np.concatenate((np.array([payload["trueval"]], float), 220 | np.array(payload["input0"]), 221 | np.array(payload["input1"]))) 222 | 223 | # Append new request to the dataframe 224 | df_temp = df_temp.append(pd.DataFrame([request], columns=col_names)) 225 | 226 | # Write to a file if there are a minimum number of samples available, 227 | # and if a minimum amount of time has passed since last write 228 | # TOFIX: This is problematic if no new request comes for a while and 229 | # there are many requests in the dataframe ready to be written already 230 | current_time = datetime.datetime.now() 231 | if (df_temp.shape[0] >= self.min_log_length) and \ 232 | (current_time - last_log_time >= self.log_time_delta): 233 | filename = current_time.strftime(DATETIME_FORMAT) + ".parquet" 234 | logging.info(f"Writing {df_temp.shape[0]} records to {self.pv_location} / {filename}...") 235 | # print(f"Writing {df_temp.shape[0]} records to {self.pv_location+filename}...") 236 | df_temp.reset_index(inplace=True, drop=True) 237 | df_temp.to_parquet(self.pv_location+"/"+filename) 238 | 239 | # Clear the dataframe 240 | df_temp = pd.DataFrame(columns = col_names) 241 | last_log_time = current_time 242 | 243 | # Circular buffer of size evaluate_period 244 | self.label_queue.append(payload["trueval"]) 245 | self.pred_queue.append(payload["predval"]) 246 | 247 | try: 248 | # This will fail if there is only one class in label_queue, catch and pass 249 | # in that case 250 | rolling_acc = metrics.roc_auc_score(self.label_queue, self.pred_queue) 251 | logging.info(f"Rolling AUC score: {rolling_acc}") 252 | except ValueError: 253 | pass 254 | 255 | 256 | if (rolling_acc < self.acc_threshold) and (len(self.label_queue) > self.min_trigger_len): 257 | success = trigger_kfp(self.pipeline_name, self.client_host_key) 258 | # If the pipeline has triggered, refresh the result circular buffer, 259 | # and calculate fresh metrics. Ideally we need a better mechanism to 260 | # check if the pipeline is already running, then don't retrigger 261 | if success == True: 262 | self.label_queue.clear() 263 | self.pred_queue.clear() 264 | rolling_acc = 1.0 265 | sleep(5) 266 | 267 | 268 | if __name__ == "__main__": 269 | parser = argparse.ArgumentParser() 270 | 271 | print("In Performance monitoring module") 272 | 273 | parser.add_argument("--project_id", 274 | type=str, 275 | required=True, 276 | default="dl-tme", 277 | help="Google Cloud project ID") 278 | 279 | parser.add_argument("--subscription_id", 280 | type=str, 281 | required=True, 282 | default="sub_id", 283 | help="Pub/Sub subscription ID") 284 | 285 | parser.add_argument("--timeout", 286 | type=int, 287 | required=False, 288 | default=None, 289 | help="Timeout for Streaming Pull") 290 | 291 | parser.add_argument("--evaluate_period", 292 | type=int, 293 | required=False, 294 | default=500, 295 | help="Evaluate over the last evaluate_period samples") 296 | 297 | parser.add_argument("--min_trigger_len", 298 | type=float, 299 | required=False, 300 | default=0.5, 301 | help="Minimum number of samples in queue before monitoring based trigger. \ 302 | As a percentage of evaluate_period ") 303 | 304 | parser.add_argument("--acc_threshold", 305 | type=float, 306 | required=False, 307 | default=0.5, 308 | help="AUC ROC threshold for trigger. Default 0.8") 309 | 310 | parser.add_argument("--pipeline_name", 311 | type=str, 312 | required=False, 313 | default='merlin-pipeline', 314 | help="Name of the original pipeline") 315 | 316 | parser.add_argument("--min_log_length", 317 | type=int, 318 | required=False, 319 | default=320, 320 | help="Minimum number of req of the .parquet/.csv that is created") 321 | 322 | parser.add_argument("--log_time_delta", 323 | type=int, 324 | required=False, 325 | default=60, 326 | help="Minimum amount of time delta (in secs) between two subsequent .parquet files") 327 | 328 | parser.add_argument("--PV_loc", 329 | type=str, 330 | required=False, 331 | default='/var/lib/data/new_data/', 332 | help="Location of PV to write the files") 333 | 334 | 335 | args = parser.parse_args() 336 | 337 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 338 | logging.info(f"Args: {args}") 339 | 340 | logging.info("Starting accuracy monitor...") 341 | 342 | client_host_key = None 343 | 344 | with open('/script/kfp_client_host_key.txt','r') as f: 345 | client_host_key = f.read() 346 | 347 | 348 | # TODO: Add better error handling, and move configs to a .json 349 | am = AccMonitor(project_id=args.project_id, 350 | subscription_id=args.subscription_id, 351 | timeout=args.timeout, 352 | evaluate_period=args.evaluate_period, 353 | acc_threshold=args.acc_threshold, 354 | min_trigger_len=args.min_trigger_len, 355 | pipeline_name=args.pipeline_name, 356 | min_log_length=args.min_log_length, 357 | log_time_delta=args.log_time_delta, 358 | pv_location=args.PV_loc, 359 | client_host=client_host_key) 360 | 361 | am.run() -------------------------------------------------------------------------------- /monitoring/run_monitoring.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | PROJECT=${1:-"dl-tme"} 19 | DOCKER_IMG=${2:-"gcr.io/${PROJECT}/monitoring:0.5.1"} 20 | PIPELINE=${3:-"merlin-pipeline"} 21 | GCS_BUCKET=${4:-"criteo-data"} 22 | BUCKET_PATH=${5:-"new_data"} 23 | LOCAL=${6:-"/var/lib/data/new_data"} 24 | PUBSUB=${7:-"mlops-test-sub"} 25 | CLUSTER=${8:-"merlin-mlops"} 26 | ZONE=${9:-"us-central1-a"} 27 | 28 | 29 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json 30 | gcloud container clusters get-credentials $CLUSTER --zone $ZONE --project $PROJECT 31 | 32 | monitoring_status=$(helm status monitoring 2>&1) 33 | echo "monitoring status: " 34 | echo $monitoring_status 35 | if [[ "$monitoring_status" == "Error: release: not found" ]]; then 36 | helm install monitoring --set project_id=$PROJECT --set image.repository=$DOCKER_IMG --set pipeline=$PIPELINE --set gcs_bucket=$GCS_BUCKET --set bucket_path=$BUCKET_PATH --set local=$LOCAL --set pubsub=$PUBSUB /script 37 | else 38 | echo "Monitoring module running already, not deploying another instance" 39 | fi 40 | -------------------------------------------------------------------------------- /monitoring/run_monitoring_and_live_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID 18 | GCS_BUCKET=${2:-"criteo-data"} 19 | BUCKET_PATH=${3:-"new_data"} 20 | LOCAL=${4:-"/var/lib/data/new_data"} 21 | PIPELINE=${5:-"merlin-pipeline"} 22 | PUBSUB=${6:-"mlops-test-sub"} 23 | 24 | echo "perf monitor" 25 | python3 -u /script/perf-monitor.py --PV_loc $LOCAL --project_id $PROJECT_ID --subscription_id $PUBSUB --evaluate_period 200 --min_trigger_len 0.5 --acc_threshold 0.8 --pipeline_name $PIPELINE & 26 | 27 | echo "gcs" 28 | python3 -u /script/csv_read_gcs_write.py --pv_dir $LOCAL --sleep_time 10 --bucket $GCS_BUCKET --bucket_path $BUCKET_PATH 29 | 30 | echo "done" -------------------------------------------------------------------------------- /monitoring/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "monitoring-module.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "monitoring-module.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "monitoring-module.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "monitoring-module.labels" -}} 37 | helm.sh/chart: {{ include "monitoring-module.chart" . }} 38 | {{ include "monitoring-module.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "monitoring-module.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "monitoring-module.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "monitoring-module.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "monitoring-module.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /monitoring/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | apiVersion: apps/v1 17 | kind: Deployment 18 | metadata: 19 | name: {{ include "monitoring-module.fullname" . }} 20 | labels: 21 | {{- include "monitoring-module.labels" . | nindent 4 }} 22 | spec: 23 | selector: 24 | matchLabels: 25 | {{- include "monitoring-module.selectorLabels" . | nindent 6 }} 26 | template: 27 | metadata: 28 | {{- with .Values.podAnnotations }} 29 | annotations: 30 | {{- toYaml . | nindent 8 }} 31 | {{- end }} 32 | labels: 33 | {{- include "monitoring-module.selectorLabels" . | nindent 8 }} 34 | spec: 35 | containers: 36 | - name: {{ .Chart.Name }} 37 | image: "{{ .Values.image.repository }}" 38 | imagePullPolicy: {{ .Values.image.pullPolicy }} 39 | command: ["/bin/sh","-c"] 40 | args: ["bash run_monitoring_and_live_data.sh {{ .Values.project_id }} {{ .Values.gcs_bucket }} {{ .Values.bucket_path }} {{ .Values.local }} {{ .Values.pipeline }} {{ .Values.pubsub }}"] 41 | volumeMounts: 42 | - name: "tmp-data-storage" 43 | mountPath: {{ .Values.local }} 44 | volumes: 45 | - name: "tmp-data-storage" 46 | persistentVolumeClaim: 47 | claimName: tmp-data-storage-claim 48 | -------------------------------------------------------------------------------- /monitoring/values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | # Default values for monitoring-module. 18 | # This is a YAML-formatted file. 19 | # Declare variables to be passed into your templates. 20 | 21 | replicaCount: 1 22 | 23 | image: 24 | repository: nginx 25 | pullPolicy: Always 26 | # Overrides the image tag whose default is the chart appVersion. 27 | 28 | project_id: "dl-tme" 29 | pipeline: "merlin-pipeline" 30 | gcs_bucket: "criteo-data" 31 | bucket_path: "new_data" 32 | local: "/var/lib/data/new_data" 33 | pubsub: "mlops-test-sub" -------------------------------------------------------------------------------- /preprocess-train/dcn_files/dcn.json: -------------------------------------------------------------------------------- 1 | { 2 | "inference": { 3 | "max_batchsize": 64, 4 | "hit_rate_threshold": 0.6, 5 | "dense_model_file": "/model/models/dcn/1/_dense_500.model", 6 | "sparse_model_file": "/model/models/dcn/1/0_sparse_500.model", 7 | "label": 1, 8 | "input_key_type": "I64" 9 | }, 10 | "layers": [ 11 | { 12 | "name": "data", 13 | "type": "Data", 14 | "check": "None", 15 | "label": { 16 | "label_dim": 1 17 | }, 18 | "dense": { 19 | "top": "dense", 20 | "dense_dim": 13 21 | }, 22 | "sparse": [ 23 | { 24 | "top": "data1", 25 | "type": "DistributedSlot", 26 | "max_feature_num_per_sample": 30, 27 | "slot_num": 26 28 | } 29 | ] 30 | }, 31 | { 32 | "name": "sparse_embedding1", 33 | "type": "DistributedSlotSparseEmbeddingHash", 34 | "bottom": "data1", 35 | "top": "sparse_embedding1", 36 | "sparse_embedding_hparam": { 37 | "max_vocabulary_size_per_gpu": 88656602, 38 | "embedding_vec_size": 16, 39 | "combiner": 0 40 | } 41 | }, 42 | { 43 | "name": "reshape1", 44 | "type": "Reshape", 45 | "bottom": "sparse_embedding1", 46 | "top": "reshape1", 47 | "leading_dim": 416 48 | }, 49 | { 50 | "name": "concat1", 51 | "type": "Concat", 52 | "bottom": ["reshape1","dense"], 53 | "top": "concat1" 54 | }, 55 | { 56 | "name": "slice1", 57 | "type": "Slice", 58 | "bottom": "concat1", 59 | "ranges": [[0,429], [0,429]], 60 | "top": ["slice11", "slice12"] 61 | }, 62 | { 63 | "name": "multicross1", 64 | "type": "MultiCross", 65 | "bottom": "slice11", 66 | "top": "multicross1", 67 | "mc_param": { 68 | "num_layers": 6 69 | } 70 | }, 71 | { 72 | "name": "fc1", 73 | "type": "InnerProduct", 74 | "bottom": "slice12", 75 | "top": "fc1", 76 | "fc_param": { 77 | "num_output": 1024 78 | } 79 | }, 80 | { 81 | "name": "relu1", 82 | "type": "ReLU", 83 | "bottom": "fc1", 84 | "top": "relu1" 85 | }, 86 | 87 | { 88 | "name": "dropout1", 89 | "type": "Dropout", 90 | "rate": 0.5, 91 | "bottom": "relu1", 92 | "top": "dropout1" 93 | }, 94 | { 95 | "name": "fc2", 96 | "type": "InnerProduct", 97 | "bottom": "dropout1", 98 | "top": "fc2", 99 | "fc_param": { 100 | "num_output": 1024 101 | } 102 | }, 103 | { 104 | "name": "relu2", 105 | "type": "ReLU", 106 | "bottom": "fc2", 107 | "top": "relu2" 108 | }, 109 | { 110 | "name": "dropout2", 111 | "type": "Dropout", 112 | "rate": 0.5, 113 | "bottom": "relu2", 114 | "top": "dropout2" 115 | }, 116 | 117 | { 118 | "name": "concat2", 119 | "type": "Concat", 120 | "bottom": ["dropout2","multicross1"], 121 | "top": "concat2" 122 | }, 123 | 124 | { 125 | "name": "fc4", 126 | "type": "InnerProduct", 127 | "bottom": "concat2", 128 | "top": "fc4", 129 | "fc_param": { 130 | "num_output": 1 131 | } 132 | }, 133 | 134 | { 135 | "name": "sigmoid", 136 | "type": "Sigmoid", 137 | "bottom": "fc4", 138 | "top": "sigmoid" 139 | } 140 | ] 141 | } 142 | -------------------------------------------------------------------------------- /preprocess-train/dcn_files/format_dcn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | import json, sys, argparse, os 18 | 19 | 20 | if __name__=='__main__': 21 | 22 | parser = argparse.ArgumentParser() 23 | 24 | parser.add_argument("--model_version", 25 | type=int, 26 | required=True, 27 | default=1, 28 | help="Provide model version") 29 | 30 | parser.add_argument("--dcn_path", 31 | type=str, 32 | required=True, 33 | default="/var/lib/data/script/dcn_files/dcn.json", 34 | help="Path of original DCN") 35 | 36 | 37 | args = parser.parse_args() 38 | 39 | dcn = os.path.basename(args.dcn_path) 40 | dir_path = os.path.dirname(args.dcn_path) 41 | obj = None 42 | with open(args.dcn_path, "r") as f: 43 | obj = json.load(f) 44 | obj["inference"]["dense_model_file"] = "/model/models/dcn/" + str(args.model_version) + "/_dense_500.model" 45 | obj["inference"]["sparse_model_file"] = "/model/models/dcn/" + str(args.model_version) + "/0_sparse_500.model" 46 | # print(obj["inference"]["dense_model_file"]) 47 | updated_json = dir_path+"/dcn" + str(args.model_version) + ".json" 48 | with open(updated_json,"w") as f: 49 | json.dump(obj, f) 50 | 51 | -------------------------------------------------------------------------------- /preprocess-train/preprocess-train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | set -e 18 | 19 | PV_LOC=${1:-"/var/lib/data"} 20 | PROJECT=${2:-"dl-tme"} 21 | cluster=${3:-"merlin-mlops"} 22 | zone=${4:-"us-central1-a"} 23 | 24 | cp -r /script $PV_LOC 25 | 26 | #echo "Preprocessing..." 27 | cd $PV_LOC 28 | echo $PV_LOC 29 | 30 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json 31 | gcloud container clusters get-credentials $cluster --zone $zone --project $PROJECT 32 | gcloud config set project $PROJECT 33 | 34 | # Check if triton is deployed 35 | triton_status=$(helm status triton 2>&1) 36 | echo "Triton status: " 37 | echo $triton_status 38 | if [[ "$triton_status" == "Error: release: not found" ]]; then 39 | echo "Triton is not running. This is first deployment." 40 | echo "Preprocessing...." 41 | ls -al $PV_LOC/criteo-data/crit_int_pq 42 | python3 -u $PV_LOC/script/preprocessing/nvt-preprocess.py -d $PV_LOC/criteo-data/crit_int_pq -o $PV_LOC/criteo-data/ -t 1 -v 1 -g 0 43 | 44 | echo "Training..." 45 | python3 -u $PV_LOC/script/training/hugectr-train-criteo-dcn.py --input_train $PV_LOC/criteo-data/test_dask/output/train/_file_list.txt --input_val $PV_LOC/criteo-data/test_dask/output/valid/_file_list.txt --max_iter 600 --snapshot 500 --num_gpus 0 46 | 47 | mkdir -p $PV_LOC/model/criteo_hugectr/1/ 48 | mv $PV_LOC/*.model $PV_LOC/model/criteo_hugectr/1/ 49 | 50 | mkdir -p $PV_LOC/models/ 51 | 52 | echo "Create ensemble" 53 | python3 -u $PV_LOC/script/training/create-nvt-hugectr-ensemble.py --nvt_workflow_path $PV_LOC/criteo-data/test_dask/output/workflow/ --hugectr_model_path $PV_LOC/model/criteo_hugectr/1/ --ensemble_output_path $PV_LOC/models/ --ensemble_config $PV_LOC/script/training/ensemble-config.json 54 | 55 | echo "Copy dcn.json" 56 | cp $PV_LOC/script/dcn_files/dcn.json $PV_LOC/models/dcn/1 57 | 58 | else 59 | echo "Triton is running. This is triggered run. Running incremental pre-processing" 60 | echo "Incremental preprocessing..." 61 | ls -al $PV_LOC/criteo-data/new_data 62 | python3 -u $PV_LOC/script/preprocessing/nvt-preprocess-incremental.py --input_train_dir $PV_LOC/criteo-data/new_data/ --output_dir $PV_LOC/criteo-data/output --workflow_dir $PV_LOC/criteo-data/test_dask/output/workflow/ --dask_workdir $PV_LOC/criteo-data/test_dask/workdir --num_gpus 0 63 | 64 | previous_version=$(ls $PV_LOC/model/criteo_hugectr/ -v | tail -n1) 65 | 66 | echo "Incremental Training..." 67 | python3 -u $PV_LOC/script/training/hugectr-train-criteo-dcn.py --input_train $PV_LOC/criteo-data/test_dask/output/train/_file_list.txt --input_val $PV_LOC/criteo-data/test_dask/output/valid/_file_list.txt --max_iter 600 --snapshot 500 --num_gpus 0 --dense_model_file $PV_LOC/model/criteo_hugectr/$previous_version/_dense_500.model --sparse_model_files $PV_LOC/model/criteo_hugectr/$previous_version/0_sparse_500.model 68 | 69 | new_version="$(($previous_version + 1))" 70 | 71 | mkdir -p $PV_LOC/model/criteo_hugectr/$new_version/ 72 | 73 | mv $PV_LOC/*.model $PV_LOC/model/criteo_hugectr/$new_version/ 74 | 75 | mkdir -p $PV_LOC/models_recurrent_runs 76 | 77 | echo "Incremental Create ensemble" 78 | python3 -u $PV_LOC/script/training/create-nvt-hugectr-ensemble.py --nvt_workflow_path $PV_LOC/criteo-data/test_dask/output/workflow/ --hugectr_model_path $PV_LOC/model/criteo_hugectr/$new_version/ --ensemble_output_path $PV_LOC/models_recurrent_runs --ensemble_config $PV_LOC/script/training/ensemble-config.json 79 | 80 | python3 -u $PV_LOC/script/dcn_files/format_dcn.py --model_version $new_version --dcn_path $PV_LOC/script/dcn_files/dcn.json 81 | 82 | mv $PV_LOC/models_recurrent_runs/dcn/1 $PV_LOC/models/dcn/$new_version 83 | mv $PV_LOC/models_recurrent_runs/dcn/config.pbtxt $PV_LOC/models/dcn/ 84 | cp $PV_LOC/script/dcn_files/dcn$new_version.json $PV_LOC/models/dcn/$new_version/dcn.json 85 | 86 | mv $PV_LOC/models_recurrent_runs/dcn_ens/1 $PV_LOC/models/dcn_ens/$new_version 87 | mv $PV_LOC/models_recurrent_runs/dcn_nvt/1 $PV_LOC/models/dcn_nvt/$new_version 88 | 89 | rm -rf $PV_LOC/models_recurrent_runs 90 | fi 91 | -------------------------------------------------------------------------------- /preprocess-train/preprocessing/nvt-preprocess-incremental.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | 18 | # Standard Libraries 19 | import os 20 | from time import time 21 | import re 22 | import shutil 23 | import glob 24 | import warnings 25 | import sys 26 | import argparse 27 | import logging 28 | 29 | # External Dependencies 30 | import numpy as np 31 | import pandas as pd 32 | import cupy as cp 33 | import cudf 34 | import dask_cudf 35 | from dask_cuda import LocalCUDACluster 36 | from dask.distributed import Client 37 | from dask.utils import parse_bytes 38 | from dask.delayed import delayed 39 | import rmm 40 | 41 | import nvtabular as nvt 42 | from nvtabular.utils import _pynvml_mem_size, device_mem_size 43 | 44 | def run_preprocessing(input_train_path, workflow_path, output_path, dask_workdir, num_gpus): 45 | fname = '{}.parquet' 46 | train_files = [i for i in os.listdir(input_train_path) if re.match(fname.format('.*'), i) is not None] 47 | train_paths = [os.path.join(input_train_path, filename) for filename in train_files] 48 | 49 | # Deploy a Dask Distributed Cluster 50 | # Single-Machine Multi-GPU Cluster 51 | protocol = "tcp" # "tcp" or "ucx" 52 | visible_devices = ",".join([str(n) for n in num_gpus]) # Delect devices to place workers 53 | device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. 54 | device_pool_frac = 0.5 55 | part_mem_frac = 0.05 56 | 57 | # Use total device size to calculate args.device_limit_frac 58 | device_size = device_mem_size(kind="total") 59 | part_size = int(part_mem_frac * device_size) 60 | logging.info(f"Partition size: {part_size}") 61 | 62 | # Deploy Dask Distributed cluster only if asked for multiple GPUs 63 | if len(num_gpus) > 1: 64 | logging.info("Deploy Dask Distributed cluster...") 65 | 66 | device_limit = int(device_limit_frac * device_size) 67 | device_pool_size = int(device_pool_frac * device_size) 68 | 69 | logging.info("Checking if any device memory is already occupied...") 70 | # Check if any device memory is already occupied 71 | for dev in visible_devices.split(","): 72 | fmem = _pynvml_mem_size(kind="free", index=int(dev)) 73 | used = (device_size - fmem) / 1e9 74 | if used > 1.0: 75 | warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!") 76 | 77 | cluster = None # (Optional) Specify existing scheduler port 78 | if cluster is None: 79 | cluster = LocalCUDACluster( 80 | protocol = protocol, 81 | n_workers=len(visible_devices.split(",")), 82 | CUDA_VISIBLE_DEVICES = visible_devices, 83 | device_memory_limit = device_limit, 84 | local_directory=dask_workdir 85 | ) 86 | 87 | logging.info("Create the distributed client...") 88 | # Create the distributed client 89 | client = Client(cluster) 90 | 91 | logging.info("Initialize memory pools...") 92 | # Initialize RMM pool on ALL workers 93 | def _rmm_pool(): 94 | rmm.reinitialize( 95 | # RMM may require the pool size to be a multiple of 256. 96 | pool_allocator=True, 97 | initial_pool_size=(device_pool_size // 256) * 256, # Use default size 98 | ) 99 | 100 | client.run(_rmm_pool) 101 | 102 | 103 | # Import the test .parquet 104 | logging.info("Importing Data...") 105 | test_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) 106 | 107 | logging.info("Loading workflow object...") 108 | workflow = nvt.Workflow.load(workflow_path) 109 | 110 | # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets 111 | CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)] 112 | CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1,27)] 113 | LABEL_COLUMNS = ['label'] 114 | dict_dtypes={} 115 | 116 | for col in CATEGORICAL_COLUMNS: 117 | dict_dtypes[col] = np.int64 118 | 119 | for col in CONTINUOUS_COLUMNS: 120 | dict_dtypes[col] = np.float32 121 | 122 | for col in LABEL_COLUMNS: 123 | dict_dtypes[col] = np.float32 124 | 125 | # Create output directory for test data 126 | output_test_dir = os.path.join(output_path, 'train/') 127 | 128 | if not os.path.exists(output_test_dir): 129 | logging.info(f"Creating train/ directory at: {output_test_dir}") 130 | os.makedirs(output_test_dir) 131 | 132 | logging.info("Preprocessing Data...") 133 | workflow.transform(test_dataset).to_parquet(output_path=output_test_dir, 134 | dtypes=dict_dtypes, 135 | cats=CATEGORICAL_COLUMNS, 136 | conts=CONTINUOUS_COLUMNS, 137 | labels=LABEL_COLUMNS) 138 | 139 | logging.info("Done!") 140 | 141 | 142 | if __name__ == '__main__': 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument('-t', 145 | '--input_train_dir', 146 | type=str, 147 | required=False, 148 | default='/crit_int_pq', 149 | help='Path to Preprocessed Data Dir. Default is /crit_int_pq') 150 | 151 | parser.add_argument('-o', 152 | '--output_dir', 153 | type=str, 154 | required=False, 155 | default='./test_dask/output/', 156 | help='Path for Output directory. Default is ./test_dask/output/') 157 | 158 | parser.add_argument('-w', 159 | '--workflow_dir', 160 | type=str, 161 | required=False, 162 | default='./test_dask/output/workflow/', 163 | help='Path to Saved Workflow object. This should be obtained from Preprocessing Training data. Default is ./test_dask/output/workflow') 164 | 165 | parser.add_argument('-e', 166 | '--dask_workdir', 167 | type=str, 168 | required=False, 169 | default='./test_dask/workdir', 170 | help='Working directory for Dask. Default is ./test_dask/workdir') 171 | 172 | parser.add_argument('-g', 173 | '--num_gpus', 174 | nargs='+', 175 | type=int, 176 | required=False, 177 | default=[0,1], 178 | help='GPU devices to use for Preprocessing') 179 | 180 | args = parser.parse_args() 181 | 182 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 183 | 184 | logging.info(f"Args: {args}") 185 | 186 | run_preprocessing(input_train_path=args.input_train_dir, 187 | workflow_path=args.workflow_dir, 188 | output_path=args.output_dir , 189 | dask_workdir=args.dask_workdir, 190 | num_gpus=args.num_gpus) 191 | -------------------------------------------------------------------------------- /preprocess-train/preprocessing/nvt-preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Standard Libraries 17 | import os 18 | from time import time 19 | import re 20 | import shutil 21 | import glob 22 | import warnings 23 | import argparse 24 | import logging 25 | 26 | # External Dependencies 27 | import numpy as np 28 | import cupy as cp 29 | import cudf 30 | import dask_cudf 31 | from dask_cuda import LocalCUDACluster 32 | from dask.distributed import Client 33 | from dask.utils import parse_bytes 34 | from dask.delayed import delayed 35 | import rmm 36 | 37 | # NVTabular 38 | import nvtabular as nvt 39 | from nvtabular.ops import Categorify, Clip, FillMissing, HashBucket, LambdaOp, LogOp, Rename, get_embedding_sizes, Normalize 40 | from nvtabular.io import Shuffle 41 | from nvtabular.utils import _pynvml_mem_size, device_mem_size 42 | 43 | 44 | def run_preprocessing(input_path, base_dir, num_train_days, num_val_days, num_gpus): 45 | 46 | # Define paths to save artifacts 47 | dask_workdir = os.path.join(base_dir, "test_dask/workdir") 48 | output_path = os.path.join(base_dir, "test_dask/output") 49 | stats_path = os.path.join(base_dir, "test_dask/stats") 50 | 51 | logging.info(f"Dask Workdir: {dask_workdir}") 52 | logging.info(f"Output Path: {output_path}") 53 | 54 | # Make sure we have a clean worker space for Dask 55 | if os.path.isdir(dask_workdir): 56 | shutil.rmtree(dask_workdir) 57 | os.makedirs(dask_workdir) 58 | 59 | # Make sure we have a clean stats space for Dask 60 | if os.path.isdir(stats_path): 61 | shutil.rmtree(stats_path) 62 | os.mkdir(stats_path) 63 | 64 | # Make sure we have a clean output path 65 | if os.path.isdir(output_path): 66 | shutil.rmtree(output_path) 67 | os.mkdir(output_path) 68 | 69 | logging.info("Created output directories..") 70 | 71 | # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc. 72 | fname = 'day_{}.parquet' 73 | num_days = len([i for i in os.listdir(input_path) if re.match(fname.format('[0-9]{1,2}'), i) is not None]) 74 | train_paths = [os.path.join(input_path, fname.format(day)) for day in range(num_train_days)] 75 | valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(num_train_days, num_train_days + num_val_days)] 76 | 77 | logging.info(f"Training data: {train_paths}") 78 | logging.info(f"Validation data: {valid_paths}") 79 | 80 | # Deploy a Dask Distributed Cluster 81 | # Single-Machine Multi-GPU Cluster 82 | protocol = "tcp" # "tcp" or "ucx" 83 | visible_devices = ",".join([str(n) for n in num_gpus]) # Delect devices to place workers 84 | device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. 85 | device_pool_frac = 0.5 86 | part_mem_frac = 0.05 # Desired maximum size of each partition as a fraction of total GPU memory. 87 | 88 | # Use total device size to calculate args.device_limit_frac 89 | device_size = device_mem_size(kind="total") 90 | part_size = int(part_mem_frac * device_size) 91 | logging.info(f"Partition size: {part_size}") 92 | 93 | # Deploy Dask Distributed cluster only if asked for multiple GPUs 94 | if len(num_gpus) > 1: 95 | 96 | device_limit = int(device_limit_frac * device_size) 97 | device_pool_size = int(device_pool_frac * device_size) 98 | 99 | logging.info("Checking if any device memory is already occupied..") 100 | # Check if any device memory is already occupied 101 | for dev in visible_devices.split(","): 102 | fmem = _pynvml_mem_size(kind="free", index=int(dev)) 103 | used = (device_size - fmem) / 1e9 104 | if used > 1.0: 105 | warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!") 106 | 107 | cluster = None # (Optional) Specify existing scheduler port 108 | if cluster is None: 109 | cluster = LocalCUDACluster( 110 | protocol = protocol, 111 | n_workers=len(visible_devices.split(",")), 112 | CUDA_VISIBLE_DEVICES = visible_devices, 113 | device_memory_limit = device_limit, 114 | local_directory=dask_workdir 115 | ) 116 | 117 | logging.info("Create the distributed client..") 118 | # Create the distributed client 119 | client = Client(cluster) 120 | 121 | logging.info("Initialize memory pools..") 122 | # Initialize RMM pool on ALL workers 123 | def _rmm_pool(): 124 | rmm.reinitialize( 125 | # RMM may require the pool size to be a multiple of 256. 126 | pool_allocator=True, 127 | initial_pool_size=(device_pool_size // 256) * 256, 128 | ) 129 | 130 | client.run(_rmm_pool) 131 | 132 | # Preprocessing 133 | CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)] 134 | CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1,27)] 135 | LABEL_COLUMNS = ['label'] 136 | COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS 137 | 138 | cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path) 139 | cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> Normalize() 140 | features = cat_features + cont_features + LABEL_COLUMNS 141 | 142 | logging.info("Defining a workflow object..") 143 | if len(num_gpus) > 1: 144 | workflow = nvt.Workflow(features, client=client) 145 | else: 146 | workflow = nvt.Workflow(features) 147 | 148 | dict_dtypes={} 149 | 150 | for col in CATEGORICAL_COLUMNS: 151 | dict_dtypes[col] = np.int64 152 | 153 | for col in CONTINUOUS_COLUMNS: 154 | dict_dtypes[col] = np.float32 155 | 156 | for col in LABEL_COLUMNS: 157 | dict_dtypes[col] = np.float32 158 | 159 | 160 | train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) 161 | valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size) 162 | 163 | output_train_dir = os.path.join(output_path, 'train/') 164 | logging.info(f"Creating train/ directory at: {output_train_dir}") 165 | if not os.path.exists(output_train_dir): 166 | os.makedirs(output_train_dir) 167 | 168 | output_valid_dir = os.path.join(output_path, 'valid/') 169 | logging.info(f"Creating valid/ directory at: {output_valid_dir}") 170 | if not os.path.exists(output_valid_dir): 171 | os.makedirs(output_valid_dir) 172 | 173 | logging.info("Workflow Fit..") 174 | workflow.fit(train_dataset) 175 | 176 | logging.info("Transform Training data..") 177 | workflow.transform(train_dataset).to_parquet(output_path=output_train_dir, 178 | shuffle=nvt.io.Shuffle.PER_PARTITION, 179 | dtypes=dict_dtypes, 180 | cats=CATEGORICAL_COLUMNS, 181 | conts=CONTINUOUS_COLUMNS, 182 | labels=LABEL_COLUMNS) 183 | 184 | logging.info("Transform Validation data..") 185 | workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir, 186 | dtypes=dict_dtypes, 187 | cats=CATEGORICAL_COLUMNS, 188 | conts=CONTINUOUS_COLUMNS, 189 | labels=LABEL_COLUMNS) 190 | 191 | 192 | # use these printed out cardinalities list in the "slot_size_array" in the HugeCTR training "dcn_parquet.json" 193 | cardinalities = [] 194 | for col in CATEGORICAL_COLUMNS: 195 | cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0]) 196 | 197 | logging.info(f"Cardinalities for configuring slot_size_array: {cardinalities}") 198 | 199 | logging.info(f"Saving workflow object at: {output_path + '/workflow'}") 200 | workflow.save(output_path + '/workflow') 201 | 202 | logging.info("Done!") 203 | 204 | 205 | if __name__ == '__main__': 206 | 207 | parser = argparse.ArgumentParser() 208 | parser.add_argument('-d', 209 | '--input_data_dir', 210 | type=str, 211 | required=False, 212 | default='/crit_int_pq', 213 | help='Path to Preprocessed Test Data Dir. Default is /crit_int_pq') 214 | 215 | parser.add_argument('-o', 216 | '--output_dir', 217 | type=str, 218 | required=False, 219 | default='/var/lib/data/criteo-data/', 220 | help='Path for Output directory. It will create a directory "test_dask" to store artifacts. Default is /var/lib/data/criteo-data/') 221 | 222 | parser.add_argument('-t', 223 | '--n_train_days', 224 | type=int, 225 | required=False, 226 | default=1, 227 | help='Number of Criteo data days to use for training dataset. Default is 1. Keep n_train_days + n_val_days<=24') 228 | 229 | parser.add_argument('-v', 230 | '--n_val_days', 231 | type=int, 232 | required=False, 233 | default=1, 234 | help='Number of Criteo data days to take for validation set after n_train_days. Default is 1. Keep n_train_days + n_val_days<=24.') 235 | 236 | parser.add_argument('-g', 237 | '--num_gpus', 238 | nargs='+', 239 | type=int, 240 | required=False, 241 | default=[0,1,2,3,4,5,6,7], 242 | help='GPU devices to use for Preprocessing') 243 | 244 | args = parser.parse_args() 245 | 246 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 247 | 248 | logging.info(f"Args: {args}") 249 | 250 | run_preprocessing(input_path=args.input_data_dir, 251 | base_dir=args.output_dir, 252 | num_train_days=args.n_train_days, 253 | num_val_days=args.n_val_days, 254 | num_gpus=args.num_gpus) 255 | -------------------------------------------------------------------------------- /preprocess-train/training/create-nvt-hugectr-ensemble.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os 17 | import argparse 18 | import logging 19 | import json 20 | 21 | import nvtabular as nvt 22 | from nvtabular.inference.triton import export_hugectr_ensemble 23 | from nvtabular.ops import get_embedding_sizes 24 | 25 | 26 | def create_ensemble(workflow_path, hugectr_model_path, ensemble_output_path, ensemble_config_file): 27 | """ 28 | Creates an ensemble of NVTabular and HugeCTR model. 29 | 30 | This enables preprocessing at the time of inference, allowing the 31 | user to send raw data directly to the inference server. 32 | """ 33 | 34 | # Load the workflow object 35 | workflow = nvt.Workflow.load(workflow_path) 36 | 37 | # Verify that the workflow is loaded 38 | embeddings = get_embedding_sizes(workflow) 39 | logging.info(f"Embedding sizes for categorical features: {embeddings}") 40 | 41 | with open(ensemble_config_file, "r") as jsonfile: 42 | ensemble_config = json.load(jsonfile) 43 | 44 | hugectr_params = ensemble_config["hugectr_params"] 45 | 46 | # We override the config param to update the model version 47 | # Get the model version for updating the config accordingly 48 | model_version = hugectr_model_path.split('/')[-2] 49 | logging.info(f"Model version: {model_version}") 50 | model_json_path = hugectr_params["config"].split(os.sep) # "/model/models/dcn/1/dcn.json" -> ['', 'model', 'models', 'dcn', '1', 'dcn.json'] 51 | model_json_path[-2] = model_version # ['', 'model', 'models', 'dcn', '1', 'dcn.json'] -> ['', 'model', 'models', 'dcn', '2', 'dcn.json'] 52 | hugectr_params["config"] = os.sep + os.path.join(*model_json_path) # '/' + 'model/models/dcn/2/dcn.json' 53 | 54 | logging.info(f"HugeCTR configs: {hugectr_params}") 55 | 56 | categorical_cols = ensemble_config["categorical_cols"] 57 | continuous_cols = ensemble_config["continuous_cols"] 58 | label_cols = ensemble_config["label_cols"] 59 | 60 | logging.info(f"Categorical Columns: {categorical_cols}") 61 | logging.info(f"Continuous Columns: {continuous_cols}") 62 | logging.info(f"Label Columns: {label_cols}") 63 | 64 | logging.info(f"Generating the ensemble at directory: {ensemble_output_path}") 65 | export_hugectr_ensemble(workflow=workflow, 66 | hugectr_model_path=hugectr_model_path, 67 | hugectr_params=hugectr_params, 68 | name=ensemble_config["name"], 69 | output_path=ensemble_output_path, 70 | label_columns=label_cols, 71 | cats=categorical_cols, 72 | conts=continuous_cols, 73 | max_batch_size=ensemble_config["max_batch_size"]) 74 | 75 | if __name__ == '__main__': 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('-w', 79 | '--nvt_workflow_path', 80 | type=str, 81 | required=False, 82 | default='./test_dask/output/workflow', 83 | help='Path to Workflow Dir. Default is ./test_dask/output/workflow') 84 | 85 | parser.add_argument('-m', 86 | '--hugectr_model_path', 87 | type=str, 88 | required=False, 89 | default='/model/criteo_hugectr/1/', 90 | help='Path to where your .model files and inference .json is stored. Default is /model/criteo_hugectr/1/') 91 | 92 | parser.add_argument('-o', 93 | '--ensemble_output_path', 94 | type=str, 95 | required=False, 96 | default='/model/models/', 97 | help='Path to where your ensemble output must be stored. Default is /model/models') 98 | 99 | parser.add_argument('-c', 100 | '--ensemble_config', 101 | type=str, 102 | required=False, 103 | default='./ensemble-config.json', 104 | help='Path to where ensemble config .json') 105 | 106 | 107 | args = parser.parse_args() 108 | 109 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 110 | 111 | logging.info(f"Args: {args}") 112 | 113 | create_ensemble(workflow_path=args.nvt_workflow_path, 114 | hugectr_model_path=args.hugectr_model_path, 115 | ensemble_output_path=args.ensemble_output_path, 116 | ensemble_config_file=args.ensemble_config 117 | ) 118 | -------------------------------------------------------------------------------- /preprocess-train/training/ensemble-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dcn", 3 | "categorical_cols": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26"], 4 | "continuous_cols": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13"], 5 | "label_cols": ["label"], 6 | "max_batch_size": 64, 7 | "hugectr_params": { 8 | "config": "/model/models/dcn/1/dcn.json", 9 | "slots": 26, 10 | "max_nnz": 1, 11 | "embedding_vector_size": 16, 12 | "n_outputs": 1 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /preprocess-train/training/hugectr-train-criteo-dcn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Standard Libraries 17 | import argparse 18 | import logging 19 | 20 | import hugectr 21 | from mpi4py import MPI 22 | 23 | 24 | def train(input_train, input_val, max_iter, 25 | batchsize, snapshot, num_gpus, eval_interval, 26 | dense_model_file, sparse_model_files): 27 | 28 | logging.info(f"GPU Devices: {num_gpus}") 29 | 30 | # Configure and define the HugeCTR model 31 | solver = hugectr.solver_parser_helper(num_epochs = 0, 32 | max_iter = max_iter, 33 | max_eval_batches = 100, 34 | batchsize_eval = batchsize, 35 | batchsize = batchsize, 36 | model_file = dense_model_file, 37 | embedding_files = sparse_model_files, 38 | display = 200, 39 | eval_interval = eval_interval, 40 | i64_input_key = True, 41 | use_mixed_precision = False, 42 | repeat_dataset = True, 43 | snapshot = snapshot, 44 | vvgpu = [num_gpus], 45 | use_cuda_graph = False 46 | ) 47 | 48 | optimizer = hugectr.optimizer.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam, 49 | use_mixed_precision = False) 50 | model = hugectr.Model(solver, optimizer) 51 | 52 | # The slot_size_array are the cardinalities of each categorical feature after NVTabular preprocessing 53 | model.add(hugectr.Input(data_reader_type = hugectr.DataReaderType_t.Parquet, 54 | source = input_train, 55 | eval_source = input_val, 56 | check_type = hugectr.Check_t.Non, 57 | label_dim = 1, label_name = "label", 58 | dense_dim = 13, dense_name = "dense", 59 | slot_size_array = [18576837, 29428, 15128, 7296, 19902, 4, 6466, 1311, 62, 11700067, 622921, 219557, 11, 2209, 9780, 71, 4, 964, 15, 22022124, 4384510, 15960286, 290588, 10830, 96, 35], 60 | data_reader_sparse_param_array = 61 | [hugectr.DataReaderSparseParam(hugectr.DataReaderSparse_t.Distributed, 30, 1, 26)], 62 | sparse_names = ["data1"])) 63 | 64 | # Sparse Embedding Layer 65 | model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 66 | max_vocabulary_size_per_gpu = 88656602, 67 | embedding_vec_size = 16, 68 | combiner = 0, 69 | sparse_embedding_name = "sparse_embedding1", 70 | bottom_name = "data1")) 71 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape, 72 | bottom_names = ["sparse_embedding1"], 73 | top_names = ["reshape1"], 74 | leading_dim=416)) 75 | 76 | # Concatenate sparse embedding and dense input 77 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat, 78 | bottom_names = ["reshape1", "dense"], top_names = ["concat1"])) 79 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Slice, 80 | bottom_names = ["concat1"], 81 | top_names = ["slice11", "slice12"], 82 | ranges=[(0,429),(0,429)])) 83 | 84 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross, 85 | bottom_names = ["slice11"], 86 | top_names = ["multicross1"], 87 | num_layers=6)) 88 | 89 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, 90 | bottom_names = ["slice12"], 91 | top_names = ["fc1"], 92 | num_output=1024)) 93 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, 94 | bottom_names = ["fc1"], 95 | top_names = ["relu1"])) 96 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, 97 | bottom_names = ["relu1"], 98 | top_names = ["dropout1"], 99 | dropout_rate=0.5)) 100 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, 101 | bottom_names = ["dropout1"], 102 | top_names = ["fc2"], 103 | num_output=1024)) 104 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, 105 | bottom_names = ["fc2"], 106 | top_names = ["relu2"])) 107 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, 108 | bottom_names = ["relu2"], 109 | top_names = ["dropout2"], 110 | dropout_rate=0.5)) 111 | 112 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat, 113 | bottom_names = ["dropout2", "multicross1"], 114 | top_names = ["concat2"])) 115 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, 116 | bottom_names = ["concat2"], 117 | top_names = ["fc3"], 118 | num_output=1)) 119 | model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss, 120 | bottom_names = ["fc3", "label"], 121 | top_names = ["loss"])) 122 | model.compile() 123 | model.summary() 124 | model.fit() 125 | 126 | 127 | if __name__ == '__main__': 128 | 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument('-t', 131 | '--input_train', 132 | type=str, 133 | required=False, 134 | default='/mlops/scripts/test-script/test_dask/output/train/_file_list.txt', 135 | help='Path to training data _file_list.txt') 136 | 137 | parser.add_argument('-v', 138 | '--input_val', 139 | type=str, 140 | required=False, 141 | default='/mlops/scripts/test-script/test_dask/output/valid/_file_list.txt', 142 | help='Path to validation data _file_list.txt') 143 | 144 | parser.add_argument('-i', 145 | '--max_iter', 146 | type=int, 147 | required=False, 148 | default=20000, 149 | help='Number of training iterations') 150 | 151 | parser.add_argument('-b', 152 | '--batchsize', 153 | type=int, 154 | required=False, 155 | default=2048, 156 | help='Batch size') 157 | 158 | parser.add_argument('-s', 159 | '--snapshot', 160 | type=int, 161 | required=False, 162 | default=10000, 163 | help='Saves a model snapshot after given number of iterations') 164 | 165 | parser.add_argument('-g', 166 | '--num_gpus', 167 | nargs='+', 168 | type=int, 169 | required=False, 170 | default=[0,1], 171 | help='GPU devices to use for Preprocessing') 172 | 173 | parser.add_argument('-r', 174 | '--eval_interval', 175 | type=int, 176 | required=False, 177 | default=1000, 178 | help='Run evaluation after given number of iterations') 179 | 180 | parser.add_argument('-d', 181 | '--dense_model_file', 182 | type=str, 183 | required=False, 184 | default=None, 185 | help='Path to an existing dense model. If provided, resumes training from here. Eg. ./_dense_19500.model ') 186 | 187 | parser.add_argument('-m', 188 | '--sparse_model_files', 189 | type=str, 190 | nargs='+', 191 | required=False, 192 | default=None, 193 | help='Paths to an existing sparse snapshots. If provided, resumes training from here. Eg. --sparse_model_files ./model-snapshot/0_sparse_19500.model ./model-snapshot/0_sparse_19500.model') 194 | 195 | args = parser.parse_args() 196 | 197 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 198 | 199 | logging.info(f"Args: {args}") 200 | 201 | # Both the dense and sparse model files should be provided if either one is provided 202 | if args.dense_model_file and args.sparse_model_files: 203 | logging.info("Training from previously saved model...") 204 | logging.info(f"Dense model file: {args.dense_model_file}") 205 | logging.info(f"Sparse model file: {args.sparse_model_files}") 206 | dense_model_file = args.dense_model_file 207 | sparse_model_files = args.sparse_model_files 208 | elif (args.dense_model_file and args.sparse_model_files is None) or \ 209 | (args.sparse_model_files and args.dense_model_file is None): 210 | parser.error("--dense_model_file and --sparse_model_files both need to be provided together.") 211 | else: 212 | logging.info("No previous checkpoint/model provided. Training from scratch. ") 213 | dense_model_file = "" 214 | sparse_model_files = [] 215 | 216 | train(input_train=args.input_train, 217 | input_val=args.input_val, 218 | max_iter=args.max_iter, 219 | batchsize=args.batchsize, 220 | snapshot=args.snapshot, 221 | eval_interval=args.eval_interval, 222 | num_gpus=args.num_gpus, 223 | dense_model_file=dense_model_file, 224 | sparse_model_files=sparse_model_files 225 | ) -------------------------------------------------------------------------------- /run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PROJECT_ID=${1:-"dl-tme"} 20 | GCLOUD_KEY=${2:-"gcloud_key.json"} 21 | 22 | COPY_CONTAINER=gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk:0.5.1 23 | TRAIN_CONTAINER=gcr.io/$PROJECT_ID/merlin/merlin-training:0.5.1 24 | MONITOR_COMPONENT=gcr.io/$PROJECT_ID/monitoring:0.5.1 25 | VALIDATE_CONTAINER=gcr.io/$PROJECT_ID/validation:0.5.1 26 | 27 | bash build_copy_container.sh $PROJECT_ID $GCLOUD_KEY 28 | COPY_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk:0.5.1) 29 | DEPLOY_CONTAINER=$COPY_CONTAINER 30 | 31 | bash build_validation_component.sh $PROJECT_ID $GCLOUD_KEY 32 | VALIDATE_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/validation:0.5.1) 33 | 34 | bash build_training_container.sh $PROJECT_ID $GCLOUD_KEY 35 | TRAIN_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/merlin/merlin-training:0.5.1) 36 | 37 | bash build_monitoring_component.sh $PROJECT_ID $GCLOUD_KEY 38 | MONITOR_COMPONENT=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/monitoring:0.5.1) 39 | 40 | 41 | source activate mlpipeline 42 | python3 merlin-pipeline.py -vc $VALIDATE_CONTAINER -dex $COPY_CONTAINER -tc $TRAIN_CONTAINER -dc $DEPLOY_CONTAINER -mc $MONITOR_COMPONENT -------------------------------------------------------------------------------- /validation/generate-stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | 18 | import os 19 | import argparse 20 | 21 | import pandas as pd 22 | import tensorflow_data_validation as tfdv 23 | from google.protobuf.json_format import MessageToDict 24 | 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | 30 | parser.add_argument('-d', 31 | '--data_dir', 32 | type=str, 33 | required=False, 34 | default='/crit_int_pq/day_23.parquet', 35 | help='Path to a data .parquet file. Default') 36 | 37 | parser.add_argument('-o', 38 | '--output_dir', 39 | type=str, 40 | required=False, 41 | default='./output', 42 | help='Path to a where stats must be saved') 43 | 44 | parser.add_argument('-f', 45 | '--file_name', 46 | type=str, 47 | required=False, 48 | default='stats.txt', 49 | help='Name of the stats file') 50 | 51 | 52 | args = parser.parse_args() 53 | 54 | 55 | # tfdv doesnt support generating stats directly from parquet 56 | # so read through pandas parquet reader 57 | # Ideally, this should be be an accelerated parquet reader and stats 58 | # computation should happen via GPU 59 | df = pd.read_parquet(args.data_dir) 60 | 61 | stats = tfdv.generate_statistics_from_dataframe(df) 62 | 63 | if not os.path.exists(args.output_dir): 64 | os.makedirs(args.output_dir) 65 | 66 | output_path = os.path.join(args.output_dir, args.file_name) 67 | 68 | tfdv.write_stats_text(stats, output_path=output_path) 69 | -------------------------------------------------------------------------------- /validation/run_validation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | 19 | PV_LOC=${1:-"/var/lib/data"} 20 | VALIDATION=${2:-'False'} 21 | 22 | if [ -d "$PV_LOC/stats" ] && [ -d "$PV_LOC/stats/stats.txt" ]; then 23 | previous_version=$(ls $PV_LOC/stats/ -v | tail -n1) 24 | new_version="$(($previous_version + 1))" 25 | new_file="$(ls $PV_LOC/criteo-data/new_data/ | shuf -n 1)" 26 | 27 | echo "Generating stats for training data..." 28 | python3 -u /script/generate-stats.py --data_dir $PV_LOC/criteo-data/new_data/$new_file --output_dir $PV_LOC/stats/ --file_name "stats"$new_version".txt" 29 | 30 | echo "Validate stats..." 31 | python3 -u /script/validate-stats.py --stats_file_1 $PV_LOC/stats/stats.txt --stats_file_2 $PV_LOC/stats/"stats"$new_version".txt" 32 | 33 | else 34 | if [[ "$VALIDATION" == 'True' ]]; then 35 | mkdir -p $PV_LOC/stats/ 36 | 37 | echo "Generating stats for training data..." 38 | python3 -u /script/generate-stats.py --data_dir $PV_LOC/criteo-data/crit_int_pq/day_0.parquet --output_dir $PV_LOC/stats/ --file_name "stats.txt" 39 | else 40 | echo "Not generating stats..." 41 | fi 42 | 43 | fi 44 | -------------------------------------------------------------------------------- /validation/validate-stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | import os 18 | import argparse 19 | import logging 20 | 21 | import pandas as pd 22 | import tensorflow_data_validation as tfdv 23 | from google.protobuf.json_format import MessageToDict 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument('-t', 30 | '--stats_file_1', 31 | type=str, 32 | required=False, 33 | default='./train_stats/stats.txt', 34 | help='Path to the training/reference stats .txt file ') 35 | 36 | parser.add_argument('-v', 37 | '--stats_file_2', 38 | type=str, 39 | required=False, 40 | default='./val_stats/stats.txt', 41 | help='Path to the validation stats .txt file ') 42 | 43 | 44 | args = parser.parse_args() 45 | 46 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S') 47 | logging.info(f"Args: {args}") 48 | 49 | 50 | stats1 = tfdv.load_stats_text(input_path=args.stats_file_1) 51 | stats2 = tfdv.load_stats_text(input_path=args.stats_file_2) 52 | 53 | schema1 = tfdv.infer_schema(statistics=stats1) 54 | 55 | # Custom rules, tweak this as required. This is just an example 56 | tfdv.get_feature(schema1, 'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06 57 | 58 | # Calculate drift between the reference stats stats1, and the statistics from new data in stats2 59 | drift_anomalies = tfdv.validate_statistics(statistics=stats2, 60 | schema=schema1, 61 | previous_statistics=stats1) 62 | 63 | # Convert the .pb2 to dict 64 | drift = MessageToDict(drift_anomalies) 65 | 66 | value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value'] 67 | threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] 68 | logging.info(f"JS divergence value: {value}, and JS divergence threshold: {threshold}") 69 | drift_detected = True 70 | if value < threshold: 71 | drift_detected = False 72 | logging.info(f"Drift detected: {drift_detected}") 73 | -------------------------------------------------------------------------------- /yamls/Autoscaling_custom_metrics/1_custom-metric-server-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | apiVersion: v1 18 | kind: ConfigMap 19 | metadata: 20 | name: adapter-config 21 | namespace: custom-metrics 22 | data: 23 | adapter-config-data: | 24 | rules: 25 | - seriesQuery: 'nv_inference_queue_duration_us{namespace="default"}' 26 | resources: 27 | overrides: 28 | namespace: 29 | resource: "namespace" 30 | pod: 31 | resource: pod 32 | name: 33 | matches: "nv_inference_queue_duration_us" 34 | as: "avg_time_queue_ms" 35 | metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))/1000) by (<<.GroupBy>>)' 36 | -------------------------------------------------------------------------------- /yamls/Autoscaling_custom_metrics/2_custom-metric-server.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | apiVersion: apps/v1 18 | kind: Deployment 19 | metadata: 20 | name: custom-metrics-apiserver 21 | namespace: custom-metrics 22 | labels: 23 | app: custom-metrics-apiserver 24 | spec: 25 | replicas: 1 26 | selector: 27 | matchLabels: 28 | app: custom-metrics-apiserver 29 | template: 30 | metadata: 31 | labels: 32 | app: custom-metrics-apiserver 33 | spec: 34 | containers: 35 | - name: custom-metrics-server 36 | #image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1 37 | image: directxman12/k8s-prometheus-adapter-amd64 38 | args: 39 | #- --prometheus-url=http://kube-prometheus-stack-1616-prometheus:9090 40 | - --prometheus-url=http://10.4.6.5:9090 41 | - --metrics-relist-interval=30s 42 | - --v=10 43 | - --config=/etc/config/adapter-config.yaml 44 | ports: 45 | - containerPort: 443 46 | volumeMounts: 47 | - name: config-volume 48 | mountPath: /etc/config 49 | securityContext: 50 | runAsUser: 0 51 | volumes: 52 | - name: config-volume 53 | configMap: 54 | name: adapter-config 55 | items: 56 | - key: adapter-config-data 57 | path: adapter-config.yaml 58 | 59 | --- 60 | apiVersion: v1 61 | kind: Service 62 | metadata: 63 | name: api 64 | namespace: custom-metrics 65 | spec: 66 | selector: 67 | app: custom-metrics-apiserver 68 | ports: 69 | - port: 443 70 | targetPort: 443 71 | --- 72 | apiVersion: apiregistration.k8s.io/v1 73 | kind: APIService 74 | metadata: 75 | name: v1beta1.custom.metrics.k8s.io 76 | spec: 77 | insecureSkipTLSVerify: true 78 | group: custom.metrics.k8s.io 79 | groupPriorityMinimum: 1000 80 | versionPriority: 5 81 | service: 82 | name: api 83 | namespace: custom-metrics 84 | version: v1beta1 85 | --- -------------------------------------------------------------------------------- /yamls/Autoscaling_custom_metrics/3_custom-metrics-server-rbac.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | kind: ServiceAccount 18 | apiVersion: v1 19 | metadata: 20 | name: custom-metrics-apiserver 21 | namespace: custom-metrics 22 | --- 23 | apiVersion: rbac.authorization.k8s.io/v1 24 | kind: ClusterRoleBinding 25 | metadata: 26 | name: custom-metrics:system:auth-delegator 27 | roleRef: 28 | apiGroup: rbac.authorization.k8s.io 29 | kind: ClusterRole 30 | name: system:auth-delegator 31 | subjects: 32 | - kind: ServiceAccount 33 | name: custom-metrics-apiserver 34 | namespace: custom-metrics 35 | --- 36 | apiVersion: rbac.authorization.k8s.io/v1 37 | kind: RoleBinding 38 | metadata: 39 | name: custom-metrics-auth-reader 40 | namespace: kube-system 41 | roleRef: 42 | apiGroup: rbac.authorization.k8s.io 43 | kind: Role 44 | name: extension-apiserver-authentication-reader 45 | subjects: 46 | - kind: ServiceAccount 47 | name: custom-metrics-apiserver 48 | namespace: custom-metrics 49 | --- 50 | apiVersion: rbac.authorization.k8s.io/v1 51 | kind: ClusterRoleBinding 52 | metadata: 53 | name: custom-metrics-resource-reader 54 | roleRef: 55 | apiGroup: rbac.authorization.k8s.io 56 | kind: ClusterRole 57 | name: custom-metrics-resource-reader 58 | subjects: 59 | - kind: ServiceAccount 60 | name: custom-metrics-apiserver 61 | namespace: custom-metrics 62 | --- 63 | apiVersion: rbac.authorization.k8s.io/v1 64 | kind: ClusterRole 65 | metadata: 66 | name: custom-metrics-server-resources 67 | rules: 68 | - apiGroups: 69 | - custom.metrics.k8s.io 70 | resources: ["*"] 71 | verbs: ["*"] 72 | --- 73 | apiVersion: rbac.authorization.k8s.io/v1 74 | kind: ClusterRole 75 | metadata: 76 | name: custom-metrics-resource-reader 77 | rules: 78 | - apiGroups: 79 | - "" 80 | resources: 81 | - namespaces 82 | - pods 83 | - services 84 | verbs: 85 | - get 86 | - list 87 | --- 88 | apiVersion: rbac.authorization.k8s.io/v1 89 | kind: ClusterRoleBinding 90 | metadata: 91 | name: hpa-controller-custom-metrics 92 | roleRef: 93 | apiGroup: rbac.authorization.k8s.io 94 | kind: ClusterRole 95 | name: custom-metrics-server-resources 96 | subjects: 97 | - kind: ServiceAccount 98 | name: horizontal-pod-autoscaler 99 | namespace: kube-system 100 | --- -------------------------------------------------------------------------------- /yamls/Autoscaling_custom_metrics/4_triton-hpa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | apiVersion: autoscaling/v2beta1 18 | kind: HorizontalPodAutoscaler 19 | metadata: 20 | name: trtis-metrics-app-hpa 21 | spec: 22 | scaleTargetRef: 23 | apiVersion: apps/v1beta1 24 | kind: Deployment 25 | name: triton-triton-inference-server 26 | minReplicas: 1 27 | maxReplicas: 2 28 | metrics: 29 | - type: Object 30 | object: 31 | target: 32 | kind: Namespace 33 | name: default 34 | metricName: avg_time_queue_ms 35 | targetValue: 200m -------------------------------------------------------------------------------- /yamls/pv.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | apiVersion: v1 18 | kind: PersistentVolume 19 | metadata: 20 | name: my-file-server 21 | spec: 22 | capacity: 23 | storage: 1T 24 | accessModes: 25 | - ReadWriteMany 26 | nfs: 27 | path: /myVolume 28 | server: 10.0.0.2 -------------------------------------------------------------------------------- /yamls/pvc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | apiVersion: v1 17 | kind: PersistentVolumeClaim 18 | metadata: 19 | name: my-volume-claim 20 | spec: 21 | accessModes: 22 | - ReadWriteMany 23 | resources: 24 | requests: 25 | storage: 1T 26 | storageClassName: "" 27 | --------------------------------------------------------------------------------