├── .gitignore
├── Dockerfile.copy
├── Dockerfile.monitoring
├── Dockerfile.train
├── Dockerfile.validation
├── LICENSE
├── README.md
├── build_copy_container.sh
├── build_monitoring_component.sh
├── build_training_container.sh
├── build_validation_component.sh
├── data-extraction
    └── run_copy_merlin.sh
├── images
    └── merlin-kubeflow-arch.png
├── inference
    ├── criteo-inference-client.py
    ├── load-triton-ensemble.py
    ├── run_merlin_inference.sh
    └── triton
    │   ├── Chart.yaml
    │   ├── README.md
    │   ├── dashboard.json
    │   ├── run_triton.sh
    │   ├── templates
    │       ├── _helpers.tpl
    │       ├── deployment.yaml
    │       └── service.yaml
    │   └── values.yaml
├── merlin-pipeline.py
├── monitoring
    ├── .helmignore
    ├── Chart.yaml
    ├── csv_read_gcs_write.py
    ├── perf-monitor-test.py
    ├── perf-monitor.py
    ├── run_monitoring.sh
    ├── run_monitoring_and_live_data.sh
    ├── templates
    │   ├── _helpers.tpl
    │   └── deployment.yaml
    └── values.yaml
├── preprocess-train
    ├── dcn_files
    │   ├── dcn.json
    │   └── format_dcn.py
    ├── preprocess-train.sh
    ├── preprocessing
    │   ├── nvt-preprocess-incremental.py
    │   └── nvt-preprocess.py
    └── training
    │   ├── create-nvt-hugectr-ensemble.py
    │   ├── ensemble-config.json
    │   └── hugectr-train-criteo-dcn.py
├── run_all.sh
├── validation
    ├── generate-stats.py
    ├── run_validation.sh
    ├── train_stats
    │   └── stats.txt
    ├── val_stats
    │   └── stats.txt
    └── validate-stats.py
└── yamls
    ├── Autoscaling_custom_metrics
        ├── 1_custom-metric-server-config.yaml
        ├── 2_custom-metric-server.yaml
        ├── 3_custom-metrics-server-rbac.yaml
        └── 4_triton-hpa.yaml
    ├── pv.yaml
    └── pvc.yaml


/.gitignore:
--------------------------------------------------------------------------------
1 | *.tar
2 | *.tar.gz
3 | gcloud_key.json
4 | kfp_client_host_key.txt
5 | 


--------------------------------------------------------------------------------
/Dockerfile.copy:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | FROM google/cloud-sdk
18 | 
19 | ARG gcloud_key
20 | 
21 | # Install dependencies
22 | RUN apt-get update && \
23 |     apt-get install -y python3-pip vim curl 
24 | 
25 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \
26 |     apt-get install apt-transport-https --yes && \
27 |     echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \
28 |     apt-get update && apt-get install helm
29 | 
30 | RUN pip3 install kfp
31 | 
32 | COPY $PWD/data-extraction/run_copy_merlin.sh /script/
33 | COPY $PWD/inference /script
34 | COPY $gcloud_key /script
35 | 
36 | WORKDIR /script
37 | 


--------------------------------------------------------------------------------
/Dockerfile.monitoring:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | FROM google/cloud-sdk
18 | 
19 | ARG gcloud_key
20 | ARG project_id
21 | 
22 | # Install dependencies
23 | RUN apt-get update && \
24 |     apt-get install -y python3-pip vim curl 
25 | 
26 | RUN pip3 install kfp
27 | RUN pip3 install --upgrade google-cloud-pubsub
28 | RUN pip3 install scikit-learn pandas pyarrow
29 | 
30 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \
31 |     apt-get install apt-transport-https --yes && \
32 |     echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \
33 |     apt-get update && apt-get install helm
34 | 
35 | COPY $PWD/monitoring /script/
36 | COPY $gcloud_key /script/
37 | COPY $PWD/kfp_client_host_key.txt /script/
38 | 
39 | # Set Environment variables
40 | ENV GOOGLE_APPLICATION_CREDENTIALS /script/$gcloud_key
41 | ENV PROJECT $project_id
42 | # ENV PYTHONUNBUFFERED=0
43 | 
44 | WORKDIR /script
45 | 


--------------------------------------------------------------------------------
/Dockerfile.train:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | FROM nvcr.io/nvidia/merlin/merlin-training:0.5.1
18 | 
19 | ARG gcloud_key
20 | 
21 | RUN apt-get update && \
22 |     apt-get install -y python3-pip vim curl 
23 | 
24 | RUN curl https://baltocdn.com/helm/signing.asc | apt-key add - && \
25 |     apt-get install apt-transport-https --yes && \
26 |     echo "deb https://baltocdn.com/helm/stable/debian/ all main" | tee /etc/apt/sources.list.d/helm-stable-debian.list && \
27 |     apt-get update && apt-get install helm
28 | 
29 | RUN curl -sSL https://sdk.cloud.google.com | bash
30 | ENV PATH $PATH:/root/google-cloud-sdk/bin
31 | 
32 | COPY $PWD/preprocess-train /script
33 | COPY $gcloud_key /script
34 | 
35 | WORKDIR /script
36 | 


--------------------------------------------------------------------------------
/Dockerfile.validation:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | FROM google/cloud-sdk
18 | 
19 | ARG gcloud_key
20 | ARG project_id
21 | 
22 | RUN apt-get update && \
23 |     apt-get install -y python3-pip vim curl 
24 | 
25 | RUN pip3 install tensorflow-data-validation && \
26 |     pip3 install protobuf && \
27 |     pip3 install pandas ipython
28 | 
29 | COPY $PWD/validation /script/
30 | COPY $gcloud_key /script/
31 | 
32 | ENV GOOGLE_APPLICATION_CREDENTIALS /script/$gcloud_key
33 | ENV PROJECT $project_id
34 | ENV PYTHONUNBUFFERED=0
35 | 
36 | WORKDIR /script
37 | 
38 | 
39 | 
40 |  
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Merlin - MLOps on GKE
 2 | 
 3 | ## Introduction
 4 | [NVIDIA Merlin](https://developer.nvidia.com/nvidia-merlin) is an open-source application framework that facilitates the development and deployment of large-scale deep recommender systems on GPUs.
 5 | 
 6 | The figure below shows the architecture of a recommendation system example using NVIDIA Merlin on a [Kubeflow pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/pipelines-overview/).
 7 | 
 8 | Through this, we intend to show an end-to-end reference architecture, all the way from data preparation, to model deployment, with features like continuous and fast re-training, autoscaling, and model monitoring.
 9 | 
10 | ![Merlin-Kubeflow Architecture](images/merlin-kubeflow-arch.png)
11 | 
12 | For this example, we use the [Criteo 1TB Click Logs](https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/) dataset, a large publicly available dataset for recommender systems. It contains feature values and click feedback for millions of display ads. It is divided into 24 files, each one corresponding to one day of data.
13 | 
14 | ## Running the example
15 | Please follow the User Guide available [here](https://docs.google.com/document/d/1P_BerGSP5CNzGjGbRqgMrPcNaCmQuKUyodFaG0jlu9I/edit?usp=sharing)!
16 | 
17 | 


--------------------------------------------------------------------------------
/build_copy_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID
20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key
21 | 
22 | image_name=gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk # Specify the image name here
23 | image_tag=0.5.1
24 | 
25 | full_image_name=${image_name}:${image_tag}
26 | 
27 | docker build --build-arg gcloud_key=$GCLOUD_KEY -f Dockerfile.copy -t $full_image_name  .
28 | 
29 | printf "\n\nPushing the container on GCR..."
30 | docker push $full_image_name
31 | 
32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n"
33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}"
34 | printf "\n\n------------------------------------------------------------------------------------------\n\n"
35 | 


--------------------------------------------------------------------------------
/build_monitoring_component.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID
20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key
21 | 
22 | image_name=gcr.io/$PROJECT_ID/monitoring # Specify the image name here
23 | image_tag=0.5.1
24 | 
25 | full_image_name=${image_name}:${image_tag}
26 | 
27 | docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID -f Dockerfile.monitoring -t $full_image_name  .
28 | 
29 | printf "\n\nPushing the container on GCR..."
30 | docker push $full_image_name
31 | 
32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n"
33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}"
34 | printf "\n\n------------------------------------------------------------------------------------------\n\n"


--------------------------------------------------------------------------------
/build_training_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID
20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key
21 | 
22 | image_name=gcr.io/$PROJECT_ID/merlin/merlin-training # Specify the image name here
23 | image_tag=0.5.1
24 | 
25 | full_image_name=${image_name}:${image_tag}
26 | 
27 | docker build --build-arg gcloud_key=$GCLOUD_KEY  -f Dockerfile.train -t $full_image_name .
28 | 
29 | printf "\n\nPushing the container on GCR..."
30 | docker push $full_image_name
31 | 
32 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n"
33 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}"
34 | printf "\n\n------------------------------------------------------------------------------------------\n\n"
35 | 
36 | 


--------------------------------------------------------------------------------
/build_validation_component.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID
20 | GCLOUD_KEY=${2:-"gcloud_key.json"} # Path to Google Cloud key
21 | 
22 | image_name=gcr.io/$PROJECT_ID/validation # Specify the image name here
23 | image_tag=0.5.1
24 | 
25 | full_image_name=${image_name}:${image_tag}
26 | 
27 | # docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID --no-cache -f Dockerfile.validation -t $full_image_name  .
28 | docker build --build-arg gcloud_key=$GCLOUD_KEY --build-arg project_id=$PROJECT_ID -f Dockerfile.validation -t $full_image_name  .
29 | 
30 | printf "\n\nPushing the container on GCR..."
31 | docker push $full_image_name
32 | 
33 | printf "\n\n\n\n<<< Unique ID of the container is below. Use this ID in the pipeline component >>>\n\n"
34 | docker inspect --format="{{index .RepoDigests 0}}" "${full_image_name}"
35 | printf "\n\n------------------------------------------------------------------------------------------\n\n"


--------------------------------------------------------------------------------
/data-extraction/run_copy_merlin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | data_input_path=$1;
19 | data_local=$2;
20 | project_id=$3;
21 | new_data_path=$4;
22 | cluster=$5;
23 | zone=$6;
24 | 
25 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json
26 | gcloud container clusters get-credentials $cluster --zone $zone --project $project_id
27 | gcloud config set project $project_id
28 | 
29 | triton_status=$(helm status triton 2>&1)
30 | if [[ "$triton_status" == "Error: release: not found" ]]; then
31 |   if [ -d "$data_local" ]; then
32 |     ### Take action if $DIR exists ###
33 |     echo "Running first time..."
34 |     echo "Directory ${DIR} exists. Copying files from gcs"
35 |     gsutil list gs://
36 |     if ! [ -d "$data_local/criteo-data" ]; then
37 |       echo "Making criteo-data"
38 |       mkdir -p $data_local/criteo-data/crit_int_pq
39 |     fi
40 |     echo "Copying data..."
41 |     gsutil cp -r $data_input_path $data_local/criteo-data/crit_int_pq
42 |     echo "Copying done"
43 |     
44 |     for entry in "$data_local/criteo-data/crit_int_pq"/*
45 |     do
46 |       echo "$entry"
47 |     done
48 | 
49 |   else
50 |     ###  Control will jump here if $DIR does NOT exists ###
51 |     echo "Error: ${DIR} not found. Can not continue."
52 |     exit 1
53 |   fi
54 |   echo "copying done"
55 | else
56 |   if [ -d "$data_local" ]; then
57 |     ### Take action if $DIR exists ###
58 |     echo "Recurrent run..."
59 |     echo "Directory ${DIR} exists. Copying files from gcs"
60 |     # gsutil list gs://
61 |     if ! [ -d "$data_local/criteo-data/new_data" ]; then
62 |       echo "Making criteo-data"
63 |       mkdir -p $data_local/criteo-data/new_data
64 |     fi
65 |     echo "Copying data..."
66 |     gsutil cp -r $new_data_path $data_local/criteo-data/new_data
67 |     echo "Copying done"
68 |     
69 |     for entry in "$data_local/criteo-data/new_data"/*
70 |     do
71 |       echo "$entry"
72 |     done
73 |   else
74 |     ###  Control will jump here if $DIR does NOT exists ###
75 |     echo "Error: ${DIR} not found. Can not continue."
76 |     exit 1
77 |   fi
78 | fi
79 | 
80 | 


--------------------------------------------------------------------------------
/images/merlin-kubeflow-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-Merlin/gcp-ml-ops/5fed2a218b605854fe7147576891e91871698359/images/merlin-kubeflow-arch.png


--------------------------------------------------------------------------------
/inference/criteo-inference-client.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import os
 18 | import argparse
 19 | import sys
 20 | import warnings
 21 | import sys
 22 | 
 23 | import tritonclient.http as httpclient
 24 | import tritonclient.grpc as grpcclient
 25 | from tritonclient.utils import *
 26 | import cudf
 27 | 
 28 | from sklearn import metrics
 29 | 
 30 | 
 31 | if __name__ == "__main__":
 32 |     parser = argparse.ArgumentParser()
 33 | 
 34 |     parser.add_argument('-u',
 35 |                         '--triton_grpc_url',
 36 |                         type=str,
 37 |                         required=False,
 38 |                         default='localhost:8001',
 39 |                         help='URL to Triton gRPC Endpoint')
 40 | 
 41 |     parser.add_argument('-m',
 42 |                         '--model_name',
 43 |                         type=str,
 44 |                         required=False,
 45 |                         default='dcn_ens',
 46 |                         help='Name of the model ensemble to load')
 47 | 
 48 |     parser.add_argument('-d',
 49 |                         '--test_data',
 50 |                         type=str,
 51 |                         required=False,
 52 |                         default='/crit_int_pq/day_23.parquet',
 53 |                         help='Path to a test .parquet file. Default')
 54 | 
 55 |     parser.add_argument('-b',
 56 |                         '--batch_size',
 57 |                         type=int,
 58 |                         required=False,
 59 |                         default=64,
 60 |                         help='Batch size. Max is 64 at the moment, but this max size could be specified when create the model and the ensemble.')
 61 | 
 62 |     parser.add_argument('-n',
 63 |                         '--n_batches',
 64 |                         type=int,
 65 |                         required=False,
 66 |                         default=1,
 67 |                         help='Number of batches of data to send')
 68 | 
 69 |     parser.add_argument('-v',
 70 |                         '--verbose',
 71 |                         type=bool,
 72 |                         required=False,
 73 |                         default=False,
 74 |                         help='Verbosity, True or False')
 75 | 
 76 | 
 77 |     args = parser.parse_args()
 78 | 
 79 |     # warnings can be disabled
 80 |     if not sys.warnoptions:
 81 |         warnings.simplefilter("ignore")
 82 | 
 83 |     try:
 84 |         triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose)
 85 |         print("Triton client created.")
 86 |     except Exception as e:
 87 |         print("channel creation failed: " + str(e))
 88 |         sys.exit()
 89 | 
 90 | 
 91 |     if not triton_client.is_model_ready(args.model_name):
 92 |         print(f"Model {args.model_name} is not ready!")
 93 |         sys.exit(1)
 94 |     else:
 95 |         print(f"Model {args.model_name} is ready!")
 96 | 
 97 |     ### ....
 98 |    
 99 | 
100 |     # Load the dataset
101 |     CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
102 |     CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
103 |     LABEL_COLUMNS = ['label']
104 |     col_names =  CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS
105 |     col_dtypes = [np.int32]*26 + [np.int64]*13
106 |     
107 | 
108 | 
109 |     print("Reading dataset..")
110 |     batch_whole = cudf.read_parquet(args.test_data, num_rows=args.batch_size*args.n_batches)
111 |     batch_features = batch_whole[col_names]
112 |     batch_labels = batch_whole[LABEL_COLUMNS]
113 | 
114 |     
115 | 
116 |     results=[]
117 | 
118 | 
119 |     with grpcclient.InferenceServerClient(url=args.triton_grpc_url) as client:
120 |         for batch in range(args.n_batches):
121 |             print(f"Requesting inference for batch {batch}..")
122 |             start_idx=batch*args.batch_size
123 |             end_idx=(batch+1)*(args.batch_size)
124 |             # convert the batch to a triton inputs
125 |             columns = [(col, batch_features[col][start_idx:end_idx]) for col in col_names]
126 |             inputs = []
127 | 
128 |             for i, (name, col) in enumerate(columns):
129 |                 d = col.values_host.astype(col_dtypes[i])
130 |                 d = d.reshape(len(d), 1)
131 |                 inputs.append(grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
132 |                 inputs[i].set_data_from_numpy(d)
133 | 
134 |             outputs = []
135 |             outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
136 | 
137 |             response = client.infer(args.model_name, inputs, request_id=str(1), outputs=outputs)
138 | 
139 |             results.extend(response.as_numpy("OUTPUT0"))
140 |     
141 | 
142 |     print(f"ROC AUC Score: {metrics.roc_auc_score(batch_labels[LABEL_COLUMNS].values.tolist(), results)}")
143 | 
144 |     
145 | 
146 | 


--------------------------------------------------------------------------------
/inference/load-triton-ensemble.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | 
18 | import sys
19 | import argparse
20 | import logging
21 | 
22 | import tritonclient.grpc as grpcclient
23 | from tritonclient.utils import InferenceServerException
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser()
28 | 
29 |     parser.add_argument('-u',
30 |                         '--triton_grpc_url',
31 |                         type=str,
32 |                         required=False,
33 |                         default='localhost:8001',
34 |                         help='URL to Triton gRPC Endpoint')
35 | 
36 |     parser.add_argument('-m',
37 |                         '--model_name',
38 |                         type=str,
39 |                         required=False,
40 |                         default='dcn_ens',
41 |                         help='Name of the model ensemble to load')
42 | 
43 |     parser.add_argument('-v',
44 |                         '--verbose',
45 |                         type=bool,
46 |                         required=False,
47 |                         default=True,
48 |                         help='Verbosity, True or False')
49 | 
50 | 
51 |     args = parser.parse_args()
52 | 
53 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
54 |     logging.info(f"Args: {args}")
55 |     
56 |     try:
57 |         triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose)
58 |         logging.info("Triton client created.")
59 |     except Exception as e:
60 |         logging.error(f"channel creation failed: {str(e)}")
61 |         sys.exit()
62 | 
63 | 
64 |     # Health
65 |     if not triton_client.is_server_live(headers={'test': '1', 'dummy': '2'}):
66 |         logging.error("FAILED : is_server_live")
67 |         sys.exit(1)
68 | 
69 |     if not triton_client.is_server_ready():
70 |         logging.error("FAILED : is_server_ready")
71 |         sys.exit(1)
72 | 
73 |     logging.info(f"Models available: {triton_client.get_model_repository_index()}")
74 | 
75 |     # Load the ensemble model
76 |     # TODO: Increase the timeout. Sometimes this times out with 8xGPUs because loading
77 |     # the model takes longer.
78 |     try:
79 |         triton_client.load_model(model_name=args.model_name)
80 |     except InferenceServerException as e:
81 |         if "failed to load" in e.message():
82 |             logging.error(f"Model {args.model_name} failed to load!")
83 | 
84 |     if not triton_client.is_model_ready(args.model_name):
85 |         logging.error(f"Model {args.model_name} is not ready!")
86 |         sys.exit(1)
87 |     else:
88 |         logging.info(f"Model {args.model_name} is ready!")
89 | 


--------------------------------------------------------------------------------
/inference/run_merlin_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | PV_LOC=${1:-"/var/lib/data"}
19 | PROJECT_ID=${2:-"dl-tme"}
20 | GCLOUD_KEY=${3:-"/script/gcloud_key.json"}
21 | CLUSTER=${4:-"merlin-mlops"}
22 | ZONE=${5:-"us-central1-a"}
23 | 
24 | gcloud auth activate-service-account --key-file=$GCLOUD_KEY
25 | gcloud container clusters get-credentials $CLUSTER --zone $ZONE --project $PROJECT_ID
26 | gcloud config set project $PROJECT_ID
27 | 
28 | if ! [ -d $PV_LOC/inference ]; then
29 |     mkdir $PV_LOC/inference
30 | fi
31 | 
32 | triton_status=$(helm status triton 2>&1)
33 | echo "Triton status: "
34 | echo $triton_status
35 | if [[ "$triton_status" == "Error: release: not found" ]]; then
36 |     cp /script/load-triton-ensemble.py $PV_LOC/inference/load-triton-ensemble.py
37 |     cp /script/triton/run_triton.sh $PV_LOC/inference/run_triton.sh
38 | 
39 |     # helm install triton /script/triton/ --set image.repository=gcr.io/$PROJECT_ID/merlin/merlin-inference:v0.5
40 |     helm install triton /script/triton/ --set image.repository=gcr.io/$PROJECT_ID/merlin/merlin-inference:0.5.1
41 | else
42 |     echo "Triton running already, not deploying another instance."
43 | fi
44 | 


--------------------------------------------------------------------------------
/inference/triton/Chart.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | apiVersion: v1
28 | appVersion: "2.0"
29 | description: Triton Inference Server
30 | name: triton-inference-server
31 | version: 1.0.0
32 | 


--------------------------------------------------------------------------------
/inference/triton/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 
 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 30 | 
 31 | # Kubernetes Deploy: Triton Inference Server Cluster
 32 | 
 33 | **NOTE: The prometheuos operator used in these instructions is not yet
 34 | updated to work with 1.16.x versions of Google Kubernetes Engine
 35 | (GKE). You must use a GKE 1.15.x version to avoid this issue.**
 36 | 
 37 | A helm chart for installing a single cluster of Triton Inference
 38 | Server is provided. By default the cluster contains a single instance
 39 | of the inference server but the *replicaCount* configuration parameter
 40 | can be set to create a cluster of any size, as described below.
 41 | 
 42 | This guide assumes you already have a functional Kubernetes cluster
 43 | and helm installed (see below for instructions on installing
 44 | helm). Note the following requirements:
 45 | 
 46 | * The helm chart deploys Prometheus and Grafana to collect and display
 47 | Triton metrics. Your cluster must contain sufficient CPU resourses to
 48 | support these services. At a minimum you will likely require 2 CPU
 49 | nodes with machine type of n1-standard-2 or greater.
 50 | 
 51 | * If you want Triton Server to use GPUs for inferencing, your cluster
 52 | must be configured to contain the desired number of GPU nodes with
 53 | support for the NVIDIA driver and CUDA version required by the version
 54 | of the inference server you are using.
 55 | 
 56 | This helm chart is available from [Triton Inference Server
 57 | GitHub](https://github.com/triton-inference-server/server) or from the
 58 | [NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com).
 59 | 
 60 | The steps below describe how to set-up a model repository, use helm to
 61 | launch the inference server, and then send inference requests to the
 62 | running server. You can access a Grafana endpoint to see real-time
 63 | metrics reported by the inference server.
 64 | 
 65 | ## Installing Helm
 66 | 
 67 | If you do not already have Helm installed in your Kubernetes cluster,
 68 | executing the following steps from the [official helm install
 69 | guide](https://helm.sh/docs/intro/install/) will
 70 | give you a quick setup.
 71 | 
 72 | ```
 73 | $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
 74 | $ kubectl create serviceaccount -n kube-system tiller
 75 | serviceaccount/tiller created
 76 | $ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
 77 | $ helm init --service-account tiller --wait
 78 | ```
 79 | 
 80 | ## Model Repository
 81 | 
 82 | If you already have a model repository you may use that with this helm
 83 | chart. If you do not have a model repository, you can checkout a local
 84 | copy of the inference server source repository to create an example
 85 | model repository::
 86 | 
 87 | ```
 88 | $ git clone https://github.com/triton-inference-server/server.git
 89 | ```
 90 | 
 91 | Triton Server needs a repository of models that it will make available
 92 | for inferencing. For this example you will place the model repository
 93 | in a Google Cloud Storage bucket.
 94 | 
 95 | ```
 96 | $ gsutil mb gs://triton-inference-server-repository
 97 | ```
 98 | 
 99 | Following the [QuickStart](../../docs/quickstart.md) download the
100 | example model repository to your system and copy it into the GCS
101 | bucket.
102 | 
103 | ```
104 | $ gsutil cp -r docs/examples/model_repository gs://triton-inference-server-repository/model_repository
105 | ```
106 | 
107 | ### GCS Permissions
108 | 
109 | Make sure the bucket permissions are set so that the inference server
110 | can access the model repository. If the bucket is public then no
111 | additional changes are needed and you can proceed to "Deploy
112 | Prometheus and Grafana" section.
113 | 
114 | If bucket premissions need to be set with the
115 | GOOGLE_APPLICATION_CREDENTIALS environment variable then perform the
116 | following steps:
117 | 
118 | * Generate Google service account JSON with proper permissions called
119 |   *gcp-creds.json*.
120 | 
121 | * Create a Kubernetes secret from *gcp-creds.json*:
122 | 
123 | ```
124 |   $ kubectl create configmap gcpcreds --from-literal "project-id=myproject"
125 |   $ kubectl create secret generic gcpcreds --from-file gcp-creds.json
126 | ```
127 | 
128 | * Modify templates/deployment.yaml to include the
129 |   GOOGLE_APPLICATION_CREDENTIALS environment variable:
130 | 
131 | ```
132 |     env:
133 |       - name: GOOGLE_APPLICATION_CREDENTIALS
134 |         value: /secret/gcp-creds.json
135 | ```
136 | 
137 | * Modify templates/deployment.yaml to mount the secret in a volume at
138 |   /secret:
139 | 
140 | ```
141 |     volumeMounts:
142 |       - name: vsecret
143 |         mountPath: "/secret"
144 |         readOnly: true
145 |     ...
146 |     volumes:
147 |     - name: vsecret
148 |       secret:
149 |         secretName: gcpcreds
150 | ```
151 | 
152 | ## Deploy Prometheus and Grafana
153 | 
154 | The inference server metrics are collected by Prometheus and viewable
155 | by Grafana. The inference server helm chart assumes that Prometheus
156 | and Grafana are available so this step must be followed even if you
157 | don't want to use Grafana.
158 | 
159 | Use the prometheus-operator to install these components. The
160 | *serviceMonitorSelectorNilUsesHelmValues* flag is needed so that
161 | Prometheus can find the inference server metrics in the *example*
162 | release deployed below.
163 | 
164 | ```
165 | $ helm install --name example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false stable/prometheus-operator
166 | ```
167 | 
168 | Then port-forward to the Grafana service so you can access it from
169 | your local browser.
170 | 
171 | ```
172 | $ kubectl port-forward service/example-metrics-grafana 8080:80
173 | ```
174 | 
175 | Now you should be able to navigate in your browser to localhost:8080
176 | and see the Grafana login page. Use username=admin and
177 | password=prom-operator to login.
178 | 
179 | An example Grafana dashboard is available in dashboard.json. Use the
180 | import function in Grafana to import and view this dashboard.
181 | 
182 | ## Deploy the Inference Server
183 | 
184 | Deploy the inference server using the default configuration with the
185 | following commands.
186 | 
187 | ```
188 | $ cd <directory containing Chart.yaml>
189 | $ helm install --name example .
190 | ```
191 | 
192 | Use kubectl to see status and wait until the inference server pods are
193 | running.
194 | 
195 | ```
196 | $ kubectl get pods
197 | NAME                                               READY   STATUS    RESTARTS   AGE
198 | example-triton-inference-server-5f74b55885-n6lt7   1/1     Running   0          2m21s
199 | ```
200 | 
201 | There are several ways of overriding the default configuration as
202 | described in this [helm
203 | documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing).
204 | 
205 | You can edit the values.yaml file directly or you can use the *--set*
206 | option to override a single parameter with the CLI. For example, to
207 | deploy a cluster of four inference servers use *--set* to set the
208 | replicaCount parameter.
209 | 
210 | ```
211 | $ helm install --name example --set replicaCount=4 .
212 | ```
213 | 
214 | You can also write your own "config.yaml" file with the values you
215 | want to override and pass it to helm.
216 | 
217 | ```
218 | $ cat << EOF > config.yaml
219 | namespace: MyCustomNamespace
220 | image:
221 |   imageName: nvcr.io/nvidia/tritonserver:custom-tag
222 |   modelRepositoryPath: gs://my_model_repository
223 | EOF
224 | $ helm install --name example -f config.yaml .
225 | ```
226 | 
227 | ## Using Triton Inference Server
228 | 
229 | Now that the inference server is running you can send HTTP or GRPC
230 | requests to it to perform inferencing. By default, the inferencing
231 | service is exposed with a LoadBalancer service type. Use the following
232 | to find the external IP for the inference server. In this case it is
233 | 34.83.9.133.
234 | 
235 | ```
236 | $ kubectl get services
237 | NAME                             TYPE           CLUSTER-IP     EXTERNAL-IP   PORT(S)                                        AGE
238 | ...
239 | example-triton-inference-server  LoadBalancer   10.18.13.28    34.83.9.133   8000:30249/TCP,8001:30068/TCP,8002:32723/TCP   47m
240 | ```
241 | 
242 | The inference server exposes an HTTP endpoint on port 8000, and GRPC
243 | endpoint on port 8001 and a Prometheus metrics endpoint on
244 | port 8002. You can use curl to get the meta-data of the inference server
245 | from the HTTP endpoint.
246 | 
247 | ```
248 | $ curl 34.83.9.133:8000/v2
249 | ```
250 | 
251 | Follow the [QuickStart](../../docs/quickstart.md) to get the example
252 | image classification client that can be used to perform inferencing
253 | using image classification models being served by the inference
254 | server. For example,
255 | 
256 | ```
257 | $ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
258 | Request 0, batch size 1
259 | Image 'images/mug.jpg':
260 |     504 (COFFEE MUG) = 0.723992
261 |     968 (CUP) = 0.270953
262 |     967 (ESPRESSO) = 0.00115997
263 | ```
264 | 
265 | ## Cleanup
266 | 
267 | Once you've finished using the inference server you should use helm to
268 | delete the deployment.
269 | 
270 | ```
271 | $ helm list
272 | NAME            REVISION  UPDATED                   STATUS    CHART                          APP VERSION   NAMESPACE
273 | example         1         Wed Feb 27 22:16:55 2019  DEPLOYED  triton-inference-server-1.0.0  1.0           default
274 | example-metrics	1       	Tue Jan 21 12:24:07 2020	DEPLOYED	prometheus-operator-6.18.0   	 0.32.0     	 default
275 | 
276 | $ helm delete --purge example
277 | $ helm delete --purge example-metrics
278 | ```
279 | 
280 | For the Prometheus and Grafana services you should [explicitly delete
281 | CRDs](https://github.com/helm/charts/tree/master/stable/prometheus-operator#uninstalling-the-chart):
282 | 
283 | ```
284 | $ kubectl delete crd alertmanagers.monitoring.coreos.com servicemonitors.monitoring.coreos.com podmonitors.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com
285 | ```
286 | 
287 | You may also want to delete the GCS bucket you created to hold the
288 | model repository.
289 | 
290 | ```
291 | $ gsutil rm -r gs://triton-inference-server-repository
292 | ```
293 | 


--------------------------------------------------------------------------------
/inference/triton/dashboard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "__inputs": [
  3 |     {
  4 |       "name": "DS_PROMETHEUS",
  5 |       "label": "Prometheus",
  6 |       "description": "",
  7 |       "type": "datasource",
  8 |       "pluginId": "prometheus",
  9 |       "pluginName": "Prometheus"
 10 |     }
 11 |   ],
 12 |   "__requires": [
 13 |     {
 14 |       "type": "grafana",
 15 |       "id": "grafana",
 16 |       "name": "Grafana",
 17 |       "version": "6.3.5"
 18 |     },
 19 |     {
 20 |       "type": "panel",
 21 |       "id": "graph",
 22 |       "name": "Graph",
 23 |       "version": ""
 24 |     },
 25 |     {
 26 |       "type": "panel",
 27 |       "id": "heatmap",
 28 |       "name": "Heatmap",
 29 |       "version": ""
 30 |     },
 31 |     {
 32 |       "type": "datasource",
 33 |       "id": "prometheus",
 34 |       "name": "Prometheus",
 35 |       "version": "1.0.0"
 36 |     }
 37 |   ],
 38 |   "annotations": {
 39 |     "list": [
 40 |       {
 41 |         "builtIn": 1,
 42 |         "datasource": "-- Grafana --",
 43 |         "enable": true,
 44 |         "hide": true,
 45 |         "iconColor": "rgba(0, 211, 255, 1)",
 46 |         "name": "Annotations & Alerts",
 47 |         "type": "dashboard"
 48 |       }
 49 |     ]
 50 |   },
 51 |   "editable": true,
 52 |   "gnetId": null,
 53 |   "graphTooltip": 0,
 54 |   "id": null,
 55 |   "links": [],
 56 |   "panels": [
 57 |     {
 58 |       "aliasColors": {},
 59 |       "bars": false,
 60 |       "dashLength": 10,
 61 |       "dashes": false,
 62 |       "datasource": "${DS_PROMETHEUS}",
 63 |       "fill": 1,
 64 |       "fillGradient": 0,
 65 |       "gridPos": {
 66 |         "h": 9,
 67 |         "w": 12,
 68 |         "x": 0,
 69 |         "y": 0
 70 |       },
 71 |       "id": 2,
 72 |       "legend": {
 73 |         "avg": false,
 74 |         "current": false,
 75 |         "max": false,
 76 |         "min": false,
 77 |         "show": true,
 78 |         "total": false,
 79 |         "values": false
 80 |       },
 81 |       "lines": true,
 82 |       "linewidth": 1,
 83 |       "nullPointMode": "null",
 84 |       "options": {
 85 |         "dataLinks": []
 86 |       },
 87 |       "percentage": false,
 88 |       "pointradius": 2,
 89 |       "points": false,
 90 |       "renderer": "flot",
 91 |       "seriesOverrides": [],
 92 |       "spaceLength": 10,
 93 |       "stack": false,
 94 |       "steppedLine": false,
 95 |       "targets": [
 96 |         {
 97 |           "expr": "nv_inference_request_success",
 98 |           "legendFormat": "Success {{instance}}",
 99 |           "refId": "A"
100 |         },
101 |         {
102 |           "expr": "nv_inference_request_failure",
103 |           "legendFormat": "Failure {{instance}}",
104 |           "refId": "B"
105 |         }
106 |       ],
107 |       "thresholds": [],
108 |       "timeFrom": null,
109 |       "timeRegions": [],
110 |       "timeShift": null,
111 |       "title": "Cumulative Inference Requests",
112 |       "tooltip": {
113 |         "shared": true,
114 |         "sort": 0,
115 |         "value_type": "individual"
116 |       },
117 |       "type": "graph",
118 |       "xaxis": {
119 |         "buckets": null,
120 |         "mode": "time",
121 |         "name": null,
122 |         "show": true,
123 |         "values": []
124 |       },
125 |       "yaxes": [
126 |         {
127 |           "format": "short",
128 |           "label": null,
129 |           "logBase": 1,
130 |           "max": null,
131 |           "min": null,
132 |           "show": true
133 |         },
134 |         {
135 |           "format": "short",
136 |           "label": null,
137 |           "logBase": 1,
138 |           "max": null,
139 |           "min": null,
140 |           "show": false
141 |         }
142 |       ],
143 |       "yaxis": {
144 |         "align": false,
145 |         "alignLevel": null
146 |       }
147 |     },
148 |     {
149 |       "cards": {
150 |         "cardPadding": null,
151 |         "cardRound": null
152 |       },
153 |       "color": {
154 |         "cardColor": "#b4ff00",
155 |         "colorScale": "sqrt",
156 |         "colorScheme": "interpolateReds",
157 |         "exponent": 0.5,
158 |         "mode": "spectrum"
159 |       },
160 |       "dataFormat": "timeseries",
161 |       "gridPos": {
162 |         "h": 9,
163 |         "w": 12,
164 |         "x": 12,
165 |         "y": 0
166 |       },
167 |       "heatmap": {},
168 |       "hideZeroBuckets": false,
169 |       "highlightCards": true,
170 |       "id": 7,
171 |       "legend": {
172 |         "show": false
173 |       },
174 |       "options": {},
175 |       "reverseYBuckets": false,
176 |       "targets": [
177 |         {
178 |           "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)",
179 |           "legendFormat": "",
180 |           "refId": "A"
181 |         }
182 |       ],
183 |       "timeFrom": null,
184 |       "timeShift": null,
185 |       "title": "Load Ratio  (Total Time / Compute Time)",
186 |       "tooltip": {
187 |         "show": true,
188 |         "showHistogram": false
189 |       },
190 |       "type": "heatmap",
191 |       "xAxis": {
192 |         "show": true
193 |       },
194 |       "xBucketNumber": null,
195 |       "xBucketSize": null,
196 |       "yAxis": {
197 |         "decimals": null,
198 |         "format": "short",
199 |         "logBase": 1,
200 |         "max": null,
201 |         "min": null,
202 |         "show": true,
203 |         "splitFactor": null
204 |       },
205 |       "yBucketBound": "auto",
206 |       "yBucketNumber": null,
207 |       "yBucketSize": null
208 |     },
209 |     {
210 |       "aliasColors": {},
211 |       "bars": false,
212 |       "dashLength": 10,
213 |       "dashes": false,
214 |       "datasource": "${DS_PROMETHEUS}",
215 |       "fill": 1,
216 |       "fillGradient": 0,
217 |       "gridPos": {
218 |         "h": 8,
219 |         "w": 12,
220 |         "x": 0,
221 |         "y": 9
222 |       },
223 |       "id": 4,
224 |       "legend": {
225 |         "avg": false,
226 |         "current": false,
227 |         "max": false,
228 |         "min": false,
229 |         "show": true,
230 |         "total": false,
231 |         "values": false
232 |       },
233 |       "lines": true,
234 |       "linewidth": 1,
235 |       "nullPointMode": "null",
236 |       "options": {
237 |         "dataLinks": []
238 |       },
239 |       "percentage": false,
240 |       "pointradius": 2,
241 |       "points": false,
242 |       "renderer": "flot",
243 |       "seriesOverrides": [],
244 |       "spaceLength": 10,
245 |       "stack": false,
246 |       "steppedLine": false,
247 |       "targets": [
248 |         {
249 |           "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000",
250 |           "legendFormat": "{{instance}}",
251 |           "refId": "A"
252 |         }
253 |       ],
254 |       "thresholds": [],
255 |       "timeFrom": null,
256 |       "timeRegions": [],
257 |       "timeShift": null,
258 |       "title": "Queue Time (milliseconds)",
259 |       "tooltip": {
260 |         "shared": true,
261 |         "sort": 0,
262 |         "value_type": "individual"
263 |       },
264 |       "type": "graph",
265 |       "xaxis": {
266 |         "buckets": null,
267 |         "mode": "time",
268 |         "name": null,
269 |         "show": true,
270 |         "values": []
271 |       },
272 |       "yaxes": [
273 |         {
274 |           "format": "short",
275 |           "label": "Queue Time (ms)",
276 |           "logBase": 1,
277 |           "max": null,
278 |           "min": null,
279 |           "show": true
280 |         },
281 |         {
282 |           "format": "short",
283 |           "label": null,
284 |           "logBase": 1,
285 |           "max": null,
286 |           "min": null,
287 |           "show": false
288 |         }
289 |       ],
290 |       "yaxis": {
291 |         "align": false,
292 |         "alignLevel": null
293 |       }
294 |     },
295 |     {
296 |       "aliasColors": {},
297 |       "bars": false,
298 |       "dashLength": 10,
299 |       "dashes": false,
300 |       "datasource": "${DS_PROMETHEUS}",
301 |       "fill": 1,
302 |       "fillGradient": 0,
303 |       "gridPos": {
304 |         "h": 8,
305 |         "w": 12,
306 |         "x": 12,
307 |         "y": 9
308 |       },
309 |       "id": 5,
310 |       "legend": {
311 |         "avg": false,
312 |         "current": false,
313 |         "max": false,
314 |         "min": false,
315 |         "show": true,
316 |         "total": false,
317 |         "values": false
318 |       },
319 |       "lines": true,
320 |       "linewidth": 1,
321 |       "nullPointMode": "null",
322 |       "options": {
323 |         "dataLinks": []
324 |       },
325 |       "percentage": false,
326 |       "pointradius": 2,
327 |       "points": false,
328 |       "renderer": "flot",
329 |       "seriesOverrides": [],
330 |       "spaceLength": 10,
331 |       "stack": false,
332 |       "steppedLine": false,
333 |       "targets": [
334 |         {
335 |           "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000",
336 |           "legendFormat": "{{instance}}",
337 |           "refId": "A"
338 |         }
339 |       ],
340 |       "thresholds": [],
341 |       "timeFrom": null,
342 |       "timeRegions": [],
343 |       "timeShift": null,
344 |       "title": "Compute Time (milliseconds)",
345 |       "tooltip": {
346 |         "shared": true,
347 |         "sort": 0,
348 |         "value_type": "individual"
349 |       },
350 |       "type": "graph",
351 |       "xaxis": {
352 |         "buckets": null,
353 |         "mode": "time",
354 |         "name": null,
355 |         "show": true,
356 |         "values": []
357 |       },
358 |       "yaxes": [
359 |         {
360 |           "format": "short",
361 |           "label": "Compute Time (ms)",
362 |           "logBase": 1,
363 |           "max": null,
364 |           "min": null,
365 |           "show": true
366 |         },
367 |         {
368 |           "format": "short",
369 |           "label": null,
370 |           "logBase": 1,
371 |           "max": null,
372 |           "min": null,
373 |           "show": false
374 |         }
375 |       ],
376 |       "yaxis": {
377 |         "align": false,
378 |         "alignLevel": null
379 |       }
380 |     }
381 |   ],
382 |   "refresh": "5s",
383 |   "schemaVersion": 19,
384 |   "style": "dark",
385 |   "tags": [],
386 |   "templating": {
387 |     "list": []
388 |   },
389 |   "time": {
390 |     "from": "now-15m",
391 |     "to": "now"
392 |   },
393 |   "timepicker": {
394 |     "refresh_intervals": [
395 |       "5s",
396 |       "10s",
397 |       "30s",
398 |       "1m",
399 |       "5m",
400 |       "15m",
401 |       "30m",
402 |       "1h",
403 |       "2h",
404 |       "1d"
405 |     ]
406 |   },
407 |   "timezone": "",
408 |   "title": "Triton Inference Server",
409 |   "uid": "slEY4dsZk",
410 |   "version": 8
411 | }
412 | 


--------------------------------------------------------------------------------
/inference/triton/run_triton.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | MODELS_DIR=${1:-"/model/models"}
19 | 
20 | set -m
21 | 
22 | tritonserver --model-repository=$MODELS_DIR  --backend-config=hugectr,dcn=$MODELS_DIR/dcn/1/dcn.json --backend-config=hugectr,supportlonglong=true --model-control-mode=poll --repository-poll-secs=10 &
23 | 
24 | sleep 120
25 | 
26 | echo "starting script"
27 | python3 /model/inference/load-triton-ensemble.py --triton_grpc_url localhost:8001 --model_name dcn_ens --verbose False
28 | 
29 | fg %1
30 | 


--------------------------------------------------------------------------------
/inference/triton/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | */}}
28 | 
29 | {{/* vim: set filetype=mustache: */}}
30 | {{/*
31 | Create inference server name.
32 | */}}
33 | {{- define "triton-inference-server.name" -}}
34 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
35 | {{- end -}}
36 | 
37 | {{/*
38 | Create a default fully qualified app name.
39 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
40 | If release name contains chart name it will be used as a full name.
41 | */}}
42 | {{- define "triton-inference-server.fullname" -}}
43 | {{- if .Values.fullnameOverride -}}
44 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
45 | {{- else -}}
46 | {{- $name := default .Chart.Name .Values.nameOverride -}}
47 | {{- if contains $name .Release.Name -}}
48 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
49 | {{- else -}}
50 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
51 | {{- end -}}
52 | {{- end -}}
53 | {{- end -}}
54 | 
55 | {{/*
56 |   Create inference server metrics service name and fullname derived from above and
57 |   truncated appropriately.
58 | */}}
59 | {{- define "triton-inference-server-metrics.name" -}}
60 | {{- $basename := include "triton-inference-server.name" . -}}
61 | {{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
62 | {{- printf "%s-%s" $basename_trimmed "metrics" -}}
63 | {{- end -}}
64 | 
65 | {{- define "triton-inference-server-metrics.fullname" -}}
66 | {{- $basename := include "triton-inference-server.fullname" . -}}
67 | {{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
68 | {{- printf "%s-%s" $basename_trimmed "metrics" -}}
69 | {{- end -}}
70 | 
71 | {{/*
72 |   Create inference server metrics monitor name and fullname derived from
73 |   above and truncated appropriately.
74 | */}}
75 | {{- define "triton-inference-server-metrics-monitor.name" -}}
76 | {{- $basename := include "triton-inference-server.name" . -}}
77 | {{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
78 | {{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
79 | {{- end -}}
80 | 
81 | {{- define "triton-inference-server-metrics-monitor.fullname" -}}
82 | {{- $basename := include "triton-inference-server.fullname" . -}}
83 | {{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
84 | {{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
85 | {{- end -}}
86 | 
87 | {{/*
88 | Create chart name and version as used by the chart label.
89 | */}}
90 | {{- define "triton-inference-server.chart" -}}
91 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
92 | {{- end -}}
93 | 


--------------------------------------------------------------------------------
/inference/triton/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | apiVersion: apps/v1
28 | kind: Deployment
29 | metadata:
30 |   name: {{ template "triton-inference-server.fullname" . }}
31 |   namespace: {{ .Release.Namespace }}
32 |   labels:
33 |     app: {{ template "triton-inference-server.name" . }}
34 |     chart: {{ template "triton-inference-server.chart" . }}
35 |     release: {{ .Release.Name }}
36 |     heritage: {{ .Release.Service }}
37 | spec:
38 |   replicas: {{ .Values.replicaCount }}
39 |   selector:
40 |     matchLabels:
41 |       app: {{ template "triton-inference-server.name" . }}
42 |       release: {{ .Release.Name }}
43 |   template:
44 |     metadata:
45 |       labels:
46 |         app: {{ template "triton-inference-server.name" . }}
47 |         release: {{ .Release.Name }}
48 | 
49 |     spec:
50 |       nodeSelector:
51 |         cloud.google.com/gke-nodepool: a100-pool
52 |       #   cloud.google.com/gke-accelerator: nvidia-tesla-a100
53 |       # nodeSelector:
54 |       #   cloud.google.com/gke-gpu-partition-size: {{ .Values.migPartition}}
55 |       containers:
56 |         - name: {{ .Chart.Name }}
57 |           image: "{{ .Values.image.imageName }}"
58 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
59 |           volumeMounts:
60 |           - name: "storage"
61 |             mountPath: /model
62 | 
63 |           resources:
64 |             limits:
65 |               nvidia.com/gpu: {{ .Values.image.numGpus }}
66 | 
67 |           command: ["/bin/sh","-c"] 
68 |           args: ["bash /model/inference/run_triton.sh {{ .Values.image.modelRepositoryPath }}"]
69 | 
70 |           ports:
71 |             - containerPort: 8000
72 |               name: http
73 |             - containerPort: 8001
74 |               name: grpc
75 |             - containerPort: 8002
76 |               name: metrics
77 |           livenessProbe:
78 |             httpGet:
79 |               path: /v2/health/live
80 |               port: http
81 |             initialDelaySeconds: 100
82 |             periodSeconds: 30
83 |           readinessProbe:
84 |             initialDelaySeconds: 100
85 |             periodSeconds: 30
86 |             httpGet:
87 |               path: /v2/health/ready
88 |               port: http
89 |       volumes:
90 |       - name: "storage"
91 |         persistentVolumeClaim:
92 |           claimName: my-volume-claim


--------------------------------------------------------------------------------
/inference/triton/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | apiVersion: v1
28 | kind: Service
29 | metadata:
30 |   name: {{ template "triton-inference-server.fullname" . }}
31 |   namespace: {{ .Release.Namespace }}
32 |   labels:
33 |     app: {{ template "triton-inference-server.name" . }}
34 |     chart: {{ template "triton-inference-server.chart" . }}
35 |     release: {{ .Release.Name }}
36 |     heritage: {{ .Release.Service }}
37 | spec:
38 |   type: {{ .Values.service.type }}
39 |   ports:
40 |     - port: 8000
41 |       targetPort: http
42 |       name: http-inference-server
43 |     - port: 8001
44 |       targetPort: grpc
45 |       name: grpc-inference-server
46 |     - port: 8002
47 |       targetPort: metrics
48 |       name: metrics-inference-server
49 |   selector:
50 |     app: {{ template "triton-inference-server.name" . }}
51 |     release: {{ .Release.Name }}
52 | ---
53 | apiVersion: v1
54 | kind: Service
55 | metadata:
56 |   name: {{ template "triton-inference-server-metrics.fullname" . }}
57 |   namespace: {{ .Release.Namespace }}
58 |   labels:
59 |     app: {{ template "triton-inference-server-metrics.name" . }}
60 |     chart: {{ template "triton-inference-server.chart" . }}
61 |     release: {{ .Release.Name }}
62 |     heritage: {{ .Release.Service }}
63 |   annotations:
64 |     alpha.monitoring.coreos.com/non-namespaced: "true"
65 | spec:
66 |   ports:
67 |   - name: metrics
68 |     port: 8080
69 |     targetPort: metrics
70 |     protocol: TCP
71 |   selector:
72 |     app: {{ template "triton-inference-server.name" . }}
73 |     release: {{ .Release.Name }}
74 | ---
75 | apiVersion: monitoring.coreos.com/v1
76 | kind: ServiceMonitor
77 | metadata:
78 |   name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
79 |   namespace: {{ .Release.Namespace }}
80 |   labels:
81 |     app: {{ template "triton-inference-server-metrics-monitor.name" . }}
82 |     chart: {{ template "triton-inference-server.chart" . }}
83 |     release: {{ .Release.Name }}
84 |     heritage: {{ .Release.Service }}
85 | spec:
86 |   selector:
87 |     matchLabels:
88 |       app: {{ template "triton-inference-server-metrics.name" . }}
89 |   endpoints:
90 |   - port: metrics
91 |     interval: 10s


--------------------------------------------------------------------------------
/inference/triton/values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | replicaCount: 1
28 | 
29 | migPartition: 3g.20gb
30 | 
31 | image:
32 |   imageName: gcr.io/dl-tme/merlin/merlin-inference:0.5.1
33 |   pullPolicy: Always
34 |   modelRepositoryPath: /model/models
35 |   numGpus: 1
36 | 
37 | service:
38 |   type: LoadBalancer
39 | 


--------------------------------------------------------------------------------
/merlin-pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | import os
 18 | import kfp.dsl as dsl
 19 | import kfp.gcp as gcp
 20 | import kfp.components as comp
 21 | import kfp.dsl as dsl
 22 | import datetime
 23 | import os
 24 | from kubernetes import client as k8s_client
 25 | import argparse
 26 | import logging
 27 | 
 28 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s:%(message)s')
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | args = None
 32 | 
 33 | @dsl.pipeline(
 34 |     name="Merlin pipeline",
 35 |     description="HugeCTR training to deployment"
 36 | )
 37 | def merlin_pipeline(
 38 |   accelerator: str = 'nvidia-tesla-a100',
 39 |   node_pool: str = 'gpu-pool',
 40 |   high_mem_node: str = 'none',
 41 |   data_dir: 'GCSPath' = 'gs://tme-criteo/dummy_data/*',
 42 |   new_data_dir: 'GCSPath' = 'gs://tme-criteo/new_data/*',
 43 |   gcs_bucket_head: str = 'tme-criteo',
 44 |   local_data_dir: str = '/var/lib/data',
 45 |   project_id: str = 'dl-tme',
 46 |   pipeline_name: str = 'merlin-pipeline',
 47 |   new_data_collection: str = 'new_data',
 48 |   do_data_validation: str = 'False',
 49 |   pubsub_sub_id: str = 'mlops-test-sub',
 50 |   cluster: str = 'merlin-mlops',
 51 |   zone: str = 'us-central1-a'):
 52 |     
 53 |     global args
 54 | 
 55 |     # Persistent volume variables
 56 |     persistent_volume_name = 'my-file-server'
 57 |     persistent_volume_claim_name = 'my-volume-claim'
 58 |     persistent_volume_path = '/var/lib/data'
 59 | 
 60 |     # First component - Copy data from GCS to PV
 61 |     copy_data = dsl.ContainerOp(
 62 |       name="data-extraction",
 63 |       image=args.data_extraction,
 64 |       command=["bash" , "/script/run_copy_merlin.sh"],
 65 |       arguments=[data_dir, local_data_dir, project_id, new_data_dir, cluster, zone]
 66 |     )
 67 | 
 68 |     # Second component - Data validation
 69 |     data_validation = dsl.ContainerOp(
 70 |       name="validate-data",
 71 |       image=args.validate_container,
 72 |       command=["bash" , "/script/run_validation.sh"],
 73 |       arguments=[local_data_dir, do_data_validation]
 74 |     )
 75 | 
 76 |     # Third component - Preprocess and Train
 77 |     preprocess_train = dsl.ContainerOp(
 78 |       name="merlin-preprocess-train",
 79 |       image=args.preprocess_train_container,
 80 |       command=["bash", "/script/preprocess-train.sh"],
 81 |       arguments=[local_data_dir, project_id, cluster, zone]
 82 |     )
 83 | 
 84 |     # Fourth component - Model deployment
 85 |     deploy_triton = dsl.ContainerOp(
 86 |       name="triton-inference",
 87 |       image=args.deploy_container,
 88 |       command=["bash" , "/script/run_merlin_inference.sh"],
 89 |       arguments=[local_data_dir, project_id, "/script/gcloud_key.json", cluster, zone]
 90 |     )
 91 | 
 92 |     # Fifth component - Monitoring
 93 |     monitoring = dsl.ContainerOp(
 94 |       name="data-monitoring",
 95 |       image=args.monitor_container,
 96 |       command=["bash" , "/script/run_monitoring.sh"],
 97 |       arguments=[project_id, args.monitor_container, pipeline_name, gcs_bucket_head, new_data_collection, "{}{}{}".format(local_data_dir,"/",new_data_collection), cluster, zone]
 98 |     ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool)
 99 | 
100 | 
101 |     # Adding PV, PVC, GPU constraints to the components
102 |     copy_data.add_volume(k8s_client.V1Volume(name=persistent_volume_name,
103 |       persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
104 |       claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount(
105 |       mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool)
106 | 
107 |     data_validation.add_volume(k8s_client.V1Volume(name=persistent_volume_name,
108 |       persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
109 |       claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount(
110 |       mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool)
111 | 
112 | 
113 |     preprocess_train.add_volume(k8s_client.V1Volume(name=persistent_volume_name,
114 |       persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
115 |       claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount(
116 |       mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool)
117 | 
118 |     deploy_triton.add_volume(k8s_client.V1Volume(name=persistent_volume_name,
119 |       persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
120 |       claim_name=persistent_volume_claim_name))).add_volume_mount(k8s_client.V1VolumeMount(
121 |       mount_path=persistent_volume_path,name=persistent_volume_name)).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator', accelerator).add_node_selector_constraint('cloud.google.com/gke-nodepool', node_pool)
122 | 
123 |     # Sequencing the components
124 |     data_validation.after(copy_data)
125 |     preprocess_train.after(data_validation)
126 |     deploy_triton.after(preprocess_train)
127 |     monitoring.after(deploy_triton)
128 | 
129 | if __name__ == '__main__':
130 |   parser = argparse.ArgumentParser()
131 | 
132 |   # Parse command line arguments
133 |   parser.add_argument("-vc",
134 |                       "--validate_container",
135 |                         type=str,
136 |                         required=False,
137 |                         help="pass validate data container")
138 | 
139 |   parser.add_argument("-dex",
140 |                       "--data_extraction",
141 |                         type=str,
142 |                         required=True,
143 |                         help="pass copy container")
144 | 
145 |   parser.add_argument("-tc",
146 |                       "--preprocess_train_container",
147 |                         type=str,
148 |                         required=True,
149 |                         help="pass preprocess-train container")
150 |   
151 |   parser.add_argument("-dc",
152 |                       "--deploy_container",
153 |                         type=str,
154 |                         required=True,
155 |                         help="pass copy container")
156 | 
157 |   parser.add_argument("-mc",
158 |                       "--monitor_container",
159 |                         type=str,
160 |                         required=True,
161 |                         help="pass copy container")
162 | 
163 |   args = parser.parse_args()
164 | 
165 |   logger.info("Data extraction container: " + args.data_extraction)
166 |   logger.info("Validate container: " + args.validate_container)
167 |   logger.info("Preprocess-train container: " + args.preprocess_train_container)
168 |   logger.info("Deploy container: " + args.deploy_container)
169 |   logger.info("Monitor container: " + args.monitor_container)
170 | 
171 | 
172 |   import kfp.compiler as compiler
173 |   # Export pipeline as .tar.gz
174 |   compiler.Compiler().compile(merlin_pipeline, __file__ + '.tar.gz')
175 |   
176 | 


--------------------------------------------------------------------------------
/monitoring/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/monitoring/Chart.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | apiVersion: v1
28 | appVersion: "2.0"
29 | description: Monitoring Module
30 | name: monitoring-module
31 | version: 1.0.0
32 | 


--------------------------------------------------------------------------------
/monitoring/csv_read_gcs_write.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import pandas as pd
 17 | import numpy as np
 18 | from pyarrow import csv, parquet
 19 | from glob import glob
 20 | import os
 21 | from datetime import datetime
 22 | import re
 23 | import sys
 24 | from google.cloud import storage
 25 | import argparse
 26 | from time import sleep
 27 | 
 28 | PATH = 'dummy'
 29 | 
 30 | def get_local_files(path):
 31 |     local_files = glob(path+"/*")
 32 |     return local_files
 33 | 
 34 | def files_to_data_frames(local_files):
 35 |     data_frames = []
 36 |     for local_file in local_files:
 37 |         df = pd.read_csv(local_file)
 38 |         data_frames.append(df)
 39 |         del df
 40 |     return data_frames
 41 | 
 42 | def files_to_data_frames_parquet(local_files):
 43 |     data_frames = []
 44 |     for local_file in local_files:
 45 |         df = pd.read_parquet(local_file, engine='pyarrow')
 46 |         data_frames.append(df)
 47 |         del df
 48 |     return data_frames
 49 | 
 50 | def one_giant_data_frame(data_frames):
 51 |     big_un = pd.concat(data_frames, copy=False)
 52 |     return big_un
 53 | 
 54 | def file_to_data_frame_to_parquet(data_frame, parquet_file):
 55 |     # table = csv.read_csv(local_file)
 56 |     # parquet.write_table(table, parquet_file)
 57 |     data_frame.to_parquet(parquet_file, engine='pyarrow')
 58 | 
 59 | class GCSStore:
 60 |     def __init__(self, bucket_name, bucket_path):
 61 |         self.bucket_name = bucket_name
 62 |         self.bucket_path = bucket_path
 63 |         # Create a Cloud Storage client.
 64 |         self.gcs = storage.Client()
 65 | 
 66 |         # Get the bucket that the file will be uploaded to.
 67 |         self.bucket = self.gcs.get_bucket(self.bucket_name)
 68 | 
 69 | 
 70 |     def list_bucket(self, limit=sys.maxsize):
 71 |         a_bucket = self.gcs.lookup_bucket(self.bucket_name)
 72 |         bucket_iterator = a_bucket.list_blobs(prefix=self.bucket_path)
 73 |         for resource in bucket_iterator:
 74 |             print(resource.name)
 75 |             limit = limit - 1
 76 |             if limit <= 0:
 77 |                 break
 78 | 
 79 |     def upload_to_bucket(self, input_file_name, output_file_name):
 80 |         blob2 = self.bucket.blob(self.bucket_path + "/" + output_file_name)
 81 |         blob2.upload_from_filename(filename=input_file_name)
 82 | 
 83 | 
 84 | if __name__=='__main__':
 85 |     parser = argparse.ArgumentParser()
 86 | 
 87 |     print("In read-write csv to parquet")
 88 | 
 89 |     parser.add_argument("--pv_dir",
 90 |                         type=str,
 91 |                         required=True,
 92 |                         default="/var/lib/data/new_data",
 93 |                         help="Path to new data in PV")
 94 | 
 95 |     parser.add_argument("--sleep_time",
 96 |                         type=int,
 97 |                         required=True,
 98 |                         default=1,
 99 |                         help="Sleep time in seconds")
100 | 
101 |     parser.add_argument("--bucket",
102 |                         type=str,
103 |                         required=True,
104 |                         default="criteo-data",
105 |                         help="Name of GCS bucket")
106 | 
107 |     parser.add_argument("--bucket_path",
108 |                         type=str,
109 |                         required=True,
110 |                         default="new_data",
111 |                         help="Path of directory to store files on GCS bucket")
112 |     
113 |     args = parser.parse_args()
114 | 
115 |     sleep_time = args.sleep_time
116 |     gcs_store = GCSStore(args.bucket, args.bucket_path)
117 | 
118 |     while True:
119 |         sleep(sleep_time)
120 |         local_files = get_local_files(args.pv_dir)
121 |         if len(local_files) == 0:
122 |             print("No files to process. Sleeping for {} secs".format(sleep_time))
123 |             continue
124 |         
125 |         print("New files found. Pushing to GCS...")
126 |         for each_file in local_files:
127 |             print("pushing {} to {}".format(each_file, args.bucket + "/" + args.bucket_path + "/" +os.path.basename(each_file)))
128 |             gcs_store.upload_to_bucket(each_file, os.path.basename(each_file))
129 |             print("Uploaded {} to {} at {}. Deleting {} from PV".format(each_file,
130 |                                     args.bucket + "/" + args.bucket_path + "/" +os.path.basename(each_file),
131 |                                     datetime.now(), each_file))
132 |             os.remove(each_file)
133 |         
134 | 
135 | 


--------------------------------------------------------------------------------
/monitoring/perf-monitor-test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import numpy as np
 17 | import os
 18 | import logging
 19 | import argparse
 20 | import sys
 21 | import warnings
 22 | import sys
 23 | import time
 24 | import json
 25 | 
 26 | import cudf
 27 | from sklearn import metrics
 28 | import pandas as pd
 29 | 
 30 | import tritonclient.http as httpclient
 31 | import tritonclient.grpc as grpcclient
 32 | from tritonclient.utils import *
 33 | 
 34 | from google.cloud import pubsub_v1
 35 | from google.protobuf.json_format import MessageToJson
 36 | from google.pubsub_v1.types import Encoding
 37 | 
 38 | 
 39 | 
 40 | def publish_batch(project_id, topic_id, current_batch, pred_label):
 41 |     # Initialize a Publisher client.
 42 |     client = pubsub_v1.PublisherClient()
 43 |     topic_path = client.topic_path(project_id, topic_id)
 44 | 
 45 |     batch_size = len(pred_label)
 46 |     df = current_batch.to_pandas()
 47 | 
 48 |     for i in range(batch_size):
 49 |         row = df.iloc[i]
 50 | 
 51 |         frame = {
 52 |             "input0": row[CONTINUOUS_COLUMNS].values.tolist(),
 53 |             "input1": row[CATEGORICAL_COLUMNS].values.tolist(),
 54 |             "trueval": row['label'],
 55 |             "predval": response.as_numpy("OUTPUT0")[i].astype('float64')
 56 |         }
 57 | 
 58 |         payload = json.dumps(frame).encode('utf-8')
 59 | 
 60 |         # When you publish a message, the client returns a future.
 61 |         api_future = client.publish(topic_path, data=''.encode(), payload=payload)
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 | 
 66 |     parser = argparse.ArgumentParser()
 67 | 
 68 |     parser.add_argument('-u',
 69 |                         '--triton_grpc_url',
 70 |                         type=str,
 71 |                         required=False,
 72 |                         default='localhost:8001',
 73 |                         help='URL to Triton gRPC Endpoint')
 74 | 
 75 |     parser.add_argument('-m',
 76 |                         '--model_name',
 77 |                         type=str,
 78 |                         required=False,
 79 |                         default='dcn_ens',
 80 |                         help='Name of the model ensemble to load')
 81 | 
 82 |     parser.add_argument('-d',
 83 |                         '--test_data',
 84 |                         type=str,
 85 |                         required=False,
 86 |                         default='/crit_int_pq/day_23.parquet',
 87 |                         help='Path to a test .parquet file. Default')
 88 | 
 89 |     parser.add_argument('-b',
 90 |                         '--batch_size',
 91 |                         type=int,
 92 |                         required=False,
 93 |                         default=64,
 94 |                         help='Batch size. Max is 64 at the moment, but this max size could be specified when create the model and the ensemble.')
 95 | 
 96 |     parser.add_argument('-n',
 97 |                         '--n_batches',
 98 |                         type=int,
 99 |                         required=False,
100 |                         default=1,
101 |                         help='Number of batches of data to send')
102 | 
103 |     parser.add_argument('-v',
104 |                         '--verbose',
105 |                         type=bool,
106 |                         required=False,
107 |                         default=False,
108 |                         help='Verbosity, True or False')
109 | 
110 |     parser.add_argument("--project_id",
111 |                         type=str,
112 |                         required=True,
113 |                         default="dl-tme",
114 |                         help="Google Cloud project ID")
115 | 
116 |     parser.add_argument("--topic_id",
117 |                         type=str,
118 |                         required=True,
119 |                         default="pubsub",
120 |                         help="Pub/Sub topic ID")
121 | 
122 | 
123 |     args = parser.parse_args()
124 | 
125 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
126 |     logging.info(f"Args: {args}")
127 | 
128 | 
129 |     # warnings can be disabled
130 |     if not sys.warnoptions:
131 |         warnings.simplefilter("ignore")
132 | 
133 |     try:
134 |         triton_client = grpcclient.InferenceServerClient(url=args.triton_grpc_url, verbose=args.verbose)
135 |         logging.info("Triton client created.")
136 | 
137 |         triton_client.is_model_ready(args.model_name)
138 |         logging.info(f"Model {args.model_name} is ready!")
139 |     except Exception as e:
140 |         logging.error(f"Channel creation failed:  {str(e)}")
141 |         sys.exit()
142 | 
143 |     # Load the dataset
144 |     CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
145 |     CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
146 |     LABEL_COLUMNS = ['label']
147 |     col_names =  CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS
148 |     col_dtypes = [np.int32]*26 + [np.int64]*13
149 | 
150 |     logging.info("Reading dataset..")
151 |     all_batches = cudf.read_parquet(args.test_data, num_rows=args.batch_size*args.n_batches)
152 | 
153 |     results=[]
154 | 
155 |     with grpcclient.InferenceServerClient(url=args.triton_grpc_url) as client:
156 |         for batch in range(args.n_batches):
157 | 
158 |             logging.info(f"Requesting inference for batch {batch}..")
159 |             start_idx = batch*args.batch_size
160 |             end_idx = (batch+1)*(args.batch_size)
161 | 
162 |             # Convert the batch to a triton inputs
163 |             current_batch = all_batches[start_idx:end_idx]
164 |             columns = [(col, current_batch[col]) for col in col_names]
165 |             inputs = []
166 | 
167 |             for i, (name, col) in enumerate(columns):
168 |                 d = col.values_host.astype(col_dtypes[i])
169 |                 d = d.reshape(len(d), 1)
170 |                 inputs.append(grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
171 |                 inputs[i].set_data_from_numpy(d)
172 | 
173 |             outputs = []
174 |             outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
175 | 
176 |             response = client.infer(args.model_name, inputs, request_id=str(1), outputs=outputs)
177 | 
178 |             results.extend(response.as_numpy("OUTPUT0"))
179 | 
180 |             publish_batch(args.project_id, args.topic_id,
181 |                         current_batch,
182 |                         response.as_numpy("OUTPUT0"))
183 | 
184 |     logging.info(f"ROC AUC Score: {metrics.roc_auc_score(all_batches[LABEL_COLUMNS].values.tolist(), results)}")


--------------------------------------------------------------------------------
/monitoring/perf-monitor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os
 17 | import logging
 18 | from time import time, sleep
 19 | from queue import Queue
 20 | from threading import Thread
 21 | 
 22 | import argparse
 23 | from google.cloud import pubsub_v1
 24 | import json
 25 | import collections
 26 | 
 27 | from sklearn import metrics
 28 | import pandas as pd
 29 | import numpy as np
 30 | 
 31 | import kfp
 32 | import datetime
 33 | 
 34 | # logging.basicConfig(level=logging.INFO,
 35 | #                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 36 | # logger = logging.getLogger(__name__)
 37 | 
 38 | # client = kfp.Client(host='https://320d47d67af4e8cf-dot-us-central1.pipelines.googleusercontent.com')
 39 | 
 40 | def get_pipeline_id(name, client):
 41 |     pl_id = None
 42 |     page_size = 100
 43 |     page_token = ''
 44 |     while True:
 45 |         res = client.list_pipelines(page_size=page_size, page_token=page_token)
 46 |         pl_list = res.pipelines
 47 |         for pl in pl_list:
 48 |             if pl.name == name:
 49 |                 pl_id = pl.id
 50 |                 return pl_id
 51 |         page_token = res.next_page_token
 52 |         if not page_token:
 53 |             break
 54 |     return pl_id
 55 | 
 56 | def get_pipeline_info(input_name, client_key):
 57 |     page_size = 200
 58 |     page_token = ''
 59 |     pipeline_runs = []
 60 | 
 61 |     client = kfp.Client(host=client_key)
 62 | 
 63 |     res = client.list_runs(page_size=page_size, page_token=page_token)
 64 |     for runs in res.runs:
 65 |         if runs.resource_references[1].name == input_name:
 66 |             pipeline_runs.append(runs)
 67 | 
 68 |     if len(pipeline_runs) !=0:
 69 |         for prun in pipeline_runs:
 70 |             if prun.status == 'Running':
 71 |                 return None
 72 | 
 73 |     # if prun.status == 'Succeeded':
 74 |         tmp = { 'pipelineID': prun.resource_references[1].key.id,
 75 |             'experimentID': prun.resource_references[0].key.id,
 76 |             'status': prun.status,
 77 |             'new_run_name': 'triggered_'+str(datetime.datetime.now())}
 78 |             
 79 |         return tmp
 80 |             # pid = get_pipeline_id(input_name,client)
 81 |             # print("pid: ", name)
 82 | 
 83 |     return None
 84 | 
 85 | def trigger_kfp(pipeline_name, client_key):
 86 |     logging.warning("Triggering Kubeflow Pipeline...")
 87 | 
 88 |     # If pipeline is already running --> False
 89 |     # Else -> True
 90 |     try:
 91 |         pipeline_info = get_pipeline_info(pipeline_name, client_key)
 92 |     except Exception as e:
 93 |         logging.error(f"Triggering pipeline error: {e}")
 94 |         return False
 95 | 
 96 |     logging.info(f"Pipeline info: {pipeline_info}")
 97 | 
 98 |     if pipeline_info != None:
 99 |             print("Using pipeline ID: ", pipeline_info['pipelineID'], " triggering ", pipeline_info['new_run_name'], " at: ", datetime.datetime.now())
100 |             client = kfp.Client(host=client_key)
101 |             res = client.run_pipeline(pipeline_info['experimentID'], pipeline_info['new_run_name'], pipeline_id=pipeline_info['pipelineID'])
102 |             return True
103 |     else:
104 |         logging.info("Did not trigger the pipeline")
105 |         return False
106 | 
107 | 
108 | class AccMonitor:
109 |     def __init__(self, project_id, subscription_id, timeout, evaluate_period=500,
110 |                     acc_threshold=0.5, min_trigger_len=0.5, pipeline_name='merlin-pipeline',
111 |                     min_log_length=320, log_time_delta=60,pv_location='/var/lib/data/', client_host=None):
112 |         self.evaluate_period = evaluate_period
113 |         self.pipeline_name = pipeline_name
114 |         self.pv_location = pv_location
115 |         self.client_host_key = client_host
116 |         # Thread safe Queues where each item is a request
117 |         self.request_queue = Queue(maxsize=self.evaluate_period)
118 | 
119 |         self.project_id = project_id
120 |         self.subscription_id = subscription_id
121 |         self.timeout = timeout
122 |         self.acc_threshold = acc_threshold
123 | 
124 |         # Mininum number of results in the circular buffer to initiate a monitoring based trigger
125 |         self.min_trigger_len = min_trigger_len * self.evaluate_period
126 |         # print("Min trigger length", self.min_trigger_len)
127 | 
128 |         # Logging configs
129 |         self.min_log_length = min_log_length
130 |         self.log_time_delta = datetime.timedelta(seconds=log_time_delta)
131 | 
132 |         # Circular buffer to store results in a rolling manner
133 |         self.label_queue = collections.deque(maxlen=self.evaluate_period)
134 |         self.pred_queue = collections.deque(maxlen=self.evaluate_period)
135 | 
136 |     def run(self):
137 | 
138 |         def enqueue_request(self):
139 |             """
140 |             Receives messages from a Pub/Sub subscription and adds the request to a queue.
141 | 
142 |             The idea is to decouple message processing from message reception so that
143 |             if there are a large number of messages at once, processing does not cause delays in the
144 |             thread recieving messages.
145 |             """
146 | 
147 |             # Initialize a Subscriber client
148 |             subscriber_client = pubsub_v1.SubscriberClient()
149 | 
150 |             # Create a fully qualified identifier in the form of
151 |             # `projects/{project_id}/subscriptions/{subscription_id}`
152 |             subscription_path = subscriber_client.subscription_path(self.project_id, self.subscription_id)
153 | 
154 |             def callback(message):
155 |                 # Acknowledge the message. Unack'ed messages will be redelivered.
156 |                 message.ack()
157 |                 # print("JSON of message:", json.loads(message.attributes))
158 |                 # print(f"Acknowledged {message.message_id}.")
159 | 
160 |                 payload = json.loads(message.attributes['payload'])
161 | 
162 |                 # If the queue at it's max size, this blocks until items are consumed
163 |                 # In case the dequeuing thread is slower, then this will block
164 |                 # from recieving more messages from the broker. The broker should
165 |                 # still have those messages so that they dont get lost.
166 |                 self.request_queue.put(payload)
167 | 
168 |             streaming_pull_future = subscriber_client.subscribe(
169 |                 subscription_path, callback=callback
170 |             )
171 |             logging.info(f"Listening for messages on {subscription_path}..\n")
172 | 
173 |             try:
174 |                 # Calling result() on StreamingPullFuture keeps the main thread from
175 |                 # exiting while messages get processed in the callbacks.
176 |                 streaming_pull_future.result(timeout=self.timeout)
177 |             except:
178 |                 streaming_pull_future.cancel()
179 | 
180 |             subscriber_client.close()
181 | 
182 |         # Start the enqueue thread as a daemon
183 |         enqueue = Thread(target=enqueue_request, args=(self,))
184 |         enqueue.daemon = True
185 |         enqueue.start()
186 | 
187 |         """
188 |         Fetches request from a queue, and calculates the rolling accuracy over last N requests
189 |         If the rolling accuracy is below a pre-specified threshold, raises an alarm
190 | 
191 |         - We have access to the features here. We save the requests into a .parquet file
192 |         in batches
193 | 
194 |         - PubSub usually does not guarantee in-order delivery of messages
195 |         """
196 | 
197 |         # Initialization
198 |         rolling_acc = 1.0
199 | 
200 |         CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
201 |         CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
202 |         LABEL_COLUMNS = ['label']
203 |         col_names =  LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
204 |         DATETIME_FORMAT = '%d_%m_%Y-%H-%M-%S'
205 |         last_log_time = datetime.datetime.strptime('01_01_1970-00-00-00', DATETIME_FORMAT)
206 | 
207 |         # Create an empty dataframe
208 |         df_temp = pd.DataFrame(columns = col_names)
209 | 
210 |         while True:
211 |             while self.request_queue.empty():
212 |                 # sleep so .get doesnt eat CPU cycles if queue is empty
213 |                 sleep(0.1)
214 | 
215 |             # Fetch the payload
216 |             payload = self.request_queue.get()
217 | 
218 |             # TODO: put checks for payload
219 |             request = np.concatenate((np.array([payload["trueval"]], float),
220 |                                       np.array(payload["input0"]),
221 |                                       np.array(payload["input1"])))
222 | 
223 |             # Append new request to the dataframe
224 |             df_temp = df_temp.append(pd.DataFrame([request], columns=col_names))
225 | 
226 |             # Write to a file if there are a minimum number of samples available,
227 |             # and if a minimum amount of time has passed since last write
228 |             # TOFIX: This is problematic if no new request comes for a while and
229 |             # there are many requests in the dataframe ready to be written already
230 |             current_time = datetime.datetime.now()
231 |             if (df_temp.shape[0] >= self.min_log_length) and \
232 |                                 (current_time - last_log_time >= self.log_time_delta):
233 |                 filename = current_time.strftime(DATETIME_FORMAT) + ".parquet"
234 |                 logging.info(f"Writing {df_temp.shape[0]} records to {self.pv_location} / {filename}...")
235 |                 # print(f"Writing {df_temp.shape[0]} records to {self.pv_location+filename}...")
236 |                 df_temp.reset_index(inplace=True, drop=True)
237 |                 df_temp.to_parquet(self.pv_location+"/"+filename)
238 | 
239 |                 # Clear the dataframe
240 |                 df_temp = pd.DataFrame(columns = col_names)
241 |                 last_log_time = current_time
242 | 
243 |             # Circular buffer of size evaluate_period
244 |             self.label_queue.append(payload["trueval"])
245 |             self.pred_queue.append(payload["predval"])
246 | 
247 |             try:
248 |                 # This will fail if there is only one class in label_queue, catch and pass
249 |                 # in that case
250 |                 rolling_acc = metrics.roc_auc_score(self.label_queue, self.pred_queue)
251 |                 logging.info(f"Rolling AUC score: {rolling_acc}")
252 |             except ValueError:
253 |                 pass
254 | 
255 | 
256 |             if (rolling_acc < self.acc_threshold) and (len(self.label_queue) > self.min_trigger_len):
257 |                 success = trigger_kfp(self.pipeline_name, self.client_host_key)
258 |                 # If the pipeline has triggered, refresh the result circular buffer,
259 |                 # and calculate fresh metrics. Ideally we need a better mechanism to
260 |                 # check if the pipeline is already running, then don't retrigger
261 |                 if success == True:
262 |                     self.label_queue.clear()
263 |                     self.pred_queue.clear()
264 |                     rolling_acc = 1.0
265 |                     sleep(5)
266 | 
267 | 
268 | if __name__ == "__main__":
269 |     parser = argparse.ArgumentParser()
270 | 
271 |     print("In Performance monitoring module")
272 | 
273 |     parser.add_argument("--project_id",
274 |                         type=str,
275 |                         required=True,
276 |                         default="dl-tme",
277 |                         help="Google Cloud project ID")
278 | 
279 |     parser.add_argument("--subscription_id",
280 |                         type=str,
281 |                         required=True,
282 |                         default="sub_id",
283 |                         help="Pub/Sub subscription ID")
284 | 
285 |     parser.add_argument("--timeout",
286 |                         type=int,
287 |                         required=False,
288 |                         default=None,
289 |                         help="Timeout for Streaming Pull")
290 | 
291 |     parser.add_argument("--evaluate_period",
292 |                         type=int,
293 |                         required=False,
294 |                         default=500,
295 |                         help="Evaluate over the last evaluate_period samples")
296 | 
297 |     parser.add_argument("--min_trigger_len",
298 |                         type=float,
299 |                         required=False,
300 |                         default=0.5,
301 |                         help="Minimum number of samples in queue before monitoring based trigger. \
302 |                                As a percentage of evaluate_period ")
303 | 
304 |     parser.add_argument("--acc_threshold",
305 |                         type=float,
306 |                         required=False,
307 |                         default=0.5,
308 |                         help="AUC ROC threshold for trigger. Default 0.8")
309 | 
310 |     parser.add_argument("--pipeline_name",
311 |                         type=str,
312 |                         required=False,
313 |                         default='merlin-pipeline',
314 |                         help="Name of the original pipeline")
315 | 
316 |     parser.add_argument("--min_log_length",
317 |                         type=int,
318 |                         required=False,
319 |                         default=320,
320 |                         help="Minimum number of req of the .parquet/.csv that is created")
321 | 
322 |     parser.add_argument("--log_time_delta",
323 |                         type=int,
324 |                         required=False,
325 |                         default=60,
326 |                         help="Minimum amount of time delta (in secs) between two subsequent .parquet files")
327 | 
328 |     parser.add_argument("--PV_loc",
329 |                         type=str,
330 |                         required=False,
331 |                         default='/var/lib/data/new_data/',
332 |                         help="Location of PV to write the files")
333 | 
334 | 
335 |     args = parser.parse_args()
336 | 
337 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
338 |     logging.info(f"Args: {args}")
339 | 
340 |     logging.info("Starting accuracy monitor...")
341 | 
342 |     client_host_key = None
343 | 
344 |     with open('/script/kfp_client_host_key.txt','r') as f:
345 |         client_host_key = f.read()
346 | 
347 | 
348 |     # TODO: Add better error handling, and move configs to a .json
349 |     am = AccMonitor(project_id=args.project_id,
350 |                     subscription_id=args.subscription_id,
351 |                     timeout=args.timeout,
352 |                     evaluate_period=args.evaluate_period,
353 |                     acc_threshold=args.acc_threshold,
354 |                     min_trigger_len=args.min_trigger_len,
355 |                     pipeline_name=args.pipeline_name,
356 |                     min_log_length=args.min_log_length,
357 |                     log_time_delta=args.log_time_delta,
358 |                     pv_location=args.PV_loc,
359 |                     client_host=client_host_key)
360 | 
361 |     am.run()


--------------------------------------------------------------------------------
/monitoring/run_monitoring.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | PROJECT=${1:-"dl-tme"}
19 | DOCKER_IMG=${2:-"gcr.io/${PROJECT}/monitoring:0.5.1"}
20 | PIPELINE=${3:-"merlin-pipeline"}
21 | GCS_BUCKET=${4:-"criteo-data"}
22 | BUCKET_PATH=${5:-"new_data"}
23 | LOCAL=${6:-"/var/lib/data/new_data"}
24 | PUBSUB=${7:-"mlops-test-sub"}
25 | CLUSTER=${8:-"merlin-mlops"}
26 | ZONE=${9:-"us-central1-a"}
27 | 
28 | 
29 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json
30 | gcloud container clusters get-credentials $CLUSTER --zone $ZONE --project $PROJECT
31 | 
32 | monitoring_status=$(helm status monitoring 2>&1)
33 | echo "monitoring status: "
34 | echo $monitoring_status
35 | if [[ "$monitoring_status" == "Error: release: not found" ]]; then
36 |     helm install monitoring --set project_id=$PROJECT --set image.repository=$DOCKER_IMG --set pipeline=$PIPELINE --set gcs_bucket=$GCS_BUCKET --set bucket_path=$BUCKET_PATH --set local=$LOCAL --set pubsub=$PUBSUB /script
37 | else
38 |     echo "Monitoring module running already, not deploying another instance"
39 | fi
40 | 


--------------------------------------------------------------------------------
/monitoring/run_monitoring_and_live_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | 
17 | PROJECT_ID=${1:-"dl-tme"} # Google Cloud project ID
18 | GCS_BUCKET=${2:-"criteo-data"}
19 | BUCKET_PATH=${3:-"new_data"}
20 | LOCAL=${4:-"/var/lib/data/new_data"}
21 | PIPELINE=${5:-"merlin-pipeline"}
22 | PUBSUB=${6:-"mlops-test-sub"}
23 | 
24 | echo "perf monitor"
25 | python3 -u /script/perf-monitor.py --PV_loc $LOCAL --project_id $PROJECT_ID --subscription_id $PUBSUB --evaluate_period 200 --min_trigger_len 0.5 --acc_threshold 0.8 --pipeline_name $PIPELINE &
26 | 
27 | echo "gcs"
28 | python3 -u /script/csv_read_gcs_write.py --pv_dir $LOCAL  --sleep_time 10 --bucket $GCS_BUCKET --bucket_path $BUCKET_PATH
29 | 
30 | echo "done"


--------------------------------------------------------------------------------
/monitoring/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "monitoring-module.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "monitoring-module.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "monitoring-module.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "monitoring-module.labels" -}}
37 | helm.sh/chart: {{ include "monitoring-module.chart" . }}
38 | {{ include "monitoring-module.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "monitoring-module.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "monitoring-module.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "monitoring-module.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "monitoring-module.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 


--------------------------------------------------------------------------------
/monitoring/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | apiVersion: apps/v1
17 | kind: Deployment
18 | metadata:
19 |   name: {{ include "monitoring-module.fullname" . }}
20 |   labels:
21 |     {{- include "monitoring-module.labels" . | nindent 4 }}
22 | spec:
23 |   selector:
24 |     matchLabels:
25 |       {{- include "monitoring-module.selectorLabels" . | nindent 6 }}
26 |   template:
27 |     metadata:
28 |       {{- with .Values.podAnnotations }}
29 |       annotations:
30 |         {{- toYaml . | nindent 8 }}
31 |       {{- end }}
32 |       labels:
33 |         {{- include "monitoring-module.selectorLabels" . | nindent 8 }}
34 |     spec:
35 |       containers:
36 |         - name: {{ .Chart.Name }}
37 |           image: "{{ .Values.image.repository }}"
38 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
39 |           command: ["/bin/sh","-c"]
40 |           args: ["bash run_monitoring_and_live_data.sh {{ .Values.project_id }} {{ .Values.gcs_bucket }} {{ .Values.bucket_path }} {{ .Values.local }} {{ .Values.pipeline }} {{ .Values.pubsub }}"]
41 |           volumeMounts:
42 |           - name: "tmp-data-storage"
43 |             mountPath: {{ .Values.local }}
44 |       volumes:
45 |       - name: "tmp-data-storage"
46 |         persistentVolumeClaim:
47 |           claimName: tmp-data-storage-claim
48 | 


--------------------------------------------------------------------------------
/monitoring/values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | # Default values for monitoring-module.
18 | # This is a YAML-formatted file.
19 | # Declare variables to be passed into your templates.
20 | 
21 | replicaCount: 1
22 | 
23 | image:
24 |   repository: nginx
25 |   pullPolicy: Always
26 |   # Overrides the image tag whose default is the chart appVersion.
27 | 
28 | project_id: "dl-tme"
29 | pipeline: "merlin-pipeline"
30 | gcs_bucket: "criteo-data"
31 | bucket_path: "new_data"
32 | local: "/var/lib/data/new_data"
33 | pubsub: "mlops-test-sub"


--------------------------------------------------------------------------------
/preprocess-train/dcn_files/dcn.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "inference": {
  3 |     "max_batchsize": 64,
  4 |     "hit_rate_threshold": 0.6,
  5 |     "dense_model_file": "/model/models/dcn/1/_dense_500.model",
  6 |     "sparse_model_file": "/model/models/dcn/1/0_sparse_500.model",
  7 |     "label": 1,
  8 |     "input_key_type": "I64"
  9 |   },
 10 |   "layers": [ 
 11 |       {
 12 |       "name": "data",
 13 |       "type": "Data",
 14 |       "check": "None",
 15 |       "label": {
 16 |         "label_dim": 1
 17 |       },
 18 |       "dense": {
 19 |         "top": "dense",
 20 |         "dense_dim": 13
 21 |       },
 22 |       "sparse": [
 23 |         {
 24 |           "top": "data1",
 25 |           "type": "DistributedSlot",
 26 |           "max_feature_num_per_sample": 30,
 27 |           "slot_num": 26
 28 |         }        
 29 |       ]
 30 |     },
 31 |     {
 32 |       "name": "sparse_embedding1",
 33 |       "type": "DistributedSlotSparseEmbeddingHash",
 34 |       "bottom": "data1",
 35 |       "top": "sparse_embedding1",
 36 |       "sparse_embedding_hparam": {
 37 |         "max_vocabulary_size_per_gpu": 88656602,
 38 |         "embedding_vec_size": 16,
 39 |         "combiner": 0
 40 |       }
 41 |     },
 42 |     {
 43 |       "name": "reshape1",
 44 |       "type": "Reshape",
 45 |       "bottom": "sparse_embedding1",
 46 |       "top": "reshape1",
 47 |       "leading_dim": 416
 48 |     },
 49 |     {
 50 |       "name": "concat1",
 51 |       "type": "Concat",
 52 |       "bottom": ["reshape1","dense"],
 53 |       "top": "concat1"
 54 |     },
 55 |     {
 56 |       "name": "slice1",
 57 |       "type": "Slice",
 58 |       "bottom": "concat1",
 59 |       "ranges": [[0,429], [0,429]],
 60 |       "top": ["slice11", "slice12"]
 61 |     },
 62 |     {
 63 |       "name": "multicross1",
 64 |       "type": "MultiCross",
 65 |       "bottom": "slice11",
 66 |       "top": "multicross1",
 67 |       "mc_param": {
 68 |         "num_layers": 6
 69 |       }
 70 |     },
 71 |     {
 72 |       "name": "fc1",
 73 |       "type": "InnerProduct",
 74 |       "bottom": "slice12",
 75 |       "top": "fc1",
 76 |        "fc_param": {
 77 |         "num_output": 1024
 78 |       }
 79 |     },
 80 |     {
 81 |       "name": "relu1",
 82 |       "type": "ReLU",
 83 |       "bottom": "fc1",
 84 |       "top": "relu1" 
 85 |     },
 86 |       
 87 |     {
 88 |       "name": "dropout1",
 89 |       "type": "Dropout",
 90 |       "rate": 0.5,
 91 |       "bottom": "relu1",
 92 |       "top": "dropout1" 
 93 |     },
 94 |     {
 95 |       "name": "fc2",
 96 |       "type": "InnerProduct",
 97 |       "bottom": "dropout1",
 98 |       "top": "fc2",
 99 |        "fc_param": {
100 |         "num_output": 1024
101 |       }
102 |     },
103 |     {
104 |       "name": "relu2",
105 |       "type": "ReLU",
106 |       "bottom": "fc2",
107 |       "top": "relu2"     
108 |     },
109 |     {
110 |       "name": "dropout2",
111 |       "type": "Dropout",
112 |       "rate": 0.5,
113 |       "bottom": "relu2",
114 |       "top": "dropout2" 
115 |     },
116 |     
117 |     {
118 |       "name": "concat2",
119 |       "type": "Concat",
120 |       "bottom": ["dropout2","multicross1"],
121 |       "top": "concat2"
122 |     },
123 |     
124 |     {
125 |       "name": "fc4",
126 |       "type": "InnerProduct",
127 |       "bottom": "concat2",
128 |       "top": "fc4",
129 |        "fc_param": {
130 |         "num_output": 1
131 |       }
132 |     },
133 |     
134 |     {
135 |       "name": "sigmoid",
136 |       "type": "Sigmoid",
137 |       "bottom": "fc4",
138 |       "top": "sigmoid"
139 |     } 
140 |   ]
141 | }
142 | 


--------------------------------------------------------------------------------
/preprocess-train/dcn_files/format_dcn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | import json, sys, argparse, os
18 | 
19 | 
20 | if __name__=='__main__':
21 |     
22 |     parser = argparse.ArgumentParser()    
23 | 
24 |     parser.add_argument("--model_version",
25 |                         type=int,
26 |                         required=True,
27 |                         default=1,
28 |                         help="Provide model version")
29 | 
30 |     parser.add_argument("--dcn_path",
31 |                         type=str,
32 |                         required=True,
33 |                         default="/var/lib/data/script/dcn_files/dcn.json",
34 |                         help="Path of original DCN")
35 | 
36 | 
37 |     args = parser.parse_args()
38 | 
39 |     dcn = os.path.basename(args.dcn_path)
40 |     dir_path = os.path.dirname(args.dcn_path)
41 |     obj = None
42 |     with open(args.dcn_path, "r") as f:
43 |         obj = json.load(f)
44 |     obj["inference"]["dense_model_file"] = "/model/models/dcn/" + str(args.model_version) + "/_dense_500.model"
45 |     obj["inference"]["sparse_model_file"] = "/model/models/dcn/" + str(args.model_version) + "/0_sparse_500.model"
46 |     # print(obj["inference"]["dense_model_file"])
47 |     updated_json = dir_path+"/dcn" + str(args.model_version) + ".json"
48 |     with open(updated_json,"w") as f:
49 |         json.dump(obj, f)
50 | 
51 |     


--------------------------------------------------------------------------------
/preprocess-train/preprocess-train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | set -e
18 | 
19 | PV_LOC=${1:-"/var/lib/data"}
20 | PROJECT=${2:-"dl-tme"}
21 | cluster=${3:-"merlin-mlops"}
22 | zone=${4:-"us-central1-a"}
23 | 
24 | cp -r /script $PV_LOC
25 | 
26 | #echo "Preprocessing..."
27 | cd $PV_LOC
28 | echo $PV_LOC
29 | 
30 | gcloud auth activate-service-account --key-file=/script/gcloud_key.json
31 | gcloud container clusters get-credentials $cluster --zone $zone --project $PROJECT
32 | gcloud config set project $PROJECT
33 | 
34 | # Check if triton is deployed
35 | triton_status=$(helm status triton 2>&1)
36 | echo "Triton status: "
37 | echo $triton_status
38 | if [[ "$triton_status" == "Error: release: not found" ]]; then
39 |     echo "Triton is not running. This is first deployment."
40 |     echo "Preprocessing...."
41 |     ls -al $PV_LOC/criteo-data/crit_int_pq
42 |     python3 -u $PV_LOC/script/preprocessing/nvt-preprocess.py -d $PV_LOC/criteo-data/crit_int_pq -o $PV_LOC/criteo-data/ -t 1 -v 1 -g 0
43 | 
44 |     echo "Training..."
45 |     python3 -u $PV_LOC/script/training/hugectr-train-criteo-dcn.py --input_train $PV_LOC/criteo-data/test_dask/output/train/_file_list.txt --input_val $PV_LOC/criteo-data/test_dask/output/valid/_file_list.txt --max_iter 600 --snapshot 500 --num_gpus 0
46 | 
47 |     mkdir -p $PV_LOC/model/criteo_hugectr/1/
48 |     mv $PV_LOC/*.model $PV_LOC/model/criteo_hugectr/1/
49 | 
50 |     mkdir -p $PV_LOC/models/
51 | 
52 |     echo "Create ensemble"
53 |     python3 -u $PV_LOC/script/training/create-nvt-hugectr-ensemble.py --nvt_workflow_path $PV_LOC/criteo-data/test_dask/output/workflow/ --hugectr_model_path $PV_LOC/model/criteo_hugectr/1/ --ensemble_output_path $PV_LOC/models/  --ensemble_config $PV_LOC/script/training/ensemble-config.json
54 | 
55 |     echo "Copy dcn.json"
56 |     cp $PV_LOC/script/dcn_files/dcn.json $PV_LOC/models/dcn/1
57 | 
58 | else
59 |     echo "Triton is running. This is triggered run. Running incremental pre-processing"
60 |     echo "Incremental preprocessing..."
61 |     ls -al $PV_LOC/criteo-data/new_data
62 |     python3 -u $PV_LOC/script/preprocessing/nvt-preprocess-incremental.py --input_train_dir $PV_LOC/criteo-data/new_data/ --output_dir $PV_LOC/criteo-data/output --workflow_dir $PV_LOC/criteo-data/test_dask/output/workflow/ --dask_workdir $PV_LOC/criteo-data/test_dask/workdir --num_gpus 0 
63 | 
64 |     previous_version=$(ls $PV_LOC/model/criteo_hugectr/ -v | tail -n1)
65 | 
66 |     echo "Incremental Training..."
67 |     python3 -u $PV_LOC/script/training/hugectr-train-criteo-dcn.py --input_train $PV_LOC/criteo-data/test_dask/output/train/_file_list.txt --input_val $PV_LOC/criteo-data/test_dask/output/valid/_file_list.txt --max_iter 600 --snapshot 500 --num_gpus 0 --dense_model_file $PV_LOC/model/criteo_hugectr/$previous_version/_dense_500.model --sparse_model_files $PV_LOC/model/criteo_hugectr/$previous_version/0_sparse_500.model
68 | 
69 |     new_version="$(($previous_version + 1))" 
70 | 
71 |     mkdir -p $PV_LOC/model/criteo_hugectr/$new_version/
72 | 
73 |     mv $PV_LOC/*.model $PV_LOC/model/criteo_hugectr/$new_version/
74 | 
75 |     mkdir -p $PV_LOC/models_recurrent_runs
76 | 
77 |     echo "Incremental Create ensemble"
78 |     python3 -u $PV_LOC/script/training/create-nvt-hugectr-ensemble.py --nvt_workflow_path $PV_LOC/criteo-data/test_dask/output/workflow/ --hugectr_model_path $PV_LOC/model/criteo_hugectr/$new_version/ --ensemble_output_path $PV_LOC/models_recurrent_runs --ensemble_config $PV_LOC/script/training/ensemble-config.json
79 | 
80 |     python3 -u $PV_LOC/script/dcn_files/format_dcn.py --model_version $new_version --dcn_path $PV_LOC/script/dcn_files/dcn.json
81 | 
82 |     mv $PV_LOC/models_recurrent_runs/dcn/1 $PV_LOC/models/dcn/$new_version
83 |     mv $PV_LOC/models_recurrent_runs/dcn/config.pbtxt $PV_LOC/models/dcn/
84 |     cp $PV_LOC/script/dcn_files/dcn$new_version.json $PV_LOC/models/dcn/$new_version/dcn.json
85 | 
86 |     mv $PV_LOC/models_recurrent_runs/dcn_ens/1 $PV_LOC/models/dcn_ens/$new_version
87 |     mv $PV_LOC/models_recurrent_runs/dcn_nvt/1 $PV_LOC/models/dcn_nvt/$new_version
88 | 
89 |     rm -rf $PV_LOC/models_recurrent_runs
90 | fi  
91 | 


--------------------------------------------------------------------------------
/preprocess-train/preprocessing/nvt-preprocess-incremental.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | 
 18 | # Standard Libraries
 19 | import os
 20 | from time import time
 21 | import re
 22 | import shutil
 23 | import glob
 24 | import warnings
 25 | import sys
 26 | import argparse
 27 | import logging
 28 | 
 29 | # External Dependencies
 30 | import numpy as np
 31 | import pandas as pd
 32 | import cupy as cp
 33 | import cudf
 34 | import dask_cudf
 35 | from dask_cuda import LocalCUDACluster
 36 | from dask.distributed import Client
 37 | from dask.utils import parse_bytes
 38 | from dask.delayed import delayed
 39 | import rmm
 40 | 
 41 | import nvtabular as nvt
 42 | from nvtabular.utils import _pynvml_mem_size, device_mem_size
 43 | 
 44 | def run_preprocessing(input_train_path, workflow_path, output_path, dask_workdir, num_gpus):
 45 |     fname = '{}.parquet'
 46 |     train_files = [i for i in os.listdir(input_train_path) if re.match(fname.format('.*'), i) is not None]
 47 |     train_paths = [os.path.join(input_train_path, filename) for filename in train_files]
 48 | 
 49 |     # Deploy a Dask Distributed Cluster
 50 |     # Single-Machine Multi-GPU Cluster
 51 |     protocol = "tcp"             # "tcp" or "ucx"
 52 |     visible_devices = ",".join([str(n) for n in num_gpus])  # Delect devices to place workers
 53 |     device_limit_frac = 0.4      # Spill GPU-Worker memory to host at this limit.
 54 |     device_pool_frac = 0.5
 55 |     part_mem_frac = 0.05
 56 | 
 57 |     # Use total device size to calculate args.device_limit_frac
 58 |     device_size = device_mem_size(kind="total")
 59 |     part_size = int(part_mem_frac * device_size)
 60 |     logging.info(f"Partition size: {part_size}")
 61 | 
 62 |     # Deploy Dask Distributed cluster only if asked for multiple GPUs
 63 |     if len(num_gpus) > 1:
 64 |         logging.info("Deploy Dask Distributed cluster...")
 65 | 
 66 |         device_limit = int(device_limit_frac * device_size)
 67 |         device_pool_size = int(device_pool_frac * device_size)
 68 | 
 69 |         logging.info("Checking if any device memory is already occupied...")
 70 |         # Check if any device memory is already occupied
 71 |         for dev in visible_devices.split(","):
 72 |             fmem = _pynvml_mem_size(kind="free", index=int(dev))
 73 |             used = (device_size - fmem) / 1e9
 74 |             if used > 1.0:
 75 |                 warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!")
 76 | 
 77 |         cluster = None               # (Optional) Specify existing scheduler port
 78 |         if cluster is None:
 79 |             cluster = LocalCUDACluster(
 80 |                 protocol = protocol,
 81 |                 n_workers=len(visible_devices.split(",")),
 82 |                 CUDA_VISIBLE_DEVICES = visible_devices,
 83 |                 device_memory_limit = device_limit,
 84 |                 local_directory=dask_workdir
 85 |             )
 86 | 
 87 |         logging.info("Create the distributed client...")
 88 |         # Create the distributed client
 89 |         client = Client(cluster)
 90 | 
 91 |         logging.info("Initialize memory pools...")
 92 |         # Initialize RMM pool on ALL workers
 93 |         def _rmm_pool():
 94 |             rmm.reinitialize(
 95 |                 # RMM may require the pool size to be a multiple of 256.
 96 |                 pool_allocator=True,
 97 |                 initial_pool_size=(device_pool_size // 256) * 256, # Use default size
 98 |             )
 99 | 
100 |         client.run(_rmm_pool)
101 | 
102 | 
103 |     # Import the test .parquet
104 |     logging.info("Importing Data...")
105 |     test_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size)
106 | 
107 |     logging.info("Loading workflow object...")
108 |     workflow = nvt.Workflow.load(workflow_path)
109 | 
110 |     # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets
111 |     CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
112 |     CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
113 |     LABEL_COLUMNS = ['label']
114 |     dict_dtypes={}
115 | 
116 |     for col in CATEGORICAL_COLUMNS:
117 |         dict_dtypes[col] = np.int64
118 | 
119 |     for col in CONTINUOUS_COLUMNS:
120 |         dict_dtypes[col] = np.float32
121 | 
122 |     for col in LABEL_COLUMNS:
123 |         dict_dtypes[col] = np.float32
124 | 
125 |     # Create output directory for test data
126 |     output_test_dir = os.path.join(output_path, 'train/')
127 | 
128 |     if not os.path.exists(output_test_dir):
129 |         logging.info(f"Creating train/ directory at: {output_test_dir}")
130 |         os.makedirs(output_test_dir)
131 | 
132 |     logging.info("Preprocessing Data...")
133 |     workflow.transform(test_dataset).to_parquet(output_path=output_test_dir,
134 |                                              dtypes=dict_dtypes,
135 |                                              cats=CATEGORICAL_COLUMNS,
136 |                                              conts=CONTINUOUS_COLUMNS,
137 |                                              labels=LABEL_COLUMNS)
138 | 
139 |     logging.info("Done!")
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument('-t',
145 |                         '--input_train_dir',
146 |                         type=str,
147 |                         required=False,
148 |                         default='/crit_int_pq',
149 |                         help='Path to Preprocessed Data Dir. Default is /crit_int_pq')
150 | 
151 |     parser.add_argument('-o',
152 |                         '--output_dir',
153 |                         type=str,
154 |                         required=False,
155 |                         default='./test_dask/output/',
156 |                         help='Path for Output directory. Default is ./test_dask/output/')
157 | 
158 |     parser.add_argument('-w',
159 |                         '--workflow_dir',
160 |                         type=str,
161 |                         required=False,
162 |                         default='./test_dask/output/workflow/',
163 |                         help='Path to Saved Workflow object. This should be obtained from Preprocessing Training data. Default is ./test_dask/output/workflow')
164 | 
165 |     parser.add_argument('-e',
166 |                         '--dask_workdir',
167 |                         type=str,
168 |                         required=False,
169 |                         default='./test_dask/workdir',
170 |                         help='Working directory for Dask. Default is ./test_dask/workdir')
171 | 
172 |     parser.add_argument('-g',
173 |                         '--num_gpus',
174 |                         nargs='+',
175 |                         type=int,
176 |                         required=False,
177 |                         default=[0,1],
178 |                         help='GPU devices to use for Preprocessing')
179 | 
180 |     args = parser.parse_args()
181 |     
182 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
183 | 
184 |     logging.info(f"Args: {args}")
185 | 
186 |     run_preprocessing(input_train_path=args.input_train_dir,
187 |                         workflow_path=args.workflow_dir,
188 |                         output_path=args.output_dir ,
189 |                         dask_workdir=args.dask_workdir,
190 |                         num_gpus=args.num_gpus)
191 | 


--------------------------------------------------------------------------------
/preprocess-train/preprocessing/nvt-preprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | # Standard Libraries
 17 | import os
 18 | from time import time
 19 | import re
 20 | import shutil
 21 | import glob
 22 | import warnings
 23 | import argparse
 24 | import logging
 25 | 
 26 | # External Dependencies
 27 | import numpy as np
 28 | import cupy as cp
 29 | import cudf
 30 | import dask_cudf
 31 | from dask_cuda import LocalCUDACluster
 32 | from dask.distributed import Client
 33 | from dask.utils import parse_bytes
 34 | from dask.delayed import delayed
 35 | import rmm
 36 | 
 37 | # NVTabular
 38 | import nvtabular as nvt
 39 | from nvtabular.ops import Categorify, Clip, FillMissing, HashBucket, LambdaOp, LogOp, Rename, get_embedding_sizes, Normalize
 40 | from nvtabular.io import Shuffle
 41 | from nvtabular.utils import _pynvml_mem_size, device_mem_size
 42 | 
 43 | 
 44 | def run_preprocessing(input_path, base_dir, num_train_days, num_val_days, num_gpus):
 45 | 
 46 |     # Define paths to save artifacts
 47 |     dask_workdir = os.path.join(base_dir, "test_dask/workdir")
 48 |     output_path = os.path.join(base_dir, "test_dask/output")
 49 |     stats_path = os.path.join(base_dir, "test_dask/stats")
 50 | 
 51 |     logging.info(f"Dask Workdir: {dask_workdir}")
 52 |     logging.info(f"Output Path: {output_path}")
 53 | 
 54 |     # Make sure we have a clean worker space for Dask
 55 |     if os.path.isdir(dask_workdir):
 56 |         shutil.rmtree(dask_workdir)
 57 |     os.makedirs(dask_workdir)
 58 | 
 59 |     # Make sure we have a clean stats space for Dask
 60 |     if os.path.isdir(stats_path):
 61 |         shutil.rmtree(stats_path)
 62 |     os.mkdir(stats_path)
 63 | 
 64 |     # Make sure we have a clean output path
 65 |     if os.path.isdir(output_path):
 66 |         shutil.rmtree(output_path)
 67 |     os.mkdir(output_path)
 68 | 
 69 |     logging.info("Created output directories..")
 70 | 
 71 |     # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc.
 72 |     fname = 'day_{}.parquet'
 73 |     num_days = len([i for i in os.listdir(input_path) if re.match(fname.format('[0-9]{1,2}'), i) is not None])
 74 |     train_paths = [os.path.join(input_path, fname.format(day)) for day in range(num_train_days)]
 75 |     valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(num_train_days, num_train_days + num_val_days)]
 76 | 
 77 |     logging.info(f"Training data: {train_paths}")
 78 |     logging.info(f"Validation data: {valid_paths}")
 79 | 
 80 |     # Deploy a Dask Distributed Cluster
 81 |     # Single-Machine Multi-GPU Cluster
 82 |     protocol = "tcp"             # "tcp" or "ucx"
 83 |     visible_devices = ",".join([str(n) for n in num_gpus])  # Delect devices to place workers
 84 |     device_limit_frac = 0.4      # Spill GPU-Worker memory to host at this limit.
 85 |     device_pool_frac = 0.5
 86 |     part_mem_frac = 0.05 # Desired maximum size of each partition as a fraction of total GPU memory.
 87 | 
 88 |     # Use total device size to calculate args.device_limit_frac
 89 |     device_size = device_mem_size(kind="total")
 90 |     part_size = int(part_mem_frac * device_size)
 91 |     logging.info(f"Partition size: {part_size}")
 92 | 
 93 |     # Deploy Dask Distributed cluster only if asked for multiple GPUs
 94 |     if len(num_gpus) > 1:
 95 | 
 96 |         device_limit = int(device_limit_frac * device_size)
 97 |         device_pool_size = int(device_pool_frac * device_size)
 98 | 
 99 |         logging.info("Checking if any device memory is already occupied..")
100 |         # Check if any device memory is already occupied
101 |         for dev in visible_devices.split(","):
102 |             fmem = _pynvml_mem_size(kind="free", index=int(dev))
103 |             used = (device_size - fmem) / 1e9
104 |             if used > 1.0:
105 |                 warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!")
106 | 
107 |         cluster = None               # (Optional) Specify existing scheduler port
108 |         if cluster is None:
109 |             cluster = LocalCUDACluster(
110 |                 protocol = protocol,
111 |                 n_workers=len(visible_devices.split(",")),
112 |                 CUDA_VISIBLE_DEVICES = visible_devices,
113 |                 device_memory_limit = device_limit,
114 |                 local_directory=dask_workdir
115 |             )
116 | 
117 |         logging.info("Create the distributed client..")
118 |         # Create the distributed client
119 |         client = Client(cluster)
120 | 
121 |         logging.info("Initialize memory pools..")
122 |         # Initialize RMM pool on ALL workers
123 |         def _rmm_pool():
124 |             rmm.reinitialize(
125 |                 # RMM may require the pool size to be a multiple of 256.
126 |                 pool_allocator=True,
127 |                 initial_pool_size=(device_pool_size // 256) * 256,
128 |             )
129 | 
130 |         client.run(_rmm_pool)
131 | 
132 |     # Preprocessing
133 |     CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
134 |     CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
135 |     LABEL_COLUMNS = ['label']
136 |     COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS
137 | 
138 |     cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path)
139 |     cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> Normalize()
140 |     features = cat_features + cont_features + LABEL_COLUMNS
141 | 
142 |     logging.info("Defining a workflow object..")
143 |     if len(num_gpus) > 1:
144 |         workflow = nvt.Workflow(features, client=client)
145 |     else:
146 |         workflow = nvt.Workflow(features)
147 | 
148 |     dict_dtypes={}
149 | 
150 |     for col in CATEGORICAL_COLUMNS:
151 |         dict_dtypes[col] = np.int64
152 | 
153 |     for col in CONTINUOUS_COLUMNS:
154 |         dict_dtypes[col] = np.float32
155 | 
156 |     for col in LABEL_COLUMNS:
157 |         dict_dtypes[col] = np.float32
158 | 
159 | 
160 |     train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size)
161 |     valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size)
162 | 
163 |     output_train_dir = os.path.join(output_path, 'train/')
164 |     logging.info(f"Creating train/ directory at: {output_train_dir}")
165 |     if not os.path.exists(output_train_dir):
166 |         os.makedirs(output_train_dir)
167 | 
168 |     output_valid_dir = os.path.join(output_path, 'valid/')
169 |     logging.info(f"Creating valid/ directory at: {output_valid_dir}")
170 |     if not os.path.exists(output_valid_dir):
171 |         os.makedirs(output_valid_dir)
172 | 
173 |     logging.info("Workflow Fit..")
174 |     workflow.fit(train_dataset)
175 | 
176 |     logging.info("Transform Training data..")
177 |     workflow.transform(train_dataset).to_parquet(output_path=output_train_dir,
178 |                                              shuffle=nvt.io.Shuffle.PER_PARTITION,
179 |                                              dtypes=dict_dtypes,
180 |                                              cats=CATEGORICAL_COLUMNS,
181 |                                              conts=CONTINUOUS_COLUMNS,
182 |                                              labels=LABEL_COLUMNS)
183 | 
184 |     logging.info("Transform Validation data..")
185 |     workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir,
186 |                                                  dtypes=dict_dtypes,
187 |                                                  cats=CATEGORICAL_COLUMNS,
188 |                                                  conts=CONTINUOUS_COLUMNS,
189 |                                                  labels=LABEL_COLUMNS)
190 | 
191 | 
192 |     # use these printed out cardinalities list in the  "slot_size_array" in the HugeCTR training "dcn_parquet.json"
193 |     cardinalities = []
194 |     for col in CATEGORICAL_COLUMNS:
195 |         cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0])
196 | 
197 |     logging.info(f"Cardinalities for configuring slot_size_array: {cardinalities}")
198 | 
199 |     logging.info(f"Saving workflow object at: {output_path + '/workflow'}")
200 |     workflow.save(output_path + '/workflow')
201 | 
202 |     logging.info("Done!")
203 | 
204 | 
205 | if __name__ == '__main__':
206 | 
207 |     parser = argparse.ArgumentParser()
208 |     parser.add_argument('-d',
209 |                         '--input_data_dir',
210 |                         type=str,
211 |                         required=False,
212 |                         default='/crit_int_pq',
213 |                         help='Path to Preprocessed Test Data Dir. Default is /crit_int_pq')
214 | 
215 |     parser.add_argument('-o',
216 |                         '--output_dir',
217 |                         type=str,
218 |                         required=False,
219 |                         default='/var/lib/data/criteo-data/',
220 |                         help='Path for Output directory. It will create a directory "test_dask" to store artifacts. Default is /var/lib/data/criteo-data/')
221 | 
222 |     parser.add_argument('-t',
223 |                         '--n_train_days',
224 |                         type=int,
225 |                         required=False,
226 |                         default=1,
227 |                         help='Number of Criteo data days to use for training dataset. Default is 1. Keep n_train_days + n_val_days<=24')
228 | 
229 |     parser.add_argument('-v',
230 |                         '--n_val_days',
231 |                         type=int,
232 |                         required=False,
233 |                         default=1,
234 |                         help='Number of Criteo data days to take for validation set after n_train_days. Default is 1. Keep n_train_days + n_val_days<=24.')
235 | 
236 |     parser.add_argument('-g',
237 |                         '--num_gpus',
238 |                         nargs='+',
239 |                         type=int,
240 |                         required=False,
241 |                         default=[0,1,2,3,4,5,6,7],
242 |                         help='GPU devices to use for Preprocessing')
243 | 
244 |     args = parser.parse_args()
245 | 
246 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
247 | 
248 |     logging.info(f"Args: {args}")
249 | 
250 |     run_preprocessing(input_path=args.input_data_dir,
251 |                     base_dir=args.output_dir,
252 |                     num_train_days=args.n_train_days,
253 |                     num_val_days=args.n_val_days,
254 |                     num_gpus=args.num_gpus)
255 | 


--------------------------------------------------------------------------------
/preprocess-train/training/create-nvt-hugectr-ensemble.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os
 17 | import argparse
 18 | import logging
 19 | import json
 20 | 
 21 | import nvtabular as nvt
 22 | from nvtabular.inference.triton import export_hugectr_ensemble
 23 | from nvtabular.ops import get_embedding_sizes
 24 | 
 25 | 
 26 | def create_ensemble(workflow_path, hugectr_model_path, ensemble_output_path, ensemble_config_file):
 27 |     """
 28 |     Creates an ensemble of NVTabular and HugeCTR model.
 29 | 
 30 |     This enables preprocessing at the time of inference, allowing the
 31 |     user to send raw data directly to the inference server.
 32 |     """
 33 | 
 34 |     # Load the workflow object
 35 |     workflow = nvt.Workflow.load(workflow_path)
 36 | 
 37 |     # Verify that the workflow is loaded
 38 |     embeddings = get_embedding_sizes(workflow)
 39 |     logging.info(f"Embedding sizes for categorical features: {embeddings}")
 40 | 
 41 |     with open(ensemble_config_file, "r") as jsonfile:
 42 |         ensemble_config = json.load(jsonfile)
 43 | 
 44 |     hugectr_params = ensemble_config["hugectr_params"]
 45 | 
 46 |     # We override the config param to update the model version
 47 |     # Get the model version for updating the config accordingly
 48 |     model_version = hugectr_model_path.split('/')[-2]
 49 |     logging.info(f"Model version: {model_version}")
 50 |     model_json_path = hugectr_params["config"].split(os.sep) # "/model/models/dcn/1/dcn.json" -> ['', 'model', 'models', 'dcn', '1', 'dcn.json']
 51 |     model_json_path[-2] = model_version # ['', 'model', 'models', 'dcn', '1', 'dcn.json'] -> ['', 'model', 'models', 'dcn', '2', 'dcn.json']
 52 |     hugectr_params["config"] = os.sep + os.path.join(*model_json_path) # '/' + 'model/models/dcn/2/dcn.json'
 53 | 
 54 |     logging.info(f"HugeCTR configs: {hugectr_params}")
 55 | 
 56 |     categorical_cols =  ensemble_config["categorical_cols"]
 57 |     continuous_cols = ensemble_config["continuous_cols"]
 58 |     label_cols = ensemble_config["label_cols"]
 59 | 
 60 |     logging.info(f"Categorical Columns: {categorical_cols}")
 61 |     logging.info(f"Continuous Columns: {continuous_cols}")
 62 |     logging.info(f"Label Columns: {label_cols}")
 63 | 
 64 |     logging.info(f"Generating the ensemble at directory: {ensemble_output_path}")
 65 |     export_hugectr_ensemble(workflow=workflow,
 66 |                             hugectr_model_path=hugectr_model_path,
 67 |                             hugectr_params=hugectr_params,
 68 |                             name=ensemble_config["name"],
 69 |                             output_path=ensemble_output_path,
 70 |                             label_columns=label_cols,
 71 |                             cats=categorical_cols,
 72 |                             conts=continuous_cols,
 73 |                             max_batch_size=ensemble_config["max_batch_size"])
 74 | 
 75 | if __name__ == '__main__':
 76 | 
 77 |     parser = argparse.ArgumentParser()
 78 |     parser.add_argument('-w',
 79 |                         '--nvt_workflow_path',
 80 |                         type=str,
 81 |                         required=False,
 82 |                         default='./test_dask/output/workflow',
 83 |                         help='Path to Workflow Dir. Default is ./test_dask/output/workflow')
 84 | 
 85 |     parser.add_argument('-m',
 86 |                         '--hugectr_model_path',
 87 |                         type=str,
 88 |                         required=False,
 89 |                         default='/model/criteo_hugectr/1/',
 90 |                         help='Path to where your .model files and inference .json is stored. Default is /model/criteo_hugectr/1/')
 91 | 
 92 |     parser.add_argument('-o',
 93 |                         '--ensemble_output_path',
 94 |                         type=str,
 95 |                         required=False,
 96 |                         default='/model/models/',
 97 |                         help='Path to where your ensemble output must be stored. Default is /model/models')
 98 | 
 99 |     parser.add_argument('-c',
100 |                         '--ensemble_config',
101 |                         type=str,
102 |                         required=False,
103 |                         default='./ensemble-config.json',
104 |                         help='Path to where ensemble config .json')
105 | 
106 | 
107 |     args = parser.parse_args()
108 | 
109 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
110 | 
111 |     logging.info(f"Args: {args}")
112 | 
113 |     create_ensemble(workflow_path=args.nvt_workflow_path,
114 |                     hugectr_model_path=args.hugectr_model_path,
115 |                     ensemble_output_path=args.ensemble_output_path,
116 |                     ensemble_config_file=args.ensemble_config
117 |                     )
118 | 


--------------------------------------------------------------------------------
/preprocess-train/training/ensemble-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "dcn",
 3 | 	"categorical_cols": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26"],
 4 | 	"continuous_cols": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13"],
 5 | 	"label_cols": ["label"],
 6 | 	"max_batch_size": 64, 
 7 | 	"hugectr_params": {
 8 | 		"config": "/model/models/dcn/1/dcn.json",
 9 | 		"slots": 26,
10 | 		"max_nnz": 1,
11 | 		"embedding_vector_size": 16,
12 | 		"n_outputs": 1
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/preprocess-train/training/hugectr-train-criteo-dcn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | # Standard Libraries
 17 | import argparse
 18 | import logging
 19 | 
 20 | import hugectr
 21 | from mpi4py import MPI
 22 | 
 23 | 
 24 | def train(input_train, input_val, max_iter,
 25 |                 batchsize, snapshot, num_gpus, eval_interval,
 26 |                 dense_model_file, sparse_model_files):
 27 | 
 28 |     logging.info(f"GPU Devices: {num_gpus}")
 29 | 
 30 |     # Configure and define the HugeCTR model
 31 |     solver = hugectr.solver_parser_helper(num_epochs = 0,
 32 |                                         max_iter = max_iter,
 33 |                                         max_eval_batches = 100,
 34 |                                         batchsize_eval = batchsize,
 35 |                                         batchsize = batchsize,
 36 |                                         model_file = dense_model_file,
 37 |                                         embedding_files = sparse_model_files,
 38 |                                         display = 200,
 39 |                                         eval_interval = eval_interval,
 40 |                                         i64_input_key = True,
 41 |                                         use_mixed_precision = False,
 42 |                                         repeat_dataset = True,
 43 |                                         snapshot = snapshot,
 44 |                                         vvgpu = [num_gpus],
 45 |                                         use_cuda_graph = False
 46 |                                         )
 47 | 
 48 |     optimizer = hugectr.optimizer.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
 49 |                                         use_mixed_precision = False)
 50 |     model = hugectr.Model(solver, optimizer)
 51 | 
 52 |     # The slot_size_array are the cardinalities of each categorical feature after NVTabular preprocessing
 53 |     model.add(hugectr.Input(data_reader_type = hugectr.DataReaderType_t.Parquet,
 54 |                                 source = input_train,
 55 |                                 eval_source = input_val,
 56 |                                 check_type = hugectr.Check_t.Non,
 57 |                                 label_dim = 1, label_name = "label",
 58 |                                 dense_dim = 13, dense_name = "dense",
 59 |                                 slot_size_array = [18576837, 29428, 15128, 7296, 19902, 4, 6466, 1311, 62, 11700067, 622921, 219557, 11, 2209, 9780, 71, 4, 964, 15, 22022124, 4384510, 15960286, 290588, 10830, 96, 35],
 60 |                                 data_reader_sparse_param_array =
 61 |                                 [hugectr.DataReaderSparseParam(hugectr.DataReaderSparse_t.Distributed, 30, 1, 26)],
 62 |                                 sparse_names = ["data1"]))
 63 | 
 64 |     # Sparse Embedding Layer
 65 |     model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
 66 |                                 max_vocabulary_size_per_gpu = 88656602,
 67 |                                 embedding_vec_size = 16,
 68 |                                 combiner = 0,
 69 |                                 sparse_embedding_name = "sparse_embedding1",
 70 |                                 bottom_name = "data1"))
 71 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
 72 |                                 bottom_names = ["sparse_embedding1"],
 73 |                                 top_names = ["reshape1"],
 74 |                                 leading_dim=416))
 75 | 
 76 |     # Concatenate sparse embedding and dense input
 77 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
 78 |                                 bottom_names = ["reshape1", "dense"], top_names = ["concat1"]))
 79 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Slice,
 80 |                                 bottom_names = ["concat1"],
 81 |                                 top_names = ["slice11", "slice12"],
 82 |                                 ranges=[(0,429),(0,429)]))
 83 | 
 84 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross,
 85 |                                 bottom_names = ["slice11"],
 86 |                                 top_names = ["multicross1"],
 87 |                                 num_layers=6))
 88 | 
 89 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
 90 |                                 bottom_names = ["slice12"],
 91 |                                 top_names = ["fc1"],
 92 |                                 num_output=1024))
 93 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
 94 |                                 bottom_names = ["fc1"],
 95 |                                 top_names = ["relu1"]))
 96 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
 97 |                                 bottom_names = ["relu1"],
 98 |                                 top_names = ["dropout1"],
 99 |                                 dropout_rate=0.5))
100 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
101 |                                 bottom_names = ["dropout1"],
102 |                                 top_names = ["fc2"],
103 |                                 num_output=1024))
104 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
105 |                                 bottom_names = ["fc2"],
106 |                                 top_names = ["relu2"]))
107 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
108 |                                 bottom_names = ["relu2"],
109 |                                 top_names = ["dropout2"],
110 |                                 dropout_rate=0.5))
111 | 
112 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
113 |                                 bottom_names = ["dropout2", "multicross1"],
114 |                                 top_names = ["concat2"]))
115 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
116 |                                 bottom_names = ["concat2"],
117 |                                 top_names = ["fc3"],
118 |                                 num_output=1))
119 |     model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
120 |                                 bottom_names = ["fc3", "label"],
121 |                                 top_names = ["loss"]))
122 |     model.compile()
123 |     model.summary()
124 |     model.fit()
125 | 
126 | 
127 | if __name__ == '__main__':
128 | 
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument('-t',
131 |                         '--input_train',
132 |                         type=str,
133 |                         required=False,
134 |                         default='/mlops/scripts/test-script/test_dask/output/train/_file_list.txt',
135 |                         help='Path to training data _file_list.txt')
136 | 
137 |     parser.add_argument('-v',
138 |                         '--input_val',
139 |                         type=str,
140 |                         required=False,
141 |                         default='/mlops/scripts/test-script/test_dask/output/valid/_file_list.txt',
142 |                         help='Path to validation data _file_list.txt')
143 | 
144 |     parser.add_argument('-i',
145 |                         '--max_iter',
146 |                         type=int,
147 |                         required=False,
148 |                         default=20000,
149 |                         help='Number of training iterations')
150 | 
151 |     parser.add_argument('-b',
152 |                         '--batchsize',
153 |                         type=int,
154 |                         required=False,
155 |                         default=2048,
156 |                         help='Batch size')
157 | 
158 |     parser.add_argument('-s',
159 |                         '--snapshot',
160 |                         type=int,
161 |                         required=False,
162 |                         default=10000,
163 |                         help='Saves a model snapshot after given number of iterations')
164 | 
165 |     parser.add_argument('-g',
166 |                         '--num_gpus',
167 |                         nargs='+',
168 |                         type=int,
169 |                         required=False,
170 |                         default=[0,1],
171 |                         help='GPU devices to use for Preprocessing')
172 | 
173 |     parser.add_argument('-r',
174 |                         '--eval_interval',
175 |                         type=int,
176 |                         required=False,
177 |                         default=1000,
178 |                         help='Run evaluation after given number of iterations')
179 | 
180 |     parser.add_argument('-d',
181 |                         '--dense_model_file',
182 |                         type=str,
183 |                         required=False,
184 |                         default=None,
185 |                         help='Path to an existing dense model. If provided, resumes training from here. Eg. ./_dense_19500.model ')
186 | 
187 |     parser.add_argument('-m',
188 |                         '--sparse_model_files',
189 |                         type=str,
190 |                         nargs='+',
191 |                         required=False,
192 |                         default=None,
193 |                         help='Paths to an existing sparse snapshots. If provided, resumes training from here. Eg. --sparse_model_files ./model-snapshot/0_sparse_19500.model ./model-snapshot/0_sparse_19500.model')
194 | 
195 |     args = parser.parse_args()
196 | 
197 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
198 | 
199 |     logging.info(f"Args: {args}")
200 | 
201 |     # Both the dense and sparse model files should be provided if either one is provided
202 |     if args.dense_model_file and args.sparse_model_files:
203 |         logging.info("Training from previously saved model...")
204 |         logging.info(f"Dense model file: {args.dense_model_file}")
205 |         logging.info(f"Sparse model file: {args.sparse_model_files}")
206 |         dense_model_file = args.dense_model_file
207 |         sparse_model_files = args.sparse_model_files
208 |     elif (args.dense_model_file and args.sparse_model_files is None) or \
209 |                             (args.sparse_model_files and args.dense_model_file is None):
210 |         parser.error("--dense_model_file and --sparse_model_files both need to be provided together.")
211 |     else:
212 |         logging.info("No previous checkpoint/model provided. Training from scratch. ")
213 |         dense_model_file = ""
214 |         sparse_model_files = []
215 | 
216 |     train(input_train=args.input_train,
217 |             input_val=args.input_val,
218 |             max_iter=args.max_iter,
219 |             batchsize=args.batchsize,
220 |             snapshot=args.snapshot,
221 |             eval_interval=args.eval_interval,
222 |             num_gpus=args.num_gpus,
223 |             dense_model_file=dense_model_file,
224 |             sparse_model_files=sparse_model_files
225 |             )


--------------------------------------------------------------------------------
/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PROJECT_ID=${1:-"dl-tme"}
20 | GCLOUD_KEY=${2:-"gcloud_key.json"}
21 | 
22 | COPY_CONTAINER=gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk:0.5.1
23 | TRAIN_CONTAINER=gcr.io/$PROJECT_ID/merlin/merlin-training:0.5.1
24 | MONITOR_COMPONENT=gcr.io/$PROJECT_ID/monitoring:0.5.1
25 | VALIDATE_CONTAINER=gcr.io/$PROJECT_ID/validation:0.5.1
26 | 
27 | bash build_copy_container.sh $PROJECT_ID $GCLOUD_KEY
28 | COPY_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/google-nvidia-cloud-sdk:0.5.1)
29 | DEPLOY_CONTAINER=$COPY_CONTAINER
30 | 
31 | bash build_validation_component.sh $PROJECT_ID $GCLOUD_KEY
32 | VALIDATE_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/validation:0.5.1)
33 | 
34 | bash build_training_container.sh $PROJECT_ID $GCLOUD_KEY
35 | TRAIN_CONTAINER=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/merlin/merlin-training:0.5.1)
36 | 
37 | bash build_monitoring_component.sh $PROJECT_ID $GCLOUD_KEY
38 | MONITOR_COMPONENT=$(docker inspect --format="{{index .RepoDigests 0}}" gcr.io/$PROJECT_ID/monitoring:0.5.1)
39 | 
40 | 
41 | source activate mlpipeline
42 | python3 merlin-pipeline.py -vc $VALIDATE_CONTAINER -dex $COPY_CONTAINER -tc $TRAIN_CONTAINER -dc $DEPLOY_CONTAINER -mc $MONITOR_COMPONENT


--------------------------------------------------------------------------------
/validation/generate-stats.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | 
18 | import os
19 | import argparse
20 | 
21 | import pandas as pd
22 | import tensorflow_data_validation as tfdv
23 | from google.protobuf.json_format import MessageToDict
24 | 
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser()
29 | 
30 |     parser.add_argument('-d',
31 |                         '--data_dir',
32 |                         type=str,
33 |                         required=False,
34 |                         default='/crit_int_pq/day_23.parquet',
35 |                         help='Path to a data .parquet file. Default')
36 | 
37 |     parser.add_argument('-o',
38 |                         '--output_dir',
39 |                         type=str,
40 |                         required=False,
41 |                         default='./output',
42 |                         help='Path to a where stats must be saved')
43 | 
44 |     parser.add_argument('-f',
45 |                         '--file_name',
46 |                         type=str,
47 |                         required=False,
48 |                         default='stats.txt',
49 |                         help='Name of the stats file')
50 | 
51 | 
52 |     args = parser.parse_args()
53 | 
54 | 
55 |     # tfdv doesnt support generating stats directly from parquet
56 |     # so read through pandas parquet reader
57 |     # Ideally, this should be be an accelerated parquet reader and stats
58 |     # computation should happen via GPU
59 |     df = pd.read_parquet(args.data_dir)
60 | 
61 |     stats = tfdv.generate_statistics_from_dataframe(df)
62 | 
63 |     if not os.path.exists(args.output_dir):
64 |         os.makedirs(args.output_dir)
65 | 
66 |     output_path = os.path.join(args.output_dir, args.file_name)
67 | 
68 |     tfdv.write_stats_text(stats, output_path=output_path)
69 | 


--------------------------------------------------------------------------------
/validation/run_validation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | 
18 | 
19 | PV_LOC=${1:-"/var/lib/data"}
20 | VALIDATION=${2:-'False'}
21 | 
22 | if [ -d "$PV_LOC/stats" ] && [ -d "$PV_LOC/stats/stats.txt" ]; then
23 |     previous_version=$(ls $PV_LOC/stats/ -v | tail -n1)
24 |     new_version="$(($previous_version + 1))" 
25 |     new_file="$(ls $PV_LOC/criteo-data/new_data/ | shuf -n 1)"
26 |     
27 |     echo "Generating stats for training data..."
28 |     python3 -u /script/generate-stats.py --data_dir $PV_LOC/criteo-data/new_data/$new_file --output_dir $PV_LOC/stats/ --file_name "stats"$new_version".txt" 
29 | 
30 |     echo "Validate stats..."
31 |     python3 -u /script/validate-stats.py --stats_file_1 $PV_LOC/stats/stats.txt --stats_file_2  $PV_LOC/stats/"stats"$new_version".txt"
32 | 
33 | else
34 |     if [[ "$VALIDATION" == 'True' ]]; then
35 |         mkdir -p $PV_LOC/stats/
36 | 
37 |         echo "Generating stats for training data..."
38 |         python3 -u /script/generate-stats.py --data_dir $PV_LOC/criteo-data/crit_int_pq/day_0.parquet --output_dir $PV_LOC/stats/ --file_name "stats.txt"
39 |     else
40 |         echo "Not generating stats..."
41 |     fi
42 | 
43 | fi
44 | 


--------------------------------------------------------------------------------
/validation/validate-stats.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | import os
18 | import argparse
19 | import logging
20 | 
21 | import pandas as pd
22 | import tensorflow_data_validation as tfdv
23 | from google.protobuf.json_format import MessageToDict
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser()
28 | 
29 |     parser.add_argument('-t',
30 |                         '--stats_file_1',
31 |                         type=str,
32 |                         required=False,
33 |                         default='./train_stats/stats.txt',
34 |                         help='Path to the training/reference stats .txt file ')
35 | 
36 |     parser.add_argument('-v',
37 |                         '--stats_file_2',
38 |                         type=str,
39 |                         required=False,
40 |                         default='./val_stats/stats.txt',
41 |                         help='Path to the validation stats .txt file ')
42 | 
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%d-%m-%y %H:%M:%S')
47 |     logging.info(f"Args: {args}")
48 | 
49 | 
50 |     stats1 = tfdv.load_stats_text(input_path=args.stats_file_1)
51 |     stats2 = tfdv.load_stats_text(input_path=args.stats_file_2)
52 | 
53 |     schema1 = tfdv.infer_schema(statistics=stats1)
54 | 
55 |     # Custom rules, tweak this as required. This is just an example
56 |     tfdv.get_feature(schema1, 'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06
57 | 
58 |     # Calculate drift between the reference stats stats1, and the statistics from new data in stats2
59 |     drift_anomalies = tfdv.validate_statistics(statistics=stats2,
60 |                                                 schema=schema1,
61 |                                                 previous_statistics=stats1)
62 | 
63 |     # Convert the .pb2 to dict
64 |     drift = MessageToDict(drift_anomalies)
65 | 
66 |     value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value']
67 |     threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
68 |     logging.info(f"JS divergence value: {value}, and JS divergence threshold: {threshold}")
69 |     drift_detected = True
70 |     if value < threshold:
71 |         drift_detected = False
72 |     logging.info(f"Drift detected: {drift_detected}")
73 | 


--------------------------------------------------------------------------------
/yamls/Autoscaling_custom_metrics/1_custom-metric-server-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | apiVersion: v1
18 | kind: ConfigMap
19 | metadata:
20 |   name: adapter-config
21 |   namespace: custom-metrics
22 | data:
23 |   adapter-config-data: |
24 |     rules:
25 |     - seriesQuery: 'nv_inference_queue_duration_us{namespace="default"}'
26 |       resources:
27 |         overrides:
28 |           namespace:
29 |             resource: "namespace"
30 |           pod:
31 |             resource: pod
32 |       name:
33 |         matches: "nv_inference_queue_duration_us"
34 |         as: "avg_time_queue_ms"
35 |       metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))/1000) by (<<.GroupBy>>)'
36 | 


--------------------------------------------------------------------------------
/yamls/Autoscaling_custom_metrics/2_custom-metric-server.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | apiVersion: apps/v1
18 | kind: Deployment
19 | metadata:
20 |   name: custom-metrics-apiserver
21 |   namespace: custom-metrics
22 |   labels:
23 |     app: custom-metrics-apiserver
24 | spec:
25 |   replicas: 1
26 |   selector:
27 |     matchLabels:
28 |       app: custom-metrics-apiserver
29 |   template:
30 |     metadata:
31 |       labels:
32 |         app: custom-metrics-apiserver
33 |     spec:
34 |       containers:
35 |       - name: custom-metrics-server
36 |         #image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1
37 |         image: directxman12/k8s-prometheus-adapter-amd64
38 |         args:
39 |         #- --prometheus-url=http://kube-prometheus-stack-1616-prometheus:9090
40 |         - --prometheus-url=http://10.4.6.5:9090
41 |         - --metrics-relist-interval=30s
42 |         - --v=10
43 |         - --config=/etc/config/adapter-config.yaml
44 |         ports:
45 |         - containerPort: 443
46 |         volumeMounts:
47 |         - name: config-volume
48 |           mountPath: /etc/config
49 |         securityContext:
50 |           runAsUser: 0
51 |       volumes:
52 |       - name: config-volume
53 |         configMap:
54 |           name: adapter-config
55 |           items:
56 |           - key: adapter-config-data
57 |             path: adapter-config.yaml
58 | 
59 | ---
60 | apiVersion: v1
61 | kind: Service
62 | metadata:
63 |   name: api
64 |   namespace: custom-metrics
65 | spec:
66 |   selector:
67 |     app: custom-metrics-apiserver
68 |   ports:
69 |   - port: 443
70 |     targetPort: 443
71 | ---
72 | apiVersion: apiregistration.k8s.io/v1
73 | kind: APIService
74 | metadata:
75 |   name: v1beta1.custom.metrics.k8s.io
76 | spec:
77 |   insecureSkipTLSVerify: true
78 |   group: custom.metrics.k8s.io
79 |   groupPriorityMinimum: 1000
80 |   versionPriority: 5
81 |   service:
82 |     name: api
83 |     namespace: custom-metrics
84 |   version: v1beta1
85 | ---


--------------------------------------------------------------------------------
/yamls/Autoscaling_custom_metrics/3_custom-metrics-server-rbac.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | kind: ServiceAccount
 18 | apiVersion: v1
 19 | metadata:
 20 |   name: custom-metrics-apiserver
 21 |   namespace: custom-metrics
 22 | ---
 23 | apiVersion: rbac.authorization.k8s.io/v1
 24 | kind: ClusterRoleBinding
 25 | metadata:
 26 |   name: custom-metrics:system:auth-delegator
 27 | roleRef:
 28 |   apiGroup: rbac.authorization.k8s.io
 29 |   kind: ClusterRole
 30 |   name: system:auth-delegator
 31 | subjects:
 32 | - kind: ServiceAccount
 33 |   name: custom-metrics-apiserver
 34 |   namespace: custom-metrics
 35 | ---
 36 | apiVersion: rbac.authorization.k8s.io/v1
 37 | kind: RoleBinding
 38 | metadata:
 39 |   name: custom-metrics-auth-reader
 40 |   namespace: kube-system
 41 | roleRef:
 42 |   apiGroup: rbac.authorization.k8s.io
 43 |   kind: Role
 44 |   name: extension-apiserver-authentication-reader
 45 | subjects:
 46 | - kind: ServiceAccount
 47 |   name: custom-metrics-apiserver
 48 |   namespace: custom-metrics
 49 | ---
 50 | apiVersion: rbac.authorization.k8s.io/v1
 51 | kind: ClusterRoleBinding
 52 | metadata:
 53 |   name: custom-metrics-resource-reader
 54 | roleRef:
 55 |   apiGroup: rbac.authorization.k8s.io
 56 |   kind: ClusterRole
 57 |   name: custom-metrics-resource-reader
 58 | subjects:
 59 | - kind: ServiceAccount
 60 |   name: custom-metrics-apiserver
 61 |   namespace: custom-metrics
 62 | ---
 63 | apiVersion: rbac.authorization.k8s.io/v1
 64 | kind: ClusterRole
 65 | metadata:
 66 |   name: custom-metrics-server-resources
 67 | rules:
 68 | - apiGroups:
 69 |   - custom.metrics.k8s.io
 70 |   resources: ["*"]
 71 |   verbs: ["*"]
 72 | ---
 73 | apiVersion: rbac.authorization.k8s.io/v1
 74 | kind: ClusterRole
 75 | metadata:
 76 |   name: custom-metrics-resource-reader
 77 | rules:
 78 | - apiGroups:
 79 |   - ""
 80 |   resources:
 81 |   - namespaces
 82 |   - pods
 83 |   - services
 84 |   verbs:
 85 |   - get
 86 |   - list
 87 | ---
 88 | apiVersion: rbac.authorization.k8s.io/v1
 89 | kind: ClusterRoleBinding
 90 | metadata:
 91 |   name: hpa-controller-custom-metrics
 92 | roleRef:
 93 |   apiGroup: rbac.authorization.k8s.io
 94 |   kind: ClusterRole
 95 |   name: custom-metrics-server-resources
 96 | subjects:
 97 | - kind: ServiceAccount
 98 |   name: horizontal-pod-autoscaler
 99 |   namespace: kube-system
100 | ---


--------------------------------------------------------------------------------
/yamls/Autoscaling_custom_metrics/4_triton-hpa.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | apiVersion: autoscaling/v2beta1
18 | kind: HorizontalPodAutoscaler
19 | metadata:
20 |   name: trtis-metrics-app-hpa
21 | spec:
22 |   scaleTargetRef:
23 |     apiVersion: apps/v1beta1
24 |     kind: Deployment
25 |     name: triton-triton-inference-server
26 |   minReplicas: 1
27 |   maxReplicas: 2
28 |   metrics:
29 |   - type: Object
30 |     object:
31 |       target:
32 |         kind: Namespace
33 |         name: default
34 |       metricName: avg_time_queue_ms
35 |       targetValue: 200m


--------------------------------------------------------------------------------
/yamls/pv.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | apiVersion: v1
18 | kind: PersistentVolume
19 | metadata:
20 |   name: my-file-server
21 | spec:
22 |   capacity:
23 |     storage: 1T
24 |   accessModes:
25 |     - ReadWriteMany
26 |   nfs:
27 |     path: /myVolume
28 |     server: 10.0.0.2


--------------------------------------------------------------------------------
/yamls/pvc.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | apiVersion: v1
17 | kind: PersistentVolumeClaim
18 | metadata:
19 |  name: my-volume-claim
20 | spec:
21 |   accessModes:
22 |     - ReadWriteMany
23 |   resources:
24 |     requests:
25 |       storage: 1T
26 |   storageClassName: ""
27 | 


--------------------------------------------------------------------------------