├── Dockerfile
├── LICENSE
├── README.md
├── cluster_crd.yaml
├── glue-cluster.yaml
├── glue-operator.yaml
├── glue_benchmark.py
├── kill_actor.py
├── requirements.txt
├── simple_check_s3.py
└── template-s3-creds.yaml


/Dockerfile:
--------------------------------------------------------------------------------
 1 | # docker build -f Dockerfile -t glue-runtime:ray-1.12.0 ./
 2 | 
 3 | FROM rayproject/ray:1.12.0-gpu
 4 | 
 5 | RUN sudo apt-get update && sudo apt-get install -y \
 6 |     build-essential iperf \
 7 |     && sudo rm -rf /var/lib/apt/lists/* \
 8 |     && sudo apt-get clean
 9 | 
10 | RUN mkdir /home/ray/glue
11 | WORKDIR /home/ray/glue
12 | 
13 | COPY requirements.txt ./
14 | RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
15 | RUN pip install --no-cache-dir torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
16 | 
17 | # change group permissions for running in OCP
18 | RUN sudo chgrp 0 /home/ray/glue
19 | RUN chmod g+w /home/ray/glue
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scaling transfer learning tasks using CodeFlare on OpenShift Container Platform (OCP)
 2 | 
 3 | Foundation models (e.g., BERT, GPT-3, RoBERTa) are trained on a large corpus of data and enable a wide variety of downstream tasks such as sentiment analysis, Q&A, and classification. This repository demonstrates how an enterprise can take a foundation model and run downstream tasks in a parallel manner on a Hybrid Cloud platform.
 4 | 
 5 | We use RoBERTa as our base model and run the [GLUE benchmark](https://gluebenchmark.com) that consists of 10 downstream tasks, each with 10 seeds. Each of these tasks is transformed to a [`ray` task](https://docs.ray.io/en/latest/walkthrough.html) by using the `@ray.remote` annotation with a single GPU allocated for each task.
 6 | 
 7 | ## Setting up an OpenShift cluster
 8 | 
 9 | We assume that the user of this repoistory has an [OpenShift](https://www.redhat.com/en/technologies/cloud-computing/openshift) cluster setup with the [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/). We also assume that the end user has [OpenShift CLI](https://docs.openshift.com/container-platform/4.2/cli_reference/openshift_cli/getting-started-cli.html#cli-installing-cli_cli-developer-commands) installed and have their data in an S3 compatible object storage. Python scripts for downloading all GLUE data are avaible [here](https://github.com/nyu-mll/GLUE-baselines#downloading-glue).
10 | 
11 | ## Creating the S3 objects for roberta-base and glue_data 
12 | 
13 | Both objects should be placed into the same S3 bucket.  
14 | 
15 | Create the RoBERTa base model S3 object with key="roberta-base" and contents=roberta-base.tgz
16 | ```
17 | - git clone https://huggingface.co/roberta-base
18 | - tar -czf roberta-base.tgz roberta-base
19 | ```
20 | 
21 | Create the S3 object for the GLUE datasets with key=glue_data and contents=glue_data.tgz  
22 | ```
23 | - python download_glue_data.py --data_dir glue_data --tasks all
24 | - tar -czf glue_data.tgz glue_data
25 | ```
26 | 
27 | 
28 | ## Running glue_benchmark
29 | 
30 | 1. Log into OCP using the `oc login` command (On IBM Cloud, one can go to the menu under IAM#<your username/email>, then "Copy Login Command").  
31 | 
32 | 2. Use `oc project` to confirm your namespace is as desired. You can switch to your desired namespace by:
33 | ```
34 | $ oc project {your-namespace}
35 | ```
36 | 
37 | 3. Use provided `template-s3-creds.yaml` and create a personal `yaml` secrets file with your namespace and S3 credentials. Note that to use AWS S3 storage, the value for ENDPOINT_URL should be empty. The program `simple_check_s3.py` can be used to validate S3 access from the head node.  
38 | Then register the secrets:
39 | ```
40 | $ oc create -f {your-handle}-s3-creds.yaml
41 | ```
42 | 
43 | 4. [Required only once] Check if Ray CRD is installed.
44 | ```
45 | $ oc get crd | grep ray
46 | ```
47 | You can install the Ray CRD using:
48 | ```
49 | $ oc apply -f cluster_crd.yaml  
50 | ```
51 | 
52 | 5. Create a `ray` operator in your namespace:
53 | ```
54 | $ oc apply -f glue-operator.yaml
55 | ```
56 | 
57 | 6. Create a `ray` cluster in your namespace. Change the `min` and `max` number of workers as needed (around line 100)
58 | ```
59 |  $ oc apply -f glue-cluster.yaml 
60 | ```
61 | 
62 | 7. If the container images are not cached on OCP nodes they will be pulled; this can take 5-10 minutes or more. When the `ray` cluster head and worker pods are in ready state, copy the application driver to the head node:
63 | ```
64 | $ oc get po --watch
65 | $ oc cp glue_benchmark.py glue-cluster-head-XXXXX:/home/ray/glue/
66 | ```
67 | 
68 | 8. Exec into the head node and run the application. For example:
69 | ```
70 | $ oc exec -it glue-cluster-head-cjgzk -- /bin/bash
71 | (base) 1000650000@glue-cluster-head-cjgzk:~/glue$ nohup ./glue_benchmark -b {bucket-name} -m roberta-base -t WNLI -M &
72 | ```
73 | This will run the GLUE benchmark, a set of downstream tasks on RoBERTa base model against the WNLI task with 10 different seeds, and save the model from the seed with best score. Before the compution starts, GLUE datasets and base model must be loaded into each worker node. Data loading is a two step process: first the S3 objects are pulled and cached locally in plasma, and then each worker pulls the data from plasma and unpacks it in its local filesystem. Additional processing with the same cluster will reuse the local data.
74 | 
75 | 9. Monitor progress using `nohup.out`. The evaluation results, along with the remote consoles in log.log files, will be in `/tmp/summary`.
76 | 
77 | 10. When finished, clean up the active resources in your project:
78 | ```
79 | $ oc delete -f glue-cluster.yaml
80 | $ oc delete -f glue-operator.yaml
81 | ```
82 | 
83 | ## Conclusion
84 | 
85 | This demonstrates how we can run downstream fine tuning tasks in parallel on a GPU enabled OpenShift cluster. Users can take arbitrary fine tuning tasks written by data scientists and following the pattern in this repository scale it out on their Hybrid Cloud environment. The data will never leave the user's environment and all the GPUs can be leveraged during the transfer learning process. In our experiments, we observed that all the 8 GPUs on four nodes were leveraged for training the various downstream tasks.
86 | 


--------------------------------------------------------------------------------
/glue-cluster.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: cluster.ray.io/v1
  2 | kind: RayCluster
  3 | metadata:
  4 |   name: glue-cluster
  5 | spec:
  6 |   # The maximum number of workers nodes to launch in addition to the head node.
  7 |   maxWorkers: 10
  8 |   # The autoscaler will scale up the cluster faster with higher upscaling speed.
  9 |   # E.g., if the task requires adding more nodes then autoscaler will gradually
 10 |   # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 11 |   # This number should be > 0.
 12 |   upscalingSpeed: 0.0
 13 |   # If a node is idle for this many minutes, it will be removed.
 14 |   idleTimeoutMinutes: 99999
 15 |   # Specify the pod type for the ray head node (as configured below).
 16 |   headPodType: head-node
 17 |   # Specify the allowed pod types for this ray cluster and the resources they provide.
 18 |   podTypes:
 19 |   - name: head-node
 20 |     # Minimum number of Ray workers of this Pod type.
 21 |     minWorkers: 0
 22 |     # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
 23 |     maxWorkers: 0
 24 |     rayResources: {"GPU": 0}
 25 |     podConfig:
 26 |       apiVersion: v1
 27 |       kind: Pod
 28 |       metadata:
 29 |         # Automatically generates a name for the pod with this prefix.
 30 |         generateName: head-
 31 |       spec:
 32 |         restartPolicy: Never
 33 | 
 34 |         # This volume allocates shared memory for Ray to use for its plasma
 35 |         # object store. If you do not provide this, Ray will fall back to
 36 |         # /tmp which cause slowdowns if is not a shared memory volume.
 37 |         volumes:
 38 |         - name: dshm
 39 |           emptyDir:
 40 |             medium: Memory
 41 |         containers:
 42 |         - name: ray-node
 43 |           image: projectcodeflare/codeflare-glue:latest
 44 |           env:
 45 |           - name: AWS_ACCESS_KEY_ID
 46 |             valueFrom:
 47 |               secretKeyRef:
 48 |                 name: glue-s3-creds
 49 |                 key: AWS_ACCESS_KEY_ID
 50 |           - name: AWS_SECRET_ACCESS_KEY
 51 |             valueFrom:
 52 |               secretKeyRef:
 53 |                 name: glue-s3-creds
 54 |                 key: AWS_SECRET_ACCESS_KEY
 55 |           - name: ENDPOINT_URL
 56 |             valueFrom:
 57 |               secretKeyRef:
 58 |                 name: glue-s3-creds
 59 |                 key: ENDPOINT_URL
 60 |           # Do not change this command - it keeps the pod alive until it is
 61 |           # explicitly killed.
 62 |           command: ["/bin/bash", "-c", "--"]
 63 |           args: ['trap : TERM INT; sleep infinity & wait;']
 64 |           ports:
 65 |           - containerPort: 6379  # Redis port
 66 |           - containerPort: 10001  # Used by Ray Client
 67 |           - containerPort: 8265  # Used by Ray Dashboard
 68 |           - containerPort: 5001  # Used by iperf
 69 | 
 70 |           # This volume allocates shared memory for Ray to use for its plasma
 71 |           # object store. If you do not provide this, Ray will fall back to
 72 |           # /tmp which cause slowdowns if is not a shared memory volume.
 73 |           volumeMounts:
 74 |           - mountPath: /dev/shm
 75 |             name: dshm
 76 |           resources:
 77 |             requests:
 78 |               cpu: "2"
 79 |               memory: "32G"
 80 |               ephemeral-storage: "60G"
 81 |               nvidia.com/gpu: "0"
 82 |             limits:
 83 |               # The maximum memory that this pod is allowed to use. The
 84 |               # limit will be detected by ray and split to use 10% for
 85 |               # redis, 30% for the shared memory object store, and the
 86 |               # rest for application memory. If this limit is not set and
 87 |               # the object store size is not set manually, ray will
 88 |               # allocate a very large object store in each pod that may
 89 |               # cause problems for other pods.
 90 |               cpu: "2"
 91 |               memory: "32G"
 92 |               ephemeral-storage: "60G"
 93 |               nvidia.com/gpu: "0"
 94 |   - name: worker-node
 95 |     # Minimum number of Ray workers of this Pod type.
 96 |     minWorkers: 8
 97 |     # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
 98 |     maxWorkers: 8
 99 |     # User-specified custom resources for use by Ray.
100 |     # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
101 |     rayResources: {"foo": 1, "bar": 0}
102 |     podConfig:
103 |       apiVersion: v1
104 |       kind: Pod
105 |       metadata:
106 |         # Automatically generates a name for the pod with this prefix.
107 |         generateName: worker-
108 |       spec:
109 |         restartPolicy: Never
110 |         volumes:
111 |         - name: dshm
112 |           emptyDir:
113 |             medium: Memory
114 |         containers:
115 |         - name: ray-node
116 |           imagePullPolicy: Always
117 |           image: projectcodeflare/codeflare-glue:latest
118 |           env:
119 |           - name: AWS_ACCESS_KEY_ID
120 |             valueFrom:
121 |               secretKeyRef:
122 |                 name: glue-s3-creds
123 |                 key: AWS_ACCESS_KEY_ID
124 |           - name: AWS_SECRET_ACCESS_KEY
125 |             valueFrom:
126 |               secretKeyRef:
127 |                 name: glue-s3-creds
128 |                 key: AWS_SECRET_ACCESS_KEY
129 |           - name: ENDPOINT_URL
130 |             valueFrom:
131 |               secretKeyRef:
132 |                 name: glue-s3-creds
133 |                 key: ENDPOINT_URL
134 |           command: ["/bin/bash", "-c", "--"]
135 |           args: ["trap : TERM INT; sleep infinity & wait;"]
136 |           # This volume allocates shared memory for Ray to use for its plasma
137 |           # object store. If you do not provide this, Ray will fall back to
138 |           # /tmp which cause slowdowns if is not a shared memory volume.
139 |           volumeMounts:
140 |           - mountPath: /dev/shm
141 |             name: dshm
142 |           resources:
143 |             requests:
144 |               cpu: "8"
145 |               memory: "16G"
146 |               nvidia.com/gpu: "1"
147 |             limits:
148 |               # The maximum memory that this pod is allowed to use. The
149 |               # limit will be detected by ray and split to use 10% for
150 |               # redis, 30% for the shared memory object store, and the
151 |               # rest for application memory. If this limit is not set and
152 |               # the object store size is not set manually, ray will
153 |               # allocate a very large object store in each pod that may
154 |               # cause problems for other pods.
155 |               cpu: "8"
156 |               memory: "16G"
157 |               nvidia.com/gpu: "1"
158 |   # Commands to start Ray on the head node. You don't need to change this.
159 |   # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
160 |   headStartRayCommands:
161 |     - ray stop
162 |     - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
163 |   # Commands to start Ray on worker nodes. You don't need to change this.
164 |   workerStartRayCommands:
165 |     - ray stop
166 |     - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
167 | 


--------------------------------------------------------------------------------
/glue-operator.yaml:
--------------------------------------------------------------------------------
 1 | operator_role:
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: ray-operator-serviceaccount
 6 | ---
 7 | kind: Role
 8 | apiVersion: rbac.authorization.k8s.io/v1
 9 | metadata:
10 |   name: ray-operator-role
11 | rules:
12 | - apiGroups: ["", "cluster.ray.io"]
13 |   resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"]
14 |   verbs: ["get", "watch", "list", "create", "delete", "patch", "update"]
15 | ---
16 | apiVersion: rbac.authorization.k8s.io/v1
17 | kind: RoleBinding
18 | metadata:
19 |   name: ray-operator-rolebinding
20 | subjects:
21 | - kind: ServiceAccount
22 |   name: ray-operator-serviceaccount
23 | roleRef:
24 |   kind: Role
25 |   name: ray-operator-role
26 |   apiGroup: rbac.authorization.k8s.io
27 | ---
28 | apiVersion: v1
29 | kind: Pod
30 | metadata:
31 |   name: glue-operator-pod
32 | spec:
33 |   serviceAccountName: ray-operator-serviceaccount
34 |   containers:
35 |   - name: ray
36 |     imagePullPolicy: Always
37 |     image: rayproject/ray:1.12.0-py37
38 |     command: ["ray-operator"]
39 |     env:
40 |     - name: RAY_OPERATOR_POD_NAMESPACE
41 |       valueFrom:
42 |         fieldRef:
43 |           fieldPath: metadata.namespace
44 |     resources:
45 |       requests:
46 |         cpu: 1
47 |         memory: 1Gi
48 |       limits:
49 |         memory: 2Gi
50 | 


--------------------------------------------------------------------------------
/glue_benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2021 IBM Corp.
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | 
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import os
 18 | import sys
 19 | import time
 20 | import datetime
 21 | import tempfile
 22 | import boto3
 23 | import tarfile
 24 | import subprocess
 25 | import ray
 26 | import json
 27 | import argparse
 28 | from glob import glob
 29 | import logging
 30 | import socket
 31 | import re
 32 | 
 33 | 
 34 | # ------------ validate S3 -----------
 35 | # Hard to diagnose without these checks
 36 | 
 37 | def Validate_S3(logger,bucket,model,gluedata):
 38 |   param = os.environ.get('AWS_ACCESS_KEY_ID')
 39 |   if param == None:
 40 |     logger.warning("AWS_ACCESS_KEY_ID is missing from environment")
 41 |     return False
 42 |   param = os.environ.get('AWS_SECRET_ACCESS_KEY')
 43 |   if param == None:
 44 |     logger.warning("AWS_SECRET_ACCESS_KEY is missing from environment")
 45 |     return False
 46 |   param = os.environ.get('ENDPOINT_URL')
 47 |   if param == "":
 48 |     logger.warning("ENDPOINT_URL is empty, assuming AWS object store")
 49 |     client = boto3.client(
 50 |       's3',
 51 |       aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
 52 |       aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
 53 |     )
 54 |   else:
 55 |     client = boto3.client(
 56 |       's3',
 57 |       aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
 58 |       aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
 59 |       endpoint_url = os.environ.get('ENDPOINT_URL')
 60 |     )
 61 | 
 62 |   try:
 63 |     check = client.head_bucket(Bucket=bucket)
 64 |   except Exception as e:
 65 |     logger.warning(f"bucket={bucket} not found")
 66 |     return False
 67 | 
 68 |   try:
 69 |     check = client.head_object(Bucket=bucket, Key=model)
 70 |   except Exception as e:
 71 |     logger.warning(f"key={model} not found in bucket={bucket}")
 72 |     return False
 73 | 
 74 |   try:
 75 |     check = client.head_object(Bucket=bucket, Key=gluedata)
 76 |   except Exception as e:
 77 |     logger.warning(f"key={gluedata} not found in bucket={bucket}")
 78 |     return False
 79 | 
 80 |   return True
 81 | 
 82 | 
 83 | # ------------ detached ray actor: DataRefs -----------
 84 | # pulls data from S3 and caches in Plasma for local scaleout
 85 | # returns objref for data previously cached
 86 | # S3 credentials must be defined in the env
 87 | 
 88 | @ray.remote
 89 | class DataRefs:
 90 |   def __init__(self,bucket):
 91 |     self.state = {}
 92 |     self.refs = {}
 93 |     self.bucket = bucket
 94 |     param = os.environ.get('ENDPOINT_URL')
 95 |     if param == "":
 96 |       self.client = boto3.client(
 97 |         's3',
 98 |         aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
 99 |         aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
100 |       )
101 |     else:
102 |       self.client = boto3.client(
103 |         's3',
104 |         aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
105 |         aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
106 |         endpoint_url = os.environ.get('ENDPOINT_URL')
107 |       )
108 | 
109 |   # check if data for key is already cached
110 |   # if not, try to get data from s3 and put it in plasma
111 |   def Get_dataref(self,key):
112 |     if key in self.state:
113 |       if self.state[key] == 'Cached':
114 |         return self.refs[key]
115 |     print(f"  try to get {key} from s3")
116 |     try:
117 |       dataobject = self.client.get_object(Bucket=self.bucket, Key=key)
118 |       data = dataobject['Body'].read()
119 |       print(f"  try to put {key} data into plasma")
120 |       self.refs[key] = ray.put(data)
121 |       self.state[key] = 'Cached'
122 |       return self.refs[key]
123 |     except Exception as e:
124 |       print("Unable to retrieve/put object contents: {0}\n\n".format(e))
125 |       self.state[key] = 'Failed'
126 |       return None
127 | 
128 |   def Get_state(self):
129 |     return self.state
130 | 
131 | 
132 | # ------------ Fetch dataref into plasma -----------
133 | # Calls actor to get objref of S3 data cached in Plasma
134 | def Fetch_data_to_cache(logger,dataRefs,key):
135 |   try:
136 |     st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
137 |     logger.info(f"{st} Get {key} data reference from data actor")
138 |     ref = ray.get(dataRefs.Get_dataref.remote(key))
139 |     if ref == None:
140 |       logger.warning(f"Could not get {key} data reference from data actor")
141 |       return False
142 |     return True
143 | 
144 |   except Exception as e:
145 |     logger.warning(f"Unable to retrieve {key} dataset: {0}".format(e))
146 |     return False
147 | 
148 | # ------------ Fetch data to local dir -----------
149 | # pulls data from Plasma and unpack in local directory
150 | def Fetch_data_to_local_dir(logger,dataRefs,key):
151 |   if not Fetch_data_to_cache(logger,dataRefs,key):
152 |     return False
153 |   try:
154 |     time_start = time.time()
155 |     ref = ray.get(dataRefs.Get_dataref.remote(key))
156 |     if ref == None:
157 |       logger.warning(f"Could not get {key} data reference from data actor")
158 |       return False
159 | 
160 |     dataset = ray.get(ref)
161 |     time_done = time.time()
162 |     st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
163 |     logger.info(f"{st} getting data length={len(dataset)} took {time_done-time_start:.2f}s")
164 |     tmpdata = f"/tmp/{key}.tgz"
165 |     f = open(tmpdata, "wb")
166 |     f.write(dataset)
167 |     f.close
168 |     time_start = time.time()
169 |     file = tarfile.open(tmpdata)
170 |     file.extractall('./')
171 |     file.close()
172 |     time_done = time.time()
173 |     st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
174 |     logger.info(f"{st} unpacking {key} tarfile took {time_done-time_start:.2f}s")
175 |     return True
176 | 
177 |   except Exception as e:
178 |     logger.warning(f"Unable to retrieve/unpack {key} dataset: {0}".format(e))
179 |     return False
180 | 
181 | 
182 | # -------------------- Process_task -----------------
183 | # process_task first checks if the glue datasets and the model to test are present
184 | #   if not, it requests the data to be fetched from plasma and unpacked locally
185 | # Two log streams are created: a debug level stream to stdout and an info level to file
186 | # The results are packed into a python hashmap and returned
187 | 
188 | @ray.remote(num_gpus=1)
189 | def Process_task(dataRefs,bucket,model,gluedata,task,seed,LR,savemodel):
190 |   # clean and recreate result directory
191 |   resultdir = ResultDir(model,task,seed,LR)
192 |   subprocess.run(['rm', '-rf', resultdir])
193 |   subprocess.run(['mkdir', '-p', resultdir])
194 | 
195 |   # create console handler at DEBUG and logfile hander at INFO
196 |   logger = logging.getLogger(__name__)
197 |   logger.setLevel(logging.DEBUG)
198 |   consoleHandler = logging.StreamHandler(sys.stdout)
199 |   consoleHandler.setLevel(logging.DEBUG)
200 |   logger.addHandler(consoleHandler)
201 |   fileHandler = logging.FileHandler(f"{resultdir}/log.log")
202 |   fileHandler.setLevel(logging.INFO)
203 |   logger.addHandler(fileHandler)
204 | 
205 |   # Reuse local glue data directory or try to create it
206 |   if not os.path.isdir('./'+gluedata):
207 |     if not Fetch_data_to_local_dir(logger, dataRefs, gluedata):
208 |       return ['ERROR',f"Fetch_data_to_local_dir for {gluedata} failed"]
209 |   else:
210 |     logger.info("Reusing previous existing glue-dataset")
211 | 
212 |   # Reuse local model directory or try to create it
213 |   if not os.path.isdir('./'+model):
214 |     if not Fetch_data_to_local_dir(logger, dataRefs, model):
215 |       return ['ERROR',f"Fetch_data_to_local_dir for {model} failed"]
216 |   else:
217 |     logger.info(f"Reusing {model} directory")
218 | 
219 |   logger.info(f"Processing task {task} seed {seed} with model {model}")
220 | 
221 |   # Pull run_glue.py into local pod
222 |   # This code version must match the transformer version being used
223 |   if not os.path.isfile('./run_glue.py'):
224 |     subprocess.run(['wget', 'https://raw.githubusercontent.com/huggingface/transformers/b0892fa0e8df02d683e05e625b3903209bff362d/examples/text-classification/run_glue.py'])
225 | 
226 |   # change location of transformer cache to a writable directory
227 |   os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache/'
228 | 
229 |   runargs = ["python","./run_glue.py"]
230 |   runargs.extend(["--model_name_or_path",model])
231 |   runargs.extend(["--task_name",task])
232 |   runargs.extend(["--do_train","--do_eval"])
233 |   runargs.extend(["--data_dir",f"{gluedata}/{task}"])
234 |   runargs.extend(["--max_seq_length","128"])
235 |   runargs.extend(["--per_device_train_batch_size","32"])
236 |   runargs.extend(["--learning_rate",LR])
237 |   runargs.extend(["--num_train_epochs","3.0"])
238 |   runargs.extend(["--save_steps","50000"])
239 |   runargs.extend(["--save_total_limit","0"])
240 |   runargs.extend(["--seed",seed])
241 |   runargs.extend(["--overwrite_output_dir","--output_dir",resultdir])
242 | 
243 |   # use this regex to exclude debug content from logfile
244 |   p = re.compile(r".*(Epoch|Iteration|Evaluation): .*(s/it|it/s)].*")
245 | 
246 |   # finally, do the work
247 |   time_start = time.time()
248 |   proc = subprocess.Popen(runargs,stdout=subprocess.PIPE, stderr=subprocess.STDOUT,universal_newlines=True)
249 |   for line in proc.stdout:
250 |     if re.match(p,line) is None:
251 |       if not line == "\n":
252 |         logger.info(line.rstrip())
253 |     else:
254 |       logger.debug(line.rstrip())
255 |   proc.wait()
256 |   time_proc = time.time()-time_start
257 | 
258 |   # flush logfile
259 |   logger.removeHandler(consoleHandler)
260 |   logger.removeHandler(fileHandler)
261 |   del logger, consoleHandler, fileHandler
262 | 
263 |   results = PackResults(model,task,seed,LR,time_proc,savemodel)
264 | 
265 |   # clean up local result directory
266 |   subprocess.run(['rm', '-rf', resultdir])
267 | 
268 |   return results
269 | 
270 | 
271 | # ------------------ Return remote result directory name
272 | def ResultDir(model,task,seed,LR):
273 |   taskl = task.lower()
274 |   return f"result/{model}/{task}/lr-{LR}/{taskl}_seed-{seed}_lr-{LR}_TBATCH-32"
275 | 
276 | 
277 | # ------------------ PackResults
278 | # Puts selected info, files, and optionally a reference to the generated subtask model, into a python hashmap
279 | 
280 | def PackResults(model,task,seed,LR,time,savemodel):
281 |   dir = ResultDir(model,task,seed,LR)
282 |   files = glob(os.path.join(dir, f"eval_results_*.txt"))
283 |   files.append(os.path.join(dir, "log.log"))
284 |   taskres = {}
285 |   taskres["model"] = model
286 |   taskres["LR"] = LR
287 |   taskres["task"] = task
288 |   taskres["seed"] = seed
289 |   taskres["time"] = time
290 |   taskres["hostname"] = socket.gethostname()
291 |   for f in files:
292 |     with open(f, "rb") as afile:
293 |       data = afile.read()
294 |     taskres[os.path.basename(f)] = data
295 | 
296 |   # put the model in plasma and reference in hashmap
297 |   if savemodel:
298 |     f = os.path.join(dir, "pytorch_model.bin")
299 |     if os.path.isfile(f):
300 |       with open(f, "rb") as afile:
301 |         data = afile.read()
302 |       taskres["pytorch_model.bin"] = ray.put(data)
303 | 
304 |   return taskres
305 | 
306 | 
307 | # ------------------ Return local result directory name
308 | def SummaryDir(model,LR,task,seed):
309 |   if seed == None:
310 |     return f"/tmp/summary/{model}_lr-{LR}/{task}"
311 |   else:
312 |     return f"/tmp/summary/{model}_lr-{LR}/{task}/seed-{seed}"
313 | 
314 | 
315 | # ------------------ Best_model ----------------
316 | # checks if this is best model yet for task. If so delete last model and return eval score
317 | def Best_model(model,LR,task,seed):
318 |   # per task metric for evaluating best model (from Masayasu Muraoka)
319 |   eval_metric = {
320 |     "cola": "mcc", "mnli": "mnli/acc", "sst-2": "acc", "sts-b": "corr",
321 |     "qqp": "acc_and_f1", "qnli": "acc",  "rte": "acc", "wnli": "acc",
322 |     "mrpc": "f1"
323 |   }
324 |   subtasks_dir = SummaryDir(model,LR,task,None)
325 |   new_subtask_dir = SummaryDir(model,LR,task,seed)
326 |   metric = eval_metric[task.lower()]
327 |   grppr = "eval_"+metric+" = "
328 |   best_score = 0
329 |   bin_dirs = []
330 |   # scan all subtasks for this task, get new score and best previous score
331 |   for f in os.listdir(subtasks_dir):
332 |     if os.path.exists(f"{subtasks_dir}/{f}/pytorch_model.bin"):
333 |       bin_dirs.append(f"{subtasks_dir}/{f}/pytorch_model.bin")
334 | 
335 |     with open(f"{subtasks_dir}/{f}/eval_results_{task.lower()}.txt") as fp:
336 |       for line in fp:
337 |         if line.startswith(grppr):
338 |           score = float(line.split(grppr)[1])
339 |     if f"{subtasks_dir}/{f}" == new_subtask_dir:
340 |       new_score = score
341 |     else:
342 |       if score > best_score:
343 |         best_score = score
344 | 
345 |   if new_score <= best_score:
346 |     return False, 0
347 |   # remove previous best model
348 |   for f in bin_dirs:
349 |     os.remove(f)
350 |   return True, new_score
351 | 
352 | 
353 | # ------------------ Save models = true, Check for previous saved models ----------------
354 | # checks if there are any specified tasks having previous subtasks with no models saved
355 | # if this is the case no new score may be the best score and no new model would be saved
356 | def Check_for_previous_models(model,LR,tasks):
357 |   for task in tasks:
358 |     subtasks_dir = SummaryDir(model,LR,task,None)
359 |     if not os.path.exists(subtasks_dir):
360 |       continue
361 |     # scan all subtasks for this task and see if there are completed subtasks but no models saved
362 |     if any (os.path.exists(f"{subtasks_dir}/{f}/eval_results_{task.lower()}.txt") for f in os.listdir(subtasks_dir)):
363 |       if not any (os.path.exists(f"{subtasks_dir}/{f}/pytorch_model.bin") for f in os.listdir(subtasks_dir)):
364 |         logger.warning(f"WARNING: completed subtasks for {task} exist but no previous models saved. May not save best/any model.")
365 |   return
366 | 
367 | 
368 | # -------------------- MAIN ------------------
369 | parser = argparse.ArgumentParser(description='Driver for run_glue')
370 | parser.add_argument('-m',"--model", required=True,
371 |                     help="S3 Key and local directory name of base model, e.g. roberta-base")
372 | parser.add_argument('-g',"--gluedata", default="glue_data",
373 |                     help="S3 key and local directory name of glue dataset (Default=glue_data)")
374 | parser.add_argument('-b',"--bucket", required=True, help="S3 bucket name")
375 | parser.add_argument('-t','--tasks', nargs='+',
376 |                     # required MRPC data missing from public download
377 |                     # help="tasks to run, e.g. -t WNLI CoLA (Default=WNLI STS-B CoLA RTE MRPC SST-2 MNLI QNLI QQP)",
378 |                     # default=['WNLI','STS-B','CoLA','RTE','MRPC','SST-2','MNLI','QNLI','QQP'], action='store')
379 |                     help="tasks to run, e.g. -t WNLI CoLA (Default=WNLI STS-B CoLA RTE SST-2 MNLI QNLI QQP)",
380 |                     default=['WNLI','STS-B','CoLA','RTE','SST-2','MNLI','QNLI','QQP'], action='store')
381 | parser.add_argument('-s','--seeds', nargs='+', default=list(range(38,48)), action='store',
382 |                     help="seeds to run, e.g. -s 38 39  (Default=38 39 40 41 42 43 44 45 46 47)")
383 | parser.add_argument('-l',"--learning_rate", default="2e-5",help="Learning Rate (Default=2e-5)")
384 | parser.add_argument('-M',"--savemodel", action='store_true',help="Save best scoring model for each task (Default=False)")
385 | parser.add_argument('-r',"--ray", default="glue-cluster-ray-head:10001",help="ray_service:port")
386 | parser.add_argument('-v',"--verbose", action='store_true',help="show remote consoles (Default=False)")
387 | args = parser.parse_args()
388 | 
389 | model=args.model
390 | gluedata=args.gluedata
391 | bucket=args.bucket
392 | tasks=args.tasks
393 | seeds=[str(x) for x in args.seeds]
394 | LR=args.learning_rate
395 | savemodel=args.savemodel
396 | ray_service=args.ray
397 | verbose=args.verbose
398 | 
399 | # create logger for driver stdout and logfile
400 | logger = logging.getLogger(__name__)
401 | logger.setLevel(logging.INFO)
402 | consoleHandler = logging.StreamHandler(sys.stdout)
403 | consoleHandler.setLevel(logging.INFO)
404 | logger.addHandler(consoleHandler)
405 | fileHandler = logging.FileHandler("/tmp/gluejob.console")
406 | fileHandler.setLevel(logging.INFO)
407 | logger.addHandler(fileHandler)
408 | 
409 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
410 | logger.info(f"\n{st} Starting Glue benchmark ---------------")
411 | logger.info(f"model: {model}")
412 | logger.info(f"gluedata: {gluedata}")
413 | logger.info(f"bucket: {bucket}")
414 | logger.info(f"tasks: {' '.join(tasks)}")
415 | logger.info(f"seeds: {' '.join(seeds)}")
416 | logger.info(f"learning_rate: {float(LR)}")
417 | logger.info(f"savemodel: {savemodel}")
418 | logger.info(f"ray_service: {ray_service}")
419 | 
420 | # if savemodel=True, check if there are saved subtasks with no saved model and warn user
421 | if savemodel == True:
422 |   Check_for_previous_models(model,LR,tasks)
423 | 
424 | # connect to ray cluster
425 | ray.init("ray://"+ray_service,log_to_driver=verbose,namespace="ibm-glue")
426 | 
427 | # check if S3 credentials are set and objects look accessible
428 | if not Validate_S3(logger,bucket,model,gluedata):
429 |   logger.error(f"Fatal error verifying S3 access to specified objects")
430 |   sys.exit()
431 | 
432 | # create data actor if not yet exists
433 | # namespace is required to find a previously persisted actor instance
434 | data_actor_name = 'DataRefsActor'
435 | names = ray.util.list_named_actors()
436 | if any(x == data_actor_name for x in names):
437 |   dataRefs = ray.get_actor(data_actor_name)
438 |   state = ray.get(dataRefs.Get_state.remote())
439 |   logger.info(f" Found actor={data_actor_name} with state {state}")
440 | else:
441 |   logger.info(f"  actor={data_actor_name} not found ... deploy it")
442 |   dataRefs = DataRefs.options(name=data_actor_name,lifetime="detached").remote(bucket)
443 | 
444 | # make sure required datasets are cached in actor
445 | actorstate = ray.get(dataRefs.Get_state.remote())
446 | gluecached = modelcached = True
447 | if not actorstate.get(gluedata) == 'Cached':
448 |   gluecached = Fetch_data_to_cache(logger,dataRefs,gluedata)
449 | if not actorstate.get(model) == 'Cached':
450 |   modelcached = Fetch_data_to_cache(logger,dataRefs,model)
451 | if not gluecached or not modelcached:
452 |   logger.error(f"Fatal error caching dataset from S3")
453 |   sys.exit()
454 | 
455 | # submit all subtasks at the same time
456 | tasks = [Process_task.remote(dataRefs,bucket,model,gluedata,task,str(seed),LR,savemodel) for task in tasks for seed in seeds]
457 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
458 | logger.info(f"{st} Submitted {len(tasks)} subtasks")
459 | 
460 | # wait for all to be done, one at a time
461 | # TODO handle remote processing exceptions
462 | incomplete = tasks
463 | complete = []
464 | while len(complete) < len(tasks):
465 |   onedone, incomplete = ray.wait(incomplete, num_returns=1, timeout=None)
466 |   results = ray.get(onedone)
467 |   complete.append(onedone)
468 |   taskres = results[0]
469 | 
470 |   st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
471 |   if "ERROR" in taskres:
472 |     logger.error(f"{st} Fatal error: {taskres['ERROR']}")
473 |     sys.exit()
474 | 
475 |   # check for valid result
476 |   if any(x.startswith('eval_results') for x in taskres):
477 |     logger.info(f"{st} {taskres['model']} lr-{taskres['LR']} {taskres['task']} seed-{taskres['seed']}"+
478 |                 f" took {taskres['time']:.1f}s on {taskres['hostname']} ... {len(complete)} of {len(tasks)} subtasks done")
479 |   else:
480 |     logger.error(f"{st} {taskres['model']} lr-{taskres['LR']} {taskres['task']} seed-{taskres['seed']}"+
481 |                 f" returned ERROR ... {len(complete)} of {len(tasks)} subtasks done")
482 | 
483 |   # copy results to a known place for access from outside pod; Remove any leftover files
484 |   outfolder = SummaryDir(taskres['model'],taskres['LR'],taskres['task'],taskres['seed'])
485 |   subprocess.run(['mkdir', '-p', outfolder])
486 |   subprocess.run(['rm', '-rf', outfolder+"/*"])
487 | 
488 |   for key in taskres.keys():
489 |     if key == 'model' or key == 'LR' or key == 'task' or key == 'seed' or key == 'time' or key == 'hostname':
490 |       continue
491 |     if not key == 'pytorch_model.bin':
492 |       f = open(outfolder+'/'+key, "wb")
493 |       f.write(taskres[key])
494 |       f.close
495 |     else:
496 |       # check if this subtask model should be saved
497 |       save,score = Best_model(taskres['model'],taskres['LR'],taskres['task'],taskres['seed'])
498 |       if save:
499 |         # get model from plasma and store locally
500 |         time_start = time.time()
501 |         plasobj = taskres[key]
502 |         modelbin = ray.get(plasobj)
503 |         del (plasobj)
504 |         time_pull = time.time()-time_start
505 |         st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
506 |         logger.info(f"{st}   eval={score}, model pull took {time_pull:.1f}s for length={len(modelbin)}")
507 |         f = open(outfolder+'/'+key, "wb")
508 |         f.write(modelbin)
509 |         f.close
510 | 


--------------------------------------------------------------------------------
/kill_actor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2021 IBM Corp.
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import os
18 | import sys
19 | import ray
20 | import argparse
21 | 
22 | parser = argparse.ArgumentParser(description='ray actor killer')
23 | parser.add_argument('-r',"--ray", default="glue-cluster-ray-head:10001",help="ray_service:port")
24 | parser.add_argument('-n',"--namespace", default="ibm-glue",help="Default=ibm-glue")
25 | parser.add_argument('-a','--actor_name', default="DataRefsActor",help="Default=DataRefsActor")
26 | args = parser.parse_args()
27 | 
28 | print("trying to kill actor",args.actor_name,"in namespace",args.namespace)
29 | # connect to ray cluster
30 | ray.init("ray://"+args.ray,namespace=args.namespace)
31 | 
32 | namespace=ray.get_runtime_context().namespace
33 | if not namespace == args.namespace:
34 |   print("namespace",args.namespace,"not found")
35 |   sys.exit(0)
36 | try:
37 |   actor = ray.get_actor(args.actor_name)
38 | #  if actor == None:
39 | #    print("actor",args.actor_name,"not found")
40 | #    sys.exit(0)
41 | except Exception as e:
42 |   print(f"Actor '{args.actor_name}' not found in namespace '{args.namespace}'")
43 |   sys.exit(0)
44 | 
45 | print("killing actor",actor)
46 | ray.kill(actor)
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.0.2
2 | tqdm==4.48.2
3 | numpy==1.17.3
4 | ptvsd==4.3.2
5 | tensorboardX==2.2
6 | tensorflow_datasets==4.2.0
7 | scikit_learn==0.24.1
8 | 


--------------------------------------------------------------------------------
/simple_check_s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2021 IBM Corp.
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import os
18 | import sys
19 | import boto3
20 | import argparse
21 | 
22 | parser = argparse.ArgumentParser(description='S3 object checker')
23 | parser.add_argument('-b',"--bucket", required=True, help="S3 bucket name")
24 | parser.add_argument('-k',"--key", help="S3 Key of object in bucket")
25 | args = parser.parse_args()
26 | 
27 | bucket=args.bucket
28 | objkey=args.key
29 | 
30 | param = os.environ.get('AWS_ACCESS_KEY_ID')
31 | if param == None:
32 |   print("AWS_ACCESS_KEY_ID is missing from environment")
33 |   sys.exit()
34 | param = os.environ.get('AWS_SECRET_ACCESS_KEY')
35 | if param == None:
36 |   print("AWS_SECRET_ACCESS_KEY is missing from environment")
37 |   sys.exit()
38 | 
39 | # if "ENDPOINT_URL" in os.environ:
40 | #     param = os.environ.get('ENDPOINT_URL')
41 | #     print("found endpoint in env with value=|"+param+"|")
42 | #     print(param == "")
43 | # sys.exit()
44 | 
45 | param = os.environ.get('ENDPOINT_URL')
46 | if param == "":
47 |   print("ENDPOINT_URL is empty, assuming AWS object store")
48 |   client = boto3.client(
49 |     's3',
50 |     aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
51 |     aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
52 |   )
53 | else:
54 |   client = boto3.client(
55 |     's3',
56 |     aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
57 |     aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
58 |     endpoint_url = os.environ.get('ENDPOINT_URL')
59 |   )
60 | 
61 | try:
62 |   check = client.head_bucket(Bucket=bucket)
63 |   print(f"found bucket={bucket}")
64 | except Exception as e:
65 |   print("bucket="+bucket+" not found: {0}\n".format(e))
66 |   sys.exit()
67 | 
68 | if objkey == None:
69 |   sys.exit()
70 | 
71 | try:
72 |   check = client.head_object(Bucket=bucket, Key=objkey)
73 |   print(f"found key={objkey} with length={check['ContentLength']}")
74 | except Exception as e:
75 |   print("key="+objkey+" not found in bucket="+bucket+": {0}\n".format(e))
76 |   sys.exit()
77 | 


--------------------------------------------------------------------------------
/template-s3-creds.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   name: glue-s3-creds
 5 |   namespace: {your-ocp-project}
 6 | type: Opaque
 7 | data:
 8 |   AWS_ACCESS_KEY_ID: {printf $REAL_AWS_ACCESS_KEY_ID | base64 -w 0}
 9 |   AWS_SECRET_ACCESS_KEY: {printf $REAL_AWS_SECRET_ACCESS_KEY | base64 -w 0}
10 | #if the object store is on AWS, leave the ENDPOINT_URL value empty
11 | stringData:
12 |   ENDPOINT_URL: {REAL_ENDPOINT_URL}
13 | 


--------------------------------------------------------------------------------