├── .gitignore
├── Dockerfile
├── README.md
├── build.sh
├── get_pods.py
├── head.yml
├── test_cluster.py
└── worker.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:xenial
 2 | RUN apt-get update \
 3 |     && apt-get install -y vim git wget \
 4 |     && apt-get install -y cmake build-essential autoconf curl libtool libboost-all-dev unzip \
 5 |     && apt-get install -y python3-pip \
 6 |     && alias python=python3
 7 | RUN pip3 install --upgrade pip
 8 | RUN git clone https://github.com/jhpenger/ray-kubernetes.git
 9 | RUN pip install numpy
10 | 
11 | 
12 | 
13 | #install additional dependencies
14 | RUN apt-get update \
15 |     && apt-get -y install flex \
16 |     && apt-get -y install bison \
17 |     && apt-get install -y nano \
18 |     && apt-get install -y openssh-server \
19 |     && apt install -y python-opencv \
20 |     && apt install pssh \
21 |     && pip install cython \
22 |     && pip install pyarrow \
23 |     && apt-get install -y pkg-config \
24 |     && pip install modin \
25 |     && pip install tensorflow \
26 |     && pip install gym \
27 |     && pip install scipy \
28 |     && pip install opencv-python \
29 |     && pip install bokeh \
30 |     && pip install ipywidgets==6.0.0 \
31 |     && pip install jupyter \
32 |     && pip install lz4
33 | 
34 | RUN export LC_ALL=C.UTF-8 \
35 |     && export LANG=C.UTF-8
36 | 
37 | RUN ssh-keygen -f /root/.ssh/id_rsa -P "" \
38 |     && echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Running Ray Cluster on Kubernetes
 2 | ### Disclaimer:
 3 | Many thanks to the instructions by Robert and Charles: found on [here](https://github.com/robertnishihara/ray-kubernetes/blob/instructions/README.md)
 4 | ## Instructions
 5 | **For most up-to-date set-up instructions:** [Medium Blog Post](https://medium.com/@apengjh/4634a51effc9)
 6 | 
 7 | ray-project [here](https://github.com/ray-project/ray)
 8 | #### Setup Google Cloud and Install Cloud SDK
 9 | Follow instructions from Google [here](https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu) to install Cloud SDK for Ubuntu.
10 | 
11 | Create a new project on [Google Cloud Console](console.cloud.google.com) and start a Kubernetes Cluster in that project.
12 | 
13 | Run `gcloud init` and follow the prompted instructions
14 | 
15 | Install `kubectl` on your local machine using either `gcloud components install kubectl` or  ([Instructions](https://kubernetes.io/docs/tasks/tools/install-kubectl/))
16 | 
17 | On your [GCloud Console](console.cloud.google.com) goto ->Project->Kubernetes Engine->Clusters and click on `Connect`. Paste the code to command-line to give `kubectl` access to your cluster.
18 | #### Build and Push Docker Image
19 | 
20 | Clone the [repository](https://github.com/jhpenger/ray-kubernetes): `git clone https://github.com/jhpenger/ray-kubernetes.git`
21 | 
22 | Edit `build.sh`, `head.yml`, `worker.yml`: replace `tutorial-218804` with your project-ID
23 | 
24 | Run:
25 | ```sh
26 | bash build.sh
27 | ```
28 | ```
29 | docker push <image-tag>
30 | ```
31 | #### Deploying Pods on Kubernetes Cluster
32 | In the cloned repository:
33 | ```
34 | kubectl create -f head.yml
35 | ```
36 | Wait for the `ray-head` pod to be fully running. You can check pods' status with `kubectl get pods`. If your head pod crashes `kubectl logs ray-head` to debug.
37 | 
38 | Obtain `ray-head`'s Public Key by either:
39 | 1. `kubectl logs ray-head` (key will be near bottom of output)
40 | 2. `kubectl exec -it ray-head bash`; Then
41 | `more ~/.ssh/id_rsa.pub`
42 | 
43 | Edit `worker.yml`: replace `<PASTE-PUBKEY-HERE-ONELINE>` with `ray-head`'s Public Key
44 | 
45 | If you are in `ray-head`, exit back to your local machine and run:
46 | ```
47 | kubectl create -f worker.yml
48 | ```
49 | 
50 | #### Testing Your Cluster
51 | I mounted a simple python script (modified from `exercise04.ipynb` found [here](https://github.com/ray-project/tutorial/blob/master/exercises/exercise04.ipynb)) to test the cluster.
52 | First, get into your `ray-head` with `kubectl exec -it ray-head bash`. Then run:
53 | ```
54 | python /ray-kubernetes/test_cluster.py $MY_POD_IP:6379
55 | ```
56 | You can choose to define # of actors by passing in an additional parameter. (e.g. `python /ray-kubernetes/test_cluster.py $MY_POD_IP:6379 8888`). Default is set to 136. `#-of-actors` should be no more than `# of CPU cores` in your cluster (`not # of CPUs`)
57 | Your expected run-time should be ~`2.5` seconds, but might be slower due to reaching cluster's max CPU capacity.
58 | 
59 | # Notes
60 | Instructions setup for Koç Lab @ University of California Santa Barbara.
61 | 
62 | We are trying to utilize Ray cluster to do reinforcement learning on [Gibson Enviroment](http://gibsonenv.stanford.edu/).
63 | 
64 | Currently in the very early stages of exploring `ray` and `Gibson`. Would greately appreciate guidance from anyone with:
65 | * experience in running reinforcement earning simulations on large clusters using `GCloud` or `AWS`.
66 | * using Google's `Preemptible VM Instances` with `ray` to cut down costs. Specifically in dealing with what to do when `pre-emptible instance` restarts.
67 | 
68 | Contact [Sam](mailto:samgreen@gmail.com) and [Jun](mailto:peng00@cs.ucsb.edu)
69 | Thanks,
70 | 
71 | Contact me @ [peng00@cs.ucsb.edu](mailto:peng00@cs.ucsb.edu)
72 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | docker build . -t gcr.io/tutorial-218804/kube-cluster-demo
2 | 


--------------------------------------------------------------------------------
/get_pods.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | import time
 3 | import sys
 4 | 
 5 | if len(sys.argv) != 3:
 6 |     print ("Usage: python get_pods.py <redis-address> <path-to-workers.txt>")
 7 |     exit()
 8 | 
 9 | ray.init(redis_address=sys.argv[1])
10 | @ray.remote
11 | def f():
12 |     time.sleep(0.01)
13 |     return ray.services.get_node_ip_address()
14 | 
15 | # Get a list of the IP addresses of the nodes that have joined the cluster.
16 | pods = set(ray.get([f.remote() for _ in range(1000)]))
17 | print(pods)
18 | print("Size of this cluster = %d" %len(pods))
19 | pods.remove(sys.argv[1][:-5])
20 | workerFile = open(sys.argv[2], "w")
21 | for pod in pods:
22 |     workerFile.write(pod + "\n")
23 | workerFile.close()
24 | print("Workers' IP saved to %s" %sys.argv[2])
25 | 


--------------------------------------------------------------------------------
/head.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: ray-head
 6 | spec:
 7 |   type: "LoadBalancer"
 8 |   ports:
 9 |   - name: redis
10 |     port: 6379
11 |     targetPort: 6379
12 |   - name: plasma
13 |     port: 12345
14 |     targetPort: 12345
15 |   selector:
16 |     app: ray-head
17 | ---
18 | apiVersion: v1
19 | kind: Pod
20 | metadata:
21 |   name: ray-head
22 |   labels:
23 |     app: ray-head
24 | spec:
25 |   containers:
26 |   - name: ray-head
27 |     image: gcr.io/tutorial-218804/kube-cluster-demo
28 |     env:
29 |     - name: MY_POD_IP
30 |       valueFrom:
31 |         fieldRef:
32 |           fieldPath: status.podIP
33 |     ports:
34 |     - containerPort: 6379
35 |     - containerPort: 12345
36 |     command: [ "/bin/bash", "-c", "--" ]
37 |     args:
38 |       - "ray start --head --redis-port 6379 --object-manager-port 12345 --node-ip-address $MY_POD_IP &&
39 |         echo id_rsa.pub is -;
40 |         more ~/.ssh/id_rsa.pub
41 |         && while true; do sleep 30; done;"
42 | 


--------------------------------------------------------------------------------
/test_cluster.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import numpy as np
 6 | import ray
 7 | import time
 8 | import sys
 9 | 
10 | if 3 < len(sys.argv) < 2:
11 |     print("Usage: python test_cluster.py <redis-address> <#-of-ray-actors>")
12 |     print("<#-of-ray-actors> OPTIONAL, default = 136")
13 |     exit()
14 | 
15 | ray.init(sys.argv[1])
16 | 
17 | @ray.remote
18 | class Foo(object):
19 |     def __init__(self):
20 |         self.counter = 0
21 | 
22 |     def reset(self):
23 |         self.counter = 0
24 | 
25 |     def increment(self):
26 |         time.sleep(0.5)
27 |         self.counter += 1
28 |         return self.counter
29 | 
30 | try:
31 |     num_of_actors = int(sys.argv[3])
32 | except:
33 |     num_of_actors = 68*2
34 | Foos = [Foo.remote() for _ in range(num_of_actors)]
35 | 
36 | 
37 | 
38 | time.sleep(2.0)
39 | 
40 | 
41 | # Reset the actor state so that we can run this cell multiple times without
42 | # changing the results.
43 | for f in Foos:
44 |     f.reset.remote()
45 | 
46 | # We want to parallelize this code. However, it is not straightforward to
47 | # make "increment" a remote function, because state is shared (the value of
48 | # "self.counter") between subsequent calls to "increment". In this case, it
49 | # makes sense to use actors.
50 | results = []
51 | start_time = time.time()
52 | for _ in range(5):
53 |     for f in Foos:
54 |         results.append(f.increment.remote())
55 | 
56 | results = ray.get(results)
57 | end_time = time.time()
58 | duration = end_time - start_time
59 | 
60 | 
61 | 
62 | #assert results == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
63 | 
64 | #assert duration < 3, ('The experiments ran in {} seconds. This is too '
65 | #                      'slow.'.format(duration))
66 | #assert duration > 2.5, ('The experiments ran in {} seconds. This is too '
67 | #                        'fast.'.format(duration))
68 | 
69 | print(results)
70 | print("Usage: python test_cluster.py <redis-address> <#-of-ray-actors> \n <#-of-ray-actors> OPTIONAL, default = 136")
71 | print('Success! The example took {} seconds.'.format(duration))
72 | print('Num of ray actors = %d' %num_of_remote_functions)
73 | 


--------------------------------------------------------------------------------
/worker.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: extensions/v1beta1
 3 | kind: Deployment
 4 | metadata:
 5 |   name: ray-worker
 6 |   labels:
 7 |     app: ray-worker
 8 | spec:
 9 |   replicas: 3
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: ray-worker
14 |     spec:
15 |       containers:
16 |       - name: ray-worker
17 |         image: gcr.io/tutorial-218804/kube-cluster-demo
18 |         ports:
19 |         - containerPort: 12345
20 |         env:
21 |         - name: MY_POD_IP
22 |           valueFrom:
23 |             fieldRef:
24 |               fieldPath: status.podIP
25 |         command: [ "/bin/bash", "-c", "--" ]
26 |         args:
27 |           - "ray start --node-ip-address $MY_POD_IP --redis-address $(python -c 'import socket;import sys; sys.stdout.write(socket.gethostbyname(\"ray-head.default.svc.cluster.local\"));sys.stdout.flush()'):6379 --object-manager-port 12345;
28 |             mkdir /root/.ssh/;
29 |             touch /root/.ssh/authorized_keys;
30 |             sleep 3;
31 |             echo '<PASTE-PUBKEY-HERE-ONELINE>' >> ~/.ssh/authorized_keys;
32 |             service ssh restart;
33 |             sleep 3;
34 |             service ssh restart;
35 |             echo This worker IP $MY_POD_IP
36 |             && while true; do sleep 30; done;"
37 | 
38 |       #no
39 | 


--------------------------------------------------------------------------------