├── package_example
    ├── trainer
    │   ├── __init__.py
    │   ├── task.py
    │   └── mnist_autoencoder_deconv_simple.py
    ├── setup.py
    └── README.md
├── .gitignore
├── requirements.txt
├── trainer_configs
    └── trainer_config.yaml
├── submit_model.sh
├── submit_model_gpu.sh
├── gce_scripts
    ├── gce_ml_custom_startup.sh
    └── create_gce_ml_instance.sh
└── README.md


/package_example/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #example requirements file for your VM instance
 2 | pyYaml>=3.12
 3 | keras>=2.0
 4 | scikit-learn
 5 | h5py
 6 | scipy
 7 | scikit-image >= 0.13.0
 8 | pandas
 9 | Pillow
10 | matplotlib>=2.0
11 | oauth2client
12 | nltk
13 | seaborn
14 | gensim
15 | statsmodels
16 | requests>=2.11.1
17 | html
18 | opencv-python
19 | 


--------------------------------------------------------------------------------
/trainer_configs/trainer_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Example trainer configuration for GCE_ML.
 3 | 
 4 | score_metric : "mse"  # mean squared error
 5 | loss: "binary_crossentropy"
 6 | lr : 0.001  # learning rate
 7 | lr_decay: 0.001
 8 | optimizer_name: "adam"
 9 | n_epochs: 5  # num of epochs to train
10 | patience: 5  # early stopping epochs
11 | pool_method: "average"
12 | use_transposed_conv : True
13 | 
14 | ...
15 | 


--------------------------------------------------------------------------------
/package_example/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: Ming Zhao
 5 | """
 6 | 
 7 | # setup.py
 8 | from setuptools import setup, find_packages
 9 | 
10 | REQUIRED_PACKAGES = [
11 |                      'pyYaml>=3.12',
12 |                      'keras>=2.0',
13 |                      'matplotlib',
14 |                      'h5py'#,
15 |                      #'tensorflow>=1.2'  # do not include if has custom compiled version
16 |                      ]
17 | 
18 | 
19 | setup(
20 |     name='mnist_ae_trainer',
21 |     version='0.1',
22 |     install_requires=REQUIRED_PACKAGES,
23 |     packages=find_packages(),
24 |     #package_data={'sample': ['package_data.txt']}, # OPTIONAL
25 |     include_package_data=True,
26 |     description='Example trainer package for MNIST convolutional autoencoder',
27 |     author='mz',
28 |     author_email='ming.zhao@nytimes',
29 |     zip_safe=True,
30 |     url=' ' # Required. You will get a warning of missing meta-data if not provided.
31 | )
32 | 


--------------------------------------------------------------------------------
/submit_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | new_instance_name="test-cpu1"  # unique name for new instance
 4 | gce_username='my_gce_username' # replace with your account username for gce instance
 5 | project="my-proj-dev"          # project-id
 6 | boot_image_name="test-image-2cpu-6gb" # pre-created boot image
 7 | image_project=$project
 8 | 
 9 | # VM Instance configurations. Use GCE console's `Create Instance` for reference
10 | machine_type="custom-2-5120" # 2 CPUs 5.120Gb RAM. Use Cloud Console's Instance Create tool to costomize and get your machine_type
11 | maintenance_policy="MIGRATE" # MIGRATE for cpu; TERMINATE for gpu
12 | min_cpu_platform='Intel Broadwell'
13 | boot_disk_size="50"          # 50 Gb
14 | zone="us-east1-c"
15 | 
16 | 
17 | # Trainer Variables
18 | job_id=$new_instance_name          # unique job id
19 | job_dir="gs://my-proj/test_dir"    # parent dir on GCS
20 | trainer_module="trainer.task"
21 | trainer_package_path="./package_example/"
22 | trainer_config="./trainer_configs/trainer_config.yaml"
23 | train_data_path="gs://my-proj/"    # data need to be pre-uploaded to GCS
24 | keep_alive=700                     # seconds to stay alive for debugging once training completes
25 | 
26 | 
27 | 
28 | ./gce_scripts/create_gce_ml_instance.sh --instance "$new_instance_name" --image "$boot_image_name" --gce_username "$gce_username" \
29 |   --image-project "$image_project" --machine-type "$machine_type" --maintenance-policy "$maintenance_policy" \
30 |   --min-cpu-platform "$min_cpu_platform" --boot-disk-size "$boot_disk_size" --zone "$zone" \
31 |   --job_id "$job_id" --job_dir "$job_dir" --trainer_module "$trainer_module" \
32 |   --trainer_package_path "$trainer_package_path" --trainer_config "$trainer_config" \
33 |   --train_data_path "$train_data_path" --project "$project" --keep_alive $keep_alive
34 | 


--------------------------------------------------------------------------------
/submit_model_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | new_instance_name="test-gpu1"
 4 | gce_username="my_gce_username" # account username for your GCE instance
 5 | project="my-proj-dev"          # project-id
 6 | boot_image_name="gpu1-cpu6-ram30gb-250gb-tensorflow" # pre-created boot image
 7 | image_project=$project         # ususally the same as project-id
 8 | 
 9 | # VM Instance configurations. Use GCE console's `Create Instance` for reference
10 | machine_type="custom-6-24576"  # 6 CPUs, 24576Mb RAM. Use Cloud Console's Instance Create tool to customize and get your machine_type
11 | maintenance_policy="TERMINATE" # gpu cannot be migrated
12 | accelerator="type=nvidia-tesla-k80,count=1"  # GPU config
13 | min_cpu_platform='Intel Broadwell' # CPU types
14 | boot_disk_size="250"           # 250 Gb
15 | zone="us-east1-c"	       # recommend "us-east1-c"
16 | 
17 | # Trainer Variables
18 | job_id=$new_instance_name          # unique job id
19 | job_dir="gs://my-proj/test_dir"    # parent dir for your jon on GCS
20 | trainer_module="trainer.task"      # module that actually dose the training
21 | trainer_package_path="./package_example/"  # package on local drive
22 | trainer_config="./trainer_configs/trainer_config.yaml"
23 | train_data_path="gs://my-proj/"    # data need to be pre-uploaded to GCS
24 | keep_alive=500                     # seconds to keep alive for (debugging purpose only) once training completes
25 | 
26 | 
27 | 
28 | ./gce_scripts/create_gce_ml_instance.sh --instance "$new_instance_name" --image "$boot_image_name" --gce_username "$gce_username" \
29 |   --image-project "$image_project" --machine-type "$machine_type" --maintenance-policy "$maintenance_policy" \
30 |   --accelerator $accelerator --min-cpu-platform "$min_cpu_platform" --boot-disk-size "$boot_disk_size" \
31 |   --job_id "$job_id" --job_dir "$job_dir" --trainer_module "$trainer_module" \
32 |   --trainer_package_path "$trainer_package_path" --trainer_config "$trainer_config" \
33 |   --train_data_path "$train_data_path" --project "$project" --zone "$zone" --keep_alive $keep_alive
34 | 


--------------------------------------------------------------------------------
/package_example/README.md:
--------------------------------------------------------------------------------
 1 | ### Example Python package to train Keras models with gce_ML
 2 | ___
 3 | This example shows how to train a convolutional autoencoder using Keras and GPU on GCE ML.
 4 | It also shows how to use Tensorboard in Keras and save files in Cloud Storage.
 5 | 
 6 | Meanwhile the example allows the exploration of using "deconv" layers (i.e., transposed convolution), dilated convolution (atrous convolution), as well as comparing batch norm before and after a non-linear activation layer.
 7 | 
 8 | #### Prerequisite
 9 |   - Set up gce_ML by following this [link](https://github.com/astromz/gce_ml).
10 |   - Similar to Cloud ML, **please follow the file structure of this example strictly to make the whole gce_ML package work**. Specifically, you will need:
11 |     1. A `setup.py` file for your python package in the same way as using `setuptools`.
12 |     2. A folder called `trainer` (or something else) that contains the code.
13 |     3. A `task.py` that wraps your model training code (`mnist_autoencoder_deconv_simple.py` in this example) and passes arguments and configurations to the model.
14 |     4. Your model that actually does the training (e.g., `mnist_autoencoder_deconv_simple.py`)
15 |     5. A configuration `yaml` file that contains all model-specific input variables (e.g., learning rate, loss function, number of epochs to train, and all hyper-parameters for your model). This file should reside outside the code package. 
16 | 
17 | 
18 | #### Training
19 |  - Configure your GCE submission script here. Then run:
20 | 
21 |   			$ ./submit_model.sh
22 | 
23 |   + Now follow the output instructions in the command line to either stream your log or check your instance in GCE console.
24 | 
25 | 
26 | That is it! You now should have successfully trained your  convolutional autoencoder using the gce_ML scripts.
27 | 
28 | #### Check your results
29 |  - The code should automatically save two PDF figures, either on local disk or on gs bucket. You can check the learning curves there.
30 |  - Alternatively, you can use `tensorboard` to examine your neural net graph, variables and more. Type the following and follow instructions.
31 | 
32 | 		$ tensorboard --logdir=gs://my_project_name/my_bucket/job_id
33 | 


--------------------------------------------------------------------------------
/package_example/trainer/task.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Wrapper function that passes command line inputs to the actual trainer for either Cloud ML or GCE ML.
 5 | Supports both python2 and python3.
 6 | 
 7 | Must config `trainer_configs/trainer_config.yaml` first before running model.
 8 | All parameters and network architecture are configured in that file!
 9 | 
10 | ----------
11 | Created on Mon Oct 19 2017
12 | @author: Ming Zhao
13 | """
14 | from __future__ import absolute_import, division, print_function
15 | import argparse
16 | import time, os
17 | import yaml
18 | from tensorflow.python.lib.io import file_io
19 | from tensorflow import __version__ as tf_version
20 | 
21 | from trainer import mnist_autoencoder_deconv_simple
22 | 
23 | 
24 | if __name__ =='__main__':
25 |     # test write permission by writing something random to disk
26 |     with open('testing_output.txt', 'w+') as f:
27 |         f.write('Hello World!\n')
28 |         f.write('This is a test written by custom startup script!\n')
29 |         f.write('Success. Yay!\n')
30 |         f.write('Current time = {}'.format(time.ctime()))
31 |     dir_path = os.path.dirname(os.path.realpath(__file__))
32 |     print('Current path: {}'.format(dir_path))
33 | 
34 |     # Now code that actually does stuff
35 |     parser = argparse.ArgumentParser()
36 | 
37 |     # Required Arguments
38 |     parser.add_argument(
39 |       '--job_dir', help='GCS location to write checkpoints and export models', required=True)
40 | 
41 |     parser.add_argument(
42 |       '--job_id', help='ID for the training job, passed from instance creation', required=True)
43 | 
44 |     parser.add_argument(
45 |       '--config_file', help='A YAML config file that contrains all other trainer input parameters', required=True)
46 | 
47 |     parser.add_argument(
48 |       '--data_path', help='GS path for training data', required=True)
49 | 
50 | 
51 |     args = parser.parse_args()
52 |     arguments = args.__dict__
53 | 
54 |     ########### Load config file and config parameters ################
55 |     if args.config_file is None:
56 |         raise ValueError('config YAML file must not be None!!!')
57 |     if file_io.file_exists(args.config_file) is not True:
58 |         # use tf's file_io for both GS and local files
59 |         raise ValueError('config file does not exsit!!!  {}'.format(args.config_file))
60 | 
61 |     with file_io.FileIO(args.config_file, 'r') as f:  # This reads BOTH local files and GS bucket files!!!
62 |         config = yaml.load(f)
63 | 
64 |     # actually training happens here
65 |     mnist_autoencoder_deconv_simple.train(job_dir=args.job_dir, job_id=args.job_id,
66 |                                           data_path=args.data_path, **config)
67 | 


--------------------------------------------------------------------------------
/gce_scripts/gce_ml_custom_startup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | sleep 30
 4 | echo "Running start up script ..."
 5 | 
 6 | ### Metadata specification
 7 | # All this metadata is pulled from the Compute Engine instance metadata server
 8 | 
 9 | # Your account username on the GCE instance
10 | GCE_USER=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/gce_user -H "Metadata-Flavor: Google")
11 | 
12 | # Trainer Job name or ID in meta data
13 | export JOB_ID=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/job_id -H "Metadata-Flavor: Google")
14 | 
15 | # DIR where all JOB data will reside
16 | export JOB_DIR=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/job_dir -H "Metadata-Flavor: Google")
17 | 
18 | # trainer module name
19 | export TRAINER_MODULE_NAME=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/trainer_module_name -H "Metadata-Flavor: Google")
20 | 
21 | # TRAINER PACKAGE_PATH
22 | export PACKAGE_PATH=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/package_path -H "Metadata-Flavor: Google")
23 | 
24 | # TRAINER_CONFIG_FILE
25 | export TRAINER_CONFIG_FILE=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/trainer_config_file -H "Metadata-Flavor: Google")
26 | 
27 | # TRAIN_DATA_PATH
28 | export TRAIN_DATA_PATH=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/train_data_path -H "Metadata-Flavor: Google")
29 | 
30 | # KEEP_ALIVE=True then instance won't shut down after training is complete
31 | KEEP_ALIVE=$(curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive -H "Metadata-Flavor: Google")
32 | 
33 | 
34 | cd "/home/$GCE_USER"
35 | pwd
36 | echo "Current path: $(pwd)"
37 | echo "Python path: $(which python)"
38 | echo "Python Version:"
39 | echo "$(python -V)"
40 | 
41 | # Install pip if not installed
42 | if (which pip | grep pip &> /dev/null) ;then
43 |   echo "Found pip"
44 | else
45 |   echo "pip not found! Installing pip"
46 |   apt-get --assume-yes install python-pip python-dev build-essential
47 | fi
48 | 
49 | 
50 | # Download trainer package from gs://
51 | echo "Downloading package: ${JOB_DIR}/${JOB_ID}/package/"
52 | gsutil cp -r ${JOB_DIR}/${JOB_ID}/package/ ./
53 | 
54 | # Download config file
55 | echo "Downloading config file: ${JOB_DIR}/${JOB_ID}/config/${TRAINER_CONFIG_FILE}"
56 | gsutil cp  ${JOB_DIR}/${JOB_ID}/config/${TRAINER_CONFIG_FILE} ./
57 | # send log to GS
58 | gsutil cp -r /var/log/syslog ${JOB_DIR}/${JOB_ID}/logs/
59 | 
60 | sleep 3
61 | pip install ./package/dist/*.tar.gz
62 | 
63 | 
64 | 
65 | ######### Now run the python trainer job   ##############
66 | echo "Now running custom trainer job : ${TRAINER_MODULE_NAME} ..."
67 | # send log to GS
68 | gsutil cp -r /var/log/syslog ${JOB_DIR}/${JOB_ID}/logs/
69 | 
70 | sudo -u $GCE_USER python -m $TRAINER_MODULE_NAME --job_dir ${JOB_DIR}/${JOB_ID}/ --job_id ${JOB_ID} --config_file ${TRAINER_CONFIG_FILE} --data_path ${TRAIN_DATA_PATH}
71 | 
72 | echo "Training job finished! "
73 | echo
74 | 
75 | 
76 | ### Once the job has completed, keep alive for $KEEP_ALIVE seconds,
77 | ### then shut down the Compute Engine instance
78 | echo "Sleeping for $KEEP_ALIVE seconds, then shut down."
79 | # send log to GS
80 | gsutil cp -r /var/log/syslog ${JOB_DIR}/${JOB_ID}/logs/
81 | 
82 | sleep $KEEP_ALIVE
83 | 
84 | # send log to GS
85 | gsutil cp -r /var/log/syslog ${JOB_DIR}/${JOB_ID}/logs/
86 | 
87 | sudo shutdown -h now
88 | 


--------------------------------------------------------------------------------
/gce_scripts/create_gce_ml_instance.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | POSITIONAL=()
  4 | while [[ $# -gt 0 ]]
  5 | do
  6 | key="$1"
  7 | 
  8 | case $key in
  9 |     ####### Instance parameters #######
 10 |     --gce_username)
 11 |     GCE_USER="$2"
 12 |     shift # past argument
 13 |     shift # past value
 14 |     ;;
 15 |     --project)
 16 |     PROJ_NAME="$2"
 17 |     shift # past argument
 18 |     shift # past value
 19 |     ;;
 20 |     --instance)
 21 |     INSTANCE_NAME="$2"
 22 |     shift # past argument
 23 |     shift # past value
 24 |     ;;
 25 |     --image)
 26 |     BOOT_IMAGE="$2"
 27 |     shift # past argument
 28 |     shift # past value
 29 |     ;;
 30 |     --image-project)
 31 |     image_project="$2"
 32 |     shift # past argument
 33 |     shift # past value
 34 |     ;;
 35 |     --machine-type)
 36 |     machine_type="$2"
 37 |     shift # past argument
 38 |     shift # past value
 39 |     ;;
 40 |     --maintenance-policy)
 41 |     maintenance_policy="$2"
 42 |     shift # past argument
 43 |     shift # past value
 44 |     ;;
 45 |     --accelerator)
 46 |     accelerator="$2"
 47 |     shift # past argument
 48 |     shift # past value
 49 |     ;;
 50 |     --min-cpu-platform)
 51 |     min_cpu_platform="$2"
 52 |     shift # past argument
 53 |     shift # past value
 54 |     ;;
 55 |     --tags)
 56 |     tags="$2"
 57 |     shift # past argument
 58 |     shift # past value
 59 |     ;;
 60 |     --boot-disk-size)
 61 |     boot_disk_size="$2"
 62 |     shift # past argument
 63 |     shift # past value
 64 |     ;;
 65 |     --job_id)
 66 |     job_id="$2"
 67 |     shift # past argument
 68 |     shift # past value
 69 |     ;;
 70 |     --job_dir)
 71 |     job_dir="$2"
 72 |     shift # past argument
 73 |     shift # past value
 74 |     ;;
 75 |     --trainer_module)
 76 |     trainer_module_name="$2"
 77 |     shift # past argument
 78 |     shift # past value
 79 |     ;;
 80 |     --trainer_package_path)
 81 |     trainer_package_path="$2"
 82 |     shift # past argument
 83 |     shift # past value
 84 |     ;;
 85 |     --zone)  # Optional
 86 |     ZONE="$2"
 87 |     shift # past argument
 88 |     shift # past value
 89 |     ;;
 90 |     --keep_alive)  # Optional
 91 |     KEEP_ALIVE="$2"
 92 |     shift # past argument
 93 |     ;;
 94 |     ####### model parameters #######
 95 |     --trainer_config)
 96 |     trainer_config_file="$2"
 97 |     shift # past argument
 98 |     shift # past value
 99 |     ;;
100 |     --train_data_path)
101 |     train_data_path="$2"
102 |     shift # past argument
103 |     shift # past value
104 |     ;;
105 |     *)    # unknown option
106 |     POSITIONAL+=("$1") # save it in an array for later
107 |     shift # past argument
108 |     ;;
109 | esac
110 | done
111 | set -- "${POSITIONAL[@]}" # restore positional parameters
112 | 
113 | ZONE="${ZONE:-us-east1-c}"
114 | 
115 | KEEP_ALIVE="${KEEP_ALIVE:-0}"
116 | 
117 | 
118 | echo
119 | echo "Input GCE Parameters:"
120 | echo "-------------------------------------------------"
121 | echo "New Instance Name    =  ${INSTANCE_NAME}"
122 | echo "Unique Job ID        =  ${job_id}"
123 | echo "Job Dir on GS        =  ${job_dir}"
124 | echo "GCE USER NAME        =  ${GCE_USER} (username for your account)"
125 | echo "PROJ_NAME            =  ${PROJ_NAME}"
126 | echo "Boot Image Name      =  ${BOOT_IMAGE}"
127 | echo "Boot Image Project   =  ${image_project}"
128 | echo "Machine Type         =  ${machine_type}"
129 | echo "GPU (accelerator)    =  ${accelerator}"
130 | echo "Boot Disk Size       =  ${boot_disk_size}"
131 | echo "min_cpu_platform     =  ${min_cpu_platform}"
132 | echo "ZONE                 =  ${ZONE} (us-east1-c is recommended)"
133 | echo "KEEP_ALIVE when done =  ${KEEP_ALIVE} (seconds to keep alive when done, then shutdown)"
134 | echo
135 | echo
136 | echo "Input Trainer Parameters:"
137 | echo "-------------------------------------------------"
138 | echo "trainer_module_name  =  ${trainer_module_name} (trainer module to execute)"
139 | echo "trainer_package_path =  ${trainer_package_path} (local path)"
140 | echo "train_config_file    =  ${trainer_config_file}"
141 | echo "data_path on GS      =  ${train_data_path}"
142 | echo
143 | 
144 | while true; do
145 |   read -p "Please check your inputs. Ready to proceed? (yes/no, y/n)" yn
146 |   case $yn in
147 |       [Yy]* ) break;;
148 |       [Nn]* ) exit;;
149 |       * ) echo "Please answer yes or no.";;
150 |   esac
151 | done
152 | echo
153 | 
154 | 
155 | 
156 | ############# CHECKING AND SETTING PARAMETERS #####################
157 | 
158 | # Check startup.sh script
159 | STARTUP="./gce_scripts/gce_ml_custom_startup.sh"
160 | if !ls ${STARTUP} &> /dev/null
161 | then
162 |   echo "STARTUP.sh NOT FOUND! Make sure you have '$STARTUP'.  EXIT"
163 |   exit 3 # local file not found error
164 | fi
165 | 
166 | # set default compute zone. Best choice = us-east1-c (as of 12/2017)
167 | gcloud config set compute/zone $ZONE &> /dev/null
168 | 
169 | # Original project the current console was in
170 | original_proj=$(gcloud config get-value project)
171 | echo "Current Project = $original_proj"
172 | if [ "$original_proj" != "$PROJ_NAME" ]
173 | then
174 |   # set project id for instance
175 |   gcloud config set project $PROJ_NAME &> /dev/null
176 |   echo "Switching to project = $PROJ_NAME"
177 | fi
178 | 
179 | # Check network and subnet
180 | if gcloud compute networks list --filter="$PROJ_NAME-net" | grep "$PROJ_NAME-net" &> /dev/null
181 | then
182 |   network=$PROJ_NAME-net
183 |   echo "Found network: $network"
184 | else
185 |   echo "Network NOT found: $network! EXIT" ;
186 |   exit 5 # network not found
187 | fi
188 | 
189 | if gcloud compute networks subnets list --filter="network:$PROJ_NAME-net" | grep "$PROJ_NAME" &> /dev/null
190 | then
191 |   subnet=$(gcloud compute networks subnets list --filter="network:$PROJ_NAME-net" | grep -m 1 "$PROJ_NAME" | cut -d' ' -f 1 | head -1)
192 |   echo "Found subnet: $subnet"
193 | else
194 |   echo "Subnet NOT found! EXIT" ;
195 |   exit 5 # network not found
196 | fi
197 | 
198 | 
199 | # Check image Source
200 | if gcloud compute --project "$image_project" images list --filter=$BOOT_IMAGE | grep $BOOT_IMAGE &> /dev/null
201 | then echo "Found boot image: $BOOT_IMAGE"
202 | else
203 |   echo "BOOT IMAGE $BOOT_IMAGE DOES NOT EXIST IN PROJECT $image_project !!!"
204 |   exit 1 # boot image not found error
205 | fi
206 | 
207 | # check data path on GS
208 | if gsutil ls $train_data_path &> /dev/null
209 | then echo "Found data path: $train_data_path"
210 | else
211 |   echo "GS DATA OR PATH NOT FOUND: $train_data_path"
212 |   exit 2  # GS file not found error
213 | fi
214 | 
215 | # check trainer package
216 | if ls ${trainer_package_path} &> /dev/null
217 | then echo "Found trainer package in: $trainer_package_path "
218 | else
219 |   echo "TRAINER PACKAGE NOT FOUND: $trainer_package_path"
220 |   exit 3 # local file not found error
221 | fi
222 | 
223 | # check trainer config file
224 | if (ls $trainer_config_file) &> /dev/null
225 | then echo "Found trainer config file : $trainer_config_file"
226 | else
227 |   echo "TRAINER CONFIG FILE NOT FOUND : $trainer_config_file"
228 |   exit 3 # local file not found
229 | fi
230 | 
231 | # If given instance is existant and running, then stop; elif existant then restart; else create instances
232 | if (gcloud compute instances list --filter="name=$INSTANCE_NAME AND -status=TERMINATED" | grep $INSTANCE_NAME &> /dev/null) &> /dev/null
233 | then
234 |   echo "INSTANCE $INSTANCE_NAME ALREADY EXISTS AND IS ACTIVE. EXIT."
235 |   exit 4 # Instance exists and is already active
236 |   #restart=true
237 |   #create=false
238 | elif (gcloud compute instances list --filter="name=$INSTANCE_NAME AND status=TERMINATED" | grep $INSTANCE_NAME &> /dev/null) &> /dev/null
239 | then
240 |   echo "INSTANCE ALREADY EXISTS BUT TERMINATED : $INSTANCE_NAME "
241 |   echo "Solutions: change the instance name, or delete your existing instance using 'gcloud compute instances delete $INSTANCE_NAME' "
242 |   echo
243 |   exit
244 | 
245 |   restart=true
246 |   create=false
247 | else
248 |   echo "INSTANCE DOES NOT EXIST YET. WILL CREATE : $INSTANCE_NAME"
249 |   create=true
250 |   restart=false
251 | fi
252 | 
253 | 
254 | 
255 | ################# Build and upload package #####################
256 | echo
257 | echo 'Building and uploading package ...'
258 | pushd $trainer_package_path &> /dev/null
259 | python setup.py -q sdist --formats=gztar
260 | 
261 | gsutil cp -r dist/  ${job_dir}/${job_id}/package/
262 | 
263 | rm -rf dist *egg-info
264 | 
265 | popd &> /dev/null  # go back to parent dir
266 | echo
267 | 
268 | # upload config yaml file, and trim path from filename for GCE
269 | gsutil cp ${trainer_config_file}  ${job_dir}/${job_id}/config/
270 | trainer_config_file_base=$(basename $trainer_config_file)
271 | echo
272 | 
273 | 
274 | 
275 | ################# Create or restart instance ##################
276 | 
277 | #if [ $create == false ] && [ $restart == true ]
278 | #then
279 |   # Adding or update metadata
280 | #  echo '--> Updating metadata ... '
281 | #  gcloud compute instances add-metadata $INSTANCE_NAME \
282 | #--metadata job_id=$job_id,\
283 | #job_dir=$job_dir,\
284 | #trainer_module_name=$trainer_module_name,\
285 | #trainer_package_path=$trainer_package_path,\
286 | #train_config_file=$trainer_config_file,\
287 | #train_data_path=$train_data_path,\
288 | #keep_alive=$KEEP_ALIVE \
289 | #--metadata-from-file startup-script=startup.sh # can't have space at the beginning for broken lines
290 | 
291 | #  echo '--> Restarting instance ...'
292 | #  gcloud compute instances start $INSTANCE_NAME
293 | 
294 | if [ $create == true ] && [ $restart == false ]
295 | then
296 |   if [ -z ${accelerator+x} ]   # if var accelerator is set
297 |   then
298 |     echo '--> Creating new instance WITHOUT GPU...'
299 | 
300 |     gcloud compute --project "$PROJ_NAME" instances create "$INSTANCE_NAME" --image "$BOOT_IMAGE" \
301 | --network "$network" --subnet "$subnet" --zone "$ZONE" \
302 | --scopes "https://www.googleapis.com/auth/cloud-platform" \
303 | --maintenance-policy "$maintenance_policy" --tags "https-server" --image-project=$image_project \
304 | --machine-type "$machine_type" --min-cpu-platform "$min_cpu_platform" \
305 | --boot-disk-size="$boot_disk_size" --boot-disk-type="pd-standard" \
306 | --boot-disk-device-name="$INSTANCE_NAME" \
307 | --metadata job_id=$job_id,\
308 | job_dir=$job_dir,\
309 | trainer_module_name=$trainer_module_name,\
310 | package_path=$trainer_package_path,\
311 | trainer_config_file=$trainer_config_file_base,\
312 | train_data_path=$train_data_path,\
313 | gce_user=$GCE_USER,\
314 | keep_alive=$KEEP_ALIVE \
315 | --metadata-from-file startup-script=$STARTUP # can't have space at the beginning for broken lines
316 | 
317 |   else
318 |     echo '--> Creating new instance with GPU ...'
319 | 
320 |     gcloud compute --project "$PROJ_NAME" instances create "$INSTANCE_NAME" --image "$BOOT_IMAGE" \
321 | --network "$network" --subnet "$subnet" --zone "$ZONE" \
322 | --scopes "https://www.googleapis.com/auth/cloud-platform" \
323 | --maintenance-policy "$maintenance_policy" --tags "https-server" --image-project=$image_project \
324 | --machine-type "$machine_type" --min-cpu-platform "$min_cpu_platform" \
325 | --boot-disk-size="$boot_disk_size" --boot-disk-type="pd-standard" \
326 | --boot-disk-device-name="$INSTANCE_NAME" \
327 | --accelerator $accelerator \
328 | --metadata job_id=$job_id,\
329 | job_dir=$job_dir,\
330 | trainer_module_name=$trainer_module_name,\
331 | package_path=$trainer_package_path,\
332 | trainer_config_file=$trainer_config_file_base,\
333 | train_data_path=$train_data_path,\
334 | gce_user=$GCE_USER,\
335 | keep_alive=$KEEP_ALIVE \
336 | --metadata-from-file startup-script=$STARTUP # can't have space at the beginning for broken lines
337 | 
338 |   fi
339 | fi
340 | 
341 | echo
342 | echo "For degbugging, try 'gcloud compute ssh $INSTANCE_NAME' to log in to the created instance. \
343 | You must debug as root by typing 'sudo -s'. Startup logs can be found with 'cat /var/log/syslog | grep startup-script' "
344 | echo "To stream startup logs, try 'gcloud compute instances tail-serial-port-output $INSTANCE_NAME --port 1' "
345 | echo
346 | 
347 | # set project for instance
348 | #gcloud config set project $original_proj
349 | #echo "Switching BACK to original project = $PROJoriginal_proj_NAME"
350 | 


--------------------------------------------------------------------------------
/package_example/trainer/mnist_autoencoder_deconv_simple.py:
--------------------------------------------------------------------------------
  1 | '''This script demonstrates how to build a convolutional autoencoder
  2 | with Keras and deconvolution layers. It can be run on google's Cloud ML.
  3 | 
  4 | This code borrowed and extended the example from: https://blog.keras.io/building-autoencoders-in-keras.html
  5 | 
  6 | Ming Zhao, August 10, 2017
  7 | '''
  8 | import time, argparse
  9 | import numpy as np
 10 | 
 11 | from keras.layers import Input, Dense, Lambda, Reshape
 12 | from keras.layers import Conv2D, Conv2DTranspose, AveragePooling2D, UpSampling2D, BatchNormalization, Activation, MaxPooling2D, Cropping2D
 13 | from keras.models import Model
 14 | 
 15 | from keras.optimizers import Adam, SGD
 16 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
 17 | 
 18 | from keras.datasets import mnist
 19 | from tensorflow.python.lib.io import file_io
 20 | 
 21 | import matplotlib
 22 | matplotlib.use('PDF')
 23 | from matplotlib import pyplot as plt
 24 | 
 25 | 
 26 | #%%
 27 | 
 28 | def conv_block(x, n_channels, kernel_size=3, padding='same', activation='relu',
 29 |                dilation_rate=1, batch_norm=True, use_transposed_conv=False):
 30 |     '''A 2D convolution block with conv2D (or deconv), relu, batchnorm.
 31 |     In principle batch_norm should be applied before non-linear activation. However, it
 32 |     has become a trend to have a batch norm layer after an activation layer
 33 |     (see: https://github.com/ducha-aiki/caffenet-benchmark/blob/master/batchnorm.md ,
 34 |     https://stackoverflow.com/questions/34716454/where-do-i-call-the-batchnormalization-function-in-keras ,
 35 |     https://github.com/fchollet/keras/issues/1802 )
 36 |     '''
 37 |     global do_batch_norm_before_activation
 38 | 
 39 |     if use_transposed_conv is True:
 40 |         conv = Conv2DTranspose
 41 |     else:
 42 |         conv = Conv2D
 43 | 
 44 |     if batch_norm is True:
 45 |         # use_bias = True if uses batch_norm
 46 |         if do_batch_norm_before_activation is True:
 47 |             conv_layer = conv(n_channels, kernel_size=kernel_size, use_bias=False,
 48 |                                 padding=padding, dilation_rate = dilation_rate)(x)
 49 |             conv_layer = BatchNormalization()(conv_layer)
 50 |             conv_layer  = Activation(activation=activation)(conv_layer)
 51 | 
 52 |         else:
 53 |             conv_layer = conv(n_channels, kernel_size=kernel_size, activation=activation, use_bias=False,
 54 |                                 padding=padding, dilation_rate = dilation_rate)(x)
 55 |             conv_layer = BatchNormalization()(conv_layer)
 56 |     else:
 57 |         conv_layer = conv(n_channels, kernel_size=kernel_size, activation=activation, use_bias=True,
 58 |                             padding=padding, dilation_rate = dilation_rate)(x)
 59 |     return conv_layer
 60 | 
 61 | 
 62 | 
 63 | def pool_layer(x, method='max', pool_size=(2,2), padding='same'):
 64 |     if method == 'max':
 65 |         return MaxPooling2D(pool_size, padding=padding)(x)
 66 |     else:
 67 |         return AveragePooling2D(pool_size, padding=padding)(x)
 68 | 
 69 | 
 70 | 
 71 | def train(job_dir=None, job_id=None,
 72 |           use_transposed_conv=True, score_metric='mse', loss='binary_crossentropy',
 73 |           learning_rate = 0.001, lr_decay=0.001, optimizer_name='adam', n_epochs=100,
 74 |           patience=5, batch_norm_before_activation=True, pool_method='max', **kwargs):
 75 |     '''main training function'''
 76 | 
 77 |     global do_batch_norm_before_activation
 78 |     do_batch_norm_before_activation = batch_norm_before_activation
 79 |     print('--> batch_norm_before_activation== {}!!!\n'.format(do_batch_norm_before_activation))
 80 | 
 81 |     # input image dimensions
 82 |     img_rows, img_cols, img_chns = 28, 28, 1
 83 |     original_img_size = (img_rows, img_cols, img_chns)
 84 | 
 85 |     n_filters = 8 # number of convolutional filters to use
 86 |     kernel_size = 3 # convolution kernel size
 87 |     batch_size = 1000
 88 | 
 89 |     lr = learning_rate
 90 |     decay = lr_decay
 91 |     opt = optimizer_name
 92 | 
 93 |     if job_dir is None:
 94 |         job_dir = './tmp/'
 95 | 
 96 |     if job_id is None:
 97 |         ctime = time.ctime().split()
 98 |         time_str = ctime[4]+ctime[1]+ctime[2]+"_"+''.join(ctime[3].split(":")[0:2])
 99 |         job_id = time_str
100 |         checkpoint_filename = 'mnist_autoencoder_checkpoint_{}.hdf5'.format(job_id)
101 |     else:
102 |         checkpoint_filename = '{}.hdf5'.format(job_id)
103 | 
104 |     if use_transposed_conv is True:
105 |         print('--> use_transposed_conv is True!!!\n')
106 |         job_id = 'transposed_conv_' + job_id
107 | 
108 | 
109 |     ############ Encoder ###############
110 |     x = Input(shape=original_img_size) # reshape to: (100, 28, 28, 1)
111 | 
112 |     conv1 = conv_block(x, n_filters, kernel_size)
113 |     conv1 = conv_block(conv1, n_filters, kernel_size)
114 |     conv1 = pool_layer(conv1, method=pool_method, pool_size=(2,2), padding='same')
115 | 
116 |     conv2 = conv_block(conv1, n_filters*2, kernel_size)
117 |     conv2 = conv_block(conv2, n_filters*2, kernel_size)
118 |     conv2 = pool_layer(conv2, method=pool_method, pool_size=(2,2), padding='same')
119 | 
120 |     conv3 = conv_block(conv2, n_filters*4, kernel_size)
121 |     conv3 = conv_block(conv3, n_filters*4, kernel_size)
122 |     encoded = pool_layer(conv3, method=pool_method, pool_size=(2,2), padding='same')
123 | 
124 |     # End of encoder. The compressed representation is (4, 4, 8)
125 | 
126 |     conv4 = conv_block(encoded, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv)
127 |     conv4 = conv_block(conv4, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv)
128 |     conv4 = UpSampling2D((2, 2))(conv4)
129 | 
130 |     conv5 = conv_block(conv4, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv)
131 |     conv5 = conv_block(conv5, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv)
132 |     conv5 = UpSampling2D((2, 2))(conv5)
133 | 
134 |     conv6 = conv_block(conv5, n_filters, kernel_size, use_transposed_conv=use_transposed_conv)
135 |     conv6 = conv_block(conv6, n_filters, kernel_size, use_transposed_conv=use_transposed_conv)
136 |     conv6 = UpSampling2D((2, 2))(conv6)
137 | 
138 |     decoded = conv_block(conv6, 1, kernel_size=kernel_size, activation='sigmoid', padding='same',
139 |                          batch_norm=False, use_transposed_conv=use_transposed_conv) # the activation here is sigmoid b/c the pixel values are bounded b/w 0-1, and there are lots of 0s
140 |     decoded = Cropping2D(cropping=((2, 2), (2, 2)))(decoded) # crop 2 on each side of the img to get 28x28
141 | 
142 |     # Put all layers together into a model graph
143 |     autoencoder = Model(x, decoded)
144 | 
145 |     ######### End of decoder ###################
146 |     ######### Now config models for training and logging ##########
147 | 
148 |     if opt =='adam':
149 |         optimizer = Adam(lr=lr, decay=decay)
150 |     elif opt =='sgd':
151 |         optimizer = SGD(lr=lr, momentum=0.9, decay=decay, nesterov=True)
152 | 
153 |     autoencoder.compile(optimizer=optimizer, loss = loss, metrics = [score_metric])
154 |     autoencoder.summary()
155 | 
156 | 
157 |     # data from MNIST digits
158 |     (x_train, _), (x_test, y_test) = mnist.load_data()
159 | 
160 |     # reshape data to (data_size, n_pix, n_pix, n_channels)
161 |     x_train = x_train.astype('float32') / 255.
162 |     x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
163 |     x_test = x_test.astype('float32') / 255.
164 |     x_test = x_test.reshape((x_test.shape[0],) + original_img_size)
165 | 
166 |     print('x_train.shape:', x_train.shape)
167 | 
168 |     callbacks = [EarlyStopping(monitor='val_loss',patience=5,verbose=2, mode='min', min_delta=0.0005),
169 |                  ModelCheckpoint(checkpoint_filename, monitor='val_loss', verbose=2, save_best_only=True),
170 |                  TensorBoard(log_dir=job_dir)]
171 | 
172 |     history = autoencoder.fit(x=x_train, y=x_train, shuffle=True, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks,
173 |                         verbose=2, validation_data=(x_test, x_test))
174 | 
175 |     test_score = autoencoder.evaluate(x_test, x_test, verbose=0, batch_size=batch_size)
176 |     print('Final test score:', test_score)
177 | 
178 |     if score_metric == 'mae':
179 |         history_key_validation = 'val_mean_absolute_error'
180 |         history_key_train = 'mean_absolute_error'
181 |     if score_metric == 'mse':
182 |         history_key_validation = 'val_mean_squared_error'
183 |         history_key_train = 'mean_squared_error'
184 | 
185 |     validation_history = history.history[history_key_validation]
186 |     training_history = history.history[history_key_train]
187 | 
188 | 
189 |     # Save model to gs
190 |     if 'gs://' in job_dir:
191 |         # Save model hdf5 to google storage
192 |         with file_io.FileIO(checkpoint_filename, mode='rb') as input_f:
193 |             with file_io.FileIO(job_dir + checkpoint_filename, mode='w') as output_f:
194 |                 output_f.write(input_f.read())
195 | 
196 | 
197 |     #%% Plot a learning curve
198 |     fig_name = 'lr_{}.pdf'.format(job_id) #
199 |     if 'gs://' not in job_dir:
200 |         fig_name = job_dir + fig_name
201 | 
202 |     f, axes = plt.subplots(2, sharex=True, figsize=(8,7))
203 |     axes[0].plot(training_history)
204 |     axes[0].set_ylabel('Training score ({})'.format(score_metric))
205 |     axes[0].set_title('Final test score ({0}) = {1:2.4f}\n LR={2}, decay={3}, optimizer={4}, pool_method={5}\n \
206 |                     use_transposed_conv={6}, loss={7}, batch_norm_before_activation={8}'.format(score_metric,
207 |                     test_score[1], lr, decay, opt, pool_method, use_transposed_conv, loss, do_batch_norm_before_activation), fontsize=9)
208 | 
209 |     axes[1].plot(validation_history)
210 |     axes[1].set_xlabel('Epochs')
211 |     axes[1].set_ylabel('Validation score ({})'.format(score_metric))
212 |     #f.suptitle('Config file :{}'.format(train_config_file), fontsize=10)
213 |     f.subplots_adjust(hspace=0.05)
214 |     f.savefig(fig_name)
215 | 
216 |     if 'gs://' in job_dir:
217 |         #Save figure to GS
218 |         with file_io.FileIO(fig_name, mode='rb') as input_f:
219 |             with file_io.FileIO(job_dir + fig_name, mode='w') as output_f:
220 |                 output_f.write(input_f.read())
221 | 
222 | 
223 |     #%% Sample a few test images and compare with reconstructed ones
224 |     n_imgs_to_show = 30
225 |     x_test_sub = np.random.permutation(x_test)[0:n_imgs_to_show]
226 | 
227 |     reconstructed_test = autoencoder.predict(x_test_sub, batch_size=n_imgs_to_show)
228 |     #reconstructed_train = autoencoder.predict(x_train[0:10000].reshape((10000, 28, 28, 1)), batch_size=batch_size)
229 | 
230 |     # plot reconstructed images and compare
231 |     fig_name = 'compare_{}.pdf'.format(job_id)
232 |     if 'gs://' not in job_dir:
233 |         fig_name = job_dir + fig_name
234 | 
235 |     n_rows = 3 # split orignal images into 2 rows
236 |     n_cols = n_imgs_to_show//n_rows
237 |     f, axes = plt.subplots(n_rows*2, n_cols, sharey=True, figsize=(10,10))
238 | 
239 |     for i in range(n_imgs_to_show):
240 |         axes[i//n_cols * 2, i % n_cols].imshow(x_test_sub[i,:,:,0])
241 |         axes[i//n_cols * 2, 0].set_ylabel('Original')
242 |         axes[i//n_cols * 2 +1, i % n_cols].imshow(reconstructed_test[i,:,:,0])
243 |         axes[i//n_cols * 2 +1, 0].set_ylabel('Reconstructed')
244 |     f.savefig(fig_name)
245 | 
246 |     if 'gs://' in job_dir:
247 |         #Save figure to GS
248 |         with file_io.FileIO(fig_name, mode='rb') as input_f:
249 |             with file_io.FileIO(job_dir + fig_name, mode='w') as output_f:
250 |                 output_f.write(input_f.read())
251 | 
252 | 
253 | #%%
254 | 
255 | if __name__ == '__main__':
256 | 
257 |     parser = argparse.ArgumentParser()
258 | 
259 |     # This argument is required by GC
260 |     parser.add_argument(
261 |       '--job_dir', help='GCS location to write checkpoints and export models', default=None)
262 | 
263 |     parser.add_argument('--job_id', help='Job ID to tag models', default=None)
264 | 
265 |     parser.add_argument('--use_transposed_conv', help='Use "deconv" layers or transposed conv layers for "deconv"',
266 |                         action='store_true', default=False)
267 | 
268 |     parser.add_argument('--score_metric', help='Metric for scoring: mse, mae, binary_entropy, etc.',
269 |                         default='mse')
270 | 
271 |     parser.add_argument('--loss', help='Loss function: mse, mae, binary_crossentropy, etc.',
272 |                         default='binary_crossentropy')
273 | 
274 |     parser.add_argument('-lr', '--learning_rate', help='learning rate',
275 |                         default=0.001, type=float)
276 | 
277 |     parser.add_argument('--lr_decay', help='Learning rate decay (e.g., linear decay for Adma)',
278 |                         default=0.001, type=float)
279 | 
280 |     parser.add_argument('-opt','--optimizer_name', help='optimizer function name',
281 |                         default='adam')
282 | 
283 |     parser.add_argument('--n_epochs', help='Number of epochs',
284 |                         default=100, type=int)
285 | 
286 |     parser.add_argument('--patience', help='Number of epochs to wait before early stopping',
287 |                         default=5, type=int)
288 | 
289 |     parser.add_argument('--pool_method', help='Pooling method, either "max" or "average"',
290 |                         default="max")
291 | 
292 |     args = parser.parse_args()
293 |     arguments = args.__dict__
294 | 
295 |     train(**arguments)
296 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gce_ML
  2 | This small package makes ML model training with Google Compute Engine (GCE) easy with simple model submission and automatic VM instance management. It is very similar to Google's [Cloud ML Engine](https://cloud.google.com/ml-engine/docs/technical-overview) but offers more flexibility for customization (e.g., GPU, CPU, and RAM configurations) and debugging.
  3 | 
  4 | ### Why use this package?
  5 | There are three primary advantages:
  6 | 
  7 |   1. You can *pre-build your own image* with all the libraries you need, so *what you have in the cloud is what you have locally*. This could save you some headaches with different runtime versions provided by the Cloud ML Engine, and also shortens your instance startup time and the debugging cycle.
  8 | 
  9 |   2. *Debugging is more straightforward than Cloud ML.* You can log in to the instance to debug and diagnose any problems in the cloud environment (e.g., with specific GPUs and memory and so on), instead of having to debug locally, re-submit your job, wait for 6-10 min for a new instance to start, and iterate.
 10 | 
 11 |   3. *More affordable GPU power and VM resources.* Using GCE's [Preemptible VM](https://cloud.google.com/preemptible-vms/) instances can significantly reduce cost (50% off), as long as you save your model frequently and do not need your instances to run continuously. Even better, Google just announced [Preemptible GPUs](https://cloudplatform.googleblog.com/2018/01/introducing-preemptible-gpus-50-off.html), which will make GPUs more affordable as well. According to Google: "You can now attach NVIDIA K80 and NVIDIA P100 GPUs to Preemptible VMs for $0.22 and $0.73 per GPU hour, respectively. This is 50% cheaper than GPUs attached to on-demand instances, ..."
 12 | 
 13 | Setting up the package takes some time, but once you correctly configure it, you can submit and train your models in the same way as in Cloud ML.
 14 | 
 15 | ### Prerequisite
 16 | + Google Cloud Platform account and billing enabled. Follow this [link](https://cloud.google.com/ml-engine/docs/command-line)  
 17 |     - Select or create a project on Google Cloud Platform
 18 |     - For first time users, enable billing. You can sign up for a free trial with $300 credits.
 19 |     - Install the [Google Cloud SDK](https://cloud.google.com/sdk/docs/quickstart-mac-os-x#before-you-begin). We will use the `gcloud` CLI for our tasks.
 20 |     - Initialize your gcloud environment at command line: ** `gcloud init` **
 21 |     	+ set up your email account and region (us-east is among the cheapest).
 22 |     - For more details: here is an [overview of the Cloud ML Engine](https://cloud.google.com/ml-engine/docs/concepts/technical-overview)
 23 |     - **Note**: Please follow the exact folder structure when making your own cloud ML package after trying this example. Change `setup.py` and `trainer.task.py` accordingly. For details, check [here](https://cloud.google.com/ml-engine/docs/images/recommended-project-structure.png) and [here](https://cloud.google.com/ml-engine/docs/how-tos/packaging-trainer)
 24 | 
 25 | 
 26 | ### Create a GCE instance and then a custom boot image
 27 | We will create a new GCE virtual machine instance using a public image and customize it. You can then create GCE instances using pre-existing custom images later. *This instruction is based on the example and steps from [here (Compute Engine survival training)](https://github.com/GoogleCloudPlatform/ml-on-gcp/tree/master/gce/survival-training) with modifications and a few more details.*
 28 | 
 29 | 1. Create your instance. The easiest way is to use the Cloud Console [here](https://console.cloud.google.com/compute/). Follow this [link](https://cloud.google.com/compute/docs/instances/create-start-instance).
 30 |     - Feel free to play with the customization of CPUs, memory, disk storage, etc. For GPUs, see below.
 31 |     - For `Boot disk` image, choose `Ubuntu 16.04 LTS` for this exercise. Other images may cause compatibility issues. A disk size of >=20Gb should be enough.
 32 |     - To be able to use GPUs, you may need to enable your GPU quota [here](https://console.cloud.google.com/compute/quotas?).
 33 |     - For `Access scopes`, choose `Allow full access to all Cloud APIs` so your instance can read/write to Cloud Storage.
 34 |     - For `Firewall`, choose `Allow HTTPS traffic`.
 35 |     - **Before clicking the `Create` button**, you can click the blue `command line` link located below the `Create` button to see what the full `gcloud` command is for this instance you just configured. **I found this CLI feature a very convenient debugging tool**.
 36 |     - Now click `Create` to create your VM instance.
 37 |     - To check your instance:
 38 | 
 39 |           $ gcloud compute instances list
 40 | 
 41 |     - To log in to your instance:
 42 | 
 43 |           $ gcloud compute ssh my_instance --zone=us-east1-c
 44 | 
 45 | 
 46 | 2. Add GPU(s) to your instance
 47 |     - As a beginner, try the cheapest GPU configuration by selecting `1x Tesla K80` GPU.
 48 |     - Follow this [link](https://cloud.google.com/compute/docs/gpus/add-gpus) to add GPUs.
 49 |     - Now, first initialzie your gcloud:
 50 |               $ gcloud init
 51 |     - Then, log in to your instance and install CUDA drivers:
 52 |         + Log in:
 53 |               $ gcloud config set project [my-project-id]
 54 |               $ gcloud compute ssh [my_instance] --zone=us-east-1c
 55 | 
 56 |         + Follow [this link](https://cloud.google.com/compute/docs/gpus/add-gpus#install-driver-script) to install drivers. Click `UBUNTU` to see the script for `Ubuntu 16.04 LTS - CUDA 8` (Tensorflow does not support other driver versions yet!).
 57 |     - Note the [Optimizing GPU performance](https://cloud.google.com/compute/docs/gpus/add-gpus#gpu-performance) section.
 58 |     - Next, add the CUDA path to our `LD_LIBRARY_PATH` environment variable in `~/.bashrc` file on the VM, as per the TensorFlow instructions (also [outlined here](https://github.com/GoogleCloudPlatform/ml-on-gcp/blob/master/gce/survival-training/README-tf-estimator.md#cuda-drivers)). Add the following lines to the end of your `.bashrc`:
 59 | 
 60 |           export CUDA_HOME=/usr/local/cuda-8.0
 61 |           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64
 62 | 
 63 |       With that done, run
 64 | 
 65 |             $ source ~/.bashrc
 66 | 
 67 |       And to verify that the change took effect:
 68 | 
 69 |             $ echo $CUDA_HOME $LD_LIBRARY_PATH
 70 | 
 71 |     - Next, finish the rest of the steps below (also outlined in this [instruction](https://github.com/GoogleCloudPlatform/ml-on-gcp/blob/master/gce/survival-training/README-tf-estimator.md#cudnn-library):
 72 |       + Download cuDNN v6 for CUDA 8.0 from [here](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/cudnn-8.0-linux-x64-v6.0-tgz) as required for tensorflow (other version are not supported). You may need to first register an account with NVIDIA.
 73 |       + Upload your downloaded `.tgz` file to your VM instance:
 74 | 
 75 |             $ gcloud compute scp ~/Downloads/cudnn-8.0-linux-x64-v6.0.tgz [your-instance-name]:~/
 76 | 
 77 |       + Next, in your VM instance's terminal:
 78 | 
 79 |             $ tar xvfz cudnn-8.0-linux-x64-v6.0.tgz
 80 |             $ sudo cp cuda/include/cudnn.h /usr/local/cuda-8.0/include
 81 |             $ sudo cp cuda/lib64/* $CUDA_HOME/lib64
 82 |             $ sudo chmod a+r /usr/local/cuda-8.0/include/cudnn.h $CUDA_HOME/lib64/libcudnn*
 83 |             $ sudo apt-get install libcupti-dev
 84 | 
 85 | 
 86 | 3. Create a pre-configured boot image
 87 | 
 88 |     - Custom images allow you to create new instances with the same state you configured, so you do not have to re-install CUDA drivers and python packages, etc. That is, you only need to do these configurations and setups once.
 89 | 
 90 |     - First, install all required **Python (2.7)** packages and libraries in your instance (e.g., pip, numpy, pandas, matplotlib, scipy, sklearn, etc.)
 91 | 
 92 |       + To do this, we must install everything under `root`  to make the automatic model submission and training feature work, as the startup script is executed by `sudo`. Now, log in as `root` in your instance by typing:
 93 | 
 94 |             $ sudo -s
 95 | 
 96 |         Now you can install `pip` and other developer tools:
 97 | 
 98 |             $ apt-get --assume-yes install python-pip python-dev build-essential
 99 | 
100 |         Now, install `tensorflow-gpu` for your GPU-enabled instance:
101 | 
102 |             $ pip install tensorflow-gpu==14.0
103 | 
104 |         Then, you can install other Python libraries one by one; or you can install them all together using a `requirements.txt` file.
105 | 
106 |             $ pip install -r requirements.txt
107 | 
108 |       + Once done, enter a `Python` interpreter and verify that your installed packages work:
109 | 
110 |             > # This is your python environment
111 |             > import tensorflow as tf
112 |             > tf.Session()
113 |             > # you should see messages like:
114 |             > # Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7)
115 | 
116 | 
117 |       + **NOTE**: here we use the default **Python 2.7** available as `root`. You can install your own Python version as long as you do it under root and it does not require running a `.bashrc` in shell (e.g., **Anaconda's Python distribution requires sourcing a bash script and thus won't work under sudo**).
118 |       You can check your set up and python version using:
119 | 
120 |             $ sudo python -V
121 | 
122 |     - Now, your instance is fully configured. Exit your instance (`exit`) and get back to your terminal. Stop it to create a boot image.
123 | 
124 |           $ gcloud compute instances stop my-instance
125 |           $ gcloud compute images create my-boot-image --source-disk my-instance-name --source-disk-zone us-east1-c
126 | 
127 |       That is it! You now have a boot image to create other instances with exactly the same state (GPU configurations, python libraries, etc.). Also, you can [share your image among other projects](https://cloud.google.com/compute/docs/images/sharing-images-across-projects).
128 | 
129 | 4. Set up Cloud Storage (GCS) for all your data and files
130 |     - Like Cloud ML, we store all data, logs, and model checkpoints on Cloud Storage buckets. You will need to create a bucket for this project beforehand (if you have not). Just follow this [link](https://cloud.google.com/storage/docs/creating-buckets) and create a bucket (e.g., `my_bucket`).
131 |     - Upload your data to your GCS bucket for later access. You can upload data to a new bucket or the bucket just created.
132 |     - Now, the `JOB_DIR` variable that you will need later is: `gs://project_name/my_bucket/`. Each submitted model automatically creates a subfolder inside it.
133 | 
134 | 
135 | ### Install and set up gce_ml, then train your models at scale
136 | Finally, we are ready to set up this package and train your models.
137 | 
138 | 1. Clone this package to your local directory.
139 | 
140 | 2. Set up your model in python
141 |     - Use the autoencoder-decoder model in `package_example/` as an example.
142 |     - Your training package should be constructed in pretty much the same way as in Cloud ML instances. Follow the exact folder structure listed [here](https://cloud.google.com/ml-engine/docs/images/recommended-project-structure.png) and [here](https://cloud.google.com/ml-engine/docs/how-tos/packaging-trainer) when making your own cloud ML package. Change `setup.py` and `trainer.task.py` accordingly.
143 |     - **NOTE 1:** There is, however, one small difference between this package and Cloud ML -- your model input variables are supplied by an external `YAML` configuration file instead of using `bash` commands. This approach actually makes your training easier to manage, as once you set up your gce_ml submission script, the only things you need to change are the new instance name (actually not necessary if using timestamp as instance name) and the `YAML` configuration file (for different model parameters).
144 |     - **NOTE 2:** You can rename your package to your liking, but make sure the folder `gce_scripts/` exists and resides at the same level as your `submis_model.sh` script (again, follow the directory structure of the package).  
145 | 
146 | 2. Model training with **gce_ML**
147 |     - Now you have gone through all setup steps and are finally ready to submit a model for training. Follow the example in  `submit_model_gpu.sh` or `submit_model.sh` to configure your new instance.
148 |     - To submit your model to a new GCE instance, just run:
149 | 
150 |           $ ./submit_model.sh
151 | 
152 |     - You can monitor the instance by going to the [GCE console](https://console.cloud.google.com/compute/instances?project=) and click the newly created instance. Alternatively, you can follow the instruction shown in the shell prompt to stream your `syslog` to your terminal.
153 | 
154 |           $ gcloud compute instances tail-serial-port-output my-instance-with-gpu --port 1
155 | 
156 |     - Your instance **will automatically shut down** once completed and can be restarted later. If needed, you can keep it alive for a given number of seconds so you can log in and debug.
157 |     - Finally, **remember to delete your instance** from either the [Cloud Console](https://console.cloud.google.com/compute/instances?project=) or command line. It automatically shuts down but does not delete itself from the cloud. Each instance (and its associated job) is supposed to have a unique name, so you will not need it afterward (reuse will not save resource).
158 | 
159 |           $ gcloud compute instances delete my-instance
160 | 
161 |     - Don't worry, your models, checkpoints, and `syslog` are all saved in your job's unique GCS bucket (again, like Cloud ML). You can find all relevant information on GCS as long as you save your model outputs there by following the example.
162 | 


--------------------------------------------------------------------------------