├── example ├── MXNet │ ├── documents │ │ └── paper.pdf │ ├── datasets │ │ └── manifest.json │ ├── tuxiv.conf │ └── mnist.py ├── TensorFlow │ ├── documents │ │ └── paper.pdf │ ├── datasets │ │ └── manifest.json │ ├── configurations │ │ ├── citynet.sh │ │ ├── conda.yaml │ │ └── run.slurm │ ├── run.sh │ ├── tuxiv.conf │ ├── tensorflow_on_slurm.py │ └── mnist.py ├── helloworld │ ├── main.py │ └── tuxiv.conf ├── PyTorch │ ├── tuxiv.conf │ └── mnist.py └── README.md ├── static └── workflow.png ├── .gitignore ├── docs └── user_dataset.md ├── FAQ.md ├── tuxiv.conf.md └── README.md /example/MXNet/documents/paper.pdf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/MXNet/datasets/manifest.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/TensorFlow/documents/paper.pdf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/TensorFlow/datasets/manifest.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/TensorFlow/configurations/citynet.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turingaicloud/quickstart/HEAD/static/workflow.png -------------------------------------------------------------------------------- /example/TensorFlow/configurations/conda.yaml: -------------------------------------------------------------------------------- 1 | name: tf 2 | channels: 3 | dependencies: 4 | - tensorflow=1.15 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.DS_Store 3 | .DS_Store? 4 | *.vscode 5 | *.tcloud 6 | package.json 7 | package-lock.json 8 | vendor/ 9 | pkg/ 10 | bin/* -------------------------------------------------------------------------------- /example/TensorFlow/configurations/run.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=2 3 | export TACC_WORKDIR=/mnt/sharefs/home/testuser/WORKDIR/TensorFlow 4 | export TACC_USERDIR=/mnt/sharefs/home/testuser/USERDIR 5 | srun /mnt/sharefs/home/testuser/WORKDIR/TensorFlow/run.sh 6 | -------------------------------------------------------------------------------- /example/TensorFlow/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /mnt/sharefs/home/testuser/WORKDIR/miniconda3/etc/profile.d/conda.sh 3 | conda activate tf 4 | 5 | python ${TACC_WORKDIR}/mnist.py \ 6 | --task_index=0 \ 7 | --data_dir=${TACC_WORKDIR}/datasets/mnist_data \ 8 | --batch_size=1 \ 9 | -------------------------------------------------------------------------------- /example/helloworld/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | WORKDIR = os.environ.get('TACC_WORKDIR') 5 | USERDIR = os.environ.get('TACC_USERDIR') 6 | 7 | os.system('tree -L 2 {}'.format(USERDIR)) 8 | 9 | shutil.copytree(WORKDIR, "{}/helloworld".format(USERDIR)) 10 | print("Hello World") -------------------------------------------------------------------------------- /example/MXNet/tuxiv.conf: -------------------------------------------------------------------------------- 1 | entrypoint: 2 | - python ${TACC_WORKDIR}/mnist.py 3 | environment: 4 | name: mxnet-env 5 | dependencies: 6 | - mxnet=1.5.0 7 | job: 8 | name: test 9 | general: 10 | - nodes=2 11 | - output=${TACC_SLURM_USERLOG}/mxnet.log 12 | -------------------------------------------------------------------------------- /example/helloworld/tuxiv.conf: -------------------------------------------------------------------------------- 1 | entrypoint: 2 | - python ${TACC_WORKDIR}/main.py 3 | environment: 4 | name: hello 5 | channels: 6 | - conda-forge 7 | dependencies: 8 | - python=3.6.9 9 | - opencv 10 | job: 11 | general: 12 | - nodes=1 13 | - ntasks=1 14 | - cpus-per-task=1 15 | - output=${TACC_SLURM_USERLOG}/hello.log -------------------------------------------------------------------------------- /example/TensorFlow/tuxiv.conf: -------------------------------------------------------------------------------- 1 | entrypoint: 2 | - python ${TACC_WORKDIR}/mnist.py 3 | - --task_index=0 4 | - --data_dir=${TACC_WORKDIR}/datasets/mnist_data 5 | - --batch_size=1 6 | environment: 7 | name: tf 8 | dependencies: 9 | - tensorflow=1.15 10 | job: 11 | name: test 12 | general: 13 | - nodes=2 14 | - output=${TACC_SLURM_USERLOG}/tensorflow.log -------------------------------------------------------------------------------- /example/PyTorch/tuxiv.conf: -------------------------------------------------------------------------------- 1 | entrypoint: 2 | - CUDA_VISIBLE_DEVICES="0,1,2,3" python ${TACC_WORKDIR}/mnist.py --datasetDir=mnt/data/mnist 3 | environment: 4 | name: torch-env 5 | channels: 6 | - pytorch 7 | - nvidia 8 | dependencies: 9 | - python=3.6.9 10 | - pytorch=1.9.0 11 | - torchvision=0.10.0 12 | - tensorboard=1.15.0 13 | - cudatoolkit=11.1.74 14 | - torchaudio=0.9.0 15 | job: 16 | name: test 17 | general: 18 | - nodes=2 19 | - ntasks-per-node=2 20 | - cpus-per-task=10 21 | - gres=gpu:2 -------------------------------------------------------------------------------- /docs/user_dataset.md: -------------------------------------------------------------------------------- 1 | # Upload dataset 2 | You may upload your dataset with `tcloud upload` command: 3 | ``` 4 | tcloud upload [-c] [] 5 | ``` 6 | The `tcloud upload` command helps upload your dataset to ${TACC_USERDIR}. 7 | 8 | Note that by default tcloud incrementally upload your dataset to TACC, if you want to **delete previous version** and re-upload the dataset, you may add "-c" flag in your command. 9 | 10 | # Specify dataset in code 11 | After uploading your own dataset, you must add several codes to specify the location of the dataset. Below is an example in PyTorch: 12 | 13 | ~~~python 14 | workdir = os.environ.get('TACC_WORKDIR') 15 | userdir = os.environ.get('TACC_USERDIR') 16 | 17 | ... 18 | 19 | train_dataset = torchvision.datasets.MNIST('{}/{}'.format(userdir, ), train=True, download=False, 20 | transform=transforms.Compose([ 21 | transforms.ToTensor(), 22 | transforms.Normalize((0.1307,), (0.3081,)) 23 | ])) 24 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) 25 | train_loader = torch.utils.data.DataLoader( 26 | train_dataset, 27 | batch_size=args.batch_size, 28 | sampler=train_sampler, 29 | **kwargs) 30 | ~~~ -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ### 1. How to use tcloud? 4 | - It's easy to use. Try helloworld example first. 5 | ### 2. Can I use my own dataset? 6 | - Of course, you can upload your dataset to a specific directory via tcloud upload. 7 | ### 3. What is the current tcloud strategy? 8 | - Now, there are a total of 8 nodes in TACC, and each node has 4 GTX3090 and 40 CPU cores. 9 | ### 4. Can I ssh to the tcloud server? 10 | - All users cannot ssh to our server. If you have special needs, please feel free to contact us. 11 | ### 5. Can I keep using tacc without running an experiment? 12 | - Can not. We will regularly kill processes that occupy nodes for a long time but do not run experiments. 13 | ### 6. How to cancel a tcloud job? 14 | - You can use `tcloud cancel -j [JOBID]` command. 15 | ### 7. Why did I submit a job but there is no log? 16 | - It's hard to say. This is most likely an error generated before the job is assigned to tcloud to run. Please carefully check the tuxiv.conf file. 17 | ### 8. The dependency I need is not provided by conda. 18 | - You can wirte your download command in tuxiv.conf entrypoint with your execute command. (example will provide after test) 19 | ### 9. Why does my job submission process take a long time? 20 | - We will retrieve the dependencies section in your tuxic.conf file. If the dependencies are different from the last time, tcloud will do more process. In order to save time, please confirm the required dependencies as soon as possible. -------------------------------------------------------------------------------- /example/TensorFlow/tensorflow_on_slurm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | 5 | import tensorflow as tf 6 | import re 7 | import os 8 | 9 | def tf_config_from_slurm(ps_number, port_number=12222): 10 | """ 11 | Creates configuration for a distributed tensorflow session 12 | from environment variables provided by the Slurm cluster 13 | management system. 14 | 15 | @param: ps_number number of parameter servers to run 16 | @param: port_number port number to be used for communication 17 | @return: a tuple containing cluster with fields cluster_spec, 18 | task_name and task_id 19 | """ 20 | 21 | nodelist = os.environ["SLURM_JOB_NODELIST"] 22 | nodename = os.environ["SLURMD_NODENAME"] 23 | nodelist = _expand_nodelist(nodelist) 24 | num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES")) 25 | 26 | if len(nodelist) != num_nodes: 27 | raise ValueError("Number of slurm nodes {} not equal to {}".format(len(nodelist), num_nodes)) 28 | 29 | if nodename not in nodelist: 30 | raise ValueError("Nodename({}) not in nodelist({}). This should not happen! ".format(nodename,nodelist)) 31 | 32 | ps_nodes = [node for i, node in enumerate(nodelist) if i < ps_number] 33 | worker_nodes = [node for i, node in enumerate(nodelist) if i >= ps_number] 34 | 35 | if nodename in ps_nodes: 36 | my_job_name = "ps" 37 | my_task_index = ps_nodes.index(nodename) 38 | else: 39 | my_job_name = "worker" 40 | my_task_index = worker_nodes.index(nodename) 41 | 42 | worker_sockets = [":".join([node, str(port_number)]) for node in worker_nodes] 43 | ps_sockets = [":".join([node, str(port_number)]) for node in ps_nodes] 44 | cluster = {"worker": worker_sockets, "ps" : ps_sockets} 45 | return cluster, my_job_name, my_task_index 46 | 47 | def _pad_zeros(iterable, length): 48 | return (str(t).rjust(length, '0') for t in iterable) 49 | 50 | def _expand_ids(ids): 51 | ids = ids.split(',') 52 | result = [] 53 | for id in ids: 54 | if '-' in id: 55 | begin, end = [int(token) for token in id.split('-')] 56 | result.extend(_pad_zeros(range(begin, end+1), len(id.split('-')[0]))) 57 | # print(begin, end) 58 | else: 59 | result.append(id) 60 | # result.append(id) 61 | return result 62 | 63 | def _expand_nodelist(nodelist): 64 | prefix, ids = re.findall("(.*)\[(.*)\]", nodelist)[0] 65 | ids = _expand_ids(ids) 66 | result = [prefix + str(id) for id in ids] 67 | return result 68 | 69 | def _worker_task_id(nodelist, nodename): 70 | return nodelist.index(nodename) 71 | -------------------------------------------------------------------------------- /example/MXNet/mnist.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # pylint: skip-file 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import logging 23 | logging.basicConfig(level=logging.DEBUG) 24 | 25 | import numpy as np 26 | import mxnet as mx 27 | from mxnet import gluon, autograd 28 | from mxnet.gluon import nn 29 | 30 | # Parse CLI arguments 31 | 32 | parser = argparse.ArgumentParser(description='MXNet Gluon MNIST Example') 33 | parser.add_argument('--batch-size', type=int, default=100, 34 | help='batch size for training and testing (default: 100)') 35 | parser.add_argument('--epochs', type=int, default=10, 36 | help='number of epochs to train (default: 10)') 37 | parser.add_argument('--lr', type=float, default=0.1, 38 | help='learning rate (default: 0.1)') 39 | parser.add_argument('--momentum', type=float, default=0.9, 40 | help='SGD momentum (default: 0.9)') 41 | parser.add_argument('--cuda', action='store_true', default=False, 42 | help='Train on GPU with CUDA') 43 | parser.add_argument('--log-interval', type=int, default=100, metavar='N', 44 | help='how many batches to wait before logging training status') 45 | opt = parser.parse_args() 46 | 47 | 48 | # define network 49 | 50 | net = nn.Sequential() 51 | net.add(nn.Dense(128, activation='relu')) 52 | net.add(nn.Dense(64, activation='relu')) 53 | net.add(nn.Dense(10)) 54 | 55 | # data 56 | 57 | def transformer(data, label): 58 | data = data.reshape((-1,)).astype(np.float32)/255 59 | return data, label 60 | 61 | train_data = gluon.data.DataLoader( 62 | gluon.data.vision.MNIST('./data', train=True).transform(transformer), 63 | batch_size=opt.batch_size, shuffle=True, last_batch='discard') 64 | 65 | val_data = gluon.data.DataLoader( 66 | gluon.data.vision.MNIST('./data', train=False).transform(transformer), 67 | batch_size=opt.batch_size, shuffle=False) 68 | 69 | # train 70 | 71 | def test(ctx): 72 | metric = mx.metric.Accuracy() 73 | for data, label in val_data: 74 | data = data.as_in_context(ctx) 75 | label = label.as_in_context(ctx) 76 | output = net(data) 77 | metric.update([label], [output]) 78 | 79 | return metric.get() 80 | 81 | 82 | def train(epochs, ctx): 83 | # Collect all parameters from net and its children, then initialize them. 84 | net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) 85 | # Trainer is for updating parameters with gradient. 86 | trainer = gluon.Trainer(net.collect_params(), 'sgd', 87 | {'learning_rate': opt.lr, 'momentum': opt.momentum}) 88 | metric = mx.metric.Accuracy() 89 | loss = gluon.loss.SoftmaxCrossEntropyLoss() 90 | 91 | for epoch in range(epochs): 92 | # reset data iterator and metric at begining of epoch. 93 | metric.reset() 94 | for i, (data, label) in enumerate(train_data): 95 | # Copy data to ctx if necessary 96 | data = data.as_in_context(ctx) 97 | label = label.as_in_context(ctx) 98 | # Start recording computation graph with record() section. 99 | # Recorded graphs can then be differentiated with backward. 100 | with autograd.record(): 101 | output = net(data) 102 | L = loss(output, label) 103 | L.backward() 104 | # take a gradient step with batch_size equal to data.shape[0] 105 | trainer.step(data.shape[0]) 106 | # update metric at last. 107 | metric.update([label], [output]) 108 | 109 | if i % opt.log_interval == 0 and i > 0: 110 | name, acc = metric.get() 111 | print('[Epoch %d Batch %d] Training: %s=%f'%(epoch, i, name, acc)) 112 | 113 | name, acc = metric.get() 114 | print('[Epoch %d] Training: %s=%f'%(epoch, name, acc)) 115 | 116 | name, val_acc = test(ctx) 117 | print('[Epoch %d] Validation: %s=%f'%(epoch, name, val_acc)) 118 | 119 | net.save_parameters('mnist.params') 120 | 121 | 122 | if __name__ == '__main__': 123 | if opt.cuda: 124 | ctx = mx.gpu(0) 125 | else: 126 | ctx = mx.cpu() 127 | train(opt.epochs, ctx) 128 | -------------------------------------------------------------------------------- /tuxiv.conf.md: -------------------------------------------------------------------------------- 1 | #### ENVIRONMENT VARIABLES 2 | 3 | + `TACC_WORKDIR`: TACC job workspace directory. Each job has a different workspace directory. 4 | + `TACC_USERDIR`: TACC User directory. 5 | + `TACC_SLURM_USERLOG`: Slurm log directory. The default value is `${TACC_USERDIR}/slurm_log`. 6 | - tip: These ENVIRONMENT VARIABLES can be used in `tuxiv.conf` or read in `python` code. 7 | 8 | There are four parts in `tuxiv.conf` that configure different parts of job submission. Noted that `tuxiv.conf` follows **yaml format**. 9 | 10 | #### Entrypoint 11 | 12 | In this section, you should input you shell commands to run your code line-by-line. The tcloud CLI will help run the job according to your commands. 13 | 14 | ~~~yaml 15 | entrypoint: 16 | - python ${TACC_WORKDIR}/mnist.py --epoch=3 17 | ~~~ 18 | 19 | #### Environment 20 | 21 | In this section, you can specify your software requirements, including the environment name, dependencies, source channels and so on. The tcloud CLI will help build your environment with *miniconda*. 22 | 23 | Notice: The environment name is *optional*. You can have the following two options. 24 | 1. Environment name set. 25 | 26 | In this case, tcloud will create a new environment when you change any of your dependencies configuration. 27 | ~~~yaml 28 | environment: 29 | dependencies: 30 | - pytorch=1.6.0 31 | - torchvision=0.7.0 32 | channels: pytorch 33 | ~~~ 34 | 2. Environment name unset. 35 | 36 | In this case, the environment will be persistent and tcloud will be updated the environment when you change any of your dependencies configuration (instead of creat a new environment). 37 | The environment configuration of tcloud is managed by conda, and you can follow conda to manage your environment. 38 | ~~~yaml 39 | environment: 40 | name: torch-env 41 | dependencies: 42 | - pytorch=1.6.0 43 | - torchvision=0.7.0 44 | channels: pytorch 45 | ~~~ 46 | + Check environment 47 | 48 | Check the existing environment with the `tcloud env ls` command. 49 | ``` 50 | ~ ❯ tcloud env ls 51 | # conda environments: 52 | # 53 | base * /mnt/home/username/.Miniconda3 54 | pytorch /mnt/home/username/.Miniconda3/envs/pytorch 55 | ``` 56 | Check installed dependencies in a sp environment 57 | the existing environment dependencies with the `tcloud env ls -n [ENV_NAME]` command. 58 | ``` 59 | ~ ❯ tcloud env ls -n base 60 | # packages in environment at /mnt/home/username/.Miniconda3: 61 | # 62 | # Name Version Build Channel 63 | _libgcc_mutex 0.1 main 64 | brotlipy 0.7.0 py38h27cfd23_1003 65 | ca-certificates 2020.10.14 0 66 | certifi 2020.6.20 pyhd3eb1b0_3 67 | cffi 1.14.3 py38h261ae71_2 68 | chardet 3.0.4 py38h06a4308_1003 69 | conda 4.9.2 py38h06a4308_0 70 | conda-package-handling 1.7.2 py38h03888b9_0 71 | ... 72 | ``` 73 | 74 | #### Job 75 | 76 | In this section, you can specify your configurations for cluster resources, including number of nodes, CPUs, GPUs, output file and so on. All the cluster configuration should be set in the general part. 77 | 78 | ~~~yaml 79 | job: 80 | name: test 81 | general: 82 | - nodes=2 # the number of nodes 83 | - ntasks-per-node=1 # the number of tasks per node 84 | - cpus-per-task=10 # the number of cpu per task 85 | - gres=gpu:2 # the number of gpu per node 86 | ~~~ 87 | 88 | **Note:** You can modify the output log path in Job section. For debugging purpose, we recommend you set the `output` value under `${TACC_USERDIR}` directory and check it using `tcloud ls` and `tcloud download`. 89 | 90 | #### Datasets 91 | - tcloud will help place the public datasets access in `TACC_USERDIR`. You can view the table of datasets at [Dataset Info](https://docs.google.com/spreadsheets/d/18qi2YpYvuXkWns7KY9pHYQclhS1Yyt5ysqgZ4plYcTg/edit#gid=0) or check the table below. 92 | 93 | - 94 | | | Dataset Name | 95 | | :------: | :------: | 96 | | 0 | imagenet | 97 | | 1 | mnist | 98 | | 2 | cifar-10 | 99 | | 3 | coco17 | 100 | | 4 | more datasets upon request | 101 | 102 | - to access the public dataset you need to add this command in your tuxiv.conf file: 103 | ~~~yaml 104 | datasets: 105 | - imagenet 106 | ~~~ 107 | - also use this path as a dataset directory: 108 | ~~~shell 109 | ${TACC_USERDIR}/DATASET_NAME 110 | ~~~ 111 | - User dataset: if you want to use your own dataset, you may **skip** this part and follow the [instructions](docs/user_dataset.md) to upload and use your dataset. 112 | 113 | -------------------------------------------------------------------------------- /example/TensorFlow/mnist.py: -------------------------------------------------------------------------------- 1 | import math 2 | import tensorflow as tf 3 | from tensorflow.examples.tutorials.mnist import input_data 4 | 5 | import os 6 | import shutil 7 | import sys 8 | from tensorflow_on_slurm import tf_config_from_slurm 9 | 10 | # Flags for defining the tf.train.ClusterSpec 11 | tf.app.flags.DEFINE_string("ps_hosts", "", 12 | "Comma-separated list of hostname:port pairs") 13 | tf.app.flags.DEFINE_string("worker_hosts", "", 14 | "Comma-separated list of hostname:port pairs") 15 | 16 | # Flags for defining the tf.train.Server 17 | tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'") 18 | tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job") 19 | tf.app.flags.DEFINE_integer("hidden_units", 100, 20 | "Number of units in the hidden layer of the NN") 21 | tf.app.flags.DEFINE_string("data_dir", "/tmp/mnist-data", 22 | "Directory for storing mnist data") 23 | tf.app.flags.DEFINE_integer("batch_size", 100, "Training batch size") 24 | 25 | FLAGS = tf.app.flags.FLAGS 26 | 27 | IMAGE_PIXELS = 28 28 | 29 | def main(_): 30 | # # Create a cluster from the parameter server and worker hosts. 31 | if os.path.exists("/tmp/train_logs"): 32 | shutil.rmtree("/tmp/train_logs") 33 | cluster, my_job_name, my_task_index = tf_config_from_slurm(ps_number=1) 34 | cluster_spec = tf.train.ClusterSpec(cluster) 35 | 36 | # Create and start a server for the local task. 37 | server = tf.train.Server(cluster_spec, 38 | job_name=my_job_name, 39 | task_index=my_task_index) 40 | 41 | if my_job_name == "ps": 42 | server.join() 43 | elif my_job_name == "worker": 44 | 45 | # Assigns ops to the local worker by default. 46 | with tf.device(tf.train.replica_device_setter( 47 | worker_device="/job:worker/task:%d" % my_task_index, 48 | cluster=cluster_spec)): 49 | 50 | # Variables of the hidden layer 51 | hid_w = tf.Variable( 52 | tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], 53 | stddev=1.0 / IMAGE_PIXELS), name="hid_w") 54 | hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") 55 | 56 | # Variables of the softmax layer 57 | sm_w = tf.Variable( 58 | tf.truncated_normal([FLAGS.hidden_units, 10], 59 | stddev=1.0 / math.sqrt(FLAGS.hidden_units)), 60 | name="sm_w") 61 | sm_b = tf.Variable(tf.zeros([10]), name="sm_b") 62 | 63 | x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) 64 | y_ = tf.placeholder(tf.float32, [None, 10]) 65 | 66 | hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) 67 | hid = tf.nn.relu(hid_lin) 68 | 69 | y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) 70 | loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) 71 | 72 | global_step = tf.Variable(0) 73 | 74 | train_op = tf.train.AdagradOptimizer(0.01).minimize( 75 | loss, global_step=global_step) 76 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 77 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 78 | saver = tf.train.Saver() 79 | summary_op = tf.summary.merge_all() 80 | init_op = tf.initialize_all_variables() 81 | 82 | # Create a "supervisor", which oversees the training process. 83 | sv = tf.train.Supervisor(is_chief=(my_task_index == 0), 84 | logdir="/tmp/train_logs", 85 | init_op=init_op, 86 | saver=saver, 87 | summary_op=summary_op, 88 | global_step=global_step, 89 | save_model_secs=600) 90 | 91 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 92 | 93 | # The supervisor takes care of session initialization, restoring from 94 | # a checkpoint, and closing when done or an error occurs. 95 | with sv.managed_session(server.target) as sess: 96 | # Loop until the supervisor shuts down or 1000000 steps have completed. 97 | step = 0 98 | while not sv.should_stop() and step < 10000: 99 | # Run a training step asynchronously. 100 | # See `tf.train.SyncReplicasOptimizer` for additional details on how to 101 | # perform *synchronous* training. 102 | 103 | batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) 104 | train_feed = {x: batch_xs, y_: batch_ys} 105 | 106 | _, step = sess.run([train_op, global_step], feed_dict=train_feed) 107 | if step % 1000 == 0: 108 | acc = sess.run([accuracy], feed_dict={x: mnist.test.images, y_: mnist.test.labels}) 109 | # Compute the accuracy of the model on test data set. 110 | print("step:" , step, "accuracy: ", acc) 111 | 112 | 113 | # Ask for all the services to stop. 114 | tf.logging.info("Training process finished") 115 | sv.stop() 116 | 117 | if __name__ == "__main__": 118 | tf.app.run() 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Turing AI Cloud Quick Start 2 | ## Workflow Overview 3 | 4 | ![Workflow](./static/workflow.png) 5 | 6 | The above picture illustrates the submission and debug workflows of TACC job. 7 | 8 | ## Creating a TACC account 9 | Before using tcloud SDK, please make sure that you have applied for a TACC account and submitted your public key to TACC. You may generate SSH public key according to the [steps](https://git-scm.com/book/en/v2/Git-on-the-Server-Generating-Your-SSH-Public-Key). 10 | To apply for a TACC account, please visit [our website ](https://turing.ust.hk/). 11 | 12 | ## Installing `tcloud` SDK 13 | - __Download tcloud SDK__ \ 14 | Download the latest tcloud SDK from [tags](https://github.com/turingaicloud/quickstart/tags). 15 | - __Install tcloud SDK__ \ 16 | Place `setup.sh` and `tcloud` in the same directory, and run `setup.sh`. 17 | 18 | ## Submitting Your First TACC Job 19 | ### CLI Tool Initialization 20 | + 21 | First, you need to configure your TACC credentials. You can do this by running the `tcloud config` command: 22 | ``` 23 | $ tcloud config [-u/--username] MYUSERNAME 24 | $ tcloud config [-f/--file] MYPRIVATEFILEPATH 25 | ``` 26 | + 27 | Then, run `tcloud init` command to obtain the latest cluster hardware information from TACC cluster. 28 | ``` 29 | PARTITION AVAIL TIMELIMIT NODES STATE NODELIST 30 | tacc* up infinite 5 alloc 10-0-7-[18-19],10-0-8-[18-19] 31 | tacc* up infinite 19 idle 10-0-2-[18-19],10-0-3-[10-13] 32 | ``` 33 | 34 | ### Download Sample Job 35 | You can use [this link](https://github.com/turingaicloud/quickstart/archive/refs/heads/master.zip) to download our example code. 36 | 37 | 38 | ### Submit a Job 39 | Each job requires a `main.py` with `tuxiv.conf` 40 | 41 | + 42 | main.py: Your machine learning training code. 43 | 44 | + 45 | tuxiv.conf: [Detail about tuxiv.conf](tuxiv.conf.md) 46 | 47 | 48 | After tcloud is configured correctly, you can try to submit your first job. 49 | 50 | 1. Go to the example folder in your terminal. 51 | 2. Run `tcloud submit` command. 52 | ``` 53 | ~/Dow/quickstart-master/example/helloworld ❯ tcloud submit 54 | Start parsing tuxiv.conf... 55 | building file list ... 56 | 8 files to consider 57 | helloworld/ 58 | helloworld/run.sh 59 | 151 100% 0.00kB/s 0:00:00 (xfer#1, to-check=5/8) 60 | helloworld/configurations/ 61 | helloworld/configurations/citynet.sh 62 | 12 100% 11.72kB/s 0:00:00 (xfer#2, to-check=2/8) 63 | helloworld/configurations/conda.yaml 64 | 107 100% 104.49kB/s 0:00:00 (xfer#3, to-check=1/8) 65 | helloworld/configurations/run.slurm 66 | 278 100% 271.48kB/s 0:00:00 (xfer#4, to-check=0/8) 67 | 68 | sent 429 bytes received 144 bytes 382.00 bytes/sec 69 | total size is 1071 speedup is 1.87 70 | Submitted batch job 2000 71 | Job helloworld submitted. 72 | ``` 73 | 74 | ### Retrive Your Job Status and Output 75 | In this section, we provide two methods to monitor the job log. 76 | 77 | After training, you can use `tcloud ls [filepath]` to find the output files 78 | + cat 79 | 80 | You can configure your log path in the `tuxiv.conf`. The default path is `slurm_log/slurm-jobid.out`. 81 | 82 | ``` 83 | tcloud cat slurm_log/slurm-jobid.out 84 | ``` 85 | In the helloworld example, the [tuxiv.conf](example/helloworld/tuxiv.conf) file specifies the log path as `slurm_log/hello.log` 86 | 87 | 88 | + download 89 | 90 | You can use `tcloud download [filepath]`. 91 | 92 | Note that you can only read and download files in `USERDIR`, and the files in `WORKDIR` may be removed after the job is finished. 93 | ``` 94 | tcloud download slurm_log/slurm-jobid.out 95 | ``` 96 | 97 | ### Manage your environment 98 | tcloud uses [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) to manage your dependencies. All dependencies will be installed through conda. Please specify the required conda channel to meet the installation requirements. In tcloud, we offer two ways of environment management: 99 | 100 | 1. One-off Environment. A new environment with different dependencies will be created every time you submit a task to TACC. If you do not specify an environment name and your dependencies configuration does not change between two consecutive submissions in `tuxiv.conf`, we will reuse the previous environment to save time. This is the *default* behavior. 101 | ~~~yaml 102 | environment: 103 | # name: # do not specify environment name 104 | dependencies: 105 | - pytorch=1.6.0 106 | - torchvision=0.7.0 107 | channels: pytorch 108 | ~~~ 109 | 2. Persistent Environment. You can create a dedicated environment for each project. It needs to set a different environment name in `tuxiv.conf` for each project. When you change your dependencies configuration with an exist environment, tcloud will update this environment in stead of creating a new one. Learn how to do this in [tuxiv.conf documentation environment part.](tuxiv.conf.md#environment) 110 | ~~~yaml 111 | environment: 112 | name: torch-env # dedicated environment name 113 | dependencies: 114 | - pytorch=1.6.0 115 | - torchvision=0.7.0 116 | channels: pytorch 117 | ~~~ 118 | 119 | 120 | ## Demo video 121 | The following videos will help you use tcloud CLI to begin your TACC journey: [demo video](https://hkustconnect-my.sharepoint.com/:v:/g/personal/dsunak_connect_ust_hk/EUYW3f8IRwVLhBtCYP_ufs4BpQ7CaxrCUBiUexY7-nLX7w?e=O2gR2G). 122 | 123 | ## Examples 124 | Basic examples are provided under the [example](example) folder. These examples include: [HelloWorld](example/helloworld), [TensorFlow](example/TensorFlow), [PyTorch](example/PyTorch) and [MXNet](example/MXNet). 125 | 126 | ## FAQ 127 | [FAQ](FAQ.md) -------------------------------------------------------------------------------- /example/PyTorch/mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import os 4 | import subprocess 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torchvision import datasets, transforms 10 | from torch.utils.data import Dataset, DataLoader 11 | from torch.utils.data.distributed import DistributedSampler 12 | from torch.nn.parallel import DistributedDataParallel 13 | 14 | 15 | def dist_init(host_addr, rank, local_rank, world_size, port=23456): 16 | host_addr_full = 'tcp://' + host_addr + ':' + str(port) 17 | torch.distributed.init_process_group("gloo", init_method=host_addr_full, 18 | rank=rank, world_size=world_size) 19 | assert torch.distributed.is_initialized() 20 | 21 | class Net(nn.Module): 22 | def __init__(self): 23 | super(Net, self).__init__() 24 | self.conv1 = nn.Conv2d(1, 20, 5, 1) 25 | self.conv2 = nn.Conv2d(20, 50, 5, 1) 26 | self.fc1 = nn.Linear(4*4*50, 500) 27 | self.fc2 = nn.Linear(500, 10) 28 | 29 | def forward(self, x): 30 | x = F.relu(self.conv1(x)) 31 | x = F.max_pool2d(x, 2, 2) 32 | x = F.relu(self.conv2(x)) 33 | x = F.max_pool2d(x, 2, 2) 34 | x = x.view(-1, 4*4*50) 35 | x = F.relu(self.fc1(x)) 36 | x = self.fc2(x) 37 | return F.log_softmax(x, dim=1) 38 | 39 | def train(args, model, local_rank, train_loader, optimizer, epoch): 40 | model.train() 41 | for batch_idx, (data, target) in enumerate(train_loader): 42 | data, target = data.to(local_rank), target.to(local_rank) 43 | optimizer.zero_grad() 44 | output = model(data) 45 | loss = F.nll_loss(output, target) 46 | loss.backward() 47 | optimizer.step() 48 | if batch_idx % args.log_interval == 0: 49 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 50 | epoch, batch_idx * len(data), len(train_loader.dataset), 51 | 100. * batch_idx / len(train_loader), loss.item())) 52 | 53 | def test(args, model, local_rank, test_loader, world_size): 54 | model.eval() 55 | test_loss = 0 56 | correct = 0 57 | length = len(test_loader.dataset)/world_size 58 | with torch.no_grad(): 59 | for data, target in test_loader: 60 | data, target = data.to(local_rank), target.to(local_rank) 61 | output = model(data) 62 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 63 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 64 | correct += pred.eq(target.view_as(pred)).sum().item() 65 | 66 | test_loss /= length 67 | 68 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 69 | test_loss, correct, length, 70 | 100. * correct / length)) 71 | 72 | def main(): 73 | 74 | # Training settings 75 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 76 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 77 | help='input batch size for training (default: 64)') 78 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 79 | help='input batch size for testing (default: 1000)') 80 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 81 | help='number of epochs to train (default: 10)') 82 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 83 | help='learning rate (default: 0.01)') 84 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 85 | help='SGD momentum (default: 0.5)') 86 | parser.add_argument('--no-cuda', action='store_true', default=False, 87 | help='disables CUDA training') 88 | parser.add_argument('--seed', type=int, default=1, metavar='S', 89 | help='random seed (default: 1)') 90 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 91 | help='how many batches to wait before logging training status') 92 | parser.add_argument('--save-model', action='store_true', default=False, 93 | help='For Saving the current Model') 94 | parser.add_argument('--datasetDir', 95 | help='Please add your dataset directory') 96 | 97 | args = parser.parse_args() 98 | 99 | # use_cuda = not args.no_cuda and torch.cuda.is_available() 100 | # torch.manual_seed(args.seed) 101 | # device = torch.device("cuda" if use_cuda else "cpu") 102 | # kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 103 | 104 | rank = int(os.environ['SLURM_PROCID']) 105 | local_rank = int(os.environ['SLURM_LOCALID']) 106 | world_size = int(os.environ['SLURM_NTASKS']) 107 | iplist = os.environ['SLURM_JOB_NODELIST'] 108 | ip = subprocess.getoutput(f"scontrol show hostname {iplist} | head -n1") 109 | 110 | dist_init(ip, rank, local_rank, world_size) 111 | train_dataset = datasets.MNIST(args.datasetDir, train=True, download=False, 112 | transform=transforms.Compose([ 113 | transforms.ToTensor(), 114 | transforms.Normalize((0.1307,), (0.3081,)) 115 | ])) 116 | train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) 117 | train_loader = torch.utils.data.DataLoader( 118 | train_dataset, 119 | batch_size=args.batch_size, 120 | sampler=train_sampler 121 | ) 122 | test_dataset = datasets.MNIST(args.datasetDir, train=False, transform=transforms.Compose([ 123 | transforms.ToTensor(), 124 | transforms.Normalize((0.1307,), (0.3081,)) 125 | ])) 126 | test_sampler = DistributedSampler(test_dataset, num_replicas=world_size, rank=rank) 127 | 128 | test_loader = torch.utils.data.DataLoader( 129 | test_dataset, 130 | batch_size=args.test_batch_size, 131 | sampler=test_sampler 132 | ) 133 | 134 | model = Net().to(local_rank) 135 | model = DistributedDataParallel(model) 136 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 137 | 138 | for epoch in range(1, args.epochs + 1): 139 | train(args, model, local_rank, train_loader, optimizer, epoch) 140 | test(args, model, local_rank, test_loader, world_size) 141 | 142 | if (args.save_model): 143 | torch.save(model.state_dict(),"mnist_cnn.pt") 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # tcloud SDK Examples 2 | 3 | TACC supports multiple ML frameworks such as TensorFlow, PyTorch and MXNet. We will later support some specialized ML framework like FATE, etc. Here we list several job examples of different frameworks. 4 | 5 | ## HelloWorld 6 | 7 | + CityNet Dataset: OpenRoadMap 8 | + Task: basic usage of tcloud 9 | + Code: [main.py](helloworld/main.py) 10 | 11 | ### Getting started 12 | 13 | + Install tcloud CLI, and run `tcloud init` to pull the latest cluster configurations from remote. 14 | 15 | + Configuration 16 | 17 | + Configure user information using `tcloud config`. 18 | 19 | + TACC ENV 20 | 21 | ~~~shell 22 | TACC_WORKDIR # default repo directory 23 | TACC_USERDIR # user directory 24 | TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log 25 | ~~~ 26 | 27 | + TuXiv configuration 28 | 29 | ~~~yaml 30 | # tuxiv.conf 31 | entrypoint: 32 | - python ${TACC_WORKDIR}/main.py 33 | environment: 34 | name: hello 35 | dependencies: 36 | - python=3.6.9 37 | job: 38 | name: test 39 | general: 40 | - output=${TACC_SLURM_USERLOG}/hello.out 41 | - nodes=1 42 | - ntasks=1 43 | - cpus-per-task=1 44 | datasets: 45 | - OpenRoadMap 46 | ~~~ 47 | 48 | + Model code modification 49 | 50 | ~~~python 51 | import os 52 | import shutil 53 | # get variables from env 54 | WORKDIR = os.environ.get('TACC_WORKDIR') 55 | USERDIR = os.environ.get('TACC_USERDIR') 56 | # show the directory tree 57 | os.system('tree -L 2 {}'.format(USERDIR)) 58 | # basic copy operation 59 | shutil.copytree(WORKDIR, "{}/helloworld".format(USERDIR)) 60 | ~~~ 61 | 62 | ### Submit job 63 | 64 | + Enter the `helloworld` directory and follow the following steps. 65 | + Build environment and submit job: `tcloud submit` 66 | + Monitor job: `tcloud ps [-j] []` 67 | + Obtain log: `tcloud download helloworld/slurm_log/hello.out` 68 | + Cancel job: `tcloud cancel [-j] []` 69 | + View UserDir: `tcloud ls ` 70 | 71 | 72 | 73 | ## TensorFlow 74 | 75 | + Dataset: mnist 76 | + Task: image classification 77 | + Code: [mnist.py](TensorFlow/mnist.py) 78 | 79 | ### Getting started 80 | 81 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote. 82 | 83 | + Configuration 84 | 85 | + Config user informations using `tcloud config`. 86 | 87 | + TACC ENV 88 | 89 | ~~~shell 90 | TACC_WORKDIR # default repo directory 91 | TACC_USERDIR # user directory 92 | TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log 93 | ~~~ 94 | 95 | + TuXiv configuration 96 | 97 | ~~~yaml 98 | # tuxiv.conf 99 | entrypoint: 100 | - python ${TACC_WORKDIR}/mnist.py 101 | - --task_index=0 102 | - --data_dir=${TACC_WORKDIR}/datasets/mnist_data 103 | - --batch_size=1 104 | environment: 105 | name: tf 106 | dependencies: 107 | - tensorflow=1.15 108 | job: 109 | name: mnist 110 | general: 111 | - nodes=2 112 | ~~~ 113 | 114 | + Model code modification 115 | 116 | Use ` tf.distribute.cluster_resolver.SlurmClusterResolver` instead of other resolvers. 117 | 118 | ### Training 119 | 120 | + Enter the `TensorFlow` directory and follow the following steps. 121 | + Build environment and submit job: `tcloud submit` 122 | + Monitor job: `tcloud ps [-j] []` 123 | + Cancel job: `tcloud cancel [-j] []` 124 | + View UserDir: `tcloud ls ` 125 | 126 | 127 | 128 | ## PyTorch 129 | 130 | + Dataset: mnist 131 | + Task: image classification 132 | + Code: [mnist.py](PyTorch/mnist.py) 133 | 134 | ### Getting started 135 | 136 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote. 137 | 138 | + Configuration 139 | 140 | + Config user informations using `tcloud config`. 141 | 142 | + TACC ENV 143 | 144 | ~~~shell 145 | TACC_WORKDIR # default repo directory 146 | TACC_USERDIR # user directory 147 | TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log 148 | ~~~ 149 | 150 | + TuXiv configuration 151 | 152 | ~~~yaml 153 | # tuxiv.conf 154 | entrypoint: 155 | - python ${TACC_WORKDIR}/mnist.py --epoch=3 156 | environment: 157 | name: torch-env 158 | dependencies: 159 | - pytorch=1.6.0 160 | - torchvision=0.7.0 161 | channels: pytorch 162 | job: 163 | name: test 164 | general: 165 | - nodes=2 166 | ~~~ 167 | 168 | + Model code modification 169 | 170 | Obtain environment variables from slurm cluster, and set the parameters for initialize the cluster. 171 | 172 | ~~~python 173 | # example 174 | def dist_init(host_addr, rank, local_rank, world_size, port=23456): 175 | host_addr_full = 'tcp://' + host_addr + ':' + str(port) 176 | torch.distributed.init_process_group("gloo", init_method=host_addr_full, 177 | rank=rank, world_size=world_size) 178 | assert torch.distributed.is_initialized() 179 | 180 | def get_ip(iplist): 181 | ip = iplist.split('[')[0] + iplist.split('[')[1].split('-')[0] 182 | 183 | rank = int(os.environ['SLURM_PROCID']) 184 | local_rank = int(os.environ['SLURM_LOCALID']) 185 | world_size = int(os.environ['SLURM_NTASKS']) 186 | iplist = os.environ['SLURM_STEP_NODELIST'] 187 | ip = get_ip(iplist) # function get_ip() is depends on the format of nodelist 188 | dist_init(ip, rank, local_rank, world_size) 189 | ~~~ 190 | 191 | ### Training 192 | 193 | + Enter the `PyTorch` directory and follow the following steps. 194 | + Build environment and submit job: `tcloud submit` 195 | + Monitor job: `tcloud ps [-j] []` 196 | + Cancel job: `tcloud cancel [-j] []` 197 | + View UserDir: `tcloud ls ` 198 | 199 | 200 | 201 | ## MXNet 202 | 203 | + Dataset: mnist 204 | + Task: image classification 205 | + Code: [mnist.py](MXNet/mnist.py) 206 | 207 | ### Getting started 208 | 209 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote. 210 | 211 | + Configuration 212 | 213 | + Config user informations using `tcloud config`. 214 | 215 | + TACC ENV 216 | 217 | ~~~shell 218 | TACC_WORKDIR # default repo directory 219 | TACC_USERDIR # user directory 220 | TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log 221 | ~~~ 222 | 223 | + TuXiv configuration 224 | 225 | ~~~yaml 226 | # tuxiv.conf 227 | entrypoint: 228 | - python ${TACC_WORKDIR}/mnist.py 229 | environment: 230 | name: mxnet-env 231 | dependencies: 232 | - mxnet=1.5.0 233 | job: 234 | name: test 235 | general: 236 | - nodes=2 237 | ~~~ 238 | 239 | + Model code modification 240 | 241 | Obtain environment variables from slurm cluster, and set the parameters for initialize the cluster. 242 | 243 | ### Training 244 | 245 | + Enter the `MXNet` directory and follow the following steps. 246 | + Build environment and submit job: `tcloud submit` 247 | + Monitor job: `tcloud ps [-j] []` 248 | + Cancel job: `tcloud cancel [-j] []` 249 | + View UserDir: `tcloud ls ` 250 | 251 | --------------------------------------------------------------------------------