├── example
    ├── MXNet
    │   ├── documents
    │   │   └── paper.pdf
    │   ├── datasets
    │   │   └── manifest.json
    │   ├── tuxiv.conf
    │   └── mnist.py
    ├── TensorFlow
    │   ├── documents
    │   │   └── paper.pdf
    │   ├── datasets
    │   │   └── manifest.json
    │   ├── configurations
    │   │   ├── citynet.sh
    │   │   ├── conda.yaml
    │   │   └── run.slurm
    │   ├── run.sh
    │   ├── tuxiv.conf
    │   ├── tensorflow_on_slurm.py
    │   └── mnist.py
    ├── helloworld
    │   ├── main.py
    │   └── tuxiv.conf
    ├── PyTorch
    │   ├── tuxiv.conf
    │   └── mnist.py
    └── README.md
├── static
    └── workflow.png
├── .gitignore
├── docs
    └── user_dataset.md
├── FAQ.md
├── tuxiv.conf.md
└── README.md


/example/MXNet/documents/paper.pdf:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/MXNet/datasets/manifest.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/TensorFlow/documents/paper.pdf:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/TensorFlow/datasets/manifest.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/TensorFlow/configurations/citynet.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/static/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turingaicloud/quickstart/HEAD/static/workflow.png


--------------------------------------------------------------------------------
/example/TensorFlow/configurations/conda.yaml:
--------------------------------------------------------------------------------
1 | name: tf
2 | channels:
3 | dependencies:
4 |   - tensorflow=1.15
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.DS_Store
 3 | .DS_Store?
 4 | *.vscode
 5 | *.tcloud
 6 | package.json
 7 | package-lock.json
 8 | vendor/
 9 | pkg/
10 | bin/*


--------------------------------------------------------------------------------
/example/TensorFlow/configurations/run.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --nodes=2
3 | export TACC_WORKDIR=/mnt/sharefs/home/testuser/WORKDIR/TensorFlow
4 | export TACC_USERDIR=/mnt/sharefs/home/testuser/USERDIR
5 | srun /mnt/sharefs/home/testuser/WORKDIR/TensorFlow/run.sh
6 | 


--------------------------------------------------------------------------------
/example/TensorFlow/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source /mnt/sharefs/home/testuser/WORKDIR/miniconda3/etc/profile.d/conda.sh
3 | conda activate tf
4 | 
5 | python ${TACC_WORKDIR}/mnist.py \
6 | --task_index=0 \
7 | --data_dir=${TACC_WORKDIR}/datasets/mnist_data \
8 | --batch_size=1 \
9 | 


--------------------------------------------------------------------------------
/example/helloworld/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | WORKDIR = os.environ.get('TACC_WORKDIR')
 5 | USERDIR = os.environ.get('TACC_USERDIR')
 6 | 
 7 | os.system('tree -L 2 {}'.format(USERDIR))
 8 | 
 9 | shutil.copytree(WORKDIR, "{}/helloworld".format(USERDIR))
10 | print("Hello World")


--------------------------------------------------------------------------------
/example/MXNet/tuxiv.conf:
--------------------------------------------------------------------------------
 1 | entrypoint:
 2 |     - python ${TACC_WORKDIR}/mnist.py
 3 | environment:
 4 |     name: mxnet-env 
 5 |     dependencies:
 6 |         - mxnet=1.5.0
 7 | job:
 8 |     name: test
 9 |     general:
10 |         - nodes=2
11 |         - output=${TACC_SLURM_USERLOG}/mxnet.log
12 | 


--------------------------------------------------------------------------------
/example/helloworld/tuxiv.conf:
--------------------------------------------------------------------------------
 1 | entrypoint:
 2 |   - python ${TACC_WORKDIR}/main.py
 3 | environment:
 4 |     name: hello
 5 |     channels:
 6 |       - conda-forge
 7 |     dependencies:
 8 |       - python=3.6.9
 9 |       - opencv
10 | job:
11 |     general:
12 |       - nodes=1
13 |       - ntasks=1
14 |       - cpus-per-task=1
15 |       - output=${TACC_SLURM_USERLOG}/hello.log


--------------------------------------------------------------------------------
/example/TensorFlow/tuxiv.conf:
--------------------------------------------------------------------------------
 1 | entrypoint:
 2 |     - python ${TACC_WORKDIR}/mnist.py
 3 |     - --task_index=0
 4 |     - --data_dir=${TACC_WORKDIR}/datasets/mnist_data
 5 |     - --batch_size=1
 6 | environment:
 7 |     name: tf
 8 |     dependencies:
 9 |         - tensorflow=1.15
10 | job:
11 |     name: test
12 |     general:
13 |         - nodes=2
14 |         - output=${TACC_SLURM_USERLOG}/tensorflow.log


--------------------------------------------------------------------------------
/example/PyTorch/tuxiv.conf:
--------------------------------------------------------------------------------
 1 | entrypoint:
 2 |   - CUDA_VISIBLE_DEVICES="0,1,2,3" python ${TACC_WORKDIR}/mnist.py --datasetDir=mnt/data/mnist
 3 | environment:
 4 |     name: torch-env
 5 |     channels: 
 6 |       - pytorch
 7 |       - nvidia
 8 |     dependencies:
 9 |       - python=3.6.9
10 |       - pytorch=1.9.0
11 |       - torchvision=0.10.0
12 |       - tensorboard=1.15.0
13 |       - cudatoolkit=11.1.74
14 |       - torchaudio=0.9.0
15 | job:
16 |     name: test
17 |     general:
18 |       - nodes=2
19 |       - ntasks-per-node=2
20 |       - cpus-per-task=10
21 |       - gres=gpu:2


--------------------------------------------------------------------------------
/docs/user_dataset.md:
--------------------------------------------------------------------------------
 1 | # Upload dataset
 2 | You may upload your dataset with `tcloud upload` command:
 3 | ```
 4 | tcloud upload [-c] <local_dirpath> [<remote_dirpath>]
 5 | ```
 6 | The `tcloud upload` command helps upload your dataset to ${TACC_USERDIR}. 
 7 | 
 8 | Note that by default tcloud incrementally upload your dataset to TACC, if you want to **delete previous version** and re-upload the dataset, you may add "-c" flag in your command.
 9 | 
10 | # Specify dataset in code
11 | After uploading your own dataset, you must add several codes to specify the location of the dataset. Below is an example in PyTorch:
12 | 
13 | ~~~python
14 | workdir = os.environ.get('TACC_WORKDIR')
15 | userdir = os.environ.get('TACC_USERDIR')
16 | 
17 | ...
18 | 
19 | train_dataset = torchvision.datasets.MNIST('{}/{}'.format(userdir, <YOUR_DATASET_PATH>), train=True, download=False,
20 |                        transform=transforms.Compose([
21 |                            transforms.ToTensor(),
22 |                            transforms.Normalize((0.1307,), (0.3081,))
23 |                        ]))
24 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
25 | train_loader = torch.utils.data.DataLoader(
26 |     train_dataset,
27 |     batch_size=args.batch_size,
28 |     sampler=train_sampler,
29 |     **kwargs)
30 | ~~~


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ### 1. How to use tcloud?
 4 | - It's easy to use. Try helloworld example first.
 5 | ### 2. Can I use my own dataset?
 6 | - Of course, you can upload your dataset to a specific directory via tcloud upload.
 7 | ### 3. What is the current tcloud strategy？
 8 | - Now, there are a total of 8 nodes in TACC, and each node has 4 GTX3090 and 40 CPU cores.
 9 | ### 4. Can I ssh to the tcloud server?
10 | - All users cannot ssh to our server. If you have special needs, please feel free to contact us.
11 | ### 5. Can I keep using tacc without running an experiment?
12 | - Can not. We will regularly kill processes that occupy nodes for a long time but do not run experiments.
13 | ### 6. How to cancel a tcloud job?
14 | - You can use `tcloud cancel -j [JOBID]` command.
15 | ### 7. Why did I submit a job but there is no log?
16 | - It's hard to say. This is most likely an error generated before the job is assigned to tcloud to run. Please carefully check the tuxiv.conf file.
17 | ### 8. The dependency I need is not provided by conda.
18 | - You can wirte your download command in tuxiv.conf entrypoint with your execute command. (example will provide after test)
19 | ### 9. Why does my job submission process take a long time?
20 | - We will retrieve the dependencies section in your tuxic.conf file. If the dependencies are different from the last time, tcloud will do more process. In order to save time, please confirm the required dependencies as soon as possible.


--------------------------------------------------------------------------------
/example/TensorFlow/tensorflow_on_slurm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | 
 5 | import tensorflow as tf
 6 | import re
 7 | import os
 8 | 
 9 | def tf_config_from_slurm(ps_number, port_number=12222):
10 |     """
11 |     Creates configuration for a distributed tensorflow session 
12 |     from environment variables  provided by the Slurm cluster
13 |     management system.
14 |     
15 |     @param: ps_number number of parameter servers to run
16 |     @param: port_number port number to be used for communication
17 |     @return: a tuple containing cluster with fields cluster_spec,
18 |              task_name and task_id 
19 |     """
20 |     
21 |     nodelist = os.environ["SLURM_JOB_NODELIST"]
22 |     nodename = os.environ["SLURMD_NODENAME"]
23 |     nodelist = _expand_nodelist(nodelist)
24 |     num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
25 |     
26 |     if len(nodelist) != num_nodes:
27 |         raise ValueError("Number of slurm nodes {} not equal to {}".format(len(nodelist), num_nodes))
28 |     
29 |     if nodename not in nodelist:
30 |         raise ValueError("Nodename({}) not in nodelist({}). This should not happen! ".format(nodename,nodelist))
31 |     
32 |     ps_nodes = [node for i, node in enumerate(nodelist) if i < ps_number]
33 |     worker_nodes = [node for i, node in enumerate(nodelist) if i >= ps_number]
34 |     
35 |     if nodename in ps_nodes:
36 |         my_job_name = "ps"
37 |         my_task_index = ps_nodes.index(nodename)
38 |     else:
39 |         my_job_name = "worker"
40 |         my_task_index = worker_nodes.index(nodename)
41 |     
42 |     worker_sockets = [":".join([node, str(port_number)]) for node in worker_nodes]
43 |     ps_sockets = [":".join([node, str(port_number)]) for node in ps_nodes]
44 |     cluster = {"worker": worker_sockets, "ps" : ps_sockets}
45 |     return cluster, my_job_name, my_task_index
46 | 
47 | def _pad_zeros(iterable, length):
48 |     return (str(t).rjust(length, '0') for t in iterable)
49 |     
50 | def _expand_ids(ids):
51 |     ids = ids.split(',')
52 |     result = []
53 |     for id in ids:
54 |         if '-' in id:
55 |             begin, end = [int(token) for token in id.split('-')]
56 |             result.extend(_pad_zeros(range(begin, end+1), len(id.split('-')[0])))
57 |             # print(begin, end)
58 |         else:
59 |             result.append(id)
60 |         # result.append(id)
61 |     return result
62 | 
63 | def _expand_nodelist(nodelist):
64 |     prefix, ids = re.findall("(.*)\[(.*)\]", nodelist)[0]
65 |     ids = _expand_ids(ids)
66 |     result = [prefix + str(id) for id in ids]
67 |     return result
68 | 
69 | def _worker_task_id(nodelist, nodename):
70 |     return nodelist.index(nodename)
71 | 


--------------------------------------------------------------------------------
/example/MXNet/mnist.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | 
 18 | # pylint: skip-file
 19 | from __future__ import print_function
 20 | 
 21 | import argparse
 22 | import logging
 23 | logging.basicConfig(level=logging.DEBUG)
 24 | 
 25 | import numpy as np
 26 | import mxnet as mx
 27 | from mxnet import gluon, autograd
 28 | from mxnet.gluon import nn
 29 | 
 30 | # Parse CLI arguments
 31 | 
 32 | parser = argparse.ArgumentParser(description='MXNet Gluon MNIST Example')
 33 | parser.add_argument('--batch-size', type=int, default=100,
 34 |                     help='batch size for training and testing (default: 100)')
 35 | parser.add_argument('--epochs', type=int, default=10,
 36 |                     help='number of epochs to train (default: 10)')
 37 | parser.add_argument('--lr', type=float, default=0.1,
 38 |                     help='learning rate (default: 0.1)')
 39 | parser.add_argument('--momentum', type=float, default=0.9,
 40 |                     help='SGD momentum (default: 0.9)')
 41 | parser.add_argument('--cuda', action='store_true', default=False,
 42 |                     help='Train on GPU with CUDA')
 43 | parser.add_argument('--log-interval', type=int, default=100, metavar='N',
 44 |                     help='how many batches to wait before logging training status')
 45 | opt = parser.parse_args()
 46 | 
 47 | 
 48 | # define network
 49 | 
 50 | net = nn.Sequential()
 51 | net.add(nn.Dense(128, activation='relu'))
 52 | net.add(nn.Dense(64, activation='relu'))
 53 | net.add(nn.Dense(10))
 54 | 
 55 | # data
 56 | 
 57 | def transformer(data, label):
 58 |     data = data.reshape((-1,)).astype(np.float32)/255
 59 |     return data, label
 60 | 
 61 | train_data = gluon.data.DataLoader(
 62 |     gluon.data.vision.MNIST('./data', train=True).transform(transformer),
 63 |     batch_size=opt.batch_size, shuffle=True, last_batch='discard')
 64 | 
 65 | val_data = gluon.data.DataLoader(
 66 |     gluon.data.vision.MNIST('./data', train=False).transform(transformer),
 67 |     batch_size=opt.batch_size, shuffle=False)
 68 | 
 69 | # train
 70 | 
 71 | def test(ctx):
 72 |     metric = mx.metric.Accuracy()
 73 |     for data, label in val_data:
 74 |         data = data.as_in_context(ctx)
 75 |         label = label.as_in_context(ctx)
 76 |         output = net(data)
 77 |         metric.update([label], [output])
 78 | 
 79 |     return metric.get()
 80 | 
 81 | 
 82 | def train(epochs, ctx):
 83 |     # Collect all parameters from net and its children, then initialize them.
 84 |     net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
 85 |     # Trainer is for updating parameters with gradient.
 86 |     trainer = gluon.Trainer(net.collect_params(), 'sgd',
 87 |                             {'learning_rate': opt.lr, 'momentum': opt.momentum})
 88 |     metric = mx.metric.Accuracy()
 89 |     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 90 | 
 91 |     for epoch in range(epochs):
 92 |         # reset data iterator and metric at begining of epoch.
 93 |         metric.reset()
 94 |         for i, (data, label) in enumerate(train_data):
 95 |             # Copy data to ctx if necessary
 96 |             data = data.as_in_context(ctx)
 97 |             label = label.as_in_context(ctx)
 98 |             # Start recording computation graph with record() section.
 99 |             # Recorded graphs can then be differentiated with backward.
100 |             with autograd.record():
101 |                 output = net(data)
102 |                 L = loss(output, label)
103 |                 L.backward()
104 |             # take a gradient step with batch_size equal to data.shape[0]
105 |             trainer.step(data.shape[0])
106 |             # update metric at last.
107 |             metric.update([label], [output])
108 | 
109 |             if i % opt.log_interval == 0 and i > 0:
110 |                 name, acc = metric.get()
111 |                 print('[Epoch %d Batch %d] Training: %s=%f'%(epoch, i, name, acc))
112 | 
113 |         name, acc = metric.get()
114 |         print('[Epoch %d] Training: %s=%f'%(epoch, name, acc))
115 | 
116 |         name, val_acc = test(ctx)
117 |         print('[Epoch %d] Validation: %s=%f'%(epoch, name, val_acc))
118 | 
119 |     net.save_parameters('mnist.params')
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     if opt.cuda:
124 |         ctx = mx.gpu(0)
125 |     else:
126 |         ctx = mx.cpu()
127 |     train(opt.epochs, ctx)
128 | 


--------------------------------------------------------------------------------
/tuxiv.conf.md:
--------------------------------------------------------------------------------
  1 | #### ENVIRONMENT VARIABLES
  2 | 
  3 | + `TACC_WORKDIR`: TACC job workspace directory. Each job has a different workspace directory.
  4 | + `TACC_USERDIR`: TACC User directory.
  5 | + `TACC_SLURM_USERLOG`: Slurm log directory. The default value is `${TACC_USERDIR}/slurm_log`.
  6 | - tip: These ENVIRONMENT VARIABLES can be used in `tuxiv.conf` or read in `python` code.
  7 | 
  8 | There are four parts in `tuxiv.conf` that configure different parts of job submission. Noted that `tuxiv.conf` follows **yaml format**.
  9 | 
 10 | #### Entrypoint
 11 | 
 12 |   In this section, you should input you shell commands to run your code line-by-line. The tcloud CLI will help run the job according to your commands.
 13 | 
 14 |   ~~~yaml
 15 |   entrypoint:
 16 |       - python ${TACC_WORKDIR}/mnist.py --epoch=3 
 17 |   ~~~
 18 | 
 19 | #### Environment
 20 | 
 21 |   In this section, you can specify your software requirements, including the environment name, dependencies, source channels and so on. The tcloud CLI will help build your environment with *miniconda*.
 22 | 
 23 |   Notice: The environment name is *optional*. You can have the following two options.
 24 |   1. Environment name set. 
 25 |       
 26 |       In this case, tcloud will create a new environment when you change any of your dependencies configuration.
 27 |       ~~~yaml
 28 |       environment:
 29 |         dependencies:
 30 |             - pytorch=1.6.0
 31 |             - torchvision=0.7.0
 32 |         channels: pytorch
 33 |       ~~~
 34 |   2. Environment name unset. 
 35 | 
 36 |       In this case, the environment will be persistent and tcloud will be updated the environment when you change any of your dependencies configuration (instead of creat a new environment).
 37 |       The environment configuration of tcloud is managed by conda, and you can follow conda to manage your environment.
 38 |       ~~~yaml
 39 |       environment:
 40 |           name: torch-env
 41 |           dependencies:
 42 |               - pytorch=1.6.0
 43 |               - torchvision=0.7.0
 44 |         channels: pytorch
 45 |       ~~~
 46 |   + Check environment
 47 | 
 48 |     Check the existing environment with the `tcloud env ls` command.
 49 |     ```
 50 |     ~ ❯ tcloud env ls
 51 |     # conda environments:
 52 |     #
 53 |     base                  *  /mnt/home/username/.Miniconda3
 54 |     pytorch                  /mnt/home/username/.Miniconda3/envs/pytorch
 55 |     ```
 56 |     Check installed dependencies in a sp environment
 57 |     the existing environment dependencies with the `tcloud env ls -n [ENV_NAME]` command.
 58 |     ```
 59 |     ~ ❯ tcloud env ls -n base                                                                        
 60 |     # packages in environment at /mnt/home/username/.Miniconda3:
 61 |     #
 62 |     # Name                    Version                   Build  Channel
 63 |     _libgcc_mutex             0.1                        main
 64 |     brotlipy                  0.7.0           py38h27cfd23_1003
 65 |     ca-certificates           2020.10.14                    0
 66 |     certifi                   2020.6.20          pyhd3eb1b0_3
 67 |     cffi                      1.14.3           py38h261ae71_2
 68 |     chardet                   3.0.4           py38h06a4308_1003
 69 |     conda                     4.9.2            py38h06a4308_0
 70 |     conda-package-handling    1.7.2            py38h03888b9_0
 71 |     ...
 72 |     ```
 73 | 
 74 | #### Job
 75 | 
 76 |   In this section, you can specify your configurations for cluster resources, including number of nodes, CPUs, GPUs, output file and so on. All the cluster configuration should be set in the general part.
 77 | 
 78 |   ~~~yaml
 79 |   job:
 80 |     name: test
 81 |     general:
 82 |       - nodes=2                # the number of nodes
 83 |       - ntasks-per-node=1      # the number of tasks per node
 84 |       - cpus-per-task=10       # the number of cpu per task
 85 |       - gres=gpu:2             # the number of gpu per node
 86 |   ~~~
 87 | 
 88 |   **Note:** You can modify the output log path in Job section. For debugging purpose, we recommend you set the `output` value under `${TACC_USERDIR}` directory and check it using `tcloud ls` and `tcloud download`.
 89 | 
 90 | #### Datasets
 91 |   - tcloud will help place the public datasets access in `TACC_USERDIR`. You can view the table of  datasets at [Dataset Info](https://docs.google.com/spreadsheets/d/18qi2YpYvuXkWns7KY9pHYQclhS1Yyt5ysqgZ4plYcTg/edit#gid=0) or check the table below.
 92 | 
 93 |       - 
 94 |         |  | Dataset Name |
 95 |         | :------: | :------: |
 96 |         | 0 | imagenet |
 97 |         | 1 | mnist |
 98 |         | 2 | cifar-10 |
 99 |         | 3 | coco17 |
100 |         | 4 | more datasets upon request |
101 | 
102 |     - to access the public dataset you need to add this command in your tuxiv.conf file:
103 |       ~~~yaml
104 |       datasets:
105 |         - imagenet
106 |       ~~~
107 |     - also use this path as a dataset directory:
108 |       ~~~shell
109 |       ${TACC_USERDIR}/DATASET_NAME
110 |       ~~~
111 |   - User dataset: if you want to use your own dataset, you may **skip** this part and follow the [instructions](docs/user_dataset.md) to upload and use your dataset.
112 | 
113 |   


--------------------------------------------------------------------------------
/example/TensorFlow/mnist.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import tensorflow as tf
  3 | from tensorflow.examples.tutorials.mnist import input_data
  4 | 
  5 | import os
  6 | import shutil
  7 | import sys
  8 | from tensorflow_on_slurm import tf_config_from_slurm
  9 | 
 10 | # Flags for defining the tf.train.ClusterSpec
 11 | tf.app.flags.DEFINE_string("ps_hosts", "",
 12 |                            "Comma-separated list of hostname:port pairs")
 13 | tf.app.flags.DEFINE_string("worker_hosts", "",
 14 |                            "Comma-separated list of hostname:port pairs")
 15 | 
 16 | # Flags for defining the tf.train.Server
 17 | tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
 18 | tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
 19 | tf.app.flags.DEFINE_integer("hidden_units", 100,
 20 |                             "Number of units in the hidden layer of the NN")
 21 | tf.app.flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 22 |                            "Directory for storing mnist data")
 23 | tf.app.flags.DEFINE_integer("batch_size", 100, "Training batch size")
 24 | 
 25 | FLAGS = tf.app.flags.FLAGS
 26 | 
 27 | IMAGE_PIXELS = 28
 28 | 
 29 | def main(_):
 30 |   # # Create a cluster from the parameter server and worker hosts.
 31 |   if os.path.exists("/tmp/train_logs"):
 32 |     shutil.rmtree("/tmp/train_logs")
 33 |   cluster, my_job_name, my_task_index = tf_config_from_slurm(ps_number=1)
 34 |   cluster_spec = tf.train.ClusterSpec(cluster)
 35 | 
 36 |   # Create and start a server for the local task.
 37 |   server = tf.train.Server(cluster_spec,
 38 |                            job_name=my_job_name,
 39 |                            task_index=my_task_index)
 40 | 
 41 |   if my_job_name == "ps":
 42 |     server.join()
 43 |   elif my_job_name == "worker":
 44 | 
 45 |     # Assigns ops to the local worker by default.
 46 |     with tf.device(tf.train.replica_device_setter(
 47 |         worker_device="/job:worker/task:%d" % my_task_index,
 48 |         cluster=cluster_spec)):
 49 | 
 50 |       # Variables of the hidden layer
 51 |       hid_w = tf.Variable(
 52 |           tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
 53 |                               stddev=1.0 / IMAGE_PIXELS), name="hid_w")
 54 |       hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
 55 | 
 56 |       # Variables of the softmax layer
 57 |       sm_w = tf.Variable(
 58 |           tf.truncated_normal([FLAGS.hidden_units, 10],
 59 |                               stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
 60 |           name="sm_w")
 61 |       sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
 62 | 
 63 |       x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
 64 |       y_ = tf.placeholder(tf.float32, [None, 10])
 65 | 
 66 |       hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
 67 |       hid = tf.nn.relu(hid_lin)
 68 | 
 69 |       y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
 70 |       loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
 71 | 
 72 |       global_step = tf.Variable(0)
 73 | 
 74 |       train_op = tf.train.AdagradOptimizer(0.01).minimize(
 75 |           loss, global_step=global_step)
 76 |       correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
 77 |       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 78 |       saver = tf.train.Saver()
 79 |       summary_op = tf.summary.merge_all()
 80 |       init_op = tf.initialize_all_variables()
 81 | 
 82 |     # Create a "supervisor", which oversees the training process.
 83 |     sv = tf.train.Supervisor(is_chief=(my_task_index == 0),
 84 |                              logdir="/tmp/train_logs",
 85 |                              init_op=init_op,
 86 |                              saver=saver,
 87 |                              summary_op=summary_op,
 88 |                              global_step=global_step,
 89 |                              save_model_secs=600)
 90 | 
 91 |     mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 92 | 
 93 |     # The supervisor takes care of session initialization, restoring from
 94 |     # a checkpoint, and closing when done or an error occurs.
 95 |     with sv.managed_session(server.target) as sess:
 96 |       # Loop until the supervisor shuts down or 1000000 steps have completed.
 97 |       step = 0
 98 |       while not sv.should_stop() and step < 10000:
 99 |         # Run a training step asynchronously.
100 |         # See `tf.train.SyncReplicasOptimizer` for additional details on how to
101 |         # perform *synchronous* training.
102 | 
103 |         batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
104 |         train_feed = {x: batch_xs, y_: batch_ys}
105 | 
106 |         _, step = sess.run([train_op, global_step], feed_dict=train_feed)
107 |         if step % 1000 == 0:
108 |             acc = sess.run([accuracy], feed_dict={x: mnist.test.images, y_: mnist.test.labels})
109 |             # Compute the accuracy of the model on test data set.
110 |             print("step:" , step,  "accuracy: ",  acc)
111 | 
112 | 
113 |     # Ask for all the services to stop.
114 |     tf.logging.info("Training process finished")
115 |     sv.stop()
116 | 
117 | if __name__ == "__main__":
118 |   tf.app.run()
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Turing AI Cloud Quick Start
  2 | ## Workflow Overview
  3 | 
  4 | ![Workflow](./static/workflow.png)
  5 | 
  6 | The above picture illustrates the submission and debug workflows of TACC job.
  7 | 
  8 | ## Creating a TACC account
  9 | Before using tcloud SDK, please make sure that you have applied for a TACC account and submitted your public key to TACC. You may generate SSH public key according to the [steps](https://git-scm.com/book/en/v2/Git-on-the-Server-Generating-Your-SSH-Public-Key).
 10 | To apply for a TACC account, please visit [our website ](https://turing.ust.hk/).
 11 | 
 12 | ## Installing `tcloud` SDK
 13 | - __Download tcloud SDK__ \
 14 | Download the latest tcloud SDK from [tags](https://github.com/turingaicloud/quickstart/tags).
 15 | - __Install tcloud SDK__ \
 16 | Place `setup.sh` and `tcloud` in the same directory, and run `setup.sh`.
 17 | 
 18 | ## Submitting Your First TACC Job
 19 | ### CLI Tool Initialization
 20 | + 
 21 |   First, you need to configure your TACC credentials. You can do this by running the `tcloud config` command:
 22 |   ```
 23 |   $ tcloud config [-u/--username] MYUSERNAME
 24 |   $ tcloud config [-f/--file] MYPRIVATEFILEPATH
 25 |   ```
 26 | + 
 27 |   Then, run `tcloud init` command to obtain the latest cluster hardware information from TACC cluster.
 28 |   ```
 29 |   PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 30 |   tacc*        up   infinite      5  alloc 10-0-7-[18-19],10-0-8-[18-19]
 31 |   tacc*        up   infinite     19   idle 10-0-2-[18-19],10-0-3-[10-13]
 32 |   ```
 33 | 
 34 | ### Download Sample Job
 35 | You can use [this link](https://github.com/turingaicloud/quickstart/archive/refs/heads/master.zip) to download our example code.
 36 | 
 37 | 
 38 | ### Submit a Job
 39 | Each job requires a `main.py` with `tuxiv.conf`
 40 | 
 41 | +
 42 |   main.py: Your machine learning training code.
 43 | 
 44 | +
 45 |   tuxiv.conf: [Detail about tuxiv.conf](tuxiv.conf.md)
 46 | 
 47 |   
 48 | After tcloud is configured correctly, you can try to submit your first job. 
 49 | 
 50 | 1. Go to the example folder in your terminal.
 51 | 2. Run `tcloud submit` command.
 52 |     ```
 53 |     ~/Dow/quickstart-master/example/helloworld ❯ tcloud submit
 54 |     Start parsing tuxiv.conf...
 55 |     building file list ...
 56 |     8 files to consider
 57 |     helloworld/
 58 |     helloworld/run.sh
 59 |             151 100%    0.00kB/s    0:00:00 (xfer#1, to-check=5/8)
 60 |     helloworld/configurations/
 61 |     helloworld/configurations/citynet.sh
 62 |               12 100%   11.72kB/s    0:00:00 (xfer#2, to-check=2/8)
 63 |     helloworld/configurations/conda.yaml
 64 |             107 100%  104.49kB/s    0:00:00 (xfer#3, to-check=1/8)
 65 |     helloworld/configurations/run.slurm
 66 |             278 100%  271.48kB/s    0:00:00 (xfer#4, to-check=0/8)
 67 | 
 68 |     sent 429 bytes  received 144 bytes  382.00 bytes/sec
 69 |     total size is 1071  speedup is 1.87
 70 |     Submitted batch job 2000
 71 |     Job helloworld submitted.
 72 |     ```
 73 | 
 74 | ### Retrive Your Job Status and Output
 75 | In this section, we provide two methods to monitor the job log.
 76 | 
 77 | After training, you can use `tcloud ls [filepath]` to find the output files
 78 | + cat
 79 | 
 80 |   You can configure your log path in the `tuxiv.conf`. The default path is `slurm_log/slurm-jobid.out`.
 81 | 
 82 |   ```
 83 |   tcloud cat slurm_log/slurm-jobid.out
 84 |   ```
 85 |   In the helloworld example, the [tuxiv.conf](example/helloworld/tuxiv.conf) file specifies the log path as `slurm_log/hello.log`
 86 | 
 87 | 
 88 | + download
 89 | 
 90 |   You can use `tcloud download [filepath]`. 
 91 |   
 92 |   Note that you can only read and download files in `USERDIR`, and the files in `WORKDIR` may be removed after the job is finished.
 93 |   ```
 94 |   tcloud download slurm_log/slurm-jobid.out
 95 |   ```
 96 | 
 97 | ### Manage your environment
 98 | tcloud uses [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) to manage your dependencies. All dependencies will be installed through conda. Please specify the required conda channel to meet the installation requirements. In tcloud, we offer two ways of environment management:
 99 | 
100 | 1. One-off Environment. A new environment with different dependencies will be created every time you submit a task to TACC. If you do not specify an environment name and your dependencies configuration does not change between two consecutive submissions in `tuxiv.conf`, we will reuse the previous environment to save time. This is the *default* behavior.
101 |     ~~~yaml
102 |     environment:
103 |       # name:       # do not specify environment name
104 |       dependencies:
105 |           - pytorch=1.6.0
106 |           - torchvision=0.7.0
107 |       channels: pytorch
108 |     ~~~
109 | 2. Persistent Environment. You can create a dedicated environment for each project. It needs to set a different environment name in `tuxiv.conf` for each project. When you change your dependencies configuration with an exist environment, tcloud will update this environment in stead of creating a new one. Learn how to do this in [tuxiv.conf documentation environment part.](tuxiv.conf.md#environment)
110 |     ~~~yaml
111 |     environment:
112 |       name: torch-env   # dedicated environment name
113 |       dependencies:
114 |           - pytorch=1.6.0
115 |           - torchvision=0.7.0
116 |       channels: pytorch
117 |     ~~~
118 | 
119 | 
120 | ## Demo video
121 | The following videos will help you use tcloud CLI to begin your TACC journey: [demo video](https://hkustconnect-my.sharepoint.com/:v:/g/personal/dsunak_connect_ust_hk/EUYW3f8IRwVLhBtCYP_ufs4BpQ7CaxrCUBiUexY7-nLX7w?e=O2gR2G).
122 | 
123 | ## Examples
124 | Basic examples are provided under the [example](example) folder. These examples include: [HelloWorld](example/helloworld), [TensorFlow](example/TensorFlow), [PyTorch](example/PyTorch) and [MXNet](example/MXNet).
125 | 
126 | ## FAQ
127 | [FAQ](FAQ.md)


--------------------------------------------------------------------------------
/example/PyTorch/mnist.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse
  3 | import os
  4 | import subprocess
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | from torchvision import datasets, transforms
 10 | from torch.utils.data import Dataset, DataLoader
 11 | from torch.utils.data.distributed import DistributedSampler
 12 | from torch.nn.parallel import DistributedDataParallel
 13 | 
 14 | 
 15 | def dist_init(host_addr, rank, local_rank, world_size, port=23456):
 16 |     host_addr_full = 'tcp://' + host_addr + ':' + str(port)
 17 |     torch.distributed.init_process_group("gloo", init_method=host_addr_full,
 18 |                                          rank=rank, world_size=world_size)
 19 |     assert torch.distributed.is_initialized()
 20 | 
 21 | class Net(nn.Module):
 22 |     def __init__(self):
 23 |         super(Net, self).__init__()
 24 |         self.conv1 = nn.Conv2d(1, 20, 5, 1)
 25 |         self.conv2 = nn.Conv2d(20, 50, 5, 1)
 26 |         self.fc1 = nn.Linear(4*4*50, 500)
 27 |         self.fc2 = nn.Linear(500, 10)
 28 | 
 29 |     def forward(self, x):
 30 |         x = F.relu(self.conv1(x))
 31 |         x = F.max_pool2d(x, 2, 2)
 32 |         x = F.relu(self.conv2(x))
 33 |         x = F.max_pool2d(x, 2, 2)
 34 |         x = x.view(-1, 4*4*50)
 35 |         x = F.relu(self.fc1(x))
 36 |         x = self.fc2(x)
 37 |         return F.log_softmax(x, dim=1)
 38 | 
 39 | def train(args, model, local_rank, train_loader, optimizer, epoch):
 40 |     model.train()
 41 |     for batch_idx, (data, target) in enumerate(train_loader):
 42 |         data, target = data.to(local_rank), target.to(local_rank)
 43 |         optimizer.zero_grad()
 44 |         output = model(data)
 45 |         loss = F.nll_loss(output, target)
 46 |         loss.backward()
 47 |         optimizer.step()
 48 |         if batch_idx % args.log_interval == 0:
 49 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
 50 |                 epoch, batch_idx * len(data), len(train_loader.dataset),
 51 |                 100. * batch_idx / len(train_loader), loss.item()))
 52 | 
 53 | def test(args, model, local_rank, test_loader, world_size):
 54 |     model.eval()
 55 |     test_loss = 0
 56 |     correct = 0
 57 |     length = len(test_loader.dataset)/world_size
 58 |     with torch.no_grad():
 59 |         for data, target in test_loader:
 60 |             data, target = data.to(local_rank), target.to(local_rank)
 61 |             output = model(data)
 62 |             test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
 63 |             pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
 64 |             correct += pred.eq(target.view_as(pred)).sum().item()
 65 | 
 66 |     test_loss /= length
 67 | 
 68 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 69 |         test_loss, correct, length,
 70 |         100. * correct / length))
 71 | 
 72 | def main():
 73 | 
 74 |     # Training settings
 75 |     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 76 |     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 77 |                         help='input batch size for training (default: 64)')
 78 |     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 79 |                         help='input batch size for testing (default: 1000)')
 80 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
 81 |                         help='number of epochs to train (default: 10)')
 82 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
 83 |                         help='learning rate (default: 0.01)')
 84 |     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
 85 |                         help='SGD momentum (default: 0.5)')
 86 |     parser.add_argument('--no-cuda', action='store_true', default=False,
 87 |                         help='disables CUDA training')
 88 |     parser.add_argument('--seed', type=int, default=1, metavar='S',
 89 |                         help='random seed (default: 1)')
 90 |     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 91 |                         help='how many batches to wait before logging training status')
 92 |     parser.add_argument('--save-model', action='store_true', default=False,
 93 |                         help='For Saving the current Model')
 94 |     parser.add_argument('--datasetDir',
 95 |                         help='Please add your dataset directory')
 96 | 
 97 |     args = parser.parse_args()
 98 | 
 99 |     # use_cuda = not args.no_cuda and torch.cuda.is_available()
100 |     # torch.manual_seed(args.seed)
101 |     # device = torch.device("cuda" if use_cuda else "cpu")
102 |     # kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
103 | 
104 |     rank = int(os.environ['SLURM_PROCID'])
105 |     local_rank = int(os.environ['SLURM_LOCALID'])
106 |     world_size = int(os.environ['SLURM_NTASKS'])
107 |     iplist = os.environ['SLURM_JOB_NODELIST']
108 |     ip = subprocess.getoutput(f"scontrol show hostname {iplist} | head -n1")
109 | 
110 |     dist_init(ip, rank, local_rank, world_size)
111 |     train_dataset = datasets.MNIST(args.datasetDir, train=True, download=False,
112 |                        transform=transforms.Compose([
113 |                            transforms.ToTensor(),
114 |                            transforms.Normalize((0.1307,), (0.3081,))
115 |                        ]))
116 |     train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
117 |     train_loader = torch.utils.data.DataLoader(
118 |         train_dataset,
119 |         batch_size=args.batch_size,
120 |         sampler=train_sampler
121 |         )
122 |     test_dataset = datasets.MNIST(args.datasetDir, train=False, transform=transforms.Compose([
123 |                            transforms.ToTensor(),
124 |                            transforms.Normalize((0.1307,), (0.3081,))
125 |                        ]))
126 |     test_sampler = DistributedSampler(test_dataset, num_replicas=world_size, rank=rank)
127 | 
128 |     test_loader = torch.utils.data.DataLoader(
129 |         test_dataset,
130 |         batch_size=args.test_batch_size,
131 |         sampler=test_sampler
132 |         )
133 | 
134 |     model = Net().to(local_rank)
135 |     model = DistributedDataParallel(model)
136 |     optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
137 | 
138 |     for epoch in range(1, args.epochs + 1):
139 |         train(args, model, local_rank, train_loader, optimizer, epoch)
140 |         test(args, model, local_rank, test_loader, world_size)
141 | 
142 |     if (args.save_model):
143 |         torch.save(model.state_dict(),"mnist_cnn.pt")
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
  1 | # tcloud SDK Examples
  2 | 
  3 | TACC supports multiple ML frameworks such as TensorFlow, PyTorch and MXNet. We will later support some specialized ML framework like FATE, etc. Here we list several job examples of different frameworks.
  4 | 
  5 | ## HelloWorld
  6 | 
  7 | + CityNet Dataset: OpenRoadMap
  8 | + Task: basic usage of tcloud
  9 | + Code: [main.py](helloworld/main.py)
 10 | 
 11 | ### Getting started
 12 | 
 13 | + Install tcloud CLI, and run `tcloud init` to pull the latest cluster configurations from remote.
 14 | 
 15 | + Configuration
 16 | 
 17 |   + Configure user information using `tcloud config`.
 18 | 
 19 |   + TACC ENV
 20 | 
 21 |     ~~~shell
 22 |     TACC_WORKDIR # default repo directory
 23 |     TACC_USERDIR # user directory
 24 |     TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log
 25 |     ~~~
 26 | 
 27 |   + TuXiv configuration
 28 | 
 29 |     ~~~yaml
 30 |     # tuxiv.conf
 31 |     entrypoint:
 32 |     - python ${TACC_WORKDIR}/main.py
 33 |     environment:
 34 |         name: hello 
 35 |         dependencies:
 36 |             - python=3.6.9
 37 |     job:
 38 |         name: test
 39 |         general:
 40 |             - output=${TACC_SLURM_USERLOG}/hello.out
 41 |             - nodes=1
 42 |             - ntasks=1
 43 |             - cpus-per-task=1
 44 |     datasets:
 45 |       - OpenRoadMap
 46 |     ~~~
 47 | 
 48 |   + Model code modification
 49 | 
 50 |     ~~~python
 51 |     import os
 52 |     import shutil
 53 |     # get variables from env
 54 |     WORKDIR = os.environ.get('TACC_WORKDIR')
 55 |     USERDIR = os.environ.get('TACC_USERDIR')
 56 |     # show the directory tree
 57 |     os.system('tree -L 2 {}'.format(USERDIR))
 58 |     # basic copy operation
 59 |     shutil.copytree(WORKDIR, "{}/helloworld".format(USERDIR))
 60 |     ~~~
 61 | 
 62 | ### Submit job
 63 | 
 64 | + Enter the `helloworld` directory and follow the following steps.
 65 | + Build environment and submit job: `tcloud submit`
 66 | + Monitor job: `tcloud ps [-j] [<JOB_ID>]`
 67 | + Obtain log: `tcloud download helloworld/slurm_log/hello.out`
 68 | + Cancel job: `tcloud cancel [-j] [<JOB_ID>]`
 69 | + View UserDir: `tcloud ls <PATH>`
 70 | 
 71 | 
 72 | 
 73 | ## TensorFlow
 74 | 
 75 | + Dataset: mnist
 76 | + Task: image classification
 77 | + Code: [mnist.py](TensorFlow/mnist.py)
 78 | 
 79 | ### Getting started
 80 | 
 81 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote.
 82 | 
 83 | + Configuration
 84 | 
 85 |   + Config user informations using `tcloud config`.
 86 | 
 87 |   + TACC ENV
 88 | 
 89 |     ~~~shell
 90 |     TACC_WORKDIR # default repo directory
 91 |     TACC_USERDIR # user directory
 92 |     TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log
 93 |     ~~~
 94 | 
 95 |   + TuXiv configuration
 96 |   
 97 |     ~~~yaml
 98 |     # tuxiv.conf
 99 |     entrypoint:
100 |         - python ${TACC_WORKDIR}/mnist.py 
101 |         - --task_index=0
102 |         - --data_dir=${TACC_WORKDIR}/datasets/mnist_data
103 |         - --batch_size=1
104 |     environment:
105 |         name: tf 
106 |         dependencies:
107 |             - tensorflow=1.15
108 |     job:
109 |         name: mnist
110 |         general:
111 |           - nodes=2
112 |     ~~~
113 | 
114 |   + Model code modification
115 |   
116 |     Use ` tf.distribute.cluster_resolver.SlurmClusterResolver`  instead of other resolvers.
117 | 
118 | ### Training
119 | 
120 | + Enter the `TensorFlow` directory and follow the following steps.
121 | + Build environment and submit job: `tcloud submit`
122 | + Monitor job: `tcloud ps [-j] [<JOB_ID>]`
123 | + Cancel job: `tcloud cancel [-j] [<JOB_ID>]`
124 | + View UserDir: `tcloud ls <PATH>`
125 | 
126 | 
127 | 
128 | ## PyTorch
129 | 
130 | + Dataset: mnist
131 | + Task: image classification
132 | + Code: [mnist.py](PyTorch/mnist.py)
133 | 
134 | ### Getting started
135 | 
136 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote.
137 | 
138 | + Configuration
139 | 
140 |   + Config user informations using `tcloud config`.
141 | 
142 |   + TACC ENV
143 | 
144 |     ~~~shell
145 |     TACC_WORKDIR # default repo directory
146 |     TACC_USERDIR # user directory
147 |     TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log
148 |     ~~~
149 | 
150 |   + TuXiv configuration
151 |   
152 |     ~~~yaml
153 |     # tuxiv.conf
154 |     entrypoint:
155 |         - python ${TACC_WORKDIR}/mnist.py --epoch=3
156 |     environment:
157 |         name: torch-env
158 |         dependencies:
159 |             - pytorch=1.6.0
160 |             - torchvision=0.7.0
161 |         channels: pytorch
162 |     job:
163 |         name: test
164 |         general:
165 |           - nodes=2
166 |     ~~~
167 | 
168 |   + Model code modification
169 | 
170 |     Obtain environment variables from slurm cluster, and set the parameters for initialize the cluster.
171 |   
172 |     ~~~python
173 |     # example
174 |     def dist_init(host_addr, rank, local_rank, world_size, port=23456):
175 |         host_addr_full = 'tcp://' + host_addr + ':' + str(port)
176 |         torch.distributed.init_process_group("gloo", init_method=host_addr_full,
177 |                                              rank=rank, world_size=world_size)
178 |       assert torch.distributed.is_initialized()
179 |     
180 |     def get_ip(iplist):
181 |         ip = iplist.split('[')[0] + iplist.split('[')[1].split('-')[0]
182 |         
183 |     rank = int(os.environ['SLURM_PROCID'])
184 |     local_rank = int(os.environ['SLURM_LOCALID'])
185 |     world_size = int(os.environ['SLURM_NTASKS'])
186 |     iplist = os.environ['SLURM_STEP_NODELIST']
187 |     ip = get_ip(iplist) # function get_ip() is depends on the format of nodelist 
188 |     dist_init(ip, rank, local_rank, world_size)
189 |     ~~~
190 | 
191 | ### Training
192 | 
193 | + Enter the `PyTorch` directory and follow the following steps.
194 | + Build environment and submit job: `tcloud submit`
195 | + Monitor job: `tcloud ps [-j] [<JOB_ID>]`
196 | + Cancel job: `tcloud cancel [-j] [<JOB_ID>]`
197 | + View UserDir: `tcloud ls <PATH>`
198 | 
199 | 
200 | 
201 | ## MXNet
202 | 
203 | + Dataset: mnist
204 | + Task: image classification
205 | + Code: [mnist.py](MXNet/mnist.py)
206 | 
207 | ### Getting started
208 | 
209 | + Install tcloud CLI, and run `tcloud init` to pull cluster configurations from remote.
210 | 
211 | + Configuration
212 | 
213 |   + Config user informations using `tcloud config`.
214 | 
215 |   + TACC ENV
216 | 
217 |     ~~~shell
218 |     TACC_WORKDIR # default repo directory
219 |     TACC_USERDIR # user directory
220 |     TACC_SLURM_USERLOG # slurm log directory default: ${TACC_USERDIR}/slurm_log
221 |     ~~~
222 | 
223 |   + TuXiv configuration
224 |   
225 |     ~~~yaml
226 |     # tuxiv.conf
227 |     entrypoint:
228 |     - python ${TACC_WORKDIR}/mnist.py
229 |     environment:
230 |         name: mxnet-env 
231 |         dependencies:
232 |             - mxnet=1.5.0
233 |     job:
234 |         name: test
235 |         general:
236 |           - nodes=2
237 |     ~~~
238 | 
239 |   + Model code modification
240 |   
241 |     Obtain environment variables from slurm cluster, and set the parameters for initialize the cluster.
242 | 
243 | ### Training
244 | 
245 | + Enter the `MXNet` directory and follow the following steps.
246 | + Build environment and submit job: `tcloud submit`
247 | + Monitor job: `tcloud ps [-j] [<JOB_ID>]`
248 | + Cancel job: `tcloud cancel [-j] [<JOB_ID>]`
249 | + View UserDir: `tcloud ls <PATH>`
250 | 
251 | 


--------------------------------------------------------------------------------