├── cfg
    ├── env
    │   ├── hyrax.yml
    │   └── euler.yml
    └── exp
    │   ├── exp.yml
    │   ├── exp2.yml
    │   └── tmp
    │       └── exp.yml
├── modules
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── utils.cpython-37.pyc
    │   │   ├── loading.cpython-37.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── get_logger.cpython-37.pyc
    │   │   └── utils_dict_list.cpython-37.pyc
    │   └── loading.py
    ├── lightning
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── lightning.cpython-37.pyc
    │   └── lightning.py
    └── datasets
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       └── random_dataset.cpython-37.pyc
    │   └── random_dataset.py
├── ansible
    ├── experiments.yml
    └── queue_jobs.yml
├── docs
    ├── login.png
    └── cluster_web.jpeg
├── scripts
    ├── copy_dataset.sh
    └── submit.sh
├── main.py
├── conda
    └── py38.yml
└── readme.md


/cfg/env/hyrax.yml:
--------------------------------------------------------------------------------
1 | base: /home/jonfrey/results


--------------------------------------------------------------------------------
/modules/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .loading import *


--------------------------------------------------------------------------------
/modules/lightning/__init__.py:
--------------------------------------------------------------------------------
1 | from .lightning import *


--------------------------------------------------------------------------------
/ansible/experiments.yml:
--------------------------------------------------------------------------------
1 | jobs:
2 | - exp: cfg/exp/exp.yml


--------------------------------------------------------------------------------
/modules/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .random_dataset import *


--------------------------------------------------------------------------------
/cfg/env/euler.yml:
--------------------------------------------------------------------------------
1 | base: /cluster/work/riner/users/jonfrey/runs


--------------------------------------------------------------------------------
/docs/login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/docs/login.png


--------------------------------------------------------------------------------
/docs/cluster_web.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/docs/cluster_web.jpeg


--------------------------------------------------------------------------------
/scripts/copy_dataset.sh:
--------------------------------------------------------------------------------
1 | 
2 | tar -xvf /cluster/work/riner/your/tarfile.tar -C $TMPDIR >/dev/null 2>&1
3 | 


--------------------------------------------------------------------------------
/cfg/exp/exp.yml:
--------------------------------------------------------------------------------
1 | name: test
2 | timestamp: True
3 | 
4 | trainer:
5 |   max_epochs: 1000
6 |   accelerator: ddp
7 |   gpus: -1
8 | 


--------------------------------------------------------------------------------
/cfg/exp/exp2.yml:
--------------------------------------------------------------------------------
1 | name: test_2
2 | timestamp: True
3 | 
4 | trainer:
5 |   max_epochs: 10
6 |   accelerator: ddp
7 |   gpus: -1
8 | 


--------------------------------------------------------------------------------
/modules/utils/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/utils/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/utils/__pycache__/loading.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/utils/__pycache__/loading.cpython-37.pyc


--------------------------------------------------------------------------------
/cfg/exp/tmp/exp.yml:
--------------------------------------------------------------------------------
1 | name: /home/jonfrey/results/2021-03-14T15:30:39_test
2 | timestamp: true
3 | trainer:
4 |   max_epochs: 10
5 |   accelerator: ddp
6 |   gpus: -1
7 | 


--------------------------------------------------------------------------------
/modules/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/utils/__pycache__/get_logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/utils/__pycache__/get_logger.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/datasets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/datasets/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/lightning/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/lightning/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/lightning/__pycache__/lightning.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/lightning/__pycache__/lightning.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/utils/__pycache__/utils_dict_list.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/utils/__pycache__/utils_dict_list.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/datasets/__pycache__/random_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonasFrey96/ASL_leonhard_euler/HEAD/modules/datasets/__pycache__/random_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/modules/utils/loading.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | __all__ = ['file_path', 'load_yaml']
 4 | 
 5 | def file_path(string):
 6 |   if os.path.isfile(string):
 7 |     return string
 8 |   else:
 9 |     raise NotADirectoryError(string)
10 | 
11 | def load_yaml(path):
12 |   with open(path) as file:  
13 |     res = yaml.load(file, Loader=yaml.FullLoader) 
14 |   return res


--------------------------------------------------------------------------------
/modules/datasets/random_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | __all__ = ['RandomDataset']
 4 | class RandomDataset(Dataset):
 5 |     def __init__(self, size, length):
 6 |         self.len = length
 7 |         self.data = torch.randn(length, size)
 8 | 
 9 |     def __getitem__(self, index):
10 |         return self.data[index]
11 | 
12 |     def __len__(self):
13 |         return self.len


--------------------------------------------------------------------------------
/scripts/submit.sh:
--------------------------------------------------------------------------------
 1 | # Loading correct modules
 2 | module list &> /dev/null || source /cluster/apps/modules/init/bash
 3 | module purge
 4 | module load legacy new gcc/6.3.0 hdf5 eth_proxy
 5 | 
 6 | # To export all environment variables
 7 | source ~/.bashrc
 8 | # Navigate to working directoy
 9 | cd $HOME/ASL_leonhard_euler
10 | # Executing the script
11 | /cluster/home/jonfrey/miniconda3/envs/py38/bin/python main.py $@
12 | 


--------------------------------------------------------------------------------
/ansible/queue_jobs.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | 
 4 | - name: Schedule Experiments
 5 |   hosts: euler
 6 |   vars:
 7 |     - project_dir: "{{ ansible_env.HOME }}/"
 8 |   tasks:
 9 |     - name: Sync
10 |       synchronize: 
11 |         src: /home/jonfrey/ASL_leonhard_euler 
12 |         dest: "{{ project_dir }}"
13 |         perms: True
14 |     - name: Load experiments
15 |       include_vars:
16 |         file: /home/jonfrey/ASL_leonhard_euler/ansible/experiments.yml
17 |         name: experiments
18 |     - name: Schdule all experiments
19 |       shell: >
20 |           bsub -n 1 -W 0:10 -R "rusage[mem=5000,ngpus_excl_p=2]" -R "select[gpu_mtotal0>=10000]" -R "rusage[scratch=1000]" $HOME/ASL_leonhard_euler/scripts/submit.sh --exp={{ item.exp }}
21 |       loop: "{{ experiments.jobs }}"


--------------------------------------------------------------------------------
/modules/lightning/lightning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from pl_examples import cli_lightning_logo
 7 | from pytorch_lightning import LightningModule, Trainer
 8 | 
 9 | __all__ = ['BoringModel']
10 | class BoringModel(LightningModule):
11 |     def __init__(self):
12 |         super().__init__()
13 |         self.layer = torch.nn.Linear(32, 2)
14 | 
15 |     def forward(self, x):
16 |         return self.layer(x)
17 | 
18 |     def loss(self, batch, prediction):
19 |         return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
20 | 
21 |     def step(self, x):
22 |         x = self.layer(x)
23 |         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
24 |         return out
25 | 
26 |     def training_step(self, batch, batch_idx):
27 |         output = self.layer(batch)
28 |         loss = self.loss(batch, output)
29 |         return {"loss": loss}
30 | 
31 |     def training_step_end(self, training_step_outputs):
32 |         return training_step_outputs
33 | 
34 |     def training_epoch_end(self, outputs) -> None:
35 |         torch.stack([x["loss"] for x in outputs]).mean()
36 | 
37 |     def validation_step(self, batch, batch_idx):
38 |         output = self.layer(batch)
39 |         loss = self.loss(batch, output)
40 |         return {"x": loss}
41 | 
42 |     def validation_epoch_end(self, outputs) -> None:
43 |         torch.stack([x['x'] for x in outputs]).mean()
44 | 
45 |     def test_step(self, batch, batch_idx):
46 |         output = self.layer(batch)
47 |         loss = self.loss(batch, output)
48 |         return {"y": loss}
49 | 
50 |     def test_epoch_end(self, outputs) -> None:
51 |         torch.stack([x["y"] for x in outputs]).mean()
52 | 
53 |     def configure_optimizers(self):
54 |         optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
55 |         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
56 |         return [optimizer], [lr_scheduler]
57 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys 
 3 | sys.path.insert(0, os.getcwd())
 4 | sys.path.append(os.path.join(os.getcwd() + '/modules/'))
 5 | 
 6 | import argparse
 7 | import torch
 8 | import yaml
 9 | import datetime
10 | from pytorch_lightning import Trainer
11 | import shutil
12 | from pathlib import Path
13 | 
14 | from datasets import RandomDataset
15 | from lightning import BoringModel
16 | from utils import load_yaml
17 | 
18 | 
19 | if __name__ == '__main__':
20 |   parser = argparse.ArgumentParser()    
21 |   parser.add_argument('--exp', default='cfg/exp/exp.yml',
22 |             help='The main experiment yaml file.')
23 |   args = parser.parse_args()
24 | 
25 |   local_rank = int(os.environ.get('LOCAL_RANK', 0))
26 | 
27 |   # Load Experiment Setting and Environment File
28 |   exp_cfg_path = args.exp
29 |   if local_rank != 0:
30 |     # if not main task load the modfied exp config file.
31 |     # this allows to performe timestamps of the model folder
32 |     rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/')
33 |     exp_cfg_path = os.path.join( exp_cfg_path[:rm],'tmp/',exp_cfg_path[rm:])
34 |   exp = load_yaml(exp_cfg_path)
35 | 
36 |   env_cfg_path = os.path.join('cfg/env', os.environ['ENV_WORKSTATION_NAME']+ '.yml')
37 |   env = load_yaml(env_cfg_path)
38 | 
39 |   # Create model folder (only for rank 0 ddp task) !
40 |   if local_rank == 0 :
41 |     # Set in name the correct model path
42 |     if exp.get('timestamp',True):
43 |       timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
44 |       model_path = os.path.join(env['base'], exp['name'])
45 |       p = model_path.split('/')
46 |       model_path = os.path.join('/',*p[:-1] ,str(timestamp)+'_'+ p[-1] )
47 |     else:
48 |       model_path = os.path.join(env['base'], exp['name'])
49 |     
50 |     shutil.rmtree(model_path,ignore_errors=True)
51 |     Path(model_path).mkdir(parents=True, exist_ok=True)
52 | 
53 |     exp_cfg_fn = os.path.split(exp_cfg_path)[-1]
54 |     env_cfg_fn = os.path.split(env_cfg_path)[-1]
55 |     print(f'Copy {env_cfg_path} to {model_path}/{exp_cfg_fn}')
56 |     shutil.copy(exp_cfg_path, f'{model_path}/{exp_cfg_fn}')
57 |     shutil.copy(env_cfg_path, f'{model_path}/{env_cfg_fn}')
58 |     exp['name'] = model_path
59 |   else:
60 |     # the correct model path has already been written to the yaml file.
61 |     model_path = os.path.join( exp['name'], f'rank_{local_rank}')
62 |     # Create the directory
63 |     Path(model_path).mkdir(parents=True, exist_ok=True)
64 | 
65 |   # Write back modified exp file
66 |   if local_rank == 0:
67 |     rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/')
68 |     exp_cfg_path = os.path.join( exp_cfg_path[:rm],'tmp/',exp_cfg_path[rm:])
69 |     Path(exp_cfg_path).parent.mkdir(parents=True, exist_ok=True) 
70 |     with open(exp_cfg_path, 'w+') as f:
71 |       yaml.dump(exp, f, default_flow_style=False, sort_keys=False)
72 |   
73 | 
74 |   # Fake Data
75 |   train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
76 |   val_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
77 |   test_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
78 | 
79 |   # Model
80 |   model = BoringModel()
81 |       
82 |   # Train and Test
83 |   if ( exp['trainer'] ).get('gpus', -1):
84 |     nr = torch.cuda.device_count()
85 |     exp['trainer']['gpus'] = nr
86 |     print( f'Set GPU Count for Trainer to {nr}!' )
87 | 
88 |   trainer = Trainer(
89 |     **exp['trainer'],
90 |     default_root_dir = model_path
91 |   )
92 |   trainer.fit(model, train_data, val_data)


--------------------------------------------------------------------------------
/conda/py38.yml:
--------------------------------------------------------------------------------
  1 | name: py38
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=1_llvm
  9 |   - attrs=20.3.0=pyhd3eb1b0_0
 10 |   - blas=1.0=mkl
 11 |   - blinker=1.4=py38h06a4308_0
 12 |   - bravado=11.0.3=pyhd8ed1ab_0
 13 |   - bravado-core=5.17.0=pyh9f0ad1d_0
 14 |   - brotlipy=0.7.0=py38h27cfd23_1003
 15 |   - bzip2=1.0.8=h7b6447c_0
 16 |   - ca-certificates=2021.1.19=h06a4308_1
 17 |   - cairo=1.16.0=h7979940_1007
 18 |   - certifi=2020.12.5=py38h06a4308_0
 19 |   - cffi=1.14.5=py38h261ae71_0
 20 |   - chardet=4.0.0=py38h06a4308_1003
 21 |   - click=7.1.2=pyhd3eb1b0_0
 22 |   - coloredlogs=15.0=py38h06a4308_0
 23 |   - cryptography=3.4.6=py38hd23ed53_0
 24 |   - cycler=0.10.0=py38_0
 25 |   - cython=0.29.22=py38h2531618_0
 26 |   - dbus=1.13.18=hb2f20db_0
 27 |   - expat=2.2.10=he6710b0_2
 28 |   - ffmpeg=4.3.1=hca11adc_2
 29 |   - fontconfig=2.13.1=h6c09931_0
 30 |   - freetype=2.10.4=h5ab3b9f_0
 31 |   - future=0.18.2=py38_1
 32 |   - gettext=0.19.8.1=h9b4dc7a_1
 33 |   - gitdb=4.0.5=py_0
 34 |   - gitpython=3.1.14=pyhd3eb1b0_1
 35 |   - glib=2.67.4=h36276a3_1
 36 |   - glib-tools=2.66.7=h9c3ff4c_1
 37 |   - gmp=6.2.1=h2531618_2
 38 |   - gnutls=3.6.13=h85f3911_1
 39 |   - graphite2=1.3.14=h23475e2_0
 40 |   - gst-plugins-base=1.14.0=h8213a91_2
 41 |   - gstreamer=1.14.0=h28cd5cc_2
 42 |   - h5py=2.10.0=py38hd6299e0_1
 43 |   - harfbuzz=2.7.4=h5cf4720_0
 44 |   - hdf5=1.10.6=hb1b8bf9_0
 45 |   - humanfriendly=9.1=py38h06a4308_0
 46 |   - icu=58.2=he6710b0_3
 47 |   - idna=2.10=pyhd3eb1b0_0
 48 |   - imageio=2.9.0=py_0
 49 |   - importlib-metadata=2.0.0=py_1
 50 |   - importlib_metadata=2.0.0=1
 51 |   - iniconfig=1.1.1=pyhd3eb1b0_0
 52 |   - intel-openmp=2020.2=254
 53 |   - jasper=1.900.1=hd497a04_4
 54 |   - joblib=1.0.1=pyhd3eb1b0_0
 55 |   - jpeg=9b=h024ee3a_2
 56 |   - jsonref=0.2=py_0
 57 |   - jsonschema=3.2.0=py_2
 58 |   - kiwisolver=1.3.1=py38h2531618_0
 59 |   - krb5=1.17.1=h173b8e3_0
 60 |   - lame=3.100=h7b6447c_0
 61 |   - lcms2=2.11=h396b838_0
 62 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 63 |   - libblas=3.9.0=8_mkl
 64 |   - libcblas=3.9.0=8_mkl
 65 |   - libclang=11.1.0=default_ha53f305_0
 66 |   - libedit=3.1.20191231=h14c3975_1
 67 |   - libevent=2.1.10=hcdb4288_3
 68 |   - libffi=3.3=he6710b0_2
 69 |   - libgcc-ng=9.3.0=h2828fa1_18
 70 |   - libgfortran-ng=7.3.0=hdf63c60_0
 71 |   - libglib=2.66.7=h3e27bee_1
 72 |   - libiconv=1.16=h516909a_0
 73 |   - liblapack=3.9.0=8_mkl
 74 |   - liblapacke=3.9.0=8_mkl
 75 |   - libllvm11=11.1.0=hf817b99_0
 76 |   - libopencv=4.5.1=py38h703c3c0_0
 77 |   - libpng=1.6.37=hbc83047_0
 78 |   - libpq=12.3=h255efa7_3
 79 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 80 |   - libtiff=4.2.0=h3942068_0
 81 |   - libuuid=1.0.3=h1bed415_2
 82 |   - libwebp-base=1.2.0=h27cfd23_0
 83 |   - libxcb=1.14=h7b6447c_0
 84 |   - libxkbcommon=1.0.3=he3ba5ed_0
 85 |   - libxml2=2.9.10=hb55368b_3
 86 |   - llvm-openmp=11.0.1=h4bd325d_0
 87 |   - lz4-c=1.9.3=h2531618_0
 88 |   - matplotlib=3.3.4=py38h06a4308_0
 89 |   - matplotlib-base=3.3.4=py38h62a2d02_0
 90 |   - mkl=2020.2=256
 91 |   - mkl-service=2.3.0=py38he904b0f_0
 92 |   - mkl_fft=1.3.0=py38h54f3939_0
 93 |   - mkl_random=1.1.1=py38h0573a6f_0
 94 |   - monotonic=1.5=py_0
 95 |   - more-itertools=8.7.0=pyhd3eb1b0_0
 96 |   - msgpack-python=1.0.2=py38hff7bd54_1
 97 |   - mysql-common=8.0.22=ha770c72_1
 98 |   - mysql-libs=8.0.22=h1fd7589_1
 99 |   - ncurses=6.2=he6710b0_1
100 |   - neptune-client=0.5.1=pyh44b312d_0
101 |   - nettle=3.6=he412f7d_0
102 |   - nspr=4.29=h9c3ff4c_1
103 |   - nss=3.62=hb5efdd6_0
104 |   - numpy=1.19.2=py38h54aff64_0
105 |   - numpy-base=1.19.2=py38hfa32c7d_0
106 |   - oauthlib=3.1.0=py_0
107 |   - olefile=0.46=py_0
108 |   - openh264=2.1.1=h780b84a_0
109 |   - openssl=1.1.1j=h27cfd23_0
110 |   - packaging=20.9=pyhd3eb1b0_0
111 |   - pandas=1.2.3=py38ha9443f7_0
112 |   - pcre=8.44=he6710b0_0
113 |   - pillow=8.1.2=py38he98fc37_0
114 |   - pip=21.0.1=py38h06a4308_0
115 |   - pixman=0.40.0=h7b6447c_0
116 |   - pluggy=0.13.1=py38_0
117 |   - psutil=5.7.2=py38h7b6447c_0
118 |   - py=1.10.0=pyhd3eb1b0_0
119 |   - py-opencv=4.5.1=py38h81c977d_0
120 |   - py3nvml=0.2.5=py38h32f6830_2
121 |   - pycocotools=2.0.2=py38h497a2fe_1
122 |   - pycparser=2.20=py_2
123 |   - pyjwt=1.7.1=py38_0
124 |   - pyopenssl=20.0.1=pyhd3eb1b0_1
125 |   - pyparsing=2.4.7=pyhd3eb1b0_0
126 |   - pyqt=5.9.2=py38h05f1152_4
127 |   - pyrsistent=0.17.3=py38h7b6447c_0
128 |   - pysocks=1.7.1=py38h06a4308_0
129 |   - pytest=6.2.2=py38h06a4308_2
130 |   - python=3.8.5=h7579374_1
131 |   - python-dateutil=2.8.1=pyhd3eb1b0_0
132 |   - python_abi=3.8=1_cp38
133 |   - pytz=2021.1=pyhd3eb1b0_0
134 |   - qt=5.9.7=h5867ecd_1
135 |   - readline=8.1=h27cfd23_0
136 |   - requests=2.25.1=pyhd3eb1b0_0
137 |   - requests-oauthlib=1.3.0=py_0
138 |   - scikit-learn=0.24.1=py38ha9443f7_0
139 |   - scipy=1.6.1=py38h91f5cce_0
140 |   - setuptools=52.0.0=py38h06a4308_0
141 |   - simplejson=3.17.2=py38h27cfd23_2
142 |   - sip=4.19.13=py38he6710b0_0
143 |   - six=1.15.0=py38h06a4308_0
144 |   - smmap=3.0.5=pyhd3eb1b0_0
145 |   - sqlite=3.33.0=h62c20be_0
146 |   - swagger-spec-validator=2.7.3=pyh9f0ad1d_0
147 |   - threadpoolctl=2.1.0=pyh5ca1d4c_0
148 |   - tk=8.6.10=hbc83047_0
149 |   - toml=0.10.1=py_0
150 |   - tornado=6.1=py38h27cfd23_0
151 |   - typing-extensions=3.7.4.3=hd3eb1b0_0
152 |   - typing_extensions=3.7.4.3=pyh06a4308_0
153 |   - urllib3=1.26.3=pyhd3eb1b0_0
154 |   - websocket-client=0.58.0=py38h06a4308_4
155 |   - wheel=0.36.2=pyhd3eb1b0_0
156 |   - x264=1!161.3030=h7f98852_0
157 |   - xmltodict=0.12.0=py_0
158 |   - xorg-kbproto=1.0.7=h7f98852_1002
159 |   - xorg-libice=1.0.10=h7f98852_0
160 |   - xorg-libsm=1.2.2=h470a237_5
161 |   - xorg-libx11=1.7.0=h7f98852_0
162 |   - xorg-libxext=1.3.4=h7f98852_1
163 |   - xorg-libxrender=0.9.10=h7f98852_1003
164 |   - xorg-renderproto=0.11.1=h7f98852_1002
165 |   - xorg-xextproto=7.3.0=h7f98852_1002
166 |   - xorg-xproto=7.0.31=h27cfd23_1007
167 |   - xz=5.2.5=h7b6447c_0
168 |   - yaml=0.2.5=h7b6447c_0
169 |   - zipp=3.4.0=pyhd3eb1b0_0
170 |   - zlib=1.2.11=h7b6447c_3
171 |   - zstd=1.4.5=h9ceee32_0
172 |   - pip:
173 |     - absl-py==0.12.0
174 |     - aiohttp==3.7.4.post0
175 |     - async-timeout==3.0.1
176 |     - cachetools==4.2.1
177 |     - fsspec==0.8.7
178 |     - google-auth==1.27.1
179 |     - google-auth-oauthlib==0.4.3
180 |     - grpcio==1.36.1
181 |     - jsonpointer==2.0
182 |     - markdown==3.3.4
183 |     - multidict==5.1.0
184 |     - opencv-python==4.4.0.46
185 |     - protobuf==3.15.6
186 |     - pyasn1==0.4.8
187 |     - pyasn1-modules==0.2.8
188 |     - pytorch-lightning==1.2.3
189 |     - pyyaml==5.3.1
190 |     - rfc3987==1.3.8
191 |     - rsa==4.7.2
192 |     - strict-rfc3339==0.7
193 |     - tensorboard==2.4.1
194 |     - tensorboard-plugin-wit==1.8.0
195 |     - torch==1.7.1+cu110
196 |     - torchaudio==0.7.2
197 |     - torchvision==0.8.2+cu110
198 |     - tqdm==4.59.0
199 |     - webcolors==1.11.1
200 |     - werkzeug==1.0.1
201 |     - yarl==1.6.3
202 | prefix: /cluster/home/jonfrey/miniconda3/envs/py38
203 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Leonhard and Euler Guide
  2 | 
  3 | - [Leonhard and Euler Guide](#leonhard-and-euler-guide)
  4 | 	- [Preliminary](#preliminary)
  5 | 		- [Resources](#resources)
  6 | 		- [Connecting to the cluster](#connecting-to-the-cluster)
  7 | 	- [Modules](#modules)
  8 | 	- [Python](#python)
  9 | 		- [Using the pre-compiled binaries](#using-the-pre-compiled-binaries)
 10 | 		- [Setup Miniconda](#setup-miniconda)
 11 | 		- [Creating Conda Environment](#creating-conda-environment)
 12 | 		- [Test your Python Installation](#test-your-python-installation)
 13 | 	- [Using Jupyter Notebook for Visualizing Results](#using-jupyter-notebook-for-visualizing-results)
 14 | 	- [Storing Data](#storing-data)
 15 | 		- [General Procedure](#general-procedure)
 16 | 		- [Implementation Commands](#implementation-commands)
 17 | 			- [**Taring** a folder without compression](#taring-a-folder-without-compression)
 18 | 			- [**Copying** a **folder from local computer** to the cluster](#copying-a-folder-from-local-computer-to-the-cluster)
 19 | 			- [**Copying** a **folder** **from cluster** to yor local computer](#copying-a-folder-from-cluster-to-yor-local-computer)
 20 | 			- [**Untaring** the tar file to the local storage of a node](#untaring-the-tar-file-to-the-local-storage-of-a-node)
 21 | 			- [Accessing locale scratch directory in python](#accessing-locale-scratch-directory-in-python)
 22 | 		- [Performance](#performance)
 23 | 	- [Scheduling Jobs](#scheduling-jobs)
 24 | 		- [Interactive jobs](#interactive-jobs)
 25 | 		- [Monitoring Jobs](#monitoring-jobs)
 26 | 		- [Scheduling Python-Job Manual](#scheduling-python-job-manual)
 27 | 		- [Python Debugging Tipps](#python-debugging-tipps)
 28 | 		- [Template Environment](#template-environment)
 29 | 	- [Template Project Overview](#template-project-overview)
 30 | 	- [Setting up Environment Variables on the cluster](#setting-up-environment-variables-on-the-cluster)
 31 | 	- [Using Ansible](#using-ansible)
 32 | 		- [Installation](#installation)
 33 | 		- [Configure](#configure)
 34 | 		- [Testing the settings](#testing-the-settings)
 35 | 		- [Scheduling Jobs Using Playbooks](#scheduling-jobs-using-playbooks)
 36 | 
 37 | ## Preliminary
 38 | This is the Readme copied from [](https://github.com/JonasFrey96/ASL_leonhard_euler)
 39 | If you feel like you have found a nice tool or trick to help other people using the cluster feel free to open and issue!  
 40 | 
 41 | If you would like to contribute just feel free to mail me: jonfrey@ethz.ch  
 42 | Feel free to give the repository a **star** if it helps you!  
 43 |   
 44 | What is coming: 
 45 | - Example Project Tensorflow
 46 | - Example Project PyTorch
 47 | - Example Project PyTorch lightning
 48 | - Nepune.ai Logging
 49 | - Dataset Examples
 50 | 
 51 | This repository is under construction.  
 52 | 
 53 | ### Resources
 54 | Read the Leonhard/Euler cluster guides:
 55 | [Getting_started_with_clusters](https://scicomp.ethz.ch/wiki/Getting_started_with_clusters)
 56 | 
 57 | 
 58 | ### Connecting to the cluster
 59 | ![Cluster Overview](https://github.com/JonasFrey96/ASL_leonhard_euler/blob/master/docs/cluster_web.jpeg?raw=true)
 60 | 
 61 | 
 62 | For this you can follow the guide on the official cluster web-side which shows you how to generate and copy your local ssh key to the cluster.
 63 | 
 64 | Steps in short:
 65 | 1. Connect to the ETH network via VPN CiscoAnyConnect is highly recommended. (most stable)
 66 | 2. Generate your local ssh key. 
 67 | 3. Copy your local ssh key to the cluster by running:
 68 | ```
 69 | cat ~/.ssh/id_rsa.pub | ssh username@login.leonhard.ethz.ch "mkdir -p ~/.ssh && chmod 700 ~/.ssh && cat >>  ~/.ssh/authorized_keys"
 70 | ```
 71 | Here we assume you already created your ssh key at '~/.ssh/id_rsa.pub' on your locale machine. 
 72 | 
 73 | 4. Try to connect.
 74 | 'ssh username@login.leonhard.ethz.ch'
 75 | 
 76 | 
 77 | ## Modules
 78 | When you connect to the cluster you connect to a login node.
 79 | There exists a variety of modules pre-installed. 
 80 | 
 81 | (https://scicomp.ethz.ch/wiki/Leonhard_applications_and_libraries)
 82 | 
 83 | 
 84 | At first make sure to use the new software stack with the following command:
 85 | ```
 86 | env2lmod
 87 | ```
 88 | 
 89 | 
 90 | You can list the currently loaded modules with:
 91 | ```
 92 | module list
 93 | ```
 94 | 
 95 | When you want to develop something in Python you can either use pre-compiled binaries by loading the correct python module
 96 | ```
 97 | module load gcc/6.3.0 python_gpu/3.7.4 cuda/10.1.243
 98 | module load gcc/6.3.0 python_gpu/3.8.5 cuda/11.0.3
 99 | ```
100 | or create your own Python installation.
101 | 
102 | The job execution nodes are not directly connected to the internet, but you can access the internet by loading the proxy module.
103 | ```
104 | module load eth_proxy
105 | ```
106 | 
107 | 
108 | ## Python
109 | ### Using the pre-compiled binaries
110 | You can take a look into the provided pre-compiled python binaries here:
111 | https://scicomp.ethz.ch/wiki/Python_on_Euler
112 | 
113 | In general we recommend setting up miniconda to manage your python environment.
114 | This allows you to fully match the cluster and your locale setup.
115 | 
116 | ### Setup Miniconda
117 | Using anaconda to setup a custom python environment.
118 | (https://docs.conda.io/en/latest/miniconda.html)
119 | 
120 | To install miniconda:
121 | 1. Connect to the cluster
122 | 2. Navigate to $HOME 
123 | 3. Run the following:
124 | ```
125 | cd ~ && wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh
126 | chmod +x . Miniconda3-py38_4.9.2-Linux-x86_64.sh
127 | ./Miniconda3-py38_4.9.2-Linux-x86_64.sh
128 | ```
129 | 
130 | In general:
131 | 
132 | It is important to install the conda environment (which will contain a lot of small files), to your $HOME folder (/cluster/home/username/miniconda3). This directory will always before running a job be copied to the compute node. Your home folder is quite small < 15GB but perfect for storing your code and the python environments.
133 | 
134 | 4. Source the .bashrc file or open a new shell.
135 | ```
136 | source ~/.bashrc
137 | ```
138 | 5. Verify your installation:
139 | You should now see the currently loaded conda environments in brackets before your username. `(base) [username@login-noden ~]$`
140 | 
141 | 
142 | ### Creating Conda Environment
143 | Follow this guide on how to setup a new environment.
144 | When using GPUs make sure to match the CUDA Version.
145 | You can load different CUDA-Versions with module load. 
146 | Also be aware of the GCC Version.
147 | We recommend GCC version 6.3.0 and CUDA 11.0.
148 | 
149 | 
150 | [Guide how to ,manage conda environments](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)
151 | 
152 | Execute the following command to create your Python environment named myenv (you can change the name):
153 | ```
154 | conda create -n myenv python=3.8.5
155 | conda activate myenv
156 | ```
157 | 
158 | Install some packages:  
159 | Example PyTorch Installation (Here it's important to match the cudatoolkit version!):
160 | ```bash
161 | conda install pytorch==1.7.1  \ 
162 |     torchvision==0.8.2 \
163 |     torchaudio==0.7.2 \
164 |     cudatoolkit=11.0 -c pytorch
165 | ```
166 | 
167 | ### Test your Python Installation
168 | 1. At first check your python path:
169 | Command:
170 | ```bash
171 | which python 
172 | ```
173 | Result:
174 | ```bash 
175 | /cluster/home/username/miniconda3/envs/myenv/bin/python
176 | ```
177 | 
178 | If an other path is given try to execute `conda deactivate`.
179 | Reactivate your environment: `conda activate myenv`
180 | 
181 | 1. Open an interactive python shell:
182 | ```bash
183 | python
184 | ```
185 | 
186 | ```python
187 | import torch
188 | torch.__version__
189 | ```
190 | To check that you have installed the correct pytorch version.
191 | Exit the shell with `exit()`
192 | 
193 | 
194 | 
195 | ## Using Jupyter Notebook for Visualizing Results
196 | [Jupyter_on_Euler_and_Leonhard_Open](https://scicomp.ethz.ch/wiki/Jupyter_on_Euler_and_Leonhard_Open)
197 | 
198 | 
199 | ## Storing Data
200 | ### General Procedure
201 | It's important to manage your data storage correctly on the cluster.
202 | All large datasets should be stored under the `/cluster/work/riner` folder.  
203 | Also if your experiment results are large store them under the `/cluster/work/riner` as well.
204 | 
205 | It's important to not store small files. When you need to train your model on a large dataset the workflow is the following.
206 | 1. Tar the dataset folder without compression!
207 | 2. Schedule the job and request SCRATCH storage (will be discussed in the job-section)
208 | 3. Untar the dataset to the SCRATCH partition of the compute node ($TMPDIR). The SCRATCH partition is mounted under $TMPDIR
209 | 4. Now you can access the small files individually very fast given that they are on the SSD directly on the compute-node and no network transfer is needed.
210 | 
211 | If you don't follow this procedure and try to access a lot of small files on a network storage (/cluster/work/riner) you will slow down the network and your bandwidth will be massively reduced when you hit a certain file number limit.
212 | 
213 | 
214 | ### Implementation Commands
215 | 
216 | #### **Taring** a folder without compression
217 | ```
218 | cd directory/containing/datasets
219 | tar -cvf dataset.tar dataset_folder
220 | ```
221 | 
222 | #### **Copying** a **folder from local computer** to the cluster
223 | Open a shell on your local computer
224 | ```
225 | scp -r ./path/to/local_folder username@login.leonhard.ethz.ch:/cluster/work/riner/some_folder
226 | ```
227 | #### **Copying** a **folder** **from cluster** to your local computer
228 | Open a shell on your local computer
229 | ```
230 | scp -r username@login.leonhard.ethz.ch:/cluster/work/riner/results ./path/to/local_results 
231 | ```
232 | #### **Untaring** the tar file to the local storage of a node
233 | ```
234 | tar -xvf /cluster/work/riner/datasets.tar -C $TMPDIR
235 | ```
236 | 
237 | #### Accessing locale scratch directory in python
238 | 
239 | Given that the TMPDIR variable is automatically set you can access the location of the dataset as follows:
240 | 
241 | ```python
242 | import os
243 | tmpdir = os.getenv('TMPDIR)
244 | os.system(f'tar \cluster\work\riner\yourtarfile -C {tmpdir}')
245 | ```
246 | 
247 | ### Performance
248 | Don't use a compression if you already have compressed files such as images stored as jpgs or pngs.  
249 | HDF5 files are also handy to use.  
250 | If your dataset is small you can consider loading all files into the RAM given that you can request a huge amount of RAM.  
251 | 
252 | 
253 | ## Scheduling Jobs
254 | Read the **Using the batch system** section. [Getting_started_with_clusters](https://scicomp.ethz.ch/wiki/Getting_started_with_clusters)
255 | 
256 | ### Interactive jobs
257 | At first let's start an interactive job running a shell.
258 | ```
259 | bsub -n 16 -W 1:00 -R "rusage[mem=5000,ngpus_excl_p=2]" -R "select[gpu_mtotal0>=10000]" -R "rusage[scratch=10000]" -Is bash
260 | ```
261 | This command will return an interactive bash session *(-Is)* with 16 cores *(-n 16)* that runs for 1 hour *(-W 1:00)* with 2 GPUS with more then 10GB of memory.
262 | A total RAM of 16x5000MB and a total SSD Scratch of 10000x16MB.
263 | 
264 | We can run the following two commands to see the GPU utilization `nvidia-smi` and CPU usage `htop`.
265 | 
266 | You can now simply activate the correct conda environment and run your python code as on your local computer.
267 | This is especially useful for debugging. If your code crashes it might happen that the terminal freezes and you have to submit a new interactive session.
268 | 
269 | If you know a workaround for this freezing problem I please share it!
270 | 
271 | 
272 | ### Monitoring Jobs
273 | Jo can see the running Jobs with `bjobs` or `bbjobs` for more details.
274 | 
275 | Jo can use the JOB-IDS to stop or peek the job.
276 | ```bash 
277 | bkill JOB-ID           # Sends stop signal to the selected job
278 | bkill 0                # Sends stop signals to ALL-jobs.
279 | bpeek JOB-ID           # Prints STD OUT of the selected job to the terminal.
280 | ```
281 | 
282 | When you want the evaluate or debug certain problems its helpful to connect to the job-execution directly.
283 | ```
284 | bjob_connect JOB-ID
285 | ```
286 | You will see in brackets how the node changes from a login node to the execution node.
287 | 
288 | ### Scheduling Python-Job Manual
289 | 
290 | To schedule a python job we will create shell-script `submit.sh`
291 | Don't forget to set the correct permissions for execution:
292 | `chmod +x submit.sh`
293 | 
294 | ```sh
295 | # Always reload all-modules before execution for consistency.
296 | module list &> /dev/null || source /cluster/apps/modules/init/bash
297 | module purge
298 | module load legacy new gcc/6.3.0 hdf5 eth_proxy
299 | 
300 | # Navigate to the folder containing your python project.
301 | $HOME/my_project/
302 | # Specify the conda version.
303 | # $@ allows you to pass arguments to the python file
304 | $HOME/miniconda3/envs/myenv/bin/python main.py $@   
305 | ```
306 | 
307 | Scheduling the Job:
308 | ```bash
309 | bsub -I -n 4 -W 1:00 -R "rusage[mem=5000]" $HOME/run.sh --env=hello --exp=world
310 | ```
311 | 
312 | `main.py`
313 | ```python
314 | import argparse
315 | if __name__ == "__main__":
316 |   parser = argparse.ArgumentParser()    
317 |   parser.add_argument('--exp', help='Some flag.')
318 |   parser.add_argument('--env', help='Other flag')
319 |   args = parser.parse_args()
320 |   print( args.exp, args.env )
321 | ```
322 | 
323 | 
324 | ### Python Debugging Tipps
325 | When using interactive bash sessions, you would like to break the program using Ctrl-C without freezing the terminal; it helps to explicitly catch the signal.  
326 | By adding the following to the main script:
327 | ```
328 | import signal
329 | 
330 | def signal_handler(signal, frame):
331 | 	print('exiting on CRTL-C')
332 | 	logger.experiment.stop()
333 | 	sys.exit(0)
334 | 
335 | signal.signal(signal.SIGINT, signal_handler)
336 | signal.signal(signal.SIGTERM, signal_handler)
337 | ```
338 | 
339 | ### Template Environment
340 | Tested on Leonhard and Euler.  
341 | Machine learning and vision tasks.  
342 | Python 3.8.5 & GCC/6.3.0  
343 | 
344 | Install:
345 | ```
346 | conda env create -f ./conda/py38.yml
347 | ```
348 | 
349 | FRAMEWORKS:
350 | ```yaml
351 | - torch=1.7.1+cu110
352 | - scikit-learn=0.24
353 | - scipy=1.6.1
354 | - numpy=1.19.2
355 | - pandas=1.2.3
356 | - pytorch-lightning=1.2.3
357 | - opencv=4.5.1
358 | ```
359 | UTILS:
360 | ```yaml
361 | - imageio=2.9.0
362 | - pillow=8.1.2
363 | - torchvision=0.8.2+cu110
364 | - h5py=h5py
365 | - matplotlib=3.3.4
366 | ```
367 | 
368 | MONITORING:
369 | ```yaml
370 | - neptune-client=0.5.1
371 | - tensorboard=2.4.1
372 | ```
373 | 
374 | 
375 | ## Template Project Overview
376 | 
377 | ## Setting up Environment Variables on the cluster
378 | Append the following lines to the end of your ~/.bashrc file.
379 | vi ~/.bashrc
380 | 
381 | ```
382 | export NEPTUNE_API_TOKEN="""torken"""
383 | export ENV_WORKSTATION_NAME="""leonhard"""
384 | ```
385 | Specify your neptune.ai key for debugging. (only necessary if you want to use neptune)  
386 | Specify the name of the cluster. This allows later to access this variable from your python script. Therefore you're able to keep track on which cluster you're on. Also this variable will be used to load the correct environment yaml file with the same name `/home/jonfrey/ASL_leonhard_euler/cfg/env/euler.yml` where you are able to specify cluster specific paths and settings.
387 | This allows you to easily move between your workstation and cluster.
388 | 
389 | ## Using Ansible
390 | 
391 | ### Installation
392 | Follow the installing ansible on Ubuntu guide. 
393 | 
394 | (https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html)
395 | 
396 | ### Configure
397 | 
398 | Configure ansible settings by modifying the following files.
399 | 1. ```sudo vi /etc/ansible/ansible.cfg```
400 | ```
401 | [defaults]
402 | remote_user=username
403 | host_key_checking = False
404 | sudo_flags=-H -S
405 | private_key_file = /home/jonfrey/.ssh/id_rsa
406 | 
407 | [ssh_connection]
408 | pipelining = True
409 | ```
410 | 
411 | 2. ``` sudo vi /etc/ansible/hosts```
412 | ```
413 | [leonhard]
414 | login.leonhard.ethz.ch ansible_ssh_user=username
415 | 
416 | [euler]
417 | euler.ethz.ch ansible_ssh_user=username
418 | ```
419 | 
420 | Replace the username with your ETH email abbreviation.
421 | 
422 | ### Testing the settings
423 | 
424 | You should now be able to ping the configured hosts:
425 | Command: 
426 | ```bash
427 | sudo ansible all -m ping
428 | ```
429 | 
430 | Result:
431 | ```bash
432 | euler.ethz.ch | SUCCESS => {
433 |     "ansible_facts": {
434 |         "discovered_interpreter_python": "/usr/bin/python"
435 |     }, 
436 |     "changed": false, 
437 |     "ping": "pong"
438 | }
439 | login.leonhard.ethz.ch | SUCCESS => {
440 |     "ansible_facts": {
441 |         "discovered_interpreter_python": "/usr/bin/python"
442 |     }, 
443 |     "changed": false, 
444 |     "ping": "pong"
445 | }
446 | ```
447 | 
448 | 
449 | 
450 | ###  Scheduling Jobs Using Playbooks
451 | 
452 | At first have a look into the official documentation
453 | (https://docs.ansible.com/ansible/latest/user_guide/playbooks_intro.html)
454 | 
455 | Example Playbook `(ansible/queue_jobs.yml)`:
456 | ```yaml
457 | ---
458 | - name: Schedule Experiments
459 |   hosts: euler
460 |   vars:
461 |     - project_dir: "{{ ansible_env.HOME }}/"
462 |   tasks:
463 |     - name: Sync
464 |       synchronize: 
465 |         src: /home/jonfrey/ASL_leonhard_euler 
466 |         dest: "{{ project_dir }}"
467 | 
468 |     - name: Load variables
469 |       include_vars:
470 |         file: /home/jonfrey/ASL_leonhard_euler/ansible/experiments.yml
471 |         name: experiments
472 |     
473 |     - name: Schedule all experiments
474 |       shell: >
475 |           bsub -n 1 -W 0:10 -R "rusage[mem=5000,ngpus_excl_p=2]" -R "select[gpu_mtotal0>=10000]" -R "rusage[scratch=1000]" $HOME/ASL_leonhard_euler/scripts/submit.sh --exp={{ item.exp }}
476 |       loop: "{{ experiments.jobs }}"
477 | 
478 | ```
479 | Playbook Explanation:
480 | 1. Specify the execution host:   
481 | `hosts: euler`  
482 | The available hosts can be found in the previously setup `/etc/ansible/hosts` file
483 | 2. Synchronize your local code with the cluster:  
484 | `synchronize`   
485 | You can modify the `dest` and `src` path as needed.  
486 | Also it's possible to use `rsync` instead here.  
487 | 1. Load variables:    
488 | Loads the `ansible/experiments.yml` where paths to experiment files are listed.  
489 | Each of the entries in the `jobs` list will be handled separately.
490 | We will loop over the jobs list in the next command.
491 | 4. Scheduling:  
492 | Schedule the job with the bash command. Sets the correct exp-file-path for each experiment.
493 | The `scripts/submit.sh` file loads the correct module. And starts the `main.py` with the template conda environment.
494 | the arguments that are passed to the script (`--exp=`) will be passed to the main.py file.
495 | With the loop command ansible knows it is supposed to loop over the list.  
496 | `loop: "{{ experiments.jobs }}"`
497 | 
498 | 
499 | 
500 | Command:
501 | ```bash
502 | sudo ansible-playbook ansible/queue_jobs.yml 
503 | ```
504 | 
505 | Result:
506 | 
507 | ```bash
508 | PLAY [Schedule Experiments] *********************************************************************************
509 | 
510 | TASK [Gathering Facts] **************************************************************************************
511 | ok: [euler.ethz.ch]
512 | 
513 | TASK [Sync] *************************************************************************************************
514 | changed: [euler.ethz.ch]
515 | 
516 | TASK [Load experiments] *************************************************************************************
517 | ok: [euler.ethz.ch]
518 | 
519 | TASK [Schedule all experiments] ******************************************************************************
520 | changed: [euler.ethz.ch] => (item={u'exp': u'/home/jonfrey/ASL_leonhard_euler/cfg/exp/exp.yml'})
521 | changed: [euler.ethz.ch] => (item={u'exp': u'/home/jonfrey/ASL_leonhard_euler/cfg/exp/exp.yml'})
522 | 
523 | PLAY RECAP **************************************************************************************************
524 | euler.ethz.ch              : ok=4    changed=2    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
525 | ```
526 | 
527 | ```
528 | (base) [jonfrey@eu-login-11 ~]$ bjobs
529 | JOBID      USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
530 | 165381072  jonfrey PEND  gpu.4h     eu-login-21             *p/exp.yml Mar 15 07:00
531 | 165381081  jonfrey PEND  gpu.4h     eu-login-21             *p/exp.yml Mar 15 07:00
532 | ```
533 | 
534 | 
535 | ```
536 | TODO:
537 | - git clone instructions and repository overview
538 | - Workflow diagram ansible
539 | - Add my paramiko script
540 | - Extend ansible to use -o correctly
541 | - neptuneai
542 | 
543 | ```
544 | 
545 | 


--------------------------------------------------------------------------------