├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── cw2
    ├── __init__.py
    ├── alternative_schedulers.py
    ├── cli_parser.py
    ├── cluster_work.py
    ├── cw_config
    │   ├── __init__.py
    │   ├── conf_io.py
    │   ├── conf_path.py
    │   ├── conf_resolver.py
    │   ├── conf_unfolder.py
    │   ├── cw_conf_keys.py
    │   └── cw_config.py
    ├── cw_data
    │   ├── __init__.py
    │   ├── cw_loading.py
    │   ├── cw_logging.py
    │   ├── cw_pd_logger.py
    │   └── cw_wandb_logger.py
    ├── cw_error.py
    ├── cw_slurm
    │   ├── __init__.py
    │   ├── cw_slurm.py
    │   └── cw_slurm_keys.py
    ├── default_sbatch.sh
    ├── experiment.py
    ├── job.py
    ├── scheduler.py
    └── util.py
├── doc
    ├── 01_quickstart.md
    ├── 02_experiment.md
    ├── 03_config.md
    ├── 04_slurm.md
    ├── 05_files.md
    ├── 06_code_copy.md
    ├── 07_logging.md
    ├── 08_loading.md
    ├── 09_advanced.md
    ├── 10_advanced_gpu.md
    ├── 11_cli_args.md
    └── README.md
├── polynom_tutorial
    ├── external_conf.yml
    ├── polynom_config.yml
    ├── polynom_load.py
    └── polynom_main.py
├── pyproject.toml
├── setup.py
├── templates
    ├── abstract_config.yml
    ├── abstract_main.py
    ├── iterative_config.yml
    ├── iterative_main.py
    └── sbatch_template.sh
└── test
    ├── horeka_scheduler_test
        ├── __init__.py
        ├── horeka_config.yml
        └── test_experiment.py
    └── test_cw_config.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
  2 | 
  3 | # Created by https://www.gitignore.io/api/windows,visualstudiocode,linux,python
  4 | # Edit at https://www.gitignore.io/?templates=windows,visualstudiocode,linux,python
  5 | 
  6 | ### Linux ###
  7 | *~
  8 | 
  9 | # temporary files which can be created if a process still has a handle open of a deleted file
 10 | .fuse_hidden*
 11 | 
 12 | # KDE directory preferences
 13 | .directory
 14 | 
 15 | # Linux trash folder which might appear on any partition or disk
 16 | .Trash-*
 17 | 
 18 | # .nfs files are created when an open file is removed but is still being accessed
 19 | .nfs*
 20 | 
 21 | ### Python ###
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | pip-wheel-metadata/
 45 | share/python-wheels/
 46 | *.egg-info/
 47 | .installed.cfg
 48 | *.egg
 49 | MANIFEST
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .nox/
 65 | .coverage
 66 | .coverage.*
 67 | .cache
 68 | nosetests.xml
 69 | coverage.xml
 70 | *.cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | 
 74 | # Translations
 75 | *.mo
 76 | *.pot
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # Mr Developer
111 | .mr.developer.cfg
112 | .project
113 | .pydevproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | ### VisualStudioCode ###
127 | .vscode/*
128 | 
129 | ### VisualStudioCode Patch ###
130 | # Ignore all local history of files
131 | .history
132 | 
133 | ### Windows ###
134 | # Windows thumbnail cache files
135 | Thumbs.db
136 | Thumbs.db:encryptable
137 | ehthumbs.db
138 | ehthumbs_vista.db
139 | 
140 | # Dump file
141 | *.stackdump
142 | 
143 | # Folder config file
144 | [Dd]esktop.ini
145 | 
146 | # Recycle Bin used on file shares
147 | $RECYCLE.BIN/
148 | 
149 | # Windows Installer files
150 | *.cab
151 | *.msi
152 | *.msix
153 | *.msm
154 | *.msp
155 | 
156 | # Windows shortcuts
157 | *.lnk
158 | 
159 | # End of https://www.gitignore.io/api/windows,visualstudiocode,linux,python
160 | 
161 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
162 | 
163 | exp_output
164 | polynom_tutorial/log
165 | 
166 | # ignore ide files
167 | .idea
168 | venv 
169 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Karlsruhe Institute of Technology (KIT) - Autonomous Learning Robots Lab (ALR)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cw2 - ClusterWork 2
 2 | 
 3 | [![Upload Python Package](https://github.com/ALRhub/cw2/actions/workflows/python-publish.yml/badge.svg)](https://github.com/ALRhub/cw2/actions/workflows/python-publish.yml)
 4 | 
 5 | ClusterWork 2 is a python framework to manage experiments using YAML config files. It also enables users to easily deploy multiple experiments using different configurations on computing clusters, which support the [slurm workload manager](https://slurm.schedmd.com/documentation.html).
 6 | 
 7 | ## Installation
 8 | ```bash
 9 | pip install cw2
10 | ```
11 | 
12 | ## Quickstart
13 | Please refer to the [Quickstart Guide](doc/01_quickstart.md).
14 | 
15 | ## Program Execution
16 | To start an experiment locally, e.g. for testing:
17 | ```bash
18 | python3 YOUR_MAIN.py YOUR_CONFIG.yml
19 | ```
20 | 
21 | To start an experiment on a slurm cluster:
22 | ```bash
23 | python3 YOUR_MAIN.py YOUR_CONFIG.yml -s
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/cw2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/__init__.py


--------------------------------------------------------------------------------
/cw2/alternative_schedulers.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import concurrent.futures as con
  3 | import multiprocessing
  4 | import os
  5 | import queue
  6 | from typing import List
  7 | 
  8 | from joblib import Parallel, delayed
  9 | 
 10 | from cw2 import cw_error, job
 11 | from cw2.cw_config import cw_config
 12 | from cw2.cw_slurm import cw_slurm
 13 | from cw2.scheduler import GPUDistributingLocalScheduler
 14 | 
 15 | 
 16 | class StarmapGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
 17 |     def run(self, overwrite: bool = False):
 18 |         print("Using StarmapGPUDistributingLocalScheduler")
 19 |         num_parallel = self.joblist[0].n_parallel
 20 |         for j in self.joblist:
 21 |             assert (
 22 |                 j.n_parallel == num_parallel
 23 |             ), "All jobs in list must have same n_parallel"
 24 |             assert j.n_parallel == self._queue_elements, (
 25 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
 26 |                 "parallel. Fix for optimal resource usage!!"
 27 |             )
 28 | 
 29 |         with multiprocessing.Pool(processes=num_parallel) as pool:
 30 |             # setup gpu resource queue
 31 |             m = multiprocessing.Manager()
 32 |             gpu_queue = m.Queue(maxsize=self._queue_elements)
 33 |             for i in range(self._queue_elements):
 34 |                 gpu_queue.put(i)
 35 | 
 36 |             for j in self.joblist:
 37 |                 args = [
 38 |                     (j, c, gpu_queue, self._gpus_per_rep, overwrite) for c in j.tasks
 39 |                 ]
 40 |                 pool.starmap_async(
 41 |                     StarmapGPUDistributingLocalScheduler._execute_task, args
 42 |                 )
 43 |             pool.close()
 44 |             pool.join()
 45 | 
 46 |     @staticmethod
 47 |     def _execute_task(
 48 |         j: job.Job,
 49 |         c: dict,
 50 |         q: multiprocessing.Queue,
 51 |         gpus_per_job: int,
 52 |         overwrite: bool = False,
 53 |     ):
 54 |         gpu_idx = q.get()
 55 |         s = ("{}," * gpus_per_job).format(
 56 |             *[gpu_idx * gpus_per_job + i for i in range(gpus_per_job)]
 57 |         )[:-1]
 58 |         try:
 59 |             os.environ["CUDA_VISIBLE_DEVICES"] = s
 60 |             j.run_task(c, overwrite)
 61 |         except cw_error.ExperimentSurrender as _:
 62 |             return
 63 |         finally:
 64 |             q.put(gpu_idx)
 65 | 
 66 | 
 67 | class ConcurrentGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
 68 |     def run(self, overwrite: bool = False):
 69 |         print("Using ConcurrentGPUDistributingLocalScheduler")
 70 |         num_parallel = self.joblist[0].n_parallel
 71 |         for j in self.joblist:
 72 |             assert (
 73 |                 j.n_parallel == num_parallel
 74 |             ), "All jobs in list must have same n_parallel"
 75 |             assert j.n_parallel == self._queue_elements, (
 76 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
 77 |                 "parallel. Fix for optimal resource usage!!"
 78 |             )
 79 | 
 80 |         with con.ProcessPoolExecutor(max_workers=num_parallel) as pool:
 81 |             # setup gpu resource queue
 82 |             # gpu_queue = queue.Queue(maxsize=self._queue_elements)
 83 |             # for i in range(self._queue_elements):
 84 |             #    gpu_queue.put(i)
 85 | 
 86 |             results = []
 87 |             for j in self.joblist:
 88 |                 for i, c in enumerate(j.tasks):
 89 |                     results.append(
 90 |                         pool.submit(
 91 |                             ConcurrentGPUDistributingLocalScheduler._execute_task,
 92 |                             j,
 93 |                             c,
 94 |                             i,
 95 |                             self._gpus_per_rep,
 96 |                             overwrite,
 97 |                         )
 98 |                     )
 99 |             for r in results:
100 |                 r.result()
101 | 
102 |     @staticmethod
103 |     def _execute_task(
104 |         j: job.Job,
105 |         c: dict,
106 |         idx: int,
107 |         # q: multiprocessing.Queue,
108 |         gpus_per_job: int,
109 |         overwrite: bool = False,
110 |     ):
111 |         # gpu_idx = q.get()
112 |         s = ("{}," * gpus_per_job).format(
113 |             *[idx * gpus_per_job + i for i in range(gpus_per_job)]
114 |         )[:-1]
115 |         try:
116 |             os.environ["CUDA_VISIBLE_DEVICES"] = s
117 |             j.run_task(c, overwrite)
118 |         except cw_error.ExperimentSurrender as _:
119 |             return
120 | 
121 |     # finally:
122 |     # q.put(gpu_idx)
123 | 
124 | 
125 | class JoblibGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
126 |     def run(self, overwrite: bool = False):
127 |         print("Using JoblibGPUDistributingLocalScheduler")
128 |         for j in self.joblist:
129 |             Parallel(n_jobs=j.n_parallel)(
130 |                 delayed(self.execute_task)(j, c, i, self._gpus_per_rep, overwrite)
131 |                 for i, c in enumerate(j.tasks)
132 |             )
133 | 
134 |     def execute_task(
135 |         self, j: job.Job, c: dict, idx: int, gpus_per_job: int, overwrite: bool = False
136 |     ):
137 |         s = ("{}," * gpus_per_job).format(
138 |             *[idx * gpus_per_job + i for i in range(gpus_per_job)]
139 |         )[:-1]
140 |         try:
141 |             os.environ["CUDA_VISIBLE_DEVICES"] = s
142 |             j.run_task(c, overwrite)
143 |         except cw_error.ExperimentSurrender as _:
144 |             return
145 | 
146 | 
147 | class RayGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
148 |     def run(self, overwrite: bool = False):
149 |         print("Using RayGPUDistributingLocalScheduler")
150 | 
151 |         import ray
152 |         from ray.util.queue import Queue
153 | 
154 |         @ray.remote
155 |         def _execute_task(
156 |             j: job.Job, c: dict, q, gpus_per_job: int, overwrite: bool = False
157 |         ):
158 |             gpu_idx = q.get()
159 |             print("I got gpu idx", gpu_idx)
160 |             s = ("{}," * gpus_per_job).format(
161 |                 *[gpu_idx * gpus_per_job + i for i in range(gpus_per_job)]
162 |             )[:-1]
163 |             try:
164 |                 os.environ["CUDA_VISIBLE_DEVICES"] = s
165 |                 j.run_task(c, overwrite)
166 |             except cw_error.ExperimentSurrender as _:
167 |                 return
168 |             finally:
169 |                 print("giving back gpu idx", gpu_idx)
170 |                 q.put(gpu_idx)
171 | 
172 |         ray.init()
173 |         num_parallel = self.joblist[0].n_parallel
174 |         for j in self.joblist:
175 |             assert (
176 |                 j.n_parallel == num_parallel
177 |             ), "All jobs in list must have same n_parallel"
178 |             assert j.n_parallel == self._queue_elements, (
179 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
180 |                 "parallel. Fix for optimal resource usage!!"
181 |             )
182 |         gpu_queue = Queue(maxsize=self._queue_elements)
183 | 
184 |         for i in range(self._queue_elements):
185 |             gpu_queue.put(i)
186 |         results = []
187 |         for j in self.joblist:
188 |             for i, c in enumerate(j.tasks):
189 |                 results.append(
190 |                     _execute_task.remote(j, c, gpu_queue, self._gpus_per_rep, overwrite)
191 |                 )
192 |         ray.get(results)
193 | 


--------------------------------------------------------------------------------
/cw2/cli_parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | class Arguments:
 5 |     def __init__(self):
 6 |         p = argparse.ArgumentParser()
 7 |         p.add_argument("config", metavar="CONFIG.yml")
 8 |         p.add_argument(
 9 |             "-j",
10 |             "--job",
11 |             type=int,
12 |             default=None,
13 |             help="Run only the specified job. CAVEAT: Should only be used with slurm arrays.",
14 |         )
15 | 
16 |         # XXX: Disable delete for now
17 |         # p.add_argument('-d', '--delete', action='store_true',
18 |         #                help='CAUTION deletes results of previous runs.')
19 | 
20 |         p.add_argument(
21 |             "-e",
22 |             "--experiments",
23 |             nargs="+",
24 |             default=None,
25 |             help="Allows to specify which experiments should be run.",
26 |         )
27 |         p.add_argument(
28 |             "-s",
29 |             "--slurm",
30 |             action="store_true",
31 |             help="Run using SLURM Workload Manager.",
32 |         )
33 |         p.add_argument(
34 |             "-o", "--overwrite", action="store_true", help="Overwrite existing results."
35 |         )
36 |         p.add_argument(
37 |             "-t",
38 |             "--prefix-with-timestamp",
39 |             dest="prefix_with_timestamp",
40 |             action="store_true",
41 |             default=False,
42 |             help="If specified, prefix all started experiment runs with this timestamp. "
43 |                  "This can help with telling runs apart from one another. but will also modify the log "
44 |                  "directiories created. CAUTION: Only works with local schedulers (no SLURM etc.)",
45 |         )
46 |         p.add_argument("--nocodecopy", action="store_true", help="Skip code copy.")
47 |         p.add_argument(
48 |             "--zip", action="store_true", help="Make a Zip Copy of the Code."
49 |         )
50 |         p.add_argument(
51 |             "--skipsizecheck",
52 |             action="store_true",
53 |             help="Skip check if code copy src < 200MByte",
54 |         )
55 |         p.add_argument(
56 |             "--multicopy",
57 |             action="store_true",
58 |             help="Create a code copy for each job seperately",
59 |         )
60 |         p.add_argument(
61 |             "--noconsolelog",
62 |             action="store_true",
63 |             help="Disables writing internal console log files",
64 |         )
65 |         p.add_argument(
66 |             "--debug", action="store_true", default=False, help="Enable debug mode."
67 |         )
68 |         p.add_argument(
69 |             "--debugall",
70 |             action="store_true",
71 |             default=False,
72 |             help="Enable debug mode for arguments.",
73 |         )
74 | 
75 |         self.args = p.parse_args(namespace=self)
76 |         if self.args.slurm and self.args.prefix_with_timestamp:
77 |             raise ValueError(
78 |                 "Timestep prefixing (-t) only work on local schedulers, "
79 |                 "so cannot use args --slurm (-s) and --prefix-with-timestamp (-t) at the same time."
80 |             )
81 | 
82 |     def get(self) -> dict:
83 |         return vars(self.args)
84 | 


--------------------------------------------------------------------------------
/cw2/cluster_work.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Type
  2 | 
  3 | from cw2 import cli_parser, experiment, job, scheduler
  4 | from cw2.cw_config import cw_config
  5 | from cw2.cw_data import cw_loading, cw_logging
  6 | 
  7 | 
  8 | class ClusterWork:
  9 |     def __init__(self, exp_cls: Type[experiment.AbstractExperiment] = None):
 10 |         self.args = cli_parser.Arguments().get()
 11 |         self.exp_cls = exp_cls
 12 |         self.config = cw_config.Config(
 13 |             self.args["config"],
 14 |             self.args["experiments"],
 15 |             self.args["debug"],
 16 |             self.args["debugall"],
 17 |             self.args["prefix_with_timestamp"]
 18 |         )
 19 | 
 20 |         self.logArray = cw_logging.LoggerArray()
 21 | 
 22 |         if not self.args["noconsolelog"]:
 23 |             self.add_logger(cw_logging.PythonLogger())
 24 |         self.joblist = None
 25 | 
 26 |     def add_logger(self, logger: cw_logging.AbstractLogger) -> None:
 27 |         """add a logger to the ClusterWork pipeline
 28 | 
 29 |         Args:
 30 |             logger (cw_logging.AbstractLogger): logger object to be called during execution
 31 |         """
 32 |         self.logArray.add(logger)
 33 | 
 34 |     def _get_jobs(
 35 |         self, delete: bool = False, root_dir: str = "", read_only: bool = False
 36 |     ) -> List[job.Job]:
 37 |         """private method. creates and returns all configured jobs.
 38 | 
 39 |         Args:
 40 |             delete (bool, optional): delete all old data inside the job directories. Defaults to False.
 41 |             root_dir (str, optional): [description]. Defaults to "".
 42 | 
 43 |         Returns:
 44 |             List[job.Job]: list of all configured job objects
 45 |         """
 46 |         if self.joblist is None:
 47 |             factory = job.JobFactory(
 48 |                 self.exp_cls, self.logArray, delete, root_dir, read_only
 49 |             )
 50 |             self.joblist = factory.create_jobs(self.config.exp_configs)
 51 |         return self.joblist
 52 | 
 53 |     def run(self, root_dir: str = "", sch: scheduler.AbstractScheduler = None):
 54 |         """Run ClusterWork computations.
 55 | 
 56 |         Args:
 57 |             root_dir (str, optional): [description]. Defaults to "".
 58 |         """
 59 |         if self.exp_cls is None:
 60 |             raise NotImplementedError(
 61 |                 "Cannot run with missing experiment.AbstractExperiment Implementation."
 62 |             )
 63 | 
 64 |         self.config.to_yaml(relpath=True)
 65 | 
 66 |         args = self.args
 67 | 
 68 |         # Handle SLURM execution
 69 |         if args["slurm"]:
 70 |             s = scheduler.SlurmScheduler(self.config)
 71 |         else:
 72 |             # Do Local execution
 73 |             if sch is None:
 74 |                 if scheduler.GPUDistributingLocalScheduler.use_distributed_gpu_scheduling(
 75 |                     self.config
 76 |                 ):
 77 |                     scheduler_cls = scheduler.get_gpu_scheduler_cls(
 78 |                         self.config.slurm_config.get("scheduler", "mp")
 79 |                     )
 80 |                     s = scheduler_cls(self.config)
 81 | 
 82 |                 elif scheduler.CpuDistributingLocalScheduler.use_distributed_cpu_scheduling(
 83 |                     self.config
 84 |                 ):
 85 |                     s = scheduler.CpuDistributingLocalScheduler(self.config)
 86 | 
 87 |                 else:
 88 |                     s = scheduler.LocalScheduler()
 89 |             else:
 90 |                 s = sch
 91 | 
 92 |         self._run_scheduler(s, root_dir)
 93 | 
 94 |     def load(self, root_dir: str = ""):
 95 |         """Loads all saved information.
 96 | 
 97 |         Args:
 98 |             root_dir (str, optional): [description]. Defaults to "".
 99 | 
100 |         Returns:
101 |             pd.DataFrame: saved data in Dataframe form.
102 |         """
103 | 
104 |         loader = cw_loading.Loader()
105 | 
106 |         return self._run_scheduler(loader, root_dir, True)
107 | 
108 |     def _run_scheduler(
109 |         self,
110 |         s: scheduler.AbstractScheduler,
111 |         root_dir: str = "",
112 |         read_only: bool = False,
113 |     ):
114 |         if self.logArray.is_empty():
115 |             cw_logging.getLogger().warning("No Logger has been added. Are you sure?")
116 | 
117 |         args = self.args
118 |         job_list = self._get_jobs(False, root_dir, read_only)
119 | 
120 |         if args["job"] is not None:
121 |             job_list = [job_list[args["job"]]]
122 | 
123 |         s.assign(job_list)
124 |         return s.run(overwrite=args["overwrite"])
125 | 


--------------------------------------------------------------------------------
/cw2/cw_config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_config/__init__.py


--------------------------------------------------------------------------------
/cw2/cw_config/conf_io.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Tuple
 3 | 
 4 | import yaml
 5 | 
 6 | from cw2.cw_config import cw_conf_keys as KEY
 7 | from cw2.cw_error import ExperimentNotFoundError, MissingConfigError
 8 | 
 9 | 
10 | def get_configs(
11 |     config_path: str, experiment_selections: List[str]
12 | ) -> Tuple[dict, dict, List[dict]]:
13 |     """reads and seperates the experiment configs from a yaml file
14 | 
15 |     Args:
16 |         config_path (str): path to the yaml file
17 |         experiment_selections (List[str]): a list of selected experiment names
18 | 
19 |     Returns:
20 |         Tuple[dict, dict, List[dict]]: SLURM, DEFAULT, Experiment Configurations
21 |     """
22 |     all_configs = read_yaml(config_path)
23 |     return separate_configs(all_configs, experiment_selections)
24 | 
25 | 
26 | def read_yaml(config_path: str) -> List[dict]:
27 |     """reads a YAML configuration file containing potentially multiple experiments
28 | 
29 |     Arguments:
30 |         config_path {str}: path to the YAML config file
31 | 
32 |     Returns:
33 |         List[dict]: all configs found in the yaml file
34 |     """
35 |     if not os.path.exists(config_path):
36 |         raise MissingConfigError("Could not find {}".format(config_path))
37 | 
38 |     all_configs = []
39 | 
40 |     with open(config_path, "r") as f:
41 |         for exp_conf in yaml.load_all(f, yaml.FullLoader):
42 |             if exp_conf is not None:
43 |                 all_configs.append(exp_conf)
44 |     return all_configs
45 | 
46 | 
47 | def separate_configs(
48 |     all_configs: List[dict], experiment_selections: List[str], suppress: bool = False
49 | ) -> Tuple[List[dict], dict, List[dict]]:
50 |     """separates the list of individual configs into the 'special' SLURM, DEFAULT and normal experiment configs
51 | 
52 |     Arguments:
53 |         all_configs (List[dict]): a list of all configurations
54 |         experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None.
55 | 
56 |     Returns:
57 |         Tuple[dict, dict, List[dict]]: SLURM, DEFAULT, Experiment Configurations, in this order
58 |     """
59 |     default_config = None
60 |     slurm_config = []
61 |     experiment_configs = []
62 | 
63 |     for c in all_configs:
64 |         name = c[KEY.NAME]
65 | 
66 |         if KEY.SLURM in name.lower():
67 |             slurm_config.append(c)
68 |         elif name.lower() == KEY.DEFAULT:
69 |             default_config = c
70 |         else:
71 |             if experiment_selections is None or name in experiment_selections:
72 |                 experiment_configs.append(c)
73 | 
74 |     if not suppress and len(experiment_configs) == 0:
75 |         raise ExperimentNotFoundError("No selected experiment found in config file.")
76 | 
77 |     return slurm_config, default_config, experiment_configs
78 | 
79 | 
80 | def write_yaml(fpath, data):
81 |     """write a yaml file
82 | 
83 |     Args:
84 |         fpath : path
85 |         data : payload
86 |     """
87 |     os.makedirs(os.path.dirname(fpath), exist_ok=True)
88 |     with open(fpath, "w") as f:
89 |         yaml.dump_all(data, f, default_flow_style=False)
90 | 


--------------------------------------------------------------------------------
/cw2/cw_config/conf_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, Dict, List
 3 | 
 4 | from cw2.cw_config import cw_conf_keys as KEY
 5 | 
 6 | 
 7 | def normalize_expanded_paths(
 8 |     expanded_config_list: List[Dict[str, Any]]
 9 | ) -> List[Dict[str, Any]]:
10 |     """normalizes path key after expansion operation
11 | 
12 |     Args:
13 |         expanded_config_list (List[Dict[str, Any]]): list fo expanded experiment configs
14 | 
15 |     Returns:
16 |         List[Dict[str, Any]]: noramlized expanded experiment configs
17 |     """
18 |     # Set Path and LogPath Args depending on the name
19 |     for _config in expanded_config_list:
20 |         _config[KEY.PATH] = os.path.join(
21 |             _config[KEY.i_BASIC_PATH], _config[KEY.i_NEST_DIR], _config[KEY.i_EXP_NAME]
22 |         )
23 |         _config[KEY.LOG_PATH] = os.path.join(_config[KEY.PATH], "log")
24 |     return expanded_config_list
25 | 
26 | 
27 | def make_rel_paths(config: Dict[str, Any], base_path: str) -> Dict[str, Any]:
28 |     """converts relevant paths of the config into relative paths
29 | 
30 |     Args:
31 |         config (Dict[str, Any]): experiment config
32 |         base_path (str): base path
33 | 
34 |     Returns:
35 |         Dict[str, Any]: experiment config with paths relative to base_path
36 |     """
37 |     c = config.copy()
38 |     _basic_path = base_path
39 |     c[KEY.LOG_PATH] = os.path.join(".", os.path.relpath(c[KEY.LOG_PATH], _basic_path))
40 |     c[KEY.i_REP_LOG_PATH] = os.path.join(
41 |         ".", os.path.relpath(c[KEY.i_REP_LOG_PATH], _basic_path)
42 |     )
43 |     c[KEY.PATH] = os.path.join(".", os.path.relpath(c[KEY.PATH], _basic_path))
44 |     c[KEY.i_BASIC_PATH] = os.path.join(
45 |         ".", os.path.relpath(c[KEY.i_BASIC_PATH], _basic_path)
46 |     )
47 |     return c
48 | 


--------------------------------------------------------------------------------
/cw2/cw_config/conf_resolver.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from copy import deepcopy
  3 | from typing import List
  4 | 
  5 | from cw2 import util
  6 | from cw2.cw_config import conf_io
  7 | from cw2.cw_config import cw_conf_keys as KEY
  8 | from cw2.cw_error import ConfigKeyError, MissingConfigError
  9 | 
 10 | 
 11 | def resolve_dependencies(
 12 |     default_config: dict, experiment_configs: List[dict], conf_path: str
 13 | ) -> List[dict]:
 14 |     """resolves all internal (DEFAULT) and external (import) dependencies
 15 | 
 16 |     Args:
 17 |         default_config (dict): DEFAULT exp configuration
 18 |         experiment_configs (List[dict]): list of experiment configurations
 19 |         conf_path (str): path of the "calling" config file
 20 | 
 21 |     Returns:
 22 |         List[dict]: list of experiment configurations without unresolved dependencies
 23 |     """
 24 |     experiment_configs = merge_default(default_config, experiment_configs)
 25 | 
 26 |     abs_path = os.path.abspath(conf_path)
 27 |     experiment_configs = import_external_yml(experiment_configs, abs_path)
 28 |     return experiment_configs
 29 | 
 30 | 
 31 | def merge_default(default_config: dict, experiment_configs: List[dict]) -> List[dict]:
 32 |     """merges each individual experiment configuration with the default parameters
 33 | 
 34 |     Arguments:
 35 |         default_config {dict} -- default configuration parameters
 36 |         experiment_configs {List[dict]} -- a list of individual experiment configurations
 37 | 
 38 |     Returns:
 39 |         List[dict] -- a list of all experiment configurations
 40 |     """
 41 |     if default_config is None:
 42 |         return experiment_configs
 43 | 
 44 |     expanded_exp_configs = []
 45 |     for c in experiment_configs:
 46 |         merge_c = deepcopy(default_config)
 47 |         merge_c = util.deep_update(merge_c, c)
 48 |         expanded_exp_configs.append(merge_c)
 49 |     return expanded_exp_configs
 50 | 
 51 | 
 52 | def import_external_yml(
 53 |     experiment_configs: List[dict], abs_path: str, traversal_dict: dict = None
 54 | ) -> List[dict]:
 55 |     """recursively imports external yaml files
 56 |     The external yaml files are first merged with their own DEFAULT configuration,
 57 |     then their external dependencies get resolved.
 58 | 
 59 |     Args:
 60 |         experiment_configs (List[dict]): list of experiment configurations
 61 |         abs_path (str): Absolute file path of the YAML file which gets resolved..
 62 |         traversal_dict (dict, optional): Dictionary(abs_path, exp_name) Serves as a failsafe to detect cyclic imports.
 63 |                                          Defaults to None.
 64 | 
 65 |     Raises:
 66 |         ConfigKeyError: if a cyclic import is attempted
 67 |         MissingConfigError: if the linked config cannot be found
 68 | 
 69 |     Returns:
 70 |         List[dict]: a list of resolved experiment configurations.
 71 |     """
 72 | 
 73 |     if traversal_dict is None:
 74 |         traversal_dict = {abs_path: []}
 75 | 
 76 |     resolved_configs = []
 77 |     for config in experiment_configs:
 78 |         # SKIP
 79 |         if KEY.IMPORT_PATH not in config and KEY.IMPORT_EXP not in config:
 80 |             resolved_configs.append(config)
 81 |             continue
 82 | 
 83 |         # Record current step
 84 |         traversal_dict[abs_path].append(config[KEY.NAME])
 85 | 
 86 |         import_yml = abs_path
 87 |         if KEY.IMPORT_PATH in config:
 88 |             import_yml = config[KEY.IMPORT_PATH]
 89 | 
 90 |         # Get absolute Path for import
 91 |         import_yml = os.path.abspath(
 92 |             os.path.join(os.path.dirname(abs_path), import_yml)
 93 |         )
 94 | 
 95 |         all_external_configs = conf_io.read_yaml(import_yml)
 96 | 
 97 |         ext_exp_name = KEY.DEFAULT
 98 |         if custom_import_exp(config):
 99 |             ext_exp_name = config[KEY.IMPORT_EXP]
100 | 
101 |         # Recursion Anchor:
102 |         if import_yml in traversal_dict and ext_exp_name in traversal_dict[import_yml]:
103 |             raise ConfigKeyError(
104 |                 "Cyclic YML import with {} : {}".format(import_yml, ext_exp_name)
105 |             )
106 | 
107 |         # Default Merge External
108 |         _, external, ext_selection = conf_io.separate_configs(
109 |             all_external_configs, [ext_exp_name], suppress=True
110 |         )
111 | 
112 |         if custom_import_exp(config):
113 |             if len(ext_selection) == 0:
114 |                 raise MissingConfigError(
115 |                     "Could not import {} from {}".format(ext_exp_name, import_yml)
116 |                 )
117 | 
118 |             external = merge_default(external, ext_selection)[0]
119 | 
120 |         # Register new Anchor
121 |         if import_yml not in traversal_dict:
122 |             traversal_dict[import_yml] = []
123 |         traversal_dict[import_yml].append(ext_exp_name)
124 | 
125 |         # Recursion call
126 |         ext_resolved_conf = import_external_yml([external], import_yml, traversal_dict)[
127 |             0
128 |         ]
129 | 
130 |         # Delete Anchor when coming back
131 |         del traversal_dict[import_yml]
132 | 
133 |         resolved_conf = merge_default(ext_resolved_conf, [config])[0]
134 |         resolved_conf = archive_import_keys(resolved_conf)
135 |         resolved_configs.append(resolved_conf)
136 |     return resolved_configs
137 | 
138 | 
139 | def custom_import_exp(config: dict) -> bool:
140 |     """check if the config uses a custom import_exp
141 | 
142 |     Args:
143 |         config (dict): experiment configuration
144 | 
145 |     Returns:
146 |         bool: True if a custom import_exp key is defined
147 |     """
148 |     if KEY.IMPORT_EXP not in config:
149 |         return False
150 |     if config[KEY.IMPORT_EXP].lower() == KEY.DEFAULT:
151 |         return False
152 |     return True
153 | 
154 | 
155 | def archive_import_keys(config: dict) -> dict:
156 |     """
157 |     Args:
158 |         config (dict): experiment configuration
159 | 
160 | 
161 |     Returns:
162 |         dict: experiment configuration with archived import keys
163 |     """
164 |     removal_keys = [KEY.IMPORT_PATH, KEY.IMPORT_EXP]
165 |     replacement_keys = [KEY.i_IMPORT_PATH_ARCHIVE, KEY.i_IMPORT_EXP_ARCHIVE]
166 | 
167 |     for removal, replacement in zip(removal_keys, replacement_keys):
168 |         if removal in config:
169 |             config[replacement] = config[removal]
170 |             del config[removal]
171 |     return config
172 | 


--------------------------------------------------------------------------------
/cw2/cw_config/conf_unfolder.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | from collections import deque
  4 | from copy import deepcopy
  5 | from typing import List
  6 | 
  7 | from cw2 import util
  8 | from cw2.cw_config import conf_path
  9 | from cw2.cw_config import cw_conf_keys as KEY
 10 | from cw2.cw_data import cw_logging
 11 | 
 12 | 
 13 | def unfold_exps(exp_configs: List[dict], debug: bool, debug_all: bool) -> List[dict]:
 14 |     """unfolds a list of experiment configurations into the different
 15 |     hyperparameter runs and repetitions
 16 | 
 17 |     Args:
 18 |         exp_configs (List[dict]): list of experiment configurations
 19 | 
 20 |     Returns:
 21 |         List[dict]: list of unfolded experiment configurations
 22 |     """
 23 |     param_expansion = expand_experiments(exp_configs, debug, debug_all)
 24 |     unrolled = unroll_exp_reps(param_expansion)
 25 |     return unrolled
 26 | 
 27 | 
 28 | def expand_experiments(
 29 |     _experiment_configs: List[dict], debug: bool, debug_all: bool
 30 | ) -> List[dict]:
 31 |     """Expand the experiment configuration with concrete parameter instantiations
 32 | 
 33 |     Arguments:
 34 |         experiment_configs {List[dict]} -- List with experiment configs
 35 | 
 36 |     Returns:
 37 |         List[dict] -- List of experiment configs, with set parameters
 38 |     """
 39 | 
 40 |     # get all options that are iteratable and build all combinations (grid) or tuples (list)
 41 |     experiment_configs = deque(deepcopy(_experiment_configs))
 42 |     if debug or debug_all:
 43 |         for ec in experiment_configs:
 44 |             ec[KEY.REPS] = ec["iterations"] = ec[KEY.REPS_PARALL] = ec[
 45 |                 KEY.REPS_P_JOB
 46 |             ] = 1
 47 | 
 48 |     expanded_config_list = []
 49 | 
 50 |     while len(experiment_configs) > 0:
 51 |         config = experiment_configs.popleft()
 52 | 
 53 |         # Set Default Values
 54 |         # save path argument from YML for grid modification
 55 |         if KEY.i_BASIC_PATH not in config:
 56 |             config[KEY.i_BASIC_PATH] = config.get(KEY.PATH)
 57 |         # save name argument from YML for grid modification
 58 |         if KEY.i_EXP_NAME not in config:
 59 |             config[KEY.i_EXP_NAME] = config.get(KEY.NAME)
 60 |         # add empty string for parent DIR in case of grid
 61 |         if KEY.i_NEST_DIR not in config:
 62 |             config[KEY.i_NEST_DIR] = ""
 63 |         # set debug flag
 64 |         config[KEY.i_DEBUG_FLAG] = debug or debug_all
 65 | 
 66 |         expansion = None
 67 |         for key in config:
 68 |             if key.startswith(KEY.GRID):
 69 |                 expansion = params_combine(config, key, itertools.product)
 70 |                 break
 71 |             if key.startswith(KEY.LIST):
 72 |                 expansion = params_combine(config, key, zip)
 73 |                 break
 74 |             if key.startswith(KEY.ABLATIVE):
 75 |                 expansion = ablative_expand(config, key)
 76 |                 break
 77 | 
 78 |         if expansion is not None:
 79 |             if debug and not debug_all:
 80 |                 expansion = expansion[:1]
 81 |             experiment_configs.extend(expansion)
 82 |         else:
 83 |             expanded_config_list.append(config)
 84 | 
 85 |     return conf_path.normalize_expanded_paths(expanded_config_list)
 86 | 
 87 | 
 88 | def params_combine(config: dict, key: str, iter_func) -> List[dict]:
 89 |     """combines experiment parameter with its list/grid combinations
 90 | 
 91 |     Args:
 92 |         config (dict): an single experiment configuration
 93 |         key (str): the combination key, e.g. 'list' or 'grid'
 94 |         iter_func: itertool-like function for creating the combinations
 95 | 
 96 |     Returns:
 97 |         List[dict]: list of parameter-combined experiments
 98 |     """
 99 |     if iter_func is None:
100 |         return [config]
101 | 
102 |     combined_configs = []
103 |     # convert list/grid dictionary into flat dictionary, where the key is a tuple of the keys and the
104 |     # value is the list of values
105 |     tuple_dict = util.flatten_dict_to_tuple_keys(config[key])
106 |     _param_names = [".".join(t) for t in tuple_dict]
107 | 
108 |     param_lengths = map(len, tuple_dict.values())
109 |     if key.startswith(KEY.LIST) and len(set(param_lengths)) != 1:
110 |         cw_logging.getLogger().warning(
111 |             f'experiment "{config[KEY.NAME]}" list params [{key}] are not of equal length.'.format()
112 |         )
113 | 
114 |     # create a new config for each parameter setting
115 |     for values in iter_func(*tuple_dict.values()):
116 |         _config = deepcopy(config)
117 | 
118 |         # Remove Grid/List Argument
119 |         del _config[key]
120 | 
121 |         if KEY.PARAMS not in _config:
122 |             _config[KEY.PARAMS] = {}
123 | 
124 |         # Expand Grid/List Parameters
125 |         for i, t in enumerate(tuple_dict.keys()):
126 |             util.insert_deep_dictionary(d=_config.get(KEY.PARAMS), t=t, value=values[i])
127 | 
128 |         _config = extend_config_name(_config, _param_names, values)
129 |         combined_configs.append(_config)
130 |     return combined_configs
131 | 
132 | 
133 | def ablative_expand(config: dict, key: str):
134 |     tuple_dict = util.flatten_dict_to_tuple_keys(config[key])
135 |     _param_names = [".".join(t) for t in tuple_dict]
136 |     combined_configs = []
137 |     for i, t in enumerate(tuple_dict.keys()):
138 |         for val in tuple_dict[t]:
139 |             _config = deepcopy(config)
140 | 
141 |             # Remove Grid/List Argument
142 |             del _config[key]
143 | 
144 |             if KEY.PARAMS not in _config:
145 |                 _config[KEY.PARAMS] = {}
146 |             util.insert_deep_dictionary(d=_config.get(KEY.PARAMS), t=t, value=val)
147 |             # TODO: TEST
148 |             _config = extend_config_name(_config, [_param_names[i]], [val])
149 | 
150 |             combined_configs.append(_config)
151 |     return combined_configs
152 | 
153 | 
154 | def extend_config_name(config: dict, param_names: list, values: list) -> dict:
155 |     """extend an experiment name with a shorthand derived from the parameters and their values
156 | 
157 |     Args:
158 |         config (dict): experiment config
159 |         param_names (list): list of parameter names
160 |         values (list): list of parameter values
161 | 
162 |     Returns:
163 |         dict: experiment config with extended name
164 |     """
165 |     # Rename and append
166 |     _converted_name = util.convert_param_names(param_names, values)
167 | 
168 |     # Use __ only once as a seperator
169 |     sep = "__"
170 |     if KEY.i_EXP_NAME in config and sep in config.get(KEY.i_EXP_NAME):
171 |         sep = "_"
172 | 
173 |     config[KEY.i_EXP_NAME] = config.get(KEY.i_EXP_NAME) + sep + _converted_name
174 |     config[KEY.i_NEST_DIR] = config.get(KEY.NAME)
175 |     return config
176 | 
177 | 
178 | def unroll_exp_reps(exp_configs: List[dict]) -> List[dict]:
179 |     """unrolls experiment repetitions into their own configuration object
180 | 
181 |     Args:
182 |         exp_configs (List[dict]): List of experiment configurations
183 | 
184 |     Returns:
185 |         List[dict]: List of unrolled experiment configurations
186 |     """
187 |     unrolled_exps = []
188 | 
189 |     for config in exp_configs:
190 |         if KEY.i_REP_IDX in config:
191 |             unrolled_exps.append(config)
192 |             continue
193 | 
194 |         for r in range(config[KEY.REPS]):
195 |             c = deepcopy(config)
196 |             c[KEY.i_REP_IDX] = r
197 |             c[KEY.i_REP_LOG_PATH] = os.path.join(
198 |                 c.get(KEY.LOG_PATH), "rep_{:02d}".format(r)
199 |             )
200 |             unrolled_exps.append(c)
201 |     return unrolled_exps
202 | 


--------------------------------------------------------------------------------
/cw2/cw_config/cw_conf_keys.py:
--------------------------------------------------------------------------------
 1 | # SECTIONS
 2 | SLURM = "slurm"
 3 | DEFAULT = "default"
 4 | 
 5 | # EXP KEYS
 6 | NAME = "name"
 7 | PATH = "path"
 8 | LOG_PATH = "log_path"
 9 | 
10 | IMPORT_PATH = "import_path"
11 | IMPORT_EXP = "import_exp"
12 | 
13 | # REPS
14 | REPS = "repetitions"
15 | REPS_PARALL = "reps_in_parallel"
16 | REPS_P_JOB = "reps_per_job"
17 | 
18 | # EXP PARAMS
19 | PARAMS = "params"
20 | GRID = "grid"
21 | LIST = "list"
22 | ABLATIVE = "ablative"
23 | 
24 | # INTERNAL
25 | i_BASIC_PATH = "_basic_path"
26 | i_EXP_NAME = "_experiment_name"
27 | i_NEST_DIR = "_nested_dir"
28 | i_DEBUG_FLAG = "_debug"
29 | # INTERNAL REP
30 | i_REP_IDX = "_rep_idx"
31 | i_REP_LOG_PATH = "_rep_log_path"
32 | 
33 | # INTERNAL IMPORT ARCHIVE
34 | i_IMPORT_PATH_ARCHIVE = "_import_path_archive"
35 | i_IMPORT_EXP_ARCHIVE = "_import_exp_archive"
36 | 
37 | # CPU CORES ASSIGNMENT
38 | i_CPU_CORES = "cpu_cores"
39 | 


--------------------------------------------------------------------------------
/cw2/cw_config/cw_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import socket
  3 | from typing import List, Tuple
  4 | from datetime import datetime
  5 | 
  6 | import cw2.cw_config.cw_conf_keys as KEY
  7 | from cw2.cw_config import conf_io, conf_path, conf_resolver, conf_unfolder
  8 | 
  9 | 
 10 | class Config:
 11 |     def __init__(
 12 |         self,
 13 |         config_path: str = None,
 14 |         experiment_selections: List[str] = None,
 15 |         debug: bool = False,
 16 |         debug_all: bool = False,
 17 |         prefix_with_timestamp: bool = False
 18 |     ):
 19 |         self.slurm_config = None
 20 |         self.exp_configs = None
 21 | 
 22 |         self.f_name = None
 23 |         self.config_path = config_path
 24 |         self.exp_selections = experiment_selections
 25 | 
 26 |         self.prefix_with_timestamp = prefix_with_timestamp
 27 | 
 28 |         if config_path is not None:
 29 |             self.load_config(config_path, experiment_selections, debug, debug_all)
 30 | 
 31 |     def load_config(
 32 |         self,
 33 |         config_path: str,
 34 |         experiment_selections: List[str] = None,
 35 |         debug: bool = False,
 36 |         debug_all: bool = False,
 37 |     ) -> None:
 38 |         """Loads config from YAML file
 39 |         The config can include multiple experiments, DEFAULT paramters and a SLURM configuration
 40 | 
 41 |         Arguments:
 42 |             config_path {str} -- path to a YAML configuraton file
 43 |             experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None.
 44 |         """
 45 | 
 46 |         self.config_path = config_path
 47 |         self.f_name = os.path.basename(config_path)
 48 | 
 49 |         self.exp_selections = experiment_selections
 50 | 
 51 |         slurm_configs, self.exp_configs = self._parse_configs(
 52 |             config_path, experiment_selections, debug, debug_all
 53 |         )
 54 |         self.slurm_config = self._filter_slurm_configs(slurm_configs)
 55 | 
 56 |     @staticmethod
 57 |     def _filter_slurm_configs(slurm_configs: List[dict]) -> dict:
 58 |         """Returns machine/cluster specific slurm conf (identified by hostname)
 59 |            if available, otherwise returns the default one (if available)
 60 | 
 61 |         Arguments:
 62 |             slurm_configs: (list[dict]) -- all slurm configurations found in the config file
 63 |         Returns:
 64 |             dict -- SLURM configuration to use for this machine
 65 |         """
 66 |         default_conf = None
 67 |         specific_conf = None
 68 |         hostname = socket.gethostname().lower()
 69 |         print("Hostname: {}".format(hostname))
 70 |         for c in slurm_configs:
 71 |             print("Found slurm config: {}".format(c[KEY.NAME]))
 72 |             if c[KEY.NAME].lower() == KEY.SLURM.lower():
 73 |                 print("Seeting default slurm config")
 74 |                 default_conf = c
 75 |             elif c[KEY.NAME].split("_")[1].lower() in hostname:
 76 |                 print("Setting specific slurm config: {}".format(c[KEY.NAME]))
 77 |                 specific_conf = c
 78 |                 specific_conf[KEY.NAME] = KEY.SLURM
 79 | 
 80 |         return specific_conf if specific_conf is not None else default_conf
 81 | 
 82 |     def _parse_configs(
 83 |         self,
 84 |         config_path: str,
 85 |         experiment_selections: List[str] = None,
 86 |         debug: bool = False,
 87 |         debug_all: bool = False,
 88 |     ) -> Tuple[List[dict], List[dict]]:
 89 |         """parse the config file, including separating the SLURM configuration and expanding grid / list search params
 90 | 
 91 |         Arguments:
 92 |             config_path {str} -- path to the configuration file
 93 |             experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None.
 94 | 
 95 |         Returns:
 96 |             Tuple[dict, dict] -- SLURM configuration, list of expanded experiment configurations
 97 |         """
 98 | 
 99 |         slurm_config, default_config, experiment_configs = conf_io.get_configs(
100 |             config_path, experiment_selections
101 |         )
102 | 
103 |         # if desired, prefix experiments with timestamp
104 |         if self.prefix_with_timestamp:
105 |             experiment_start = datetime.now().strftime("%m%d-%H%M%S")
106 |             for exp_config in experiment_configs:
107 |                 exp_config.update(name=f"{experiment_start}_{exp_config['name']}")
108 | 
109 |         experiment_configs = conf_resolver.resolve_dependencies(
110 |             default_config, experiment_configs, self.config_path
111 |         )
112 |         experiment_configs = conf_unfolder.unfold_exps(
113 |             experiment_configs, debug, debug_all
114 |         )
115 | 
116 |         return slurm_config, experiment_configs
117 | 
118 |     def to_yaml(self, dir_path: str = "", relpath: bool = True) -> None:
119 |         """write config back into a YAML file.
120 | 
121 |         Args:
122 |             fpath (str, optional): path to write to. Will be written to outputdir unless specified differently. Defaults to "".
123 |             relpath (bool, optional): Use relative paths only. Usefull for loading functionality. Defaults to True.
124 |         """
125 | 
126 |         if dir_path == "":
127 |             dir_path = self.exp_configs[0][KEY.i_BASIC_PATH]
128 | 
129 |         original_yml_name = os.path.splitext(self.f_name)[0]
130 | 
131 |         # List so it can be merged easily
132 |         slurm_config = []
133 |         if self.slurm_config is not None:
134 |             slurm_config.append(dict(self.slurm_config))
135 | 
136 |         readable_configs = self._readable_exp_configs(relpath)
137 | 
138 |         # Save all named experiment configs in subdir
139 |         grouped_configs = self._group_configs_by_name(readable_configs)
140 |         for exp_name in grouped_configs.keys():
141 |             fpath = os.path.join(
142 |                 dir_path,
143 |                 exp_name,
144 |                 "relative_{}_{}.yml".format(original_yml_name, exp_name),
145 |             )
146 |             conf_io.write_yaml(fpath, slurm_config + grouped_configs[exp_name])
147 | 
148 |         # Save global configs
149 |         fpath = os.path.join(dir_path, "relative_" + self.f_name)
150 | 
151 |         if self.exp_selections is not None:
152 |             fpath = (
153 |                 os.path.splitext(fpath)[0]
154 |                 + "_"
155 |                 + "_".join(self.exp_selections)
156 |                 + ".yml"
157 |             )
158 | 
159 |         # Merge into single list
160 |         data = slurm_config + readable_configs
161 |         conf_io.write_yaml(fpath, data)
162 | 
163 |     def _readable_exp_configs(self, relpath: bool = True) -> List[dict]:
164 |         """Internal function to get more readable objects when written as yaml
165 |         Converts to dict() and optionally use relative paths only
166 |         Args:
167 |             relpath (bool, optional): True if the new experiment config file should use relative paths only. Defaults to True.
168 | 
169 |         Returns:
170 |             List[dict]: list of transformed experiment configuration dicts
171 |         """
172 |         res = []
173 |         for exp in self.exp_configs:
174 |             # Convert attrdict to dict for prettier yaml write
175 |             c = dict(exp)
176 |             if relpath:
177 |                 c = conf_path.make_rel_paths(c, c[KEY.i_BASIC_PATH])
178 |             res.append(c)
179 |         return res
180 | 
181 |     def _group_configs_by_name(self, configs: List[dict]) -> dict:
182 |         grouped_configs = {}
183 |         for c in configs:
184 |             name = c[KEY.NAME]
185 |             if name not in grouped_configs:
186 |                 grouped_configs[name] = [c]
187 |             else:
188 |                 grouped_configs[name].append(c)
189 |         return grouped_configs
190 | 


--------------------------------------------------------------------------------
/cw2/cw_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_data/__init__.py


--------------------------------------------------------------------------------
/cw2/cw_data/cw_loading.py:
--------------------------------------------------------------------------------
  1 | from typing import Type
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from cw2 import job, scheduler, util
  6 | from cw2.cw_data import cw_logging, cw_pd_logger
  7 | 
  8 | 
  9 | class Loader(scheduler.AbstractScheduler):
 10 |     def run(self, overwrite: bool = False):
 11 |         cw_res = CWResult()
 12 | 
 13 |         for j in self.joblist:
 14 |             cw_res._load_job(j)
 15 | 
 16 |         cw_res._compile()
 17 |         return cw_res.data().set_index(["name", "r"])
 18 | 
 19 | 
 20 | class CWResult:
 21 |     def __init__(self, df: pd.DataFrame = None):
 22 |         self.data_list = []
 23 |         self.df = df
 24 | 
 25 |     def _compile(self):
 26 |         self.df = pd.DataFrame(self.data_list)
 27 |         self.data_list = None
 28 | 
 29 |     def _load_job(self, j: job.Job) -> None:
 30 |         for c in j.tasks:
 31 |             rep_data = j.load_task(c)
 32 |             rep_data.update(
 33 |                 {
 34 |                     "name": c["name"],
 35 |                     "r": c["_rep_idx"],
 36 |                     "rep_path": c["_rep_log_path"],
 37 |                     "params": c["params"],
 38 |                 }
 39 |             )
 40 |             rep_data.update(util.flatten_dict(c["params"]))
 41 |             self.data_list.append(rep_data)
 42 | 
 43 |     def data(self) -> pd.DataFrame:
 44 |         return self.df
 45 | 
 46 | 
 47 | @pd.api.extensions.register_dataframe_accessor("cw2")
 48 | class Cw2Accessor:
 49 |     def __init__(self, pandas_obj):
 50 |         self._obj = pandas_obj
 51 | 
 52 |     def filter(self, param_dict: dict):
 53 |         """filter by parameter dictionary.
 54 |         Supports nested dictionarys. Has to be the same format as the config file.
 55 | 
 56 |         Args:
 57 |             param_dict (dict): parameter dictionary
 58 | 
 59 |         Returns:
 60 |             pd.DataFrame: filtered result
 61 |         """
 62 |         flattened = util.flatten_dict(param_dict)
 63 | 
 64 |         df = self._obj.copy()
 65 |         for k, v in flattened.items():
 66 |             df = df[df[k] == v]
 67 |         return df
 68 | 
 69 |     def repetition(self, r: int):
 70 |         """only select a specific repetition.
 71 | 
 72 |         Args:
 73 |             r (int): repetition number
 74 | 
 75 |         Returns:
 76 |             pd.DataFrame: filtered result
 77 |         """
 78 |         df = self._obj
 79 |         return df[df["r"] == r]
 80 | 
 81 |     def name(self, name: str):
 82 |         """only select experiments with a specific name
 83 | 
 84 |         Args:
 85 |             name (str): experiment name
 86 | 
 87 |         Returns:
 88 |             pd.DataFrame: filtered result
 89 |         """
 90 |         df = self._obj
 91 |         return df[df["name"] == name]
 92 | 
 93 |     def logger(
 94 |         self,
 95 |         l_name: str = "",
 96 |         l_obj: cw_logging.AbstractLogger = None,
 97 |         l_cls: Type[cw_logging.AbstractLogger] = None,
 98 |     ):
 99 |         """select the column containg the results from a specific logger
100 | 
101 |         Args:
102 |             l_name (str, optional): the class name of the logger. Defaults to "".
103 |             l_obj (cw_logging.AbstractLogger, optional): an instance object of the logger. Defaults to None.
104 |             l_cls (Type[cw_logging.AbstractLogger], optional): the class object of the logger. Defaults to None.
105 | 
106 |         Returns:
107 |             pd.Series: The column with the logger results
108 |         """
109 |         if l_obj is not None:
110 |             l_cls = l_obj.__class__
111 | 
112 |         if l_cls is not None:
113 |             l_name = l_cls.__name__
114 | 
115 |         df = self._obj
116 |         return df[l_name]
117 | 
118 |     def flatten_pd_log(self):
119 |         pd_log_col = cw_pd_logger.PandasLogger.__name__
120 |         if pd_log_col not in self._obj.columns:
121 |             return self._obj
122 | 
123 |         df = self._obj
124 |         new_df = pd.DataFrame()
125 |         for idx, row in df.iterrows():
126 |             nested_df = row[pd_log_col]
127 | 
128 |             outer_row = row.drop(pd_log_col)
129 |             for c, v in outer_row.iteritems():
130 |                 if isinstance(v, dict):
131 |                     nested_df[c] = str(v)
132 |                     nested_df[c] = nested_df[c].map(eval)
133 |                     continue
134 |                 nested_df[c] = v
135 |             nested_df["name"] = idx[0]
136 |             nested_df["r"] = idx[1]
137 |             new_df = new_df.append(nested_df, ignore_index=True)
138 |         return new_df.set_index(["name", "r", "iter"])
139 | 


--------------------------------------------------------------------------------
/cw2/cw_data/cw_logging.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import logging
  3 | import os
  4 | import pprint
  5 | import sys
  6 | from typing import Dict, Iterable, List, Optional
  7 | 
  8 | 
  9 | class AbstractLogger(abc.ABC):
 10 |     """Abstract Base Class for all Loggers"""
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         ignore_keys: Optional[Iterable] = None,
 15 |         allow_keys: Optional[Iterable] = None,
 16 |     ):
 17 |         """
 18 |         Initialize a logger that records based on (a subset of) the provided keys
 19 |         :param ignore_keys: A list of keys
 20 |         :param allow_keys:
 21 |         """
 22 |         assert (
 23 |             ignore_keys is None or allow_keys is None
 24 |         ), "Logging keys can either be whitelisted ('ignore_keys') or blacklisted ('allow_keys'), but not both"
 25 |         self.ignore_keys = ignore_keys
 26 |         self.allow_keys = allow_keys
 27 | 
 28 |     def filter(self, data: Dict) -> Dict:
 29 |         """
 30 |         Base Function. Either filters out ignored keys or looks for allowed ones
 31 | 
 32 |         Args:
 33 |             data: data payload dict
 34 |         """
 35 |         if self.ignore_keys is not None:  # blacklist ignored keys
 36 |             return {
 37 |                 key: value for key, value in data.items() if key not in self.ignore_keys
 38 |             }
 39 |         elif self.allow_keys is not None:  # whitelist allowed keys
 40 |             return {key: value for key, value in data.items() if key in self.allow_keys}
 41 |         else:  # use all keys
 42 |             return data
 43 | 
 44 |     def preprocess(self, *args):
 45 |         """
 46 |         intended to be called during Experiment.initialize()
 47 |         """
 48 |         pass
 49 | 
 50 |     @abc.abstractmethod
 51 |     def initialize(self, config: dict, rep: int, rep_log_path: str) -> None:
 52 |         """needs to be implemented by subclass.
 53 |         Called once at the start of each repetition.
 54 |         Used to configure / reset the Logger for each repetition.
 55 | 
 56 |         Arguments:
 57 |             config {attrdict.Attrdict} -- configuration
 58 |             rep {int} -- repetition counter
 59 |         """
 60 |         raise NotImplementedError
 61 | 
 62 |     @abc.abstractmethod
 63 |     def process(self, data: dict) -> None:
 64 |         """needs to be implemented by subclass.
 65 |         The main method. Defines how the logger handles the result of each iteration.
 66 | 
 67 |         Arguments:
 68 |             data -- data payload to be processed by logger
 69 |         """
 70 |         raise NotImplementedError
 71 | 
 72 |     @abc.abstractmethod
 73 |     def finalize(self) -> None:
 74 |         """needs to be implemented by subclass.
 75 |         Called at the end of each repetition.
 76 |         Use it to finalize the processing like write to disk or other cleanup
 77 |         """
 78 |         raise NotImplementedError
 79 | 
 80 |     @abc.abstractmethod
 81 |     def load(self):
 82 |         """needs to be implemented by subclass.
 83 |         called when the data should be loaded after execution is complete.
 84 |         """
 85 |         raise NotImplementedError
 86 | 
 87 | 
 88 | class LoggerArray(AbstractLogger):
 89 |     """Storage for multiple AbstractLogger objects.
 90 |     Behaves to the outside like a simple AbstractLogger implementation.
 91 |     Used to apply multiple loggers in a run.
 92 |     """
 93 | 
 94 |     def __init__(self):
 95 |         self._logger_array: List[AbstractLogger] = []
 96 | 
 97 |     def add(self, logger: AbstractLogger) -> None:
 98 |         self._logger_array.append(logger)
 99 | 
100 |     def initialize(self, config: dict, rep: int, rep_log_path: str) -> None:
101 |         for logger in self._logger_array:
102 |             logger.initialize(config, rep, rep_log_path)
103 | 
104 |     def preprocess(self, *args):
105 |         for logger in self._logger_array:
106 |             logger.preprocess(*args)
107 | 
108 |     def process(self, data: dict) -> None:
109 |         for logger in self._logger_array:
110 |             logger.process(data)
111 | 
112 |     def finalize(self) -> None:
113 |         for logger in self._logger_array:
114 |             logger.finalize()
115 | 
116 |     def load(self):
117 |         data = {}
118 |         for logger in self._logger_array:
119 |             try:
120 |                 d = logger.load()
121 |             except:
122 |                 getLogger().exception(logger.__class__.__name__)
123 |                 d = "Error when loading {}".format(logger.__class__.__name__)
124 | 
125 |             if d is not None:
126 |                 if not isinstance(d, dict):
127 |                     d = {logger.__class__.__name__: d}
128 |                 data.update(d)
129 |         return data
130 | 
131 |     def __iter__(self):
132 |         return iter(self._logger_array)
133 | 
134 |     def is_empty(self) -> bool:
135 |         return len(self._logger_array) == 0
136 | 
137 | 
138 | class Printer(AbstractLogger):
139 |     """Prints the result of each iteration to the console."""
140 | 
141 |     def initialize(self, config: dict, rep: int, rep_log_path: str) -> None:
142 |         pass
143 | 
144 |     def process(self, data: dict) -> None:
145 |         data_ = self.filter(data)
146 |         pprint.pprint(data_)
147 | 
148 |     def finalize(self) -> None:
149 |         pass
150 | 
151 |     def load(self):
152 |         pass
153 | 
154 | 
155 | class PythonLogger(AbstractLogger):
156 |     """
157 |     Logger which writes calls to logging.getLogger('cw2') on to disk
158 |     """
159 | 
160 |     def __init__(self):
161 |         self.logger = getLogger()
162 | 
163 |     def initialize(self, config: dict, rep: int, rep_log_path: str) -> None:
164 |         self.outh = logging.FileHandler(
165 |             os.path.join(rep_log_path, "out.log"), delay=True
166 |         )
167 |         self.outh.setLevel(logging.INFO)
168 |         self.outh.setFormatter(_formatter)
169 |         self.logger.addHandler(self.outh)
170 | 
171 |         self.errh = logging.FileHandler(os.path.join(rep_log_path, "err.log"))
172 |         self.errh.setLevel(logging.ERROR)
173 |         self.errh.setFormatter(_formatter)
174 |         self.logger.addHandler(self.errh)
175 | 
176 |     def process(self, data: dict) -> None:
177 |         pass
178 | 
179 |     def finalize(self) -> None:
180 |         for h in [self.outh, self.errh]:
181 |             h.flush()
182 |             h.close()
183 |             self.logger.removeHandler(h)
184 | 
185 |     def load(self):
186 |         pass
187 | 
188 | 
189 | ### logging module functionality ####
190 | 
191 | 
192 | class _CWFormatter(logging.Formatter):
193 |     """Taken From CW V1"""
194 | 
195 |     def __init__(self):
196 |         # self.std_formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
197 |         self.std_formatter = logging.Formatter("[%(name)s] [%(levelname)s] %(message)s")
198 |         self.red_formatter = logging.Formatter(
199 |             "[%(asctime)s]:[%(name)s] [%(levelname)s] %(message)s"
200 |         )
201 | 
202 |     def format(self, record: logging.LogRecord):
203 |         if record.levelno < logging.ERROR:
204 |             return self.std_formatter.format(record)
205 |         else:
206 |             return self.red_formatter.format(record)
207 | 
208 | 
209 | _formatter = _CWFormatter()
210 | 
211 | 
212 | def getLogger() -> logging.Logger:
213 |     """creates a logging.getLogger('cw2') object with initialization.
214 |     Parallelization via joblib needs a more sophisticated getLogger function.
215 | 
216 |     Returns:
217 |         logging.Logger
218 |     """
219 |     _logging_logger = logging.getLogger("cw2")
220 | 
221 |     if _logging_logger.getEffectiveLevel() > logging.INFO:
222 |         ch = logging.StreamHandler(sys.stdout)
223 |         ch.setLevel(logging.INFO)
224 |         ch.setFormatter(_formatter)
225 | 
226 |         _logging_logger.setLevel(logging.INFO)
227 |         _logging_logger.addHandler(ch)
228 | 
229 |     return _logging_logger
230 | 


--------------------------------------------------------------------------------
/cw2/cw_data/cw_pd_logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, Iterable, Optional
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from cw2.cw_data import cw_logging
 7 | 
 8 | 
 9 | class PandasLogger(cw_logging.AbstractLogger):
10 |     """Writes the results of each repetition seperately to disk
11 |     Each repetition is saved in its own directory. Write occurs after every iteration.
12 |     """
13 | 
14 |     def __init__(
15 |         self,
16 |         ignore_keys: Optional[Iterable] = None,
17 |         allow_keys: Optional[Iterable] = None,
18 |     ):
19 |         super().__init__(ignore_keys=ignore_keys, allow_keys=allow_keys)
20 |         self.log_path = ""
21 |         self.csv_name = "rep.csv"
22 |         self.pkl_name = "rep.pkl"
23 |         self.df = pd.DataFrame()
24 | 
25 |     def initialize(self, config: Dict, rep: int, rep_log_path: str):
26 |         self.log_path = rep_log_path
27 |         self.csv_name = os.path.join(self.log_path, "rep_{}.csv".format(rep))
28 |         self.pkl_name = os.path.join(self.log_path, "rep_{}.pkl".format(rep))
29 |         self.df = pd.DataFrame()
30 | 
31 |     def process(self, log_data: dict) -> None:
32 |         data = self.filter(log_data)
33 | 
34 |         self.df = self.df.append(data, ignore_index=True)
35 | 
36 |         try:
37 |             self.df.to_csv(self.csv_name, index_label="index")
38 |         except:
39 |             cw_logging.getLogger().warning("Could not save {}".format(self.csv_name))
40 | 
41 |         try:
42 |             self.df.to_pickle(self.pkl_name)
43 |         except:
44 |             cw_logging.getLogger().warning("Could not save {}".format(self.pkl_name))
45 | 
46 |     def finalize(self) -> None:
47 |         pass
48 | 
49 |     def load(self):
50 |         payload = {}
51 |         df: pd.DataFrame = None
52 | 
53 |         # Check if file exists
54 |         try:
55 |             df = pd.read_pickle(self.pkl_name)
56 |         except FileNotFoundError as _:
57 |             warn = "{} does not exist".format(self.pkl_name)
58 |             cw_logging.getLogger().warning(warn)
59 |             return warn
60 | 
61 |         # Enrich Payload with descriptive statistics for loading DF structure
62 |         """
63 |         for c in df.columns:
64 |             if pd.api.types.is_numeric_dtype(df[c]):
65 |                 payload['{}_min'.format(c)] = df[c].min()
66 |                 payload['{}_max'.format(c)] = df[c].max()
67 |                 payload['{}_mean'.format(c)] = df[c].mean()
68 |                 payload['{}_std'.format(c)] = df[c].std()
69 | 
70 |             payload['{}_last'.format(c)] = df[c].iloc[-1]
71 |         """
72 |         payload[self.__class__.__name__] = df
73 |         return payload
74 | 


--------------------------------------------------------------------------------
/cw2/cw_data/cw_wandb_logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import warnings
  3 | from random import random
  4 | from time import sleep
  5 | 
  6 | # To prevent conflicts between wandb and the joblib scheduler
  7 | # see https://github.com/wandb/client/issues/1525 for reference
  8 | os.environ["WANDB_START_METHOD"] = "thread"
  9 | 
 10 | from itertools import groupby
 11 | from typing import Dict, Iterable, List, Optional
 12 | 
 13 | import pandas as pd
 14 | import wandb
 15 | 
 16 | from cw2.cw_data import cw_logging
 17 | from cw2.util import get_file_names_in_directory
 18 | 
 19 | 
 20 | def reset_wandb_env():
 21 |     exclude = {
 22 |         "WANDB_PROJECT",
 23 |         "WANDB_ENTITY",
 24 |         "WANDB_API_KEY",
 25 |         "WANDB_START_METHOD",
 26 |     }
 27 |     for k, v in os.environ.items():
 28 |         if k.startswith("WANDB_") and k not in exclude:
 29 |             del os.environ[k]
 30 | 
 31 | 
 32 | def group_parameters(list_of_strings: List[str]):
 33 |     """groups different strings that start with a common substring (using "." as delimiter)
 34 |         and outputs a single, more concise string.
 35 |     Example:
 36 |         outstring = group_parameters['local', 'mod.enc.tidentity', 'mod.hea.nhl5', 'mod.hea.ioFalse', 'mod.enc.hd64']
 37 |         % outstring will be 'local,mod_[enc_[hd64,tidentity],hea_[ioFalse,nhl5]]'
 38 |     """
 39 |     groups = []
 40 |     uniquekeys = []
 41 |     num_subgroups = 0
 42 |     substring = ""
 43 | 
 44 |     for k, g in groupby(sorted(list_of_strings), lambda string: string.split(".")[0]):
 45 |         groups.append(list(g))
 46 |         uniquekeys.append(k)
 47 | 
 48 |         if len(groups[-1]) == 1:
 49 |             substring += groups[-1][0] + ","
 50 |             num_subgroups += 1
 51 |         else:
 52 |             remainder = [s.replace(k, "", 1) for s in groups[-1]]
 53 |             remainder = [s.replace(".", "", 1) for s in remainder]
 54 |             if len(remainder) > 0:
 55 |                 subgroups, num_subs = group_parameters(remainder)
 56 |                 if num_subs > 1:
 57 |                     substring += k + "_[" + subgroups + "],"
 58 |                 else:
 59 |                     substring += k + "_" + subgroups + ","
 60 |                 num_subgroups += num_subs
 61 |     return substring[:-1], len(groups)
 62 | 
 63 | 
 64 | class WandBLogger(cw_logging.AbstractLogger):
 65 |     def __init__(
 66 |         self,
 67 |         ignore_keys: Optional[Iterable] = None,
 68 |         allow_keys: Optional[Iterable] = None,
 69 |     ):
 70 |         super(WandBLogger, self).__init__(
 71 |             ignore_keys=ignore_keys, allow_keys=allow_keys
 72 |         )
 73 |         self.log_path = ""
 74 |         self.run = None
 75 | 
 76 |     def initialize(self, config: Dict, rep: int, rep_log_path: str) -> None:
 77 |         if "wandb" in config.keys():
 78 |             self.init_fields(config, rep, rep_log_path)
 79 |             self.connect_to_wandb()
 80 | 
 81 |         else:
 82 |             warnings.warn("No 'wandb' field in yaml - Ignoring Weights & Biases Logger")
 83 | 
 84 |     def init_fields(self, config: Dict, rep: int, rep_log_path: str):
 85 |         self.log_path = rep_log_path
 86 |         self.rep = rep
 87 |         self.config = config["wandb"]
 88 |         self.cw2_config = config
 89 |         reset_wandb_env()
 90 |         self.job_name = config["_experiment_name"].replace("__", "_")
 91 |         self.use_group_parameters = self.config.get("use_group_parameters", False)
 92 |         if self.use_group_parameters:
 93 |             self.job_name = group_parameters(self.job_name.split("_"))[0]
 94 |         self.runname = self.job_name + "_rep_{:02d}".format(rep)
 95 | 
 96 |         # optional: change the job_type to a fixed alias if the option is present
 97 |         if "job_type" in self.config:
 98 |             self.job_name = self.config["job_type"]
 99 |         # have entity and group config entry optional
100 |         self.entity = self.config.get("entity", None)
101 |         self.group = self.config.get("group", None)
102 |         # Get the model logging directory
103 |         self.wandb_log_model = self.config.get("log_model", False)
104 |         if self.wandb_log_model:
105 |             self.save_model_dir = os.path.join(self.log_path, "model")
106 |             self.cw2_config["save_model_dir"] = self.save_model_dir
107 |             self.model_name = self.config.get("model_name", "model")
108 |         else:
109 |             self.save_model_dir = None
110 | 
111 |     def connect_to_wandb(self):
112 |         last_error = None
113 |         for i in range(10):
114 |             try:
115 |                 self.run = wandb.init(
116 |                     project=self.cw2_config["wandb"]["project"],
117 |                     entity=self.entity,
118 |                     group=self.group,
119 |                     job_type=self.job_name[:63],
120 |                     name=self.runname[:63],
121 |                     config=self.cw2_config["params"],
122 |                     dir=self.log_path,
123 |                     settings=wandb.Settings(
124 |                         _disable_stats=self.cw2_config["wandb"].get(
125 |                             "disable_stats", False
126 |                         )
127 |                     ),
128 |                     mode="online"
129 |                     if self.cw2_config["wandb"].get("enabled", True)
130 |                     else "disabled",
131 |                 )
132 |                 return  # if starting the run is successful, exit the loop (and in this case the function)
133 |             except Exception as e:
134 |                 last_error = e
135 |                 # implement a simple randomized exponential backoff if starting a run fails
136 |                 waiting_time = ((random() / 50) + 0.01) * (2**i)
137 |                 # wait between 0.01 and 10.24 seconds depending on the random seed and the iteration of the exponent
138 | 
139 |                 warnings.warn(
140 |                     "Problem with starting wandb: {}. Trying again in {} seconds".format(
141 |                         e, waiting_time
142 |                     )
143 |                 )
144 |                 sleep(waiting_time)
145 |         warnings.warn("wandb init failed several times.")
146 |         raise last_error
147 | 
148 |     def process(self, data: dict) -> None:
149 |         if self.run is not None:
150 |             # Skip logging if interval is defined but not satisfied
151 |             log_interval = self.config.get("log_interval", None)
152 |             if log_interval is not None and data["iter"] % log_interval != 0:
153 |                 return
154 | 
155 |             if "histogram" in self.config:
156 |                 for el in self.config["histogram"]:
157 |                     if el in data:
158 |                         self.run.log(
159 |                             {el: wandb.Histogram(np_histogram=data[el])},
160 |                             step=data["iter"],
161 |                         )
162 |             filtered_data = self.filter(data)
163 |             step = data.get("iter", None)
164 |             self.run.log(filtered_data, step=step)
165 | 
166 |     def finalize(self) -> None:
167 |         if self.run is not None:
168 |             self.log_model()
169 |             self.run.finish()
170 | 
171 |     def load(self):
172 |         pass
173 | 
174 |     def log_model(self):
175 |         """
176 |         Log model as an Artifact
177 | 
178 |         Returns:
179 |             None
180 |         """
181 |         if self.wandb_log_model is False:
182 |             return
183 | 
184 |         # Initialize wandb artifact
185 |         model_artifact = wandb.Artifact(name=self.model_name, type="model")
186 | 
187 |         # Get all file names in log dir
188 |         file_names = get_file_names_in_directory(self.save_model_dir)
189 | 
190 |         if file_names is None:
191 |             warnings.warn("save model dir is not available or empty.")
192 |             return
193 | 
194 |         # Add files into artifact
195 |         for file in file_names:
196 |             model_artifact.add_file(os.path.join(self.save_model_dir, file))
197 | 
198 |         aliases = ["latest", f"finished-rep-{self.rep}"]
199 | 
200 |         # Log and upload
201 |         self.run.log_artifact(model_artifact, aliases=aliases)
202 | 
203 |     def log_plot(self, x, y, column_names=("x", "y"), plot_id="plot", title="Plot"):
204 |         data = [list(i) for i in zip(x, y)]
205 |         table = wandb.Table(data=data, columns=column_names)
206 |         self.run.log(
207 |             {
208 |                 plot_id: wandb.plot.line(
209 |                     table, column_names[0], column_names[0], title=title
210 |                 )
211 |             }
212 |         )
213 | 
214 |     def log_table(self, data, table_id="table"):
215 |         assert type(data) is pd.DataFrame
216 |         table = wandb.Table(dataframe=data)
217 |         self.run.log({table_id: table})
218 | 


--------------------------------------------------------------------------------
/cw2/cw_error.py:
--------------------------------------------------------------------------------
 1 | class ConfigKeyError(Exception):
 2 |     """raised when a key is missing in the configuration."""
 3 | 
 4 |     pass
 5 | 
 6 | 
 7 | class MissingConfigError(Exception):
 8 |     """raise when a config document is missing in the configuration."""
 9 | 
10 |     pass
11 | 
12 | 
13 | class ExperimentNotFoundError(Exception):
14 |     """raise when experiment selection could not be found in the configuration"""
15 | 
16 |     pass
17 | 
18 | 
19 | class ExperimentSurrender(Exception):
20 |     def __init__(self, payload: dict = None):
21 |         if payload is None:
22 |             payload = {}
23 |         self.payload = payload
24 | 


--------------------------------------------------------------------------------
/cw2/cw_slurm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_slurm/__init__.py


--------------------------------------------------------------------------------
/cw2/cw_slurm/cw_slurm.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import shutil
  4 | import subprocess
  5 | import sys
  6 | 
  7 | import __main__
  8 | 
  9 | import cw2.cw_config.cw_conf_keys as CKEYS
 10 | import cw2.cw_slurm.cw_slurm_keys as SKEYS
 11 | from cw2 import cli_parser, cw_error, util
 12 | from cw2.cw_config import cw_config
 13 | from cw2.cw_data import cw_logging
 14 | 
 15 | 
 16 | class SlurmConfig:
 17 |     def __init__(self, conf: cw_config.Config) -> None:
 18 |         self.conf = conf
 19 |         self.slurm_conf = conf.slurm_config
 20 | 
 21 |         if self.slurm_conf is None:
 22 |             raise cw_error.MissingConfigError(
 23 |                 "No SLURM configuration found in {}".format(self.conf.config_path)
 24 |             )
 25 | 
 26 |         self._check_template()
 27 | 
 28 |     def _check_template(self):
 29 |         """check if an sbatch.sh template is present.
 30 |         If no costum template has been specified, the default will be used.
 31 |         """
 32 | 
 33 |         if SKEYS.TEMPLATE_PATH not in self.slurm_conf:
 34 |             self.slurm_conf[SKEYS.TEMPLATE_PATH] = os.path.join(
 35 |                 os.path.dirname(__file__), "../default_sbatch.sh"
 36 |             )
 37 | 
 38 |         if not os.path.exists(self.slurm_conf[SKEYS.TEMPLATE_PATH]):
 39 |             raise cw_error.ConfigKeyError(
 40 |                 "Could not find default sbatch template. Please specify your own 'path_to_template'."
 41 |             )
 42 | 
 43 |     def _complete_optionals(self):
 44 |         """Fill in any optional values."""
 45 | 
 46 |         sc: dict = self.slurm_conf
 47 | 
 48 |         exp_output_path = self.conf.exp_configs[0][CKEYS.i_BASIC_PATH]
 49 | 
 50 |         # CREATE OPTIONAL COLLECTIONS
 51 |         # Must be done first:
 52 |         sc.setdefault(SKEYS.SBATCH_ARGS, {})
 53 | 
 54 |         # SET DEFAULT VALUES
 55 |         sc.setdefault(SKEYS.SLURM_LOG, os.path.join(exp_output_path, "slurmlog"))
 56 |         sc.setdefault(SKEYS.SLURM_OUT, os.path.join(exp_output_path, "sbatch.sh"))
 57 |         sc.setdefault(SKEYS.ACCOUNT, "")
 58 | 
 59 |         # COMPLEX CONVERSIONS
 60 |         if isinstance(sc[SKEYS.TIME], int):
 61 |             sc[SKEYS.TIME] = "{:d}:{:d}:00".format(
 62 |                 sc[SKEYS.TIME] // 60, sc[SKEYS.TIME] % 60
 63 |             )
 64 | 
 65 |         if SKEYS.CPU_MEM in sc:
 66 |             sc[SKEYS.SBATCH_ARGS][SKEYS.CPU_MEM] = sc.get(SKEYS.CPU_MEM)
 67 | 
 68 |         # DEFAULT OR COMPLEX CONVERSION
 69 |         if SKEYS.VENV in sc:
 70 |             sc[SKEYS.VENV] = "source activate {}".format(sc[SKEYS.VENV])
 71 |         else:
 72 |             sc[SKEYS.VENV] = ""
 73 | 
 74 |         if SKEYS.SH_LINES in sc:
 75 |             sc[SKEYS.SH_LINES] = "\n".join(sc[SKEYS.SH_LINES])
 76 |         else:
 77 |             sc[SKEYS.SH_LINES] = ""
 78 | 
 79 |     def _complete_cli_args(self):
 80 |         """identify and process the relevant CLI flags from the original call."""
 81 |         sc = self.slurm_conf
 82 |         cw_options = cli_parser.Arguments().get()
 83 | 
 84 |         sc[SKEYS.CW_ARGS] = ""
 85 |         if cw_options["overwrite"]:
 86 |             sc[SKEYS.CW_ARGS] += " -o"
 87 |         if cw_options["experiments"] is not None:
 88 |             sc[SKEYS.CW_ARGS] += " -e " + " ".join(cw_options["experiments"])
 89 | 
 90 |     def _complete_sbatch_args(self):
 91 |         """if optional SBATCH arguments are present, build a corresponding string."""
 92 |         sc = self.slurm_conf
 93 | 
 94 |         if SKEYS.SBATCH_ARGS not in sc:  # Check if empty
 95 |             sc[SKEYS.SBATCH_ARGS] = ""
 96 |             return
 97 |         else:  # Else build String
 98 |             sbatch_args = sc.get(SKEYS.SBATCH_ARGS)
 99 | 
100 |             args_list = ["#SBATCH --{} {}".format(k, v) for k, v in sbatch_args.items()]
101 |             sc[SKEYS.SBATCH_ARGS] = "\n".join(args_list)
102 | 
103 |     def finalize(self, num_jobs: int):
104 |         """enrich slurm configuration with dynamically computed values
105 | 
106 |         Args:
107 |             num_jobs (int): total number of defined jobs
108 |         """
109 | 
110 |         # counting starts at 0
111 |         self.slurm_conf[SKEYS.LAST_IDX] = num_jobs - 1
112 | 
113 |         # Order is important!
114 |         self._complete_optionals()
115 |         self._complete_cli_args()
116 |         self._complete_sbatch_args()
117 | 
118 | 
119 | class SlurmDirectoryManager:
120 |     MODE_COPY = "COPY"
121 |     MODE_MULTI = "MULTI"
122 |     MODE_NOCOPY = "NOCOPY"
123 |     MODE_ZIP = "ZIP"
124 | 
125 |     def __init__(self, sc: SlurmConfig, conf: cw_config.Config) -> None:
126 |         self.slurm_config = sc
127 |         self.conf = conf
128 |         self.m = self.set_mode()
129 |         os.makedirs(sc.slurm_conf[SKEYS.SLURM_LOG], exist_ok=True)
130 | 
131 |     def set_mode(self):
132 |         """find which code-copy mode is configured
133 | 
134 |         Raises:
135 |             cw_error.ConfigKeyError: if incomplete definition
136 | 
137 |         Returns:
138 |             code-copy mode
139 |         """
140 |         sc = self.slurm_config.slurm_conf
141 | 
142 |         # COUNT MISSING ARGS
143 |         cp_error_count = 0
144 |         missing_arg = ""
145 |         if SKEYS.EXP_CP_AUTO not in sc and SKEYS.EXP_CP_DST not in sc:
146 |             cp_error_count += 1
147 |             missing_arg = SKEYS.EXP_CP_DST
148 | 
149 |         if SKEYS.EXP_CP_SRC not in sc:
150 |             cp_error_count += 1
151 |             missing_arg = SKEYS.EXP_CP_SRC
152 | 
153 |         # MODE SWITCH
154 |         if cp_error_count == 1:
155 |             raise cw_error.ConfigKeyError(
156 |                 "Incomplete SLURM experiment copy config. Missing key: {}".format(
157 |                     missing_arg
158 |                 )
159 |             )
160 | 
161 |         cw_options = cli_parser.Arguments().get()
162 |         if cw_options.get("zip"):
163 |             return self.MODE_ZIP
164 | 
165 |         if cw_options.get("multicopy"):
166 |             if cp_error_count == 0:
167 |                 return self.MODE_MULTI
168 |             else:
169 |                 raise cw_error.ConfigKeyError(
170 |                     "Incomplete SLURM experiment copy config. Please define SRC and DST for --multicopy"
171 |                 )
172 | 
173 |         if cp_error_count == 0:
174 |             return self.MODE_COPY
175 |         return self.MODE_NOCOPY
176 | 
177 |     def dir_size_validation(self, src):
178 |         """validates that the SRC for code copy is below 200MB in size
179 | 
180 |         Args:
181 |             src: src path
182 | 
183 |         Raises:
184 |             cw_error.ConfigKeyError: if directory is greater than 200MB
185 |         """
186 |         cw_options = cli_parser.Arguments().get()
187 |         if cw_options.get("skipsizecheck"):
188 |             return
189 | 
190 |         dirsize = util.get_size(src)
191 |         if dirsize > 200.0:
192 |             cw_logging.getLogger().warning(
193 |                 "SourceDir {} is greater than 200MByte".format(src)
194 |             )
195 |             msg = (
196 |                 "Directory {} is greater than 200MByte."
197 |                 " If you are sure you want to copy/zip this dir, use --skipsizecheck."
198 |                 "\nElse check experiment_copy__ configuration keys".format(src)
199 |             )
200 |             raise cw_error.ConfigKeyError(msg)
201 | 
202 |     def get_exp_src(self) -> str:
203 |         """retrieves the code-copy src.
204 |         Uses CWD as default unless specified
205 | 
206 |         Returns:
207 |             src path
208 |         """
209 |         sc = self.slurm_config.slurm_conf
210 |         return sc.get(SKEYS.EXP_CP_SRC, os.getcwd())
211 | 
212 |     def get_exp_dst(self):
213 |         """retrieves the code-copy dst.
214 |         Uses CWD as default unless specified
215 | 
216 |         Returns:
217 |             src path
218 |         """
219 |         sc = self.slurm_config.slurm_conf
220 |         if SKEYS.EXP_CP_AUTO in sc and SKEYS.EXP_CP_DST not in sc:
221 |             sc[SKEYS.EXP_CP_DST] = os.path.join(
222 |                 sc.get(SKEYS.EXP_CP_AUTO),
223 |                 datetime.datetime.now().strftime("%Y%m%d%G%M%S"),
224 |             )
225 |         if SKEYS.EXP_CP_DST in sc:
226 |             return sc[SKEYS.EXP_CP_DST]
227 |         else:
228 |             exp_output_path = self.conf.exp_configs[0][CKEYS.i_BASIC_PATH]
229 |             return os.path.join(exp_output_path, "code")
230 | 
231 |     def zip_exp(self):
232 |         """procedure for creating a zip backup"""
233 |         src = self.get_exp_src()
234 |         dst = self.get_exp_dst()
235 |         self.dir_size_validation(src)
236 | 
237 |         shutil.make_archive(dst, "zip", src)
238 | 
239 |     def create_single_copy(self):
240 |         """creates a copy of the exp for slurm execution"""
241 |         src = self.get_exp_src()
242 |         dst = self.get_exp_dst()
243 |         self._copy_files(src, dst)
244 | 
245 |     def create_multi_copy(self, num_jobs: int):
246 |         """creates multiple copies of the exp, one for each slurm job
247 | 
248 |         Args:
249 |             num_jobs (int): number of total jobs
250 |         """
251 |         src = self.get_exp_src()
252 |         dst_base = self.get_exp_dst()
253 | 
254 |         for i in range(num_jobs):
255 |             dst = os.path.join(dst_base, str(i))
256 |             self._copy_files(src, dst)
257 | 
258 |         # Add MultiCopy ChangeDir to Slurmconf
259 |         self.slurm_config.slurm_conf[SKEYS.SH_LINES] += "\ncd {} \n".format(
260 |             os.path.join(self.get_exp_dst(), "$SLURM_ARRAY_TASK_ID")
261 |         )
262 | 
263 |     def _copy_files(self, src, dst):
264 |         """copies files from src to dst
265 | 
266 |         Args:
267 |             src: source directory
268 |             dst: destination directory
269 | 
270 |         Raises:
271 |             cw_error.ConfigKeyError: if the dst is inside the source. Recursive copying!
272 |             cw_error.ConfigKeyError: if the dst already exists and overwrite is not forced.
273 |         """
274 |         self.dir_size_validation(src)
275 | 
276 |         # Check Filesystem
277 |         if util.check_subdir(src, dst):
278 |             raise cw_error.ConfigKeyError(
279 |                 "experiment_copy_dst is a subdirectory of experiment_copy_src. Recursive Copying is bad."
280 |             )
281 |         try:
282 |             os.makedirs(dst, exist_ok=cli_parser.Arguments().get()["overwrite"])
283 |         except FileExistsError:
284 |             raise cw_error.ConfigKeyError(
285 |                 "{} already exists. Please define a different 'experiment_copy_dst', use '-o' to overwrite or '--nocodecopy' to skip."
286 |             )
287 | 
288 |         # Copy files
289 |         ign = shutil.ignore_patterns("*.pyc", "tmp*", ".git*")
290 |         for item in os.listdir(src):
291 |             s = os.path.join(src, item)
292 |             d = os.path.join(dst, item)
293 |             if os.path.isdir(s):
294 |                 shutil.copytree(s, d, ignore=ign)
295 |             else:
296 |                 shutil.copy2(s, d)
297 | 
298 |     def move_files(self, num_jobs: int):
299 |         """moves exp files according to detected copy mode
300 |         Args:
301 |             num_jobs: number of slurm jobs for multi-copy
302 |         """
303 |         # Check Skip Flag
304 |         cw_options = cli_parser.Arguments().get()
305 |         if cw_options.get("nocodecopy"):
306 |             print("Skipping Code Copy")
307 |             return
308 | 
309 |         if self.m == self.MODE_COPY:
310 |             self.create_single_copy()
311 | 
312 |         if self.m == self.MODE_MULTI:
313 |             self.create_multi_copy(num_jobs)
314 | 
315 |         if self.m == self.MODE_ZIP:
316 |             self.zip_exp()
317 | 
318 |     def get_exp_exec_dir(self) -> str:
319 |         """retrieves the experiment execution dir.
320 |         This dir depends on the exp_copy_dst
321 | 
322 |         Returns:
323 |             str: experiment execution directory
324 |         """
325 |         if self.m == self.MODE_COPY or self.m == self.MODE_MULTI:
326 |             return self.get_exp_dst()
327 | 
328 |         return self.get_exp_src()
329 | 
330 |     def get_py_path(self) -> str:
331 |         """computes a modified python path, depending on the experiment_copy procedure
332 | 
333 |         Returns:
334 |             str: python path setting
335 |         """
336 |         if self.m in [self.MODE_NOCOPY, self.MODE_ZIP]:
337 |             return ""
338 | 
339 |         pypath = sys.path.copy()
340 | 
341 |         src = self.get_exp_src()
342 |         dst = self.get_exp_dst()
343 | 
344 |         if self.m == self.MODE_MULTI:
345 |             dst = os.path.join(dst, "$SLURM_ARRAY_TASK_ID")
346 | 
347 |         new_path = [
348 |             x.replace(os.path.abspath(src), os.path.abspath(dst)) for x in pypath
349 |         ]
350 |         # return "export PYTHONPATH=" + ":".join(new_path)
351 |         # Maybe this is better?
352 |         return "export PYTHONPATH=$PYTHONPATH:" + ":".join(new_path)
353 | 
354 | 
355 | def run_slurm(conf: cw_config.Config, num_jobs: int) -> None:
356 |     """starts slurm execution
357 | 
358 |     Args:
359 |         conf (cw_config.Config): config object
360 |         num_jobs (int): total number of jobs
361 |     """
362 |     # Finalize Configs
363 |     sc = SlurmConfig(conf)
364 |     sc.finalize(num_jobs)
365 | 
366 |     # Create Code Copies
367 |     dir_mgr = SlurmDirectoryManager(sc, conf)
368 |     dir_mgr.move_files(num_jobs)
369 | 
370 |     # Write and call slurm script
371 |     slurm_script = write_slurm_script(sc, dir_mgr)
372 |     cmd = "sbatch " + slurm_script
373 |     print(cmd)
374 |     subprocess.check_output(cmd, shell=True)
375 | 
376 | 
377 | def write_slurm_script(slurm_conf: SlurmConfig, dir_mgr: SlurmDirectoryManager) -> str:
378 |     """write the sbatch.sh script for slurm to disk
379 | 
380 |     Args:
381 |         slurm_conf (SlurmConfig): Slurm configuration object
382 | 
383 |     Returns:
384 |         str: path to the written script
385 |     """
386 |     sc = slurm_conf.slurm_conf
387 |     conf = slurm_conf.conf
388 | 
389 |     template_path = sc[SKEYS.TEMPLATE_PATH]
390 |     output_path = sc[SKEYS.SLURM_OUT]
391 | 
392 |     exp_main_file = os.path.relpath(__main__.__file__, os.getcwd())
393 | 
394 |     fid_in = open(template_path, "r")
395 |     fid_out = open(output_path, "w")
396 | 
397 |     tline = fid_in.readline()
398 | 
399 |     while tline:
400 |         tline = tline.replace("%%partition%%", sc["partition"])
401 |         tline = tline.replace("%%account%%", sc[SKEYS.ACCOUNT])
402 |         tline = tline.replace("%%job-name%%", sc["job-name"])
403 | 
404 |         tline = tline.replace("%%last_job_idx%%", "{:d}".format(sc[SKEYS.LAST_IDX]))
405 |         tline = tline.replace(
406 |             "%%num_parallel_jobs%%", "{:d}".format(sc["num_parallel_jobs"])
407 |         )
408 | 
409 |         tline = tline.replace(
410 |             "%%experiment_execution_dir%%", dir_mgr.get_exp_exec_dir()
411 |         )
412 | 
413 |         tline = tline.replace("%%slurm_log%%", sc[SKEYS.SLURM_LOG])
414 | 
415 |         tline = tline.replace("%%ntasks%%", "{:d}".format(sc["ntasks"]))
416 |         tline = tline.replace("%%cpus-per-task%%", "{:d}".format(sc["cpus-per-task"]))
417 |         tline = tline.replace("%%time%%", sc[SKEYS.TIME])
418 | 
419 |         tline = tline.replace("%%sh_lines%%", sc[SKEYS.SH_LINES])
420 | 
421 |         tline = tline.replace("%%venv%%", sc[SKEYS.VENV])
422 |         tline = tline.replace("%%pythonpath%%", dir_mgr.get_py_path())
423 | 
424 |         tline = tline.replace("%%python_script%%", exp_main_file)
425 |         tline = tline.replace("%%path_to_yaml_config%%", conf.config_path)
426 | 
427 |         tline = tline.replace("%%cw_args%%", sc[SKEYS.CW_ARGS])
428 |         tline = tline.replace("%%sbatch_args%%", sc[SKEYS.SBATCH_ARGS])
429 | 
430 |         fid_out.write(tline)
431 | 
432 |         tline = fid_in.readline()
433 |     fid_in.close()
434 |     fid_out.close()
435 |     return output_path
436 | 


--------------------------------------------------------------------------------
/cw2/cw_slurm/cw_slurm_keys.py:
--------------------------------------------------------------------------------
 1 | TEMPLATE_PATH = "path_to_template"
 2 | 
 3 | ACCOUNT = "account"
 4 | TIME = "time"
 5 | 
 6 | CPU_MEM = "mem-per-cpu"
 7 | VENV = "venv"
 8 | 
 9 | SBATCH_ARGS = "sbatch_args"
10 | SH_LINES = "sh_lines"
11 | CW_ARGS = "cw_args"
12 | 
13 | SLURM_LOG = "slurm_log"
14 | SLURM_OUT = "slurm_output"
15 | 
16 | EXP_CP_AUTO = "experiment_copy_auto_dst"
17 | EXP_CP_DST = "experiment_copy_dst"
18 | EXP_CP_SRC = "experiment_copy_src"
19 | 
20 | 
21 | LAST_IDX = "last_job_idx"
22 | 


--------------------------------------------------------------------------------
/cw2/default_sbatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p %%partition%%
 3 | # #SBATCH -A %%account%%
 4 | #SBATCH -J %%job-name%%
 5 | #SBATCH --array 0-%%last_job_idx%%%%%num_parallel_jobs%%
 6 | 
 7 | # Please use the complete path details :
 8 | #SBATCH -D %%experiment_execution_dir%%
 9 | #SBATCH -o %%slurm_log%%/out_%A_%a.log
10 | #SBATCH -e %%slurm_log%%/err_%A_%a.log
11 | 
12 | # Cluster Settings
13 | #SBATCH -n %%ntasks%%         # Number of tasks
14 | #SBATCH -c %%cpus-per-task%%  # Number of cores per task
15 | #SBATCH -t %%time%%             # 1:00:00 Hours, minutes and seconds, or '#SBATCH -t 10' - only minutes
16 | 
17 | %%sbatch_args%%
18 | # -------------------------------
19 | 
20 | # Activate the virtualenv / conda environment
21 | %%venv%%
22 | 
23 | 
24 | # Export Pythonpath
25 | %%pythonpath%%
26 | 
27 | # Additional Instructions from CONFIG.yml
28 | %%sh_lines%%
29 | 
30 | python3 %%python_script%% %%path_to_yaml_config%% -j $SLURM_ARRAY_TASK_ID %%cw_args%%
31 | 
32 | # THIS WAS BUILT FROM THE DEFAULLT SBATCH TEMPLATE


--------------------------------------------------------------------------------
/cw2/experiment.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import datetime as dt
 3 | 
 4 | from cw2.cw_data import cw_logging
 5 | from cw2.cw_error import ExperimentSurrender
 6 | 
 7 | 
 8 | class AbstractExperiment(abc.ABC):
 9 |     @abc.abstractmethod
10 |     def initialize(
11 |         self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray
12 |     ) -> None:
13 |         """needs to be implemented by subclass.
14 |         Called once at the start of each repition for initialization purposes.
15 | 
16 |         Arguments:
17 |             cw_config {dict} -- clusterwork experiment configuration
18 |             rep {int} -- repition counter
19 |             logger {cw_logging.LoggerArray} -- initialized loggers for preprocessing
20 |         """
21 |         raise NotImplementedError
22 | 
23 |     @abc.abstractmethod
24 |     def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
25 |         """needs to be implemented by subclass.
26 |         Called after initialize(). Should be the main procedure of the experiment.
27 | 
28 |         Args:
29 |             config (dict): clusterwork experiment configuration
30 |             rep (int): [description]
31 |             logger (cw_logging.LoggerArray): [description]
32 | 
33 |         Raises:
34 |             NotImplementedError: [description]
35 |         """
36 |         raise NotImplementedError
37 | 
38 |     @abc.abstractmethod
39 |     def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False):
40 |         """needs to be implemented by subclass.
41 |         Is guaranteed to be called after the experiment has run, even in case of exceptions during execution.
42 | 
43 |         Args:
44 |             surrender (ExperimentSurrender, optional): when the experiment raises an ExperimentSurrender, this object can be accessed here. Defaults to None.
45 |             crash (bool, optional): indicating if the experiment raised a 'serious' Exception. Defaults to False.
46 |         """
47 |         raise NotImplementedError
48 | 
49 | 
50 | class AbstractIterativeExperiment(AbstractExperiment):
51 |     @abc.abstractmethod
52 |     def iterate(self, cw_config: dict, rep: int, n: int) -> dict:
53 |         """needs to be implemented by subclass.
54 |         The iteration procedure.
55 | 
56 |         Arguments:
57 |             cw_config {dict} -- clusterwork experiment configuration
58 |             rep {int} -- repitition counter
59 |             n {int} -- iteration counter
60 | 
61 |         Returns:
62 |             dict -- result map
63 |         """
64 |         raise NotImplementedError
65 | 
66 |     @abc.abstractmethod
67 |     def save_state(self, cw_config: dict, rep: int, n: int) -> None:
68 |         """needs to be implemented by subclass.
69 |         Intended to save an intermediate state after each iteration.
70 |         Arguments:
71 |             cw_config {dict} -- clusterwork experiment configuration
72 |             rep {int} -- repitition counter
73 |             n {int} -- [description]
74 |         """
75 |         raise NotImplementedError
76 | 
77 |     def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
78 |         for n in range(cw_config["iterations"]):
79 |             surrender = False
80 |             try:
81 |                 res = self.iterate(cw_config, rep, n)
82 |             except ExperimentSurrender as e:
83 |                 res = e.payload
84 |                 surrender = True
85 | 
86 |             res["ts"] = dt.datetime.now()
87 |             res["rep"] = rep
88 |             res["iter"] = n
89 |             logger.process(res)
90 | 
91 |             self.save_state(cw_config, rep, n)
92 | 
93 |             if surrender:
94 |                 raise ExperimentSurrender()
95 | 


--------------------------------------------------------------------------------
/cw2/job.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict, List, Type
  3 | 
  4 | from cw2 import cw_error, experiment
  5 | from cw2.cw_config import cw_conf_keys as KEYS
  6 | from cw2.cw_data import cw_logging
  7 | 
  8 | 
  9 | class Job:
 10 |     """Class defining a computation job.
 11 |     Can contain 1..n tasks. Each job should encapsulate all information necessary for execution.
 12 |     A task is an experiment configuration with unique repetition idx.
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         tasks: List[Dict],
 18 |         exp_cls: experiment.AbstractExperiment.__class__,
 19 |         logger: cw_logging.AbstractLogger,
 20 |         delete_old_files: bool = False,
 21 |         root_dir: str = "",
 22 |         read_only: bool = False,
 23 |     ):
 24 |         self.tasks = tasks
 25 | 
 26 |         if exp_cls is not None:
 27 |             self.exp = exp_cls()
 28 |         self.logger = logger
 29 | 
 30 |         self.n_parallel = 1
 31 |         if KEYS.REPS_PARALL in tasks[0]:
 32 |             self.n_parallel = tasks[0][KEYS.REPS_PARALL]
 33 | 
 34 |         self._root_dir = root_dir
 35 | 
 36 |         if not read_only:
 37 |             self.__create_experiment_directory(tasks, delete_old_files, root_dir)
 38 | 
 39 |     def __create_experiment_directory(
 40 |         self, tasks: List[Dict], delete_old_files=False, root_dir=""
 41 |     ):
 42 |         """internal function creating the directories in which the job will write its data.
 43 | 
 44 |         Args:
 45 |             task (List[attrdict.Attrdict]): a list of experiment tasks
 46 |             delete_old_files (bool, optional): Should the directory be emptied beforehand?. Defaults to False.
 47 |             root_dir (str, optional): [description]. Defaults to "".
 48 |         """
 49 |         for conf in tasks:
 50 |             # create experiment path and subdir
 51 |             os.makedirs(os.path.join(root_dir, conf[KEYS.PATH]), exist_ok=True)
 52 | 
 53 |             # create a directory for the log path
 54 |             os.makedirs(os.path.join(root_dir, conf[KEYS.LOG_PATH]), exist_ok=True)
 55 | 
 56 |             # create log path for each repetition
 57 |             rep_path = os.path.join(root_dir, conf[KEYS.i_REP_LOG_PATH])
 58 | 
 59 |             # XXX: Disable Delete for now
 60 |             """
 61 |             if delete_old_files:
 62 |                 pass
 63 |             """
 64 |             os.makedirs(rep_path, exist_ok=True)
 65 | 
 66 |     def run_task(self, c: Dict, overwrite: bool):
 67 |         """Execute a single task of the job.
 68 | 
 69 |         Args:
 70 |             c (attrdict.AttrDict): task configuration
 71 |         """
 72 |         rep_path = c[KEYS.i_REP_LOG_PATH]
 73 |         r = c[KEYS.i_REP_IDX]
 74 |         print(rep_path)
 75 | 
 76 |         if not overwrite and self._check_task_exists(c, r):
 77 |             cw_logging.getLogger().warning(
 78 |                 "Skipping run, as {} is not empty. Use -o to overwrite.".format(
 79 |                     rep_path
 80 |                 )
 81 |             )
 82 |             return
 83 | 
 84 |         surrender = None
 85 |         crash = False
 86 | 
 87 |         self.logger.initialize(c, r, rep_path)
 88 |         try:
 89 |             self.exp.initialize(c, r, self.logger)
 90 |             self.exp.run(c, r, self.logger)
 91 |         except cw_error.ExperimentSurrender as s:
 92 |             cw_logging.getLogger().warning("SURRENDER: {}".format(rep_path))
 93 |             surrender = s
 94 |         except:
 95 |             crash = True
 96 |             cw_logging.getLogger().exception("EXCEPTION: {}".format(rep_path))
 97 | 
 98 |         self.exp.finalize(surrender, crash)
 99 |         self.logger.finalize()
100 | 
101 |     def load_task(self, c: Dict) -> Dict:
102 |         """Load the results of a single task.
103 | 
104 |         Args:
105 |             c (attrdict.AttrDict): task configuration
106 | 
107 |         Returns:
108 |             dict: the loaded data
109 |         """
110 |         rep_path = os.path.join(self._root_dir, c[KEYS.i_REP_LOG_PATH])
111 |         r = c[KEYS.i_REP_IDX]
112 |         self.logger.initialize(c, r, rep_path)
113 |         return self.logger.load()
114 | 
115 |     def _check_task_exists(self, c: Dict, r: int) -> bool:
116 |         """internal function. checks if the task has already been run in the past.
117 | 
118 |         Args:
119 |             c (attrdict.AttrDict): task configuration
120 | 
121 |         Returns:
122 |             bool: True if the repetition was already run
123 |         """
124 |         rep_path = c[KEYS.i_REP_LOG_PATH]
125 |         return len(os.listdir(rep_path)) != 0
126 | 
127 | 
128 | class JobFactory:
129 |     """Facotry class to create single jobs from experiment configuration.
130 |     Specifially used to map experiment repetitions to Jobs.
131 |     """
132 | 
133 |     def __init__(
134 |         self,
135 |         exp_cls: Type[experiment.AbstractExperiment],
136 |         logger: cw_logging.AbstractLogger,
137 |         delete_old_files: bool = False,
138 |         root_dir: str = "",
139 |         read_only: bool = False,
140 |     ):
141 |         self.exp_cls = exp_cls
142 |         self.logger = logger
143 |         self.delete_old_files = delete_old_files
144 |         self.root_dir = root_dir
145 |         self.read_only = read_only
146 | 
147 |     def _group_exp_tasks(self, task_confs: List[Dict]) -> Dict:
148 |         """group tasks by experiment to access common attributes like reps_per_job
149 | 
150 |         Args:
151 |             task_confs (List[attrdict.AttrDict]): list of all task configurations
152 | 
153 |         Returns:
154 |             dict: dictionary of task configurations grouped by name.
155 |         """
156 |         grouped_exps = {}
157 |         for t in task_confs:
158 |             name = t[KEYS.NAME]
159 |             if name not in grouped_exps:
160 |                 grouped_exps[name] = []
161 |             grouped_exps[name].append(t)
162 |         return grouped_exps
163 | 
164 |     def _divide_tasks(self, task_confs: List[Dict]) -> List[List[Dict]]:
165 |         """internal function to divide experiment repetitions into sets of repetitions.
166 |         Dependent on configured reps_per_job attribute. Each set of repetitions will be one job.
167 | 
168 |         Args:
169 |             task_confs (List[attrdict.AttrDict]): List of task configurations
170 | 
171 |         Returns:
172 |             List[List[attrdict.AttrDict]]: a list containing all subpackages of tasks as lists
173 |         """
174 |         grouped_exps = self._group_exp_tasks(task_confs)
175 |         tasks = []
176 | 
177 |         for exp_name in grouped_exps:
178 |             exp_group = grouped_exps[exp_name]
179 | 
180 |             max_rep = len(exp_group)
181 | 
182 |             # Use 1 Repetition per job if not defined otherwise
183 |             rep_portion = 1
184 |             if KEYS.REPS_P_JOB in exp_group[0]:
185 |                 rep_portion = exp_group[0][KEYS.REPS_P_JOB]
186 | 
187 |             for start_rep in range(0, max_rep, rep_portion):
188 |                 tasks.append(exp_group[start_rep : start_rep + rep_portion])
189 |         return tasks
190 | 
191 |     def create_jobs(self, exp_configs: List[Dict]) -> List[Job]:
192 |         """creates a list of all jobs.
193 | 
194 |         Args:
195 |             exp_configs (List[attrdict.AttrDict]): list of all defined experiment configurations.
196 | 
197 |         Returns:
198 |             List[Job]: list of configured jobs.
199 |         """
200 |         task_list = self._divide_tasks(exp_configs)
201 |         joblist = []
202 |         for task in task_list:
203 |             j = Job(
204 |                 task,
205 |                 self.exp_cls,
206 |                 self.logger,
207 |                 self.delete_old_files,
208 |                 self.root_dir,
209 |                 self.read_only,
210 |             )
211 |             joblist.append(j)
212 |         return joblist
213 | 


--------------------------------------------------------------------------------
/cw2/scheduler.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import concurrent.futures
  3 | import multiprocessing
  4 | import os
  5 | import socket
  6 | import warnings
  7 | from typing import List
  8 | 
  9 | from joblib import Parallel, delayed
 10 | 
 11 | from cw2 import cw_error, job
 12 | from cw2.cw_config import cw_conf_keys as KEYS
 13 | from cw2.cw_config import cw_config
 14 | from cw2.cw_slurm import cw_slurm
 15 | 
 16 | 
 17 | class AbstractScheduler(abc.ABC):
 18 |     def __init__(self, conf: cw_config.Config = None):
 19 |         self.joblist = None
 20 |         self.config = conf
 21 | 
 22 |     def assign(self, joblist: List[job.Job]) -> None:
 23 |         """assigns the scheduler a list of jobs to execute
 24 | 
 25 |         Arguments:
 26 |             joblist {List[job.AbstractJob]} -- list of configured and implemented jobs
 27 |         """
 28 |         self.joblist = joblist
 29 | 
 30 |     @abc.abstractmethod
 31 |     def run(self, overwrite=False):
 32 |         """the scheduler begins to execute all assigned jobs
 33 | 
 34 |         Args:
 35 |             overwrite (bool, optional): overwrite flag. can be passed to the job. Defaults to False.
 36 |         """
 37 |         raise NotImplementedError
 38 | 
 39 | 
 40 | class GPUDistributingLocalScheduler(AbstractScheduler):
 41 |     def __init__(self, conf: cw_config.Config = None):
 42 |         super(GPUDistributingLocalScheduler, self).__init__(conf=conf)
 43 |         self._total_num_gpus = int(
 44 |             conf.slurm_config["sbatch_args"]["gres"].rsplit(":", 1)[1]
 45 |         )
 46 |         self._gpus_per_rep = conf.slurm_config["gpus_per_rep"]
 47 |         self._queue_elements = int(self._total_num_gpus / self._gpus_per_rep)
 48 | 
 49 |         print(
 50 |             "GPUDistributingLocalScheduler: {} GPUs available, {} GPUs per rep, {} queue elements".format(
 51 |                 self._total_num_gpus, self._gpus_per_rep, self._queue_elements
 52 |             )
 53 |         )
 54 | 
 55 |         if self._gpus_per_rep >= 1.0:
 56 |             assert self._gpus_per_rep == int(
 57 |                 self._gpus_per_rep
 58 |             ), "gpus_per_rep must be integer"
 59 | 
 60 |     @staticmethod
 61 |     def use_distributed_gpu_scheduling(conf: cw_config.Config) -> bool:
 62 |         if conf.slurm_config is None:
 63 |             return False
 64 |         # Use if
 65 |         # 1.) GPUs Requested
 66 |         # 2.) Number of GPUs per rep specified
 67 |         # 3.) Number of GPUs per rep != total number of gpus requested
 68 |         gpus_requested = "gres" in conf.slurm_config.get("sbatch_args", "DUMMY_DEFAULT")
 69 |         gpus_per_rep_specified = "gpus_per_rep" in conf.slurm_config
 70 | 
 71 |         if gpus_requested:
 72 |             num_gpus_requested = int(
 73 |                 conf.slurm_config["sbatch_args"]["gres"].rsplit(":", 1)[1]
 74 |             )
 75 |             # e.g. gres=gpu:4 or gres=gpu:full:4
 76 |         else:
 77 |             num_gpus_requested = 0
 78 | 
 79 |         use_distributed_gpu_scheduling = (
 80 |             gpus_requested
 81 |             and gpus_per_rep_specified
 82 |             and num_gpus_requested != conf.slurm_config["gpus_per_rep"]
 83 |         )
 84 | 
 85 |         if not use_distributed_gpu_scheduling:
 86 |             on_horeka_gpu = (
 87 |                 "hkn" in socket.gethostname()
 88 |                 and conf.slurm_config["partition"] == "accelerated"
 89 |             )
 90 |             if on_horeka_gpu:
 91 |                 assert (
 92 |                     num_gpus_requested == 4
 93 |                 ), "On HoreKA, you must request 4 GPUs (gres=gpu:4)"
 94 |             assert (
 95 |                 not on_horeka_gpu
 96 |             ), "You are on HoreKA and not using the GPU scheduler, don't! "
 97 | 
 98 |         return use_distributed_gpu_scheduling
 99 | 
100 |     @staticmethod
101 |     def get_gpu_str(queue_idx: int, gpus_per_rep: float) -> str:
102 |         if gpus_per_rep >= 1:
103 |             assert (
104 |                 int(gpus_per_rep) == gpus_per_rep
105 |             ), "gpus_per_rep must be integer if >= 1"
106 |             gpus_per_rep = int(gpus_per_rep)
107 |             return ("{}," * gpus_per_rep).format(
108 |                 *[queue_idx * gpus_per_rep + i for i in range(gpus_per_rep)]
109 |             )[:-1]
110 |         else:
111 |             return str(int(queue_idx * gpus_per_rep) + 0.01)
112 | 
113 | 
114 | class MPGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
115 |     def run(self, overwrite: bool = False):
116 |         num_parallel = self.joblist[0].n_parallel
117 |         for j in self.joblist:
118 |             assert (
119 |                 j.n_parallel == num_parallel
120 |             ), "All jobs in list must have same n_parallel"
121 |             assert j.n_parallel == self._queue_elements, (
122 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
123 |                 "parallel. Fix for optimal resource usage!!"
124 |             )
125 | 
126 |         with multiprocessing.Pool(processes=num_parallel) as pool:
127 |             # setup gpu resource queue
128 |             m = multiprocessing.Manager()
129 |             gpu_queue = m.Queue(maxsize=self._queue_elements)
130 |             for i in range(self._queue_elements):
131 |                 gpu_queue.put(i)
132 | 
133 |             for j in self.joblist:
134 |                 for c in j.tasks:
135 |                     pool.apply_async(
136 |                         MPGPUDistributingLocalScheduler._execute_task,
137 |                         (j, c, gpu_queue, self._gpus_per_rep, overwrite),
138 |                     )
139 |             pool.close()
140 |             pool.join()
141 | 
142 |     @staticmethod
143 |     def _execute_task(
144 |         j: job.Job,
145 |         c: dict,
146 |         q: multiprocessing.Queue,
147 |         gpus_per_rep: int,
148 |         overwrite: bool = False,
149 |     ):
150 |         queue_idx = q.get()
151 |         gpu_str = MPGPUDistributingLocalScheduler.get_gpu_str(queue_idx, gpus_per_rep)
152 |         try:
153 |             os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
154 |             j.run_task(c, overwrite)
155 |         except cw_error.ExperimentSurrender as _:
156 |             return
157 |         finally:
158 |             q.put(queue_idx)
159 | 
160 | 
161 | class HOREKAAffinityGPUDistributingLocalScheduler(GPUDistributingLocalScheduler):
162 |     def __init__(self, conf: cw_config.Config = None):
163 |         super(HOREKAAffinityGPUDistributingLocalScheduler, self).__init__(conf=conf)
164 | 
165 |         total_cpus = conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"]
166 |         self._cpus_per_rep = total_cpus // self._queue_elements
167 | 
168 |         assert (
169 |             self._cpus_per_rep > 0
170 |         ), "Not enough CPUs for the number of GPUs requested"
171 | 
172 |     def run(self, overwrite: bool = False):
173 |         print("Seeing CPUs:", os.sched_getaffinity(0))
174 |         num_parallel = self.joblist[0].n_parallel
175 |         for j in self.joblist:
176 |             assert (
177 |                 j.n_parallel == num_parallel
178 |             ), "All jobs in list must have same n_parallel"
179 |             assert j.n_parallel == self._queue_elements, (
180 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
181 |                 "parallel. Fix for optimal resource usage!!"
182 |             )
183 | 
184 |         with concurrent.futures.ProcessPoolExecutor(
185 |             max_workers=num_parallel,
186 |         ) as pool:
187 |             # setup gpu resource queue
188 |             m = multiprocessing.Manager()
189 |             gpu_queue = m.Queue(maxsize=self._queue_elements)
190 |             for i in range(self._queue_elements):
191 |                 gpu_queue.put(i)
192 | 
193 |             for j in self.joblist:
194 |                 for c in j.tasks:
195 |                     pool.submit(
196 |                         HOREKAAffinityGPUDistributingLocalScheduler._execute_task,
197 |                         j,
198 |                         c,
199 |                         gpu_queue,
200 |                         self._gpus_per_rep,
201 |                         self._cpus_per_rep,
202 |                         overwrite,
203 |                     )
204 | 
205 |     @staticmethod
206 |     def _execute_task(
207 |         j: job.Job,
208 |         c: dict,
209 |         q: multiprocessing.Queue,
210 |         gpus_per_rep: int,
211 |         cpus_per_rep: int,
212 |         overwrite: bool = False,
213 |     ):
214 |         print("Seeing CPUs:", os.sched_getaffinity(0))
215 |         queue_idx = q.get()
216 |         gpu_str = HOREKAAffinityGPUDistributingLocalScheduler.get_gpu_str(
217 |             queue_idx, gpus_per_rep
218 |         )
219 |         cpus = set(range(queue_idx * cpus_per_rep, (queue_idx + 1) * cpus_per_rep))
220 |         print("Job {}: Using GPUs: {} and CPUs: {}".format(queue_idx, gpu_str, cpus))
221 |         try:
222 |             os.sched_setaffinity(0, cpus)
223 |             c[KEYS.i_CPU_CORES] = cpus
224 |             os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
225 |             j.run_task(c, overwrite)
226 |         except cw_error.ExperimentSurrender as _:
227 |             return
228 |         finally:
229 |             q.put(queue_idx)
230 | 
231 | 
232 | class KlusterThreadLimitingScheduler(GPUDistributingLocalScheduler):
233 |     def __init__(self, conf: cw_config.Config = None):
234 |         super(KlusterThreadLimitingScheduler, self).__init__(conf=conf)
235 |         total_cpus = conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"]
236 |         self._num_threads = total_cpus // self._queue_elements
237 |         print("Using {} threads per Rep".format(self._num_threads))
238 | 
239 |     def run(self, overwrite: bool = False):
240 |         num_parallel = self.joblist[0].n_parallel
241 |         for j in self.joblist:
242 |             assert (
243 |                 j.n_parallel == num_parallel
244 |             ), "All jobs in list must have same n_parallel"
245 |             assert j.n_parallel == self._queue_elements, (
246 |                 "Mismatch between GPUs Queue Elements and Jobs executed in"
247 |                 "parallel. Fix for optimal resource usage!!"
248 |             )
249 | 
250 |         with multiprocessing.Pool(processes=num_parallel) as pool:
251 |             # setup gpu resource queue
252 |             m = multiprocessing.Manager()
253 |             gpu_queue = m.Queue(maxsize=self._queue_elements)
254 |             for i in range(self._queue_elements):
255 |                 gpu_queue.put(i)
256 | 
257 |             for j in self.joblist:
258 |                 for c in j.tasks:
259 |                     args = (
260 |                         j,
261 |                         c,
262 |                         gpu_queue,
263 |                         self._gpus_per_rep,
264 |                         self._num_threads,
265 |                         overwrite,
266 |                     )
267 |                     pool.apply_async(KlusterThreadLimitingScheduler._execute_task, args)
268 |             pool.close()
269 |             pool.join()
270 | 
271 |     @staticmethod
272 |     def _execute_task(
273 |         j: job.Job,
274 |         c: dict,
275 |         q: multiprocessing.Queue,
276 |         gpus_per_rep: int,
277 |         num_threads: int,
278 |         overwrite: bool = False,
279 |     ):
280 |         queue_idx = q.get()
281 |         gpu_str = KlusterThreadLimitingScheduler.get_gpu_str(queue_idx, gpus_per_rep)
282 |         try:
283 |             os.environ["MKL_NUM_THREADS"] = str(num_threads)
284 |             os.environ["NUMEXPR_NUM_THREADS"] = str(num_threads)
285 |             os.environ["OMP_NUM_THREADS"] = str(num_threads)
286 |             # Ok, that's not so nice, but I did not find better way yet
287 |             try:
288 |                 import torch
289 | 
290 |                 torch.set_num_threads(num_threads)
291 |             except ImportError:
292 |                 pass
293 | 
294 |             os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
295 |             j.run_task(c, overwrite)
296 |         except cw_error.ExperimentSurrender as _:
297 |             return
298 |         finally:
299 |             q.put(queue_idx)
300 | 
301 | 
302 | def get_gpu_scheduler_cls(scheduler: str):
303 |     if scheduler == "mp":
304 |         return MPGPUDistributingLocalScheduler
305 |     elif scheduler == "horeka":
306 |         return HOREKAAffinityGPUDistributingLocalScheduler
307 |     elif scheduler == "kluster":
308 |         return KlusterThreadLimitingScheduler
309 |     else:
310 |         raise NotImplementedError
311 | 
312 | 
313 | class CpuDistributingLocalScheduler(AbstractScheduler):
314 |     def __init__(self, conf: cw_config.Config = None):
315 |         super(CpuDistributingLocalScheduler, self).__init__(conf=conf)
316 |         self._total_num_cpus = (
317 |             conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"]
318 |         )
319 |         self._cpus_per_rep = conf.slurm_config["cpus_per_rep"]
320 |         assert self._cpus_per_rep == int(
321 |             self._cpus_per_rep
322 |         ), "cpus_per_rep must be integer"
323 |         self._queue_elements = int(self._total_num_cpus / self._cpus_per_rep)
324 |         print(
325 |             "CPUDistributingLocalScheduler: {} CPUs available, {} CPUs per rep, {} queue elements".format(
326 |                 self._total_num_cpus, self._cpus_per_rep, self._queue_elements
327 |             )
328 |         )
329 | 
330 |     def run(self, overwrite: bool = False):
331 |         print("Seeing CPUs:", os.sched_getaffinity(0))
332 |         num_parallel = self.joblist[0].n_parallel
333 |         for j in self.joblist:
334 |             assert (
335 |                 j.n_parallel == num_parallel
336 |             ), "All jobs in list must have same n_parallel"
337 |             assert j.n_parallel == self._queue_elements, (
338 |                 "Mismatch between CPUs Queue Elements and Jobs executed in"
339 |                 "parallel. Fix for optimal resource usage!!"
340 |             )
341 | 
342 |         with concurrent.futures.ProcessPoolExecutor(
343 |             max_workers=num_parallel,
344 |         ) as pool:
345 |             # setup gpu resource queue
346 |             m = multiprocessing.Manager()
347 |             cpu_queue = m.Queue(maxsize=self._queue_elements)
348 |             for i in range(self._queue_elements):
349 |                 cpu_queue.put(i)
350 | 
351 |             for j in self.joblist:
352 |                 for c in j.tasks:
353 |                     pool.submit(
354 |                         CpuDistributingLocalScheduler._execute_task,
355 |                         j,
356 |                         c,
357 |                         cpu_queue,
358 |                         self._cpus_per_rep,
359 |                         overwrite,
360 |                     )
361 | 
362 |     @staticmethod
363 |     def _execute_task(
364 |         j: job.Job,
365 |         c: dict,
366 |         q: multiprocessing.Queue,
367 |         cpus_per_rep: int,
368 |         overwrite: bool = False,
369 |     ):
370 |         print("Seeing CPUs:", os.sched_getaffinity(0))
371 |         queue_idx = q.get()
372 |         cpus = set(range(queue_idx * cpus_per_rep, (queue_idx + 1) * cpus_per_rep))
373 |         print("Job {}: Using CPUs: {}".format(queue_idx, cpus))
374 |         try:
375 |             os.sched_setaffinity(0, cpus)
376 |             c[KEYS.i_CPU_CORES] = cpus
377 |             j.run_task(c, overwrite)
378 |         except cw_error.ExperimentSurrender as _:
379 |             return
380 |         finally:
381 |             q.put(queue_idx)
382 | 
383 |     @staticmethod
384 |     def use_distributed_cpu_scheduling(conf: cw_config.Config) -> bool:
385 |         if conf.slurm_config is None:
386 |             return False
387 |         else:
388 |             scheduler = conf.slurm_config.get("scheduler", None)
389 |             return scheduler == "cpu_distribute"
390 | 
391 | 
392 | class LocalScheduler(AbstractScheduler):
393 |     def run(self, overwrite: bool = False):
394 |         for j in self.joblist:
395 |             Parallel(n_jobs=j.n_parallel)(
396 |                 delayed(self.execute_task)(j, c, overwrite) for c in j.tasks
397 |             )
398 | 
399 |     def execute_task(self, j: job.Job, c: dict, overwrite: bool = False):
400 |         try:
401 |             j.run_task(c, overwrite)
402 |         except cw_error.ExperimentSurrender as _:
403 |             return
404 | 
405 | 
406 | class SlurmScheduler(AbstractScheduler):
407 |     def run(self, overwrite: bool = False):
408 |         cw_slurm.run_slurm(self.config, len(self.joblist))
409 | 


--------------------------------------------------------------------------------
/cw2/util.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import re
  4 | 
  5 | try:
  6 |     from collections.abc import Mapping, MutableMapping, MutableSequence  # noqa
  7 | except ImportError:
  8 |     from collections import Mapping, MutableMapping, MutableSequence  # noqa
  9 | 
 10 | 
 11 | def deep_update(base_dict: dict, update_dict: dict) -> dict:
 12 |     """Updates the base dictionary with corresponding values from the update dictionary, including nested collections.
 13 |        Not updated values are kept as is.
 14 | 
 15 |     Arguments:
 16 |         base_dict {dict} -- dictionary to be updated
 17 |         update_dict {dict} -- dictianry holding update values
 18 | 
 19 |     Returns:
 20 |         dict -- dictanry with updated values
 21 |     """
 22 |     for key, value in update_dict.items():
 23 |         # Update Recursively
 24 |         if isinstance(value, Mapping):
 25 |             branch = deep_update(base_dict.get(key, {}), value)
 26 |             base_dict[key] = branch
 27 |         else:
 28 |             base_dict[key] = update_dict[key]
 29 |     return base_dict
 30 | 
 31 | 
 32 | def flatten_dict(d, parent_key="", sep="_"):
 33 |     items = []
 34 |     for k, v in d.items():
 35 |         new_key = parent_key + sep + k if parent_key else k
 36 |         if isinstance(v, MutableMapping):
 37 |             items.extend(flatten_dict(v, new_key, sep=sep).items())
 38 |         elif isinstance(v, MutableSequence):
 39 |             keys = map(lambda i: new_key + "_" + str(i), range(len(v)))
 40 |             items.extend(zip(keys, v))
 41 |         else:
 42 |             items.append((new_key, v))
 43 |     return dict(items)
 44 | 
 45 | 
 46 | def flatten_dict_to_tuple_keys(d: MutableMapping):
 47 |     flat_dict = {}
 48 |     for k, v in d.items():
 49 |         if isinstance(v, MutableMapping):
 50 |             sub_dict = flatten_dict_to_tuple_keys(v)
 51 |             flat_dict.update({(k, *sk): sv for sk, sv in sub_dict.items()})
 52 | 
 53 |         elif isinstance(v, MutableSequence):
 54 |             flat_dict[(k,)] = v
 55 | 
 56 |     return flat_dict
 57 | 
 58 | 
 59 | def insert_deep_dictionary(d: MutableMapping, t: tuple, value):
 60 |     if type(t) is tuple:
 61 |         if len(t) == 1:  # tuple contains only one key
 62 |             d[t[0]] = value
 63 |         else:  # tuple contains more than one key
 64 |             if t[0] not in d:
 65 |                 d[t[0]] = dict()
 66 |             insert_deep_dictionary(d[t[0]], t[1:], value)
 67 |     else:
 68 |         d[t] = value
 69 | 
 70 | 
 71 | def append_deep_dictionary(d: MutableMapping, t: tuple, value):
 72 |     if type(t) is tuple:
 73 |         if len(t) == 1:  # tuple contains only one key
 74 |             if t[0] not in d:
 75 |                 d[t[0]] = []
 76 |             d[t[0]].append(value)
 77 |         else:  # tuple contains more than one key
 78 |             if t[0] not in d:
 79 |                 d[t[0]] = dict()
 80 |             append_deep_dictionary(d[t[0]], t[1:], value)
 81 |     else:
 82 |         d[t] = value
 83 | 
 84 | 
 85 | def format_time(time_in_secs: float) -> str:
 86 |     return str(datetime.timedelta(seconds=time_in_secs))
 87 | 
 88 | 
 89 | def shorten_param(_param_name):
 90 |     name_parts = _param_name.split(".")
 91 |     shortened_parts = ".".join(map(lambda s: s[:3], name_parts[:-1]))
 92 |     # also handle cases where the leaf name contains '__' then splitting at '_' yields an empty '' string element
 93 |     shortened_leaf = "".join(map(lambda s: '' if len(s) <= 0 else s[0], name_parts[-1].split("_")))
 94 |     if shortened_parts:
 95 |         return shortened_parts + "." + shortened_leaf
 96 |     else:
 97 |         return shortened_leaf
 98 | 
 99 | 
100 | def get_size(start_path: str):
101 |     """recursively compute size of a directory
102 | 
103 |     Args:
104 |         start_path (str): directory path
105 | 
106 |     Returns:
107 |         size in MByte
108 |     """
109 |     total_size = 0
110 |     for dirpath, _, filenames in os.walk(start_path):
111 |         for f in filenames:
112 |             fp = os.path.join(dirpath, f)
113 |             total_size += os.path.getsize(fp)
114 |     return total_size / 1000000.0
115 | 
116 | 
117 | def check_subdir(parent: str, child: str) -> bool:
118 |     """Check if the child is a subdirectory of the parent.
119 | 
120 |     Args:
121 |         parent (str): Path of the suspected parent dir
122 |         child (str): path of the suspected child dir
123 | 
124 |     Returns:
125 |         bool: True if child is subdir of parent
126 |     """
127 |     parent_path = os.path.abspath(parent)
128 |     child_path = os.path.abspath(child)
129 | 
130 |     return os.path.commonpath([parent_path]) == os.path.commonpath(
131 |         [parent_path, child_path]
132 |     )
133 | 
134 | 
135 | def convert_param_names(_param_names: list, values: list) -> str:
136 |     """create new shorthand name derived from parameter and value association
137 |     Arguments:
138 |         _param_names (list): parameter names for the experiment
139 |         values (list): concrete values for each parameter
140 | 
141 |     Returns:
142 |         str: shorthand name
143 |     """
144 | 
145 |     _converted_name = "_".join(
146 |         "{}{}".format(shorten_param(k), v) for k, v in zip(_param_names, values)
147 |     )
148 |     # _converted_name = re.sub("[' \[\],()]", '', _converted_name)
149 |     _converted_name = re.sub("[' ]", "", _converted_name)
150 |     _converted_name = re.sub('["]', "", _converted_name)
151 |     _converted_name = re.sub("[(\[]", "_", _converted_name)
152 |     _converted_name = re.sub("[)\]]", "", _converted_name)
153 |     _converted_name = re.sub("[,]", "_", _converted_name)
154 |     return _converted_name
155 | 
156 | 
157 | def get_file_names_in_directory(directory: str) -> [str]:
158 |     """
159 |     Get file names in given directory
160 |     Args:
161 |         directory: directory where you want to explore
162 | 
163 |     Returns:
164 |         file names in a list
165 | 
166 |     """
167 |     file_names = None
168 |     try:
169 |         (_, _, file_names) = next(os.walk(directory))
170 |         if len(file_names) == 0:
171 |             file_names = None
172 |     except StopIteration as e:
173 |         print("Cannot read files from directory: ", directory)
174 |     return file_names
175 | 


--------------------------------------------------------------------------------
/doc/01_quickstart.md:
--------------------------------------------------------------------------------
  1 | # 1. Quickstart Guide
  2 | To deploy an existing project using **cw2**, the following highlevel steps are required:
  3 | 
  4 | - [1. Quickstart Guide](#1-quickstart-guide)
  5 |   - [1.1. Experiment Implementation](#11-experiment-implementation)
  6 |   - [1.2. Main() Function](#12-main-function)
  7 |   - [1.3. Config YAML](#13-config-yaml)
  8 |   - [1.4. Program Execution](#14-program-execution)
  9 | 
 10 | 
 11 | This quickstart guide is intended to help you quickly deploy your existing project. To develop a more robust understanding of the mechanisms behind **cw2**, please refer to the corresponding sections of the [User Guide](./).
 12 | 
 13 | You can find barebones templates in the [template folder](../cw2/../templates/).
 14 | 
 15 | ## 1.1. Experiment Implementation
 16 | **cw2** requires that your program logic implements the [`cw2.experiment.AbstractExperiment`](../cw2/experiment.py) interface.
 17 | 
 18 | Lets assume you already have a working python project `existing_project.py`
 19 | ```python
 20 | # existing_project.py
 21 | def project_main():
 22 |     # perform my program
 23 |     # ...
 24 | 
 25 | if __name__ == "__main__":
 26 |     project_main()
 27 | ```
 28 | 
 29 | Create a new file to implement the `AbstractExperiment` interface, e.g. `MY_CW_MAIN.py`, and call your existing project's main (`project_main`) inside the experiments `run()` function:
 30 | 
 31 | ```python
 32 | # MY_CW_MAIN.py
 33 | from cw2 import experiment, cw_error
 34 | from cw2.cw_data import cw_logging
 35 | 
 36 | import existing_project
 37 | 
 38 | class MyExperiment(experiment.AbstractExperiment):
 39 |     # ...
 40 | 
 41 |     def initialize(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
 42 |         # Skip for Quickguide
 43 |         pass
 44 | 
 45 |     def run(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
 46 |         # Perform your existing task
 47 |         existing_project.project_main()
 48 |     
 49 |     def finalize(self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False):
 50 |         # Skip for Quickguide
 51 |         pass
 52 | ```
 53 | For more information on the experiment interface: [Experiment Class](02_experiment.md)
 54 | ## 1.2. Main() Function
 55 | 
 56 | As with any Python program, you need to define a `__main__` function.
 57 | 
 58 | It creates a `ClusterWork` instance with your experiment. If you want to use any compatible [loggers](07_logging.md), you can also add them here. Finally it will start experiment:
 59 | 
 60 | ```Python
 61 | from cw2 import cluster_work
 62 | 
 63 | if __name__ == "__main__":
 64 |     # Give the MyExperiment Class, not MyExperiment() Object!!
 65 |     cw = cluster_work.ClusterWork(MyExperiment)
 66 | 
 67 |     # Optional: Add loggers 
 68 |     cw.add_logger(...)
 69 | 
 70 |     # RUN!
 71 |     cw.run() 
 72 | ```
 73 | The easiest location for this main function is in the same file as your experiment implementation, e.g. `MY_CW_MAIN.py`
 74 | 
 75 | For more information on Logging: [Logging Results](07_logging.md)
 76 | 
 77 | ## 1.3. Config YAML
 78 | To qucikly deploy your first **cw2** experiment, create a simple YAML configuration file:
 79 | 
 80 | ```yaml
 81 | ---
 82 | # Experiment 1
 83 | name: "experiment_name"
 84 | 
 85 | # Required: Can also be set in DEFAULT
 86 | path: "path/to/output_dir/"   # location to save results in
 87 | repetitions: 1    # number of times one set of parameters is run
 88 | 
 89 | # Experiment Parameters:
 90 | params:
 91 |   key: 'value'
 92 | ```
 93 | 
 94 | We strongly recommend you read the [Config Guide](03_config.md) to better understand what the different options mean, and how you can use this file to efficiently define hyperparameter grids.
 95 | 
 96 | 
 97 | ## 1.4. Program Execution
 98 | To start an experiment locally, e.g. for testing:
 99 | ```bash
100 | python3 MY_CW_MAIN.py YOUR_CONFIG.yml
101 | ```
102 | 
103 | To start an experiment on a slurm cluster:
104 | ```bash
105 | python3 MY_CW_MAIN.py YOUR_CONFIG.yml -s
106 | ```
107 | 
108 | For more information on slurm: [Slurm Guide](04_slurm.md) 
109 | 
110 | For more information on available CLI Arguments: [CLI at a Glance](11_cli_args.md)
111 | 
112 | [Back to Overview](./)
113 | 


--------------------------------------------------------------------------------
/doc/02_experiment.md:
--------------------------------------------------------------------------------
  1 | #  2. Experiment Class
  2 | 
  3 | - [2. Experiment Class](#2-experiment-class)
  4 |   - [2.1. Initialize](#21-initialize)
  5 |     - [2.1.1 Can I use `__init__` for a global counter ? __**NO**__!!!](#211-can-i-use-__init__-for-a-global-counter--no)
  6 |   - [2.2 Run](#22-run)
  7 |     - [2.2.1 cw_config: dict](#221-cw_config-dict)
  8 |     - [2.2.2 rep: int](#222-rep-int)
  9 |     - [2.2.3 logger: LoggerArray](#223-logger-loggerarray)
 10 |   - [2.3 Finalize](#23-finalize)
 11 |   - [2.4 Iterative Experiment](#24-iterative-experiment)
 12 |     - [2.4.1 Iterate](#241-iterate)
 13 |     - [2.4.2 Save State](#242-save-state)
 14 | 
 15 | To run yur project with **cw2** you must implement the [`AbstractExperiment`](../cw2/experiment.py) interface.
 16 | This ensures that you can run multiple repetitions in the same process (e.g. for numerically unstable experiments) in the same process during local execution or deploy it massively parallelized using slurm on a computing cluster.
 17 | 
 18 | This interace provides three functions
 19 | 
 20 | - `initialize()`
 21 | - `run()`
 22 | - `finalize()`
 23 | 
 24 | corresponding to three phases during programm execution. In abstract, a **cw2** run, wether running locally in a single thread, or distirbuted using slurm, takes the form of:
 25 | 
 26 | ```Python
 27 | exp = AbstractExperiment() # Object is created once! __init__ is only called once!!
 28 | 
 29 | 
 30 | for r in repetitions:      # Can be parallelized or sequential!
 31 |     exp.initialize(...)    # Initialize / Reset the experiment for each repetition / thread
 32 |     exp.run(...)           # Execute experiment logic
 33 |     exp.finalize()         # Finalize / Clean the experiment after each repetition / thread. Close all writers, etc.
 34 | ```
 35 | 
 36 | A repetition is the repeated execution of an experiment with the exact same configuration of parameters.
 37 | 
 38 | 
 39 | ## 2.1. Initialize
 40 | The `initialize()` should be used like the `__init__` constructor typically present with python objects. It will be called before each experiment execution, whereas the constructor is only called once at the very start. As the Experiment-Object does not get instantiated newly for each execution, unwanted carry over effects between executions might occur. Take the following example:
 41 | 
 42 | ```python
 43 | class FaultyExperiment(AbstractExperiment):
 44 |     def __init__(self):
 45 |         # Is set only once during lifetime
 46 |         self.speed_of_light = 300 # 1000 km / s
 47 |     
 48 |     def initalize(self, ...):
 49 |         self.distance_traveled = 0
 50 | 
 51 |     def run(self, ...):
 52 |         self.distance_traveled += self.speed_of_light
 53 |         # Activate Warp Speed:
 54 |         self.speed_of_light *= 2
 55 | 
 56 |     def finalize(self, ...):
 57 |         print("Repetition " + str(rep))
 58 |         print(self.distance_traveled)
 59 | ```
 60 | 
 61 | If you run this `FaultyExperiment` with three Repetitions, you will get an output like:
 62 | ```
 63 | Repetition 0:
 64 | 300
 65 | 
 66 | Reptition 1:
 67 | 600
 68 | 
 69 | Repition 2:
 70 | 1200
 71 | ```
 72 | The `distance_traveled` sum gets reset to 0 at the start of each repetition. But the `speed_of_light` is modified during the `run()` function, which is persisted across the reptitions.
 73 | 
 74 | ### 2.1.1 Can I use `__init__` for a global counter ? __**NO**__!!!
 75 | When deploying on a computing cluster using slurm, most likely every repetition is executed in its own independent process. This results in a dual set of requirements for your experiment implementation:
 76 | 
 77 | 1. Each experiment repetition should be independently deployable. Do not assume that you can access any results from an earlier repetition through `self.*` fields. The only kind of persistency you can rely on, is writing results to disk.
 78 | 2. Do not rely on that an Experiment Instance gets destroyed between repetitions. Always assume that `self.*` fields might carry leftover information unless explicitely (re)set in the `initialize()` method.
 79 | 
 80 | ## 2.2 Run
 81 | Thre `run()` should implement the main logic / process of your project. There are no restrictions what you can do here. As this function is probably the most important in your implementation, we want to discuss in more detail its paramters.
 82 | 
 83 | ```python
 84 | def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
 85 |     ...
 86 | ```
 87 | ### 2.2.1 cw_config: dict
 88 | `cw_config` is a dictonary containing an unrolled experiment configuration. Unrolled means that `grid` and `list` keywords have been resolved and the `DEFAULT` documents have been merged. 
 89 | Important keys of this `dict` for your implementation might be:
 90 | - `params`: containing the unrolled `params` section of your configuration file. See [Configuration YAML File](03_config.md) for more information.
 91 | - `_rep_log_path`: a path unique to this repition. You can write your results / logs to this directory. It is guaranteed to exist and to be threadsafe. No other experimental run of your deployment will access this path. See [CW2 File System](05_files.md) for more information.
 92 | 
 93 | ### 2.2.2 rep: int
 94 | `rep` is an integer indication the repetition number of this run. As the repetitions are mostly intended to repeat the same parameter combination for numerically unstable experiments, the most likely scenario to use this parameter is to seed a random number generator, e.g.
 95 | 
 96 | ```python
 97 | np.seed(rep)
 98 | ```
 99 | 
100 | The repetition number is not globally unique, meaning you cannot use the `rep` argument alone to save your results in a global database.
101 | Assume you have the following YAML configuration:
102 | 
103 | ```yaml
104 | ---
105 | name: "exp_1"
106 | repetitions: 2
107 | 
108 | grid:
109 |     x: [1, 2]
110 |     y: [3, 4]
111 | ```
112 | 
113 | The `grid` keyword will generate 2x2 = 4 parameter combinations with 2 repetitions each, resulting in a total of 8 runs.
114 | Assume an Experiment implementation with the following `run()` function:
115 | 
116 | ```python
117 | def run(self, cw_config: dict, rep: int, ...):
118 |     print(cw_config['params'])
119 |     print(rep)
120 | ```
121 | Output:
122 | ```
123 | x: 1, y: 3
124 | rep: 0
125 | 
126 | x: 1, y: 3
127 | rep: 1
128 | 
129 | x: 2, y: 3
130 | rep: 0
131 | 
132 | x: 2, y: 3
133 | rep: 1
134 | 
135 | ...
136 | ```
137 | Only the combination of `params` and `rep` is unique in each run, or the `_rep_log_path`.
138 | 
139 | 
140 | ### 2.2.3 logger: LoggerArray
141 | `logger` is a [`LoggerArray`](../cw2/cw_data/cw_logging.py) object. If you have added any Logger object, you can pass them your results / messages with
142 | ```python
143 | msg = {}
144 | logger.process(msg)
145 | ```
146 | See [Logging Results](07_logging.md) for more information.
147 | 
148 | ## 2.3 Finalize
149 | The finalize function is called after `run()` has finished at the end of each repetition. The intention for this function is to close any opened writers / database connections, and maybe summarize the results from an (iterative) experiment execution. The function signature of `finalize()` differs from the other `AbstractExperiment` functions.
150 | 
151 | ```python
152 | def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False):
153 |     ...
154 | ```
155 | 
156 | If the `run()` function wants to abort early for whatever reason, e.g. converged loss function or any other kind of reason, the `run()` function can raise an [`ExperimentSurrender`](../cw2/cw_error.py) error. This custom error can take a `dict` as payload, which can then be accessed by the finalize. If you have different scenarios in which you want to abort an experimental run, this payload can be accessed through this `surrender` object by the `finalize()` function to react accordingly. See [Advanced Features & Parallelization](09_advanced.md) for more information.
157 | 
158 | `crash` is a boolean indication if `initialize()` or `run()` encountered any error, which you did not catch in your implementation. **cw2** ensures that even if a critical error occurs in those two functions, `finalize()` still gets called to perform its shutdown procedure. Following repetitions / runs in the same process should therefore not be impacted by earlier errors.
159 | 
160 | 
161 | ## 2.4 Iterative Experiment
162 | If you have an experiment with an iterative process, e.g. a for-loop as main component in your `run()` method, you might want to implement the [`AbstractIterativeExperiment`](../cw2/experiment.py) interface.
163 | 
164 | This interface comes with additional functionality. For example, you can define the number of iterations in your YAML config file with the `iterations` keyword, and **cw2** handles the for-loop for you. It also provides a [`PandasLogger`](../cw2/cw_data/cw_pd_logger.py) to write your results after each iteration into an excel like structure.
165 | 
166 | ### 2.4.1 Iterate
167 | Instead of implementing the `run()` method, you have to implement `iterate()`:
168 | 
169 | ```python
170 | def iterate(self, cw_config: dict, rep: int, n: int) -> dict:
171 |     return {"Result": "Current Iteration is {}".format(n)}
172 | ```
173 | In addition to the `cw_config` configuration object and `rep` repetition indicator, it also receives the current iteration `n`. This function should perform one single iteration of your process and return a dict with your results / messages / metrics you want to log.
174 | 
175 | The following keys are already reserved:
176 | - `"ts"` timestamp of the iteration results
177 | - `"rep"` repetition counter
178 | - `"iter"` iteration counter
179 | 
180 | You can again raise an [`ExperimentSurrender`](../cw2/cw_error.py) error to abort early. In this case, the payload of the error is used as the result for logging.
181 | 
182 | ### 2.4.2 Save State
183 | After each `iterate()` call, the `save_state()` function is executed.
184 | It has the same parameters as the `iterate()` function, but does not return a result.
185 | 
186 | You could use this function to save a snapshot / model of your experiment after each iteration.
187 | 
188 | ```python
189 | def save_state(self, cw_config: dict, rep: int, n: int) -> None:
190 |     # Save model every 50 iterations.
191 |     if n % 50 == 0:
192 |         self.model.to_disk(cw_config['_rep_log_path'])
193 | ```
194 | 
195 | 
196 | [Back to Overview](./)
197 | 


--------------------------------------------------------------------------------
/doc/03_config.md:
--------------------------------------------------------------------------------
  1 | # 3. Configuration YAML File
  2 | - [3. Configuration YAML File](#3-configuration-yaml-file)
  3 |   - [3.1. Experiment Configuration](#31-experiment-configuration)
  4 |     - [3.1.1. Experiment Header](#311-experiment-header)
  5 |     - [3.1.2. Experiment Parameters](#312-experiment-parameters)
  6 |       - [3.1.2.1 Ablative Parameter Search](#3121-ablative-parameter-search)
  7 |     - [3.1.3. Recommended Practices: Experiment Configuration](#313-recommended-practices-experiment-configuration)
  8 |       - [3.1.3.1. Params is your safe space](#3131-params-is-your-safe-space)
  9 |       - [3.1.3.2. You dont want multiple DEFAULTS...](#3132-you-dont-want-multiple-defaults)
 10 |   - [3.2. SLURM Configuration](#32-slurm-configuration)
 11 |   - [3.3. Example Templates](#33-example-templates)
 12 |   - [3.4. Important Keys](#34-important-keys)
 13 | 
 14 | To configure the execution of the experiment, you need to write a YAML-file. A YAML file consists several documents which begin with `---`:
 15 | ```yaml
 16 | ---
 17 | # First Document
 18 | 
 19 | 
 20 | ---
 21 | # Second Document
 22 | 
 23 | 
 24 | ```
 25 | 
 26 | For **cw2** we expect each yaml document to contain a key `name`:
 27 | 
 28 | ```yaml
 29 | ---
 30 | # First Document
 31 | name: "name_1"
 32 | 
 33 | 
 34 | ---
 35 | # Second Document
 36 | name: "name_2"
 37 | ```
 38 | 
 39 | The name is used to identify an experiment configuration and can be chosen freely, **EXCEPT** for these names:
 40 | 1. `DEFAULT` defines a default configuration. It may only exist *once* in your YAML file. If some parameter settings are shared between your experiments, you can define them inside the `DEFAULT` document. Unless they are specified differently in a named experiment, the settings from the `DEFAULT` will be used. The `DEFAULT` document follows the same structure as a generic experiment configuration document.
 41 | 
 42 | 1. `SLURM` defines a slurm configuration. It may only exist *once* in your YAML file. This document defines the relevant settings for the execution on a computing cluster, and are specific to each cluster. It follows its own special structure.
 43 | 
 44 | 
 45 | ## 3.1. Experiment Configuration
 46 | An experiment configuration (generic or default) has the following structure:
 47 | 
 48 | ```yaml
 49 | name: "experiment_name"
 50 | 
 51 | # Experiment Header
 52 | # ...
 53 | 
 54 | # Experiment Parameters
 55 | # ...
 56 | ```
 57 | 
 58 | ### 3.1.1. Experiment Header
 59 | 
 60 | ```yaml
 61 | ---
 62 | name: "experiment_name"
 63 | 
 64 | # Required: Can also be set in DEFAULT
 65 | path: "path/to/output_dir/"   # path for saving the results
 66 | repetitions: 5    # number of repeated runs for each parameter combination
 67 | 
 68 | # Required for AbstractIterativeExperiments only. Can also be set in DEFAULT
 69 | iterations: 1000  # number of iterations per repetition.
 70 | 
 71 | # Optional: Can also be set in DEFAULT
 72 | # Only change these values if you are sure you know what you are doing.
 73 | reps_per_job: 1    # number of repetitions in each job. useful for paralellization. defaults to 1.
 74 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1.
 75 | 
 76 | 
 77 | # Experiment Parameters
 78 | # ...
 79 | # ...
 80 | ```
 81 | **All fields can be defined in the `DEFAULT` document and do not need to be set in each experiment specifically.**
 82 | 
 83 | If you want to understand the `reps_per_job` and `reps_in_parallel` settings, please read TODO: BACKGROUND KNOWLEDGE
 84 | 
 85 | ### 3.1.2. Experiment Parameters
 86 | The experiment parameter section is highly specific to your code and use case. You can freely define parameter names within the `params:` key, e.g.:
 87 | ```yaml
 88 | ---
 89 | name: "DEFAULT":
 90 | # ... all required fields
 91 | 
 92 | 
 93 | ---
 94 | name: "ComputerVision"
 95 | # required fields are filled by DEFAULT
 96 | 
 97 | # Experiment Parameters
 98 | params:
 99 |     batchsize: 5 
100 |     pretrained: "imagenet"
101 | 
102 | ```
103 | 
104 | You can freely define parameter names and the structure, such as nested parameters, or list values.
105 | 
106 | You can use **cw2** to also quickly define a hyperparameter space using the `grid` or `list` keyword. This YAML file using `list`
107 | ```yaml
108 | ---
109 | name: "DEFAULT":
110 | # ... all required fields
111 | 
112 | 
113 | ---
114 | name: "CV-List"
115 | # required fields are filled by DEFAULT
116 | 
117 | # Experiment Parameters
118 | list:
119 |     batchsize: [3, 7]
120 |     learning_rate: [0.4, 0.8]
121 | ```
122 | 
123 | 
124 | is the same as if you had defined:
125 | ```yaml
126 | ---
127 | name: "DEFAULT":
128 | # ... all required fields
129 | 
130 | 
131 | ---
132 | name: "CV-list-3-04"
133 | # required fields are filled by DEFAULT
134 | 
135 | # Experiment Parameters
136 | params:
137 |     batchsize: 3
138 |     learning_rate: 0.4
139 | 
140 | ---
141 | name: "CV-list-7-08"
142 | # required fields are filled by DEFAULT
143 | 
144 | # Experiment Parameters
145 | params:
146 |     batchsize: 7
147 |     learning_rate: 0.8
148 | ```
149 | 
150 | The `list` keyword requires all parameter sets to be of equal length and will combine every n-th value. The `grid` keyword will generate all possible combinations, i.e. in the above example 2x2 = 4 combinations:
151 | 
152 |  `(3, 0.4) (3, 0.8) (7, 0.4) (7, 0.8)`)
153 | 
154 |  You can also combine `grid` and `list` in the same experiment. For every `list` combination, the `grid` will be solved, resulting in a total number of `product('grid') * min(length('list'))` runs.
155 | 
156 | 
157 | The final experiment configurations combining all techniques could look like:
158 | ```yaml
159 | ---
160 | # DEFAULT parameters (Optional)
161 | name: "DEFAULT"         # MUST BE 'DEFAULT'
162 | path: "/default/dir/"   # location to save results in
163 | repetitions: 5          # number of times one set of parameters is run
164 | 
165 | # Implementation default parameters
166 | # Can be overwritten by named experiments.
167 | params:
168 |   net_architecture: "vgg16"
169 | 
170 | 
171 | ---
172 | # Experiment 1
173 | name: "VGG"
174 | 
175 | # Required:
176 | # Repetitions are defined in DEFAULT
177 | path: "/vgg/results/"   # overwrite DEFAULT setting
178 | 
179 | # Experiment Parameters:
180 | # params.net_architecture from DEFAULT
181 | 
182 | # Creates all combinations
183 | grid:
184 |   learning_rate: [0.5]
185 |   batchsize: [5, 10]
186 | 
187 | 
188 | ---
189 | # Experiment 2
190 | name: "AlexNet"
191 | 
192 | # Required settings defined in DEFAULT
193 | 
194 | # Experiment Parameters:
195 | params:
196 |     net_architecture: "alex_net" # overwrite DEFAULT
197 |     learning_rate: 0.9 # no combination tryout
198 |     batch_size: 2 # no combination tryout
199 | ```
200 | 
201 | #### 3.1.2.1 Ablative Parameter Search
202 |  A new, advanced option is the use of the `ablative` keyword. This mechanic is helpful if you want to estimate the impact of specific hyperparameters.
203 |  **cw2** will only subsitute one parameter from the `ablative` section at a time. You can think of it as a shortcut to defining multiple default `params` sections quickly.
204 |  
205 |  For example, the following experiment configuration
206 | 
207 |  ```yaml
208 | ---
209 | name: XYZ
210 | # Required settings defined in DEFAULT
211 | 
212 | params:
213 |   pretrained: 'imagenet'
214 |   initialization: 'kmeans'
215 | 
216 | grid:
217 |   learning_rate: [0.3, 0.6]
218 |   gamma: [1, 2, 3]
219 | 
220 | ablative:
221 |   pretrained: [False]
222 |   initialization: ['random', 'softmax']
223 | ```
224 | will result in a total of 24 runs: 6 `grid` kombinations with default `params` settings, 6 with `pretrained: False`, 6 with `initialization: random` and an additional 6 with `initialization: softmax`
225 | 
226 | As you can see, the keys under `ablative` are changed one at a time, but never multiple at once.
227 | 
228 | **Attention!!**
229 | 
230 | `ablative` keys are changed one at a time. You are responsible to supply "default" `params` for when the other parameters under the `ablative` keyword are exchanged.
231 | 
232 | 
233 | ### 3.1.3. Recommended Practices: Experiment Configuration
234 | 1. `params` is your safe space!
235 | 2. If you feel like you need multiple `DEFAULT` sections, you probably want multiple YAML files
236 | 
237 | #### 3.1.3.1. Params is your safe space
238 | A common use case for **cw2** is the hyperparameter search for ML models. Often users only put the hyperparameters they search for into the `params` sections and keep their "constants", like training data location, outside. For example:
239 | 
240 | ```yaml
241 | ---
242 | name: "THIS IS NOT RECOMMENDED"
243 | # Required settings
244 | # ...
245 | 
246 | params:
247 |     learning_rate: 0.3
248 |     batch_size: 4
249 | 
250 | training_data: "/my/dataset"
251 | speed_of_light: "c"
252 | ```
253 | 
254 | While this will probably not cause an error, I recommend you still define your constants inside the `params` sections. During runtime **cw2** will modify the internal configuration object. While it is highly unlikely, you might overwrite such an internal keyword, leading to unforeseen issues, especially as the software evolves. For now, internal keywords generally begin with an underscore (`_internal_keyword`) and should be avoided.
255 | 
256 | To stay on the safe side, put all your custom parameters / arguments / constants inside the `params` section. **cw2** guarantees that all the values inside this section will not be altered without explicit user permission by using a combination keyword like `grid` or `list`. For example:
257 | 
258 | ```yaml
259 | ---
260 | name: "THIS IS THE WAY"
261 | # Required settings
262 | # ...
263 | 
264 | params:
265 |     learning_rate: 0.3
266 |     batch_size: 4
267 |     training_data: "/my/dataset"
268 |     speed_of_light: "c"
269 | ```
270 | 
271 | #### 3.1.3.2. You dont want multiple DEFAULTS...
272 | When running the same experiments for a long time, you may try out different parameters. Especially in the beginning, it is easier to extend the YAML file by adding a new document to the bottom of the file. After a while, you might find you have two "clusters" of configurations, maybe two algorithms / models, that you compare to each other. These models might require very different parameters, and it might not even be possible to share a common `DEFAULT` setting between those two classes.
273 | 
274 | In this case, I recommend you split the YAML file into two files, one for each approach. As you are most likely deploying such big experiments on a computing cluster using slurm, you do not have to wait for the results of the first set of tasks before starting the second.
275 | 
276 | ```console
277 | # Naive Approach
278 | u@cluster:~$ python experiment.py BIG_OLD_LEGACY.yml -s
279 | 
280 | # Split Approach
281 | u@cluster:~$ python experiment.py model_1.yml -s
282 | u@cluster:~$ python experiment.py model_2.yml -s
283 | ```
284 | 
285 | A new feature to help alleviate this problem, is the linking / import of external yaml files, see [Linking External YAML Files](09_advanced.md).
286 | 
287 | 
288 | ## 3.2. SLURM Configuration
289 | If you want to run a **cw2** experiment on a SLURM cluster, you __must__ include a document in your YAML configuration file with the `name` key set to `"SLURM"`. During local execution this document is ignored.
290 | 
291 | ```yaml
292 | ---
293 | # Slurm config
294 | name: "SLURM"   # MUST BE "SLURM"
295 | ```
296 | 
297 | The following fields are __required__ to ensure correct execution of your job on the slurm cluster. Please refer to the [sbatch docu](https://slurm.schedmd.com/sbatch.html) for further explanations.
298 | ```yaml
299 | # ... continued
300 | # Required
301 | job-name: "yourjob"    # this will be the experiment's name in slurm
302 | ```
303 | 
304 | The following fields are __required__ to configure your hardware requirements. These are _highly_ cluster specific. Please refer to the [sbatch docu](https://slurm.schedmd.com/sbatch.html) for further explanations.
305 | ```yaml
306 | # ... continued
307 | # Required - Cluster Specific
308 | partition: "dev"
309 | num_parallel_jobs: 120
310 | ntasks: 1
311 | cpus-per-task: 1
312 | time: 30
313 | ```
314 | 
315 | All the following sections are optional arguments.
316 | If they are not present in this slurm configuration, a default behaviour is used.
317 | ```yaml
318 | # ... continued
319 | # Optional
320 | account: ""  # Account name to which Cluster Time will be booked. Cluster specific.
321 | mem-per-cpu: 1000 # Optional - Cluster specific
322 | 
323 | experiment_copy_dst: "/path/to/code_copy/dst"       # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. If not set, no copy will be made.
324 | experiment_copy_auto_dst: /path/to/code_copy/dst"   # optional. will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Overrules experiment_copy_dst. If not set, no copy will be made.
325 | experiment_copy_src: "/path/to/code_copy/src"       # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to directory of __MAIN__ file.
326 | slurm_log: "/path/to/slurmlog/outputdir"            # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path
327 | venv: "/path/to/virtual_environment"   # optional. path to your virtual environment activate-file
328 | ```
329 | 
330 | If you have further need to configure slurm, you can use all the options offered by the [sbatch docu](https://slurm.schedmd.com/sbatch.html). Please use the following style of defining _keyword_ -> _value_ pairs:
331 | 
332 | ```yaml
333 | # ... continued
334 | # Optional SBATCH Arguments
335 | sbatch_args:    # Dictionary of SBATCH keywords and arguments
336 |   kw_1: "arg1"  # Will construct the line: #SBATCH --kw_1 arg1
337 |   kw_2: "arg2"  # Will construct the line: #SBATCH --kw_2 arg2
338 | ```
339 | 
340 | Sometimes it is necessary to do execute some additional instructions in the linux shell before starting the python process using slurm. You can define arbitrarily many additional shell instructions using the following format:
341 | ```yaml
342 | # ... continued
343 | # Optional shell instructions
344 | sh_lines:       # List of strings
345 |   - "line 1"
346 |   - "line 2"
347 | ```
348 | ## 3.3. Example Templates
349 | This documentation gets updated less frequently than potential feature introductions.
350 | When in doubt, refer to the provided templates:
351 | - [AbstractExperiment Configuration](../templates/abstract_config.yml)
352 | - [AbstractIterativeExperiment Configuration](../templates/iterative_config.yml)
353 | 
354 | ## 3.4. Important Keys
355 | These are important configuration keys you have access to in the various methods of your `AbstractExperiment` Implementation.
356 | - `cw_config['params']` is a dictionary containing everything under the `params` keyword, including the merged values from `DEFAULT` and `list`/`grid` keywords.
357 | - `cw_config['_rep_log_path']` is a `str` entry pointing to the _threadsafe_ directory of this repetition. Here all **cw2** logging artifactsof this repitition will be written. If you have any results / model checkpoints you can save them here under the guarantee that no other **cw2** run will interfere.
358 | 
359 | [Back to Overview](./)
360 | 


--------------------------------------------------------------------------------
/doc/04_slurm.md:
--------------------------------------------------------------------------------
1 | # 4. SLURM Introduction
2 | under construction
3 | 
4 | 
5 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/05_files.md:
--------------------------------------------------------------------------------
1 | # 5. The CW2 File System
2 | under construction
3 | 
4 | 
5 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/06_code_copy.md:
--------------------------------------------------------------------------------
 1 | # 6. Code Copy Feature
 2 | 
 3 | - [6. Code Copy Feature](#6-code-copy-feature)
 4 |   - [6.1. Enabling Code Copy](#61-enabling-code-copy)
 5 |   - [6.2. Disabling Code Copy](#62-disabling-code-copy)
 6 |   - [6.3 CLI Options](#63-cli-options)
 7 |   - [6.4 Known Challenges](#64-known-challenges)
 8 | 
 9 | 
10 | When submitting a job to a SLURM cluster, it is likely to wait in queue until requested compute resources become available. During this queuing time, the code can still be changed, as no Python process has been started yet.
11 | 
12 | Any changes the user makes to their code in this queueing time, will be in effect once the job starts. For example:
13 | 
14 | - User starts with default codebase A. They submit their first slurm job, waiting for results.
15 | - While waiting, the user implements a new feature, resulting in a new codebase A*.
16 | - Wanting to compare A* to the future results of A, the user submits a second job.
17 | - After a while, the results of both jobs are ready. The results of the first job and second job are exactly identical. The user is confused.
18 | 
19 | In the above example, both jobs ran with codebase A*, leading to identical results.
20 | 
21 | To avoid this problem, we offer the **Code Copy Feature**.
22 | 
23 | ## 6.1. Enabling Code Copy
24 | To enable code copy, add the `src` and **one (1)** `dst` argument to your `SLURM` config section:
25 | 
26 | ```yaml 
27 | # Required for Code-Copy-Feature
28 | experiment_copy_src: "/path/to/code_copy/src"       # Code Copy Source directory.
29 | 
30 | # Choose one for Code-Copy-Feature
31 | experiment_copy_dst: "/path/to/code_copy/dst"       # Code Copy Destination directory. Will be overwritten if called multiple times.
32 | experiment_copy_auto_dst: "/path/to/code_copy/dst"  # Code Copy Destination directory autoincrement. Will create a new subdirectory each time.
33 | ```
34 | 
35 | If you only want to "document" the code, so that you might reproduce it later, you can use the `--zip` CLI option. This will create a Zip Archive of your code in the code-copy `dst`.
36 | 
37 | ## 6.2. Disabling Code Copy
38 | To permanently disable code copy, remove the `src` and `dst` arguments from your `SLURM` config section.
39 | To temporarily disable code copy, add `--nocodecopy` to your `python main.py config.yaml` call.
40 | 
41 | ## 6.3 CLI Options
42 | For a full and updated list, please refer to the [CLI Args Docu](11_cli_args.md).
43 | | Flag | Name            | Effect                                                                                                                                                                                       |
44 | | ---- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
45 | |      | --zip           | Creates a ZIP archive for documentation purposes of $CWD or, if set, "experiment_copy_src".                                                                                                  |
46 | |      | --skipsizecheck | Disables a safety size check when Zipping or Code-Copying. The safety prevents unecessarily copying / archiving big files such as training data.                                             |
47 | |      | --multicopy     | Creates a Code-Copy for each Job. If you are modifying a hardcoded file in your codestructure during runtime, this feature might help ensure multiple runs do not interfere with each other. |
48 | |      | --nocodecopy    | Do not use the Code-Copy feature, even if the config arguments are specified.                                                                                                                |
49 | 
50 | ## 6.4 Known Challenges
51 | 1. Code Copy can quickly lead to a storage problems. To avoid this, we have a safety check disabling code-copy if more than 200MB are targeted. This can be disabled via `--skipsizecheck`.   
52 | **Attention!!**  
53 | If your `src` contains training data, it will also be copied each time.  
54 | If your `dst` is inside of `src`, future copies will contain the old ones. This can quickly lead to a file size explosion.
55 | 
56 | 2. To ensure that the copied code is executed, `cw2` will modify the `$PYTHONPATH` to point at the `dst` directory. While in my experience this should be stable, it could lead to issues if you are also modifying the `$PYTHONPATH` somewhere.
57 | 
58 | As with all more advanced features, please double check upon first execution, if your code is still executed as expected.
59 | 
60 | 
61 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/07_logging.md:
--------------------------------------------------------------------------------
  1 | # 7. Logging Results
  2 | 
  3 | - [7. Logging Results](#7-logging-results)
  4 |   - [7.1. Console Logger](#71-console-logger)
  5 |   - [7.2. Logger Interface](#72-logger-interface)
  6 |   - [7.3. Advanced Loggers](#73-advanced-loggers)
  7 |     - [7.3.1. Pandas](#731-pandas)
  8 |     - [7.3.2. WandB](#732-wandb)
  9 | 
 10 | **cw2** comes with a a variety of logging capabilities. This document will explain how to use the basic "Console" logging to document `print()`-like statements.
 11 | 
 12 | ## 7.1. Console Logger
 13 | When you create a `cw2.ClusterWork` instance in your _main_, a custom [python logging](https://docs.python.org/3/howto/logging.html) object is created. You can use this object to "print" statements to the console and they will be automatically saved into a logfile on disk in your output folder (TODO: FILESYTEM). Two files will be written:
 14 | 
 15 | - `out.log` contains every message you passed to the logger
 16 | - `err.log` contains only error messages
 17 | 
 18 | You can access it from anywhere within a **cw2** program by:
 19 | 
 20 | ```python
 21 | from cw2.cw_data import cw_logging
 22 | 
 23 | # retrieve logger
 24 | l = cw_logging.get_logger()
 25 | 
 26 | # Print Generic Message()
 27 | l.info("This will be written to out.log")
 28 | 
 29 | # Print Error Message
 30 | l.error("This will be written to err.log AND out.log")
 31 | ```
 32 | 
 33 | You do not need to initialize or close the logger object. It is handled automatically by **cw2**.
 34 | 
 35 | ## 7.2. Logger Interface
 36 | If you want to implement your own custom logger, you have to implement the corresponding interface [`AbstractLogger`](../cw2/cw_data/cw_logging.py)
 37 | 
 38 | ```Python
 39 | from cw2.cw_data import cw_logging
 40 | 
 41 | class MyLogger(cw_logging.AbstractLogger):
 42 |     # ...
 43 | 
 44 |     def initialize(self, config: attrdict.AttrDict, rep: int, rep_log_path: str):
 45 |         # Initialize / Reset the logger for a new repetition
 46 |         self.log_path = rep_log_path + 'my_file.txt'
 47 |         self.data_list = []
 48 | 
 49 |     def process(self, data) -> None:
 50 |         # Processes incoming data.
 51 |         # Need to do your own check if data is in the format you expect.
 52 |         print(data)
 53 |         self.data_list.append(data)
 54 | 
 55 |     def finalize(self) -> None:
 56 |         # Finalize the processing, e.g. write the internal data to disk and close all writers
 57 |         write_to_disk(self.data, self.log_path)
 58 | 
 59 |     def load(self):
 60 |         # Implement this function to load potential results
 61 |         self.data = read_from_disk(self.log_path)
 62 |         return self.data
 63 | ```
 64 | 
 65 | The execution order is very similar to the order of an [`AbstractIterativeExperiment`](../cw2/experiment.py):
 66 | 
 67 | ```Python
 68 | log = AbstractLogger()     # Initialize only GLOBAL values & CONSTANTS
 69 | for r in repetitions:
 70 |     log.initialize(...)    # Initialize / Reset the logger for each repetition.
 71 | 
 72 |     for i in iterations:
 73 |       result = experiment.iterate(...) # Obtain some data from an experiment
 74 |       log.process(result)    # Log the result
 75 |     
 76 |     log.finalize()      # Finalize / Clean the logger after each repetition
 77 | ```
 78 | Each logger is responsible themselves to check results and how handle them.
 79 | 
 80 | 
 81 | ## 7.3. Advanced Loggers
 82 | **cw2** provides advanced logging functionality in form of a [Pandas Dataframe](https://pandas.pydata.org/) Logger for Excel-like table structures, and a [Weights & Biases (WandB)](https://wandb.ai/site) Logger for advanced metrics.
 83 | ### 7.3.1. Pandas
 84 | ### 7.3.2. WandB
 85 | This description is intended as a first primer, and is not tested by me.
 86 | 
 87 | To instantiate the WandB logger, you need to add it to the LoggerArray.
 88 | 
 89 | ```Python
 90 | if __name__ == "__main__":
 91 |     cw = ClusterWork(YourExp)
 92 | 
 93 |     cw.add_logger(WandBLogger())
 94 |     cw.run()
 95 | ```
 96 | 
 97 | Your `config.yml` find needs to be configured for wandb:
 98 | Please refer to the official WandB documentation and the WandBLogger code to learn, what options you have and their effect.
 99 | 
100 | ```yaml
101 | ---
102 | name: some_exp
103 | repetitions: 5
104 | params:
105 |     ...
106 | 
107 | wandb:
108 |     project: project_name
109 |     group: group_name
110 | ```
111 | 
112 | Logging data with the WandBLogger is the same as every other logger:
113 | 
114 | For `AbstractIterativeExperiment` implementations, the complete result dictionary returned by your `iterate()` function will be logged, unless you used the `ignore_keys` parameters during Logger creation:
115 | 
116 | ```Python
117 | # logs everything
118 | wandb_l = WandBLogger()
119 | 
120 | # logs everything except for the key secret
121 | wandb_l = WandBLogger(ignore_keys=['secret'])
122 | ```
123 | 
124 | When using an `AbstractExperiment` implementation, you have to log results manually:
125 | 
126 | ```Python
127 | def run(self, config, repetition, logger):
128 |     do_something()
129 |     results = {
130 |         # fill dictionary
131 |     }
132 |     logger.process(results)
133 | ```
134 | 
135 | Optional config parameters of the wandb logger:
136 | ```yaml
137 | wandb: 
138 |     optional_config: value_of_this_config
139 | ```
140 | - **log_model**: bool, indicates whether the model shall be logged by the wandb or not. 
141 | When it is false or not given, nothing happens.
142 | When it is true, the wandb logger will assume you have saved some meaning model files (such as NN weights) under `rep_xx/log/model`. 
143 | In the end of each repetition, the logger will upload all the files saved there as an Artifact. 
144 | The wandb logger does not care about the content and types of the files in such directory, or how did you save model in such directory.
145 | If such directory does not exist, or it contains no file, then wandb logger will log a warning but will not raise any error to break your experiment. 
146 | In your own experiment class, you can get this directory in the initialize function and save model:
147 | ```python
148 | class MyCoolExp(experiment.AbstractIterativeExperiment):
149 |     def initialize(self, cw_config: dict,
150 |                        rep: int, logger: cw_logging.LoggerArray) -> None:
151 |         self.net = CoolNet()
152 |         
153 |         # Get the determined directory to save the model
154 |         self.save_model_dir = cw_config.save_model_dir
155 |         
156 |         # You need to make a new dir of this given save model dir too!
157 |         # os.mkdir(...)
158 | 
159 |         # You may save your model for every M epochs
160 |         self.save_model_interval = 100
161 |         
162 |     def save_state(self, cw_config: dict, rep: int, n: int) -> None:        
163 |         if self.save_model_dir and ((n + 1) % self.save_model_interval == 0
164 |                                     or (n + 1) == cw_config.iterations):
165 |         self.net.save_weights(log_dir=self.save_model_dir, epoch=n + 1)
166 | ```
167 | 
168 | - **model_name**: string, name of the saved model. 
169 | It is only useful when **log_model** is set. 
170 | If the **model_name** is not set, the saved model will use "model" as its default name.
171 | 
172 | 
173 | - **log_interval**: int value. If it is given, it indicates that you want to log result in a given interval. 
174 | This helps in the experiment which contains too many iterations (epochs), so that you do not want to log stuff for every iteration.   
175 | 
176 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/08_loading.md:
--------------------------------------------------------------------------------
 1 | # 8. Loading Results
 2 | We provide a simple function to access the results from your runs. An example can be found in `polynom_tutorial\polynom_load.py`:
 3 | 
 4 | ```Python
 5 | from cw2 import cluster_work, cw_logging
 6 | 
 7 | cw = cluster_work.ClusterWork(None)
 8 | 
 9 | # Add all the loggers whose results you want to load.
10 | cw.add_logger(cw_logging.PandasRepSaver())
11 | # ...
12 | 
13 | 
14 | # res is a pandas.DataFrame
15 | res = cw.load()
16 | ```
17 | 
18 | The resulting object is a `pandas.DataFrame` with each repetition as a row, and each configuration parameter and logger result as a column.
19 | You can use all the available `pandas` methods to filter and do your own analysis of the results.
20 | 
21 | Additionally we offer our own processing functions with an extension of the `pandas` API: `df.cw2`
22 | For example, to select a single repetition in the result dataframe `res` from the example above, use `df.cw2.repetition()`:
23 | 
24 | ```Python
25 | # ...
26 | res = cw.load()
27 | repetition_0 = res.cw2.repetition(0)
28 | ```
29 | 
30 | To select all runs with a specific hyper-parameter setting, use `df.cw2.filter()`:
31 | ```Python
32 | # ...
33 | res = cw.load()
34 | 
35 | # parameter dict - same structure as CONFIG.params
36 | interesting_params = {
37 |   'param1': 1
38 | }
39 | 
40 | interesting_results = res.cw2.filter(
41 |   interesting_params
42 | )
43 | ```
44 | 
45 | 
46 | 
47 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/09_advanced.md:
--------------------------------------------------------------------------------
  1 | # 9. Advanced Features & Parallelization
  2 | - [9. Advanced Features & Parallelization](#9-advanced-features--parallelization)
  3 |   - [9.1. Error Handling](#91-error-handling)
  4 |   - [9.2. Parallelization](#92-parallelization)
  5 |     - [9.2.1 Parallelization Pitfalls](#921-parallelization-pitfalls)
  6 |   - [9.3. Custom Scheduler](#93-custom-scheduler)
  7 |   - [9.4. Linking External YAML Files](#94-linking-external-yaml-files)
  8 | 
  9 | ## 9.1. Error Handling
 10 | Should any kind of exception be raised during an Experiment execution (`initialize()` or `run()`), **cw2** will abort this experiment run, log the error including stacktrace to a log file in the repetition directory and continue with the next task.
 11 | 
 12 | If you want to end an (iterative) experiment early, you can raise the `cw_error.ExperimentSurrender` exception to gracefully abort the experiment execution.
 13 | 
 14 | The `finalize()` function of you experiment has access to a raised `cw_error.ExperimentSurrender` exception and can access its payload. You can use this to "transmit" data to your finalziation procedure and react accordingly.
 15 | 
 16 | ## 9.2. Parallelization
 17 | First, an attempt to establish a terminology:
 18 | - Experiment: A collection of hyperparameter runs, defined in the `config.yml` via the `name` key.
 19 | - Hyperparameter run: A combination of hyperparameters, as defined by `params` and combination keywords such as `grid`. Can be repeated multiple times
 20 | - Repetition: A singular repetition of a hyperparameter run. 
 21 | - Job (cw2): A computing job, resulting in its own, independend (computing) process. Per default a 1:1 mapping with repetitions. SLURM calls this "unit" of computation task (`cpu-per-task` keyword.)
 22 | 
 23 | The following config results in `2*2 (grid) * 5 (repetitions)` jobs.
 24 | ```yaml
 25 | ---
 26 | name: exp1
 27 | repetitions: 5
 28 | grid:
 29 |  a: [1, 2]
 30 |  b: [3, 4]
 31 | ```
 32 | 
 33 | Often, a cluster has restrictions on how many SLURM tasks / cw2 jobs  can be submitted by a user at once. For this purpose, the 1:1 mapping of assign each repetition its own job can be changed with the `reps_per_job` config keyword. Multiple repetitions are bundled into one process, which are computed sequentially.
 34 | 
 35 | This can then be futher parallelized by using the `reps_in_parallel` config keyword. This starts a multi-threading parallelization within a job process.
 36 | 
 37 | ### 9.2.1 Parallelization Pitfalls
 38 | Currently, we use joblib per default for the multi-threading parallelization. This can cause issues with GPU intensive tasks like Deep Learning or special third party libraries, e.g. Mujoco.
 39 | 
 40 | 
 41 | ## 9.3. Custom Scheduler
 42 | In **cw2** a scheduler is an object responsible for executing a list of jobs (see [Slurm Introduction](04_slurm.md)). In some cases it might be necessary to built your own, custom scheduler. E.g., when the use of parallelization inside of a job is required, and your experiment is not compatible with the default joblib multiprocessing approach (for example through the use of GPU acceleration).
 43 | 
 44 | **cw2** does not offer such advanced schedulers on its own, as they might be highly dependend on your use case and applied libraries.
 45 | 
 46 | To build your custom scheduler, you need to at least implement the [`AbstractScheduler`](../cw2/scheduler.py) interface. 
 47 | 
 48 | You might want to use [`LocalScheduler`](../cw2/scheduler.py) as a reference implementation.
 49 | 
 50 | Remember: The Scheduler sees the `Job` objects, which itself might bundle multiple cw2 tasks / repetitions (NOT SLURM tasks).
 51 | 
 52 | This is a very abstract, non-working example how this might look like:
 53 | 
 54 | ```python
 55 | import some_gpu_acc
 56 | from some_gpu_acc import some_multiproc_pool
 57 | 
 58 | from cw2.scheduler import LocalScheduler
 59 | 
 60 | class CustomScheduler(AbstractScheduler):
 61 |     def run(self, overwrite: bool = False):
 62 |         for job in self.joblist:
 63 |             for t in job.tasks:
 64 |                 some_multiproc_pool(N_CORES).parallelize(
 65 |                     job.run_task(t, overwrite)
 66 |                 )
 67 | 
 68 | ```
 69 | 
 70 | To use your new custom scheduler, you have to give it to the [`ClusterWorks`](../cw2/cluster_work.py) instance in your `__main__` function:
 71 | 
 72 | ```python
 73 | from cw2 import cluster_work
 74 | 
 75 | if __name__ == "__main__":
 76 |     # Give the MyExperiment Class, not MyExperiment() Object!!
 77 |     cw = cluster_work.ClusterWork(MyExperiment)
 78 | 
 79 |     # RUN WITH CUSTOM SCHEDULER!!!
 80 |     cw.run(s = CustomScheduler()) 
 81 | ```
 82 | 
 83 | ## 9.4. Linking External YAML Files
 84 | It might be helpful to you, to organize your experiment configs into different yaml files which refer to each other.
 85 | Similiar to the merging behaviour with a `DEFAULT` configuration, you can now define a "parent" configuration with two new keywords:
 86 | 
 87 | ```yaml
 88 | ---
 89 | name: "child"
 90 | import_path: "some_path" # optional. can be an absolute path, or relative to this yaml file.
 91 |                          # if only import_exp is present, defaults to THIS file.
 92 | import_exp: "parent_exp" # optional. basically -e option which external experiment should be the basis.
 93 |                                        # The external experiment will be merged with its own default before importing.
 94 |                                        # Case Sensitive. Defaults to "DEFAULT".
 95 | ```
 96 | 
 97 | Imported yaml files can be children with imports themselves. A child will always overwrite its parent. Relative paths will always be relative to the file they are written in, NOT to the root or main.py
 98 | 
 99 | Cyclic Linking should be detected and result in an error message.
100 | 
101 | The resolution order is:
102 | 1. A named experiment `child` gets merged with its internal `DEFAULT` configuration. Shared keys are "overwritten" by the more specific `child`.
103 | 2. Should after the merge an `import_` key be present in the configuration, the specified `parent_exp` gets loaded.
104 | 3. The `parent_exp` is merged with its internal "Parent"-`DEFAULT`.
105 | 4. Repeat Steps 2-4 for each parent.
106 | 
107 | 
108 | 
109 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/10_advanced_gpu.md:
--------------------------------------------------------------------------------
  1 | # 10. Advanced GPU Scheduling
  2 | 
  3 | Here we discuss advanced GPU Scheduling, i.e., advanced methods to distribute repetitions across GPUs.
  4 | There are two main use cases for this:
  5 | 
  6 | 1.) **Putting Multiple Repetition on GPU**: Often, a single repetition is not enough to fully saturate the GPU (especially for the larger
  7 | Teslar Models used in HPC clusters).  Therefore, it can be beneficial to run multiple repetitions in parallel on a single GPU.
  8 | 
  9 | 2.) **Requesting Single GPUs not possible**: Some HPC Clusters are configured in a way that requesting single GPUs via SLURM is not possible.
 10 | In this case, you'll always get multiple GPUs at once, and it's your responsibility to distribute the load across them.
 11 | 
 12 | **Caveat**: Please always have an eye on your jobs and make sure they behave as expected with regard to GPU utilization and runtime, do not fully rely on this! 
 13 | The underlying multiprocessing is tricky business, behaviour is not always consistent across different machines and python versions. 
 14 | There can be weird side effects. 
 15 | 
 16 | 
 17 | ## 10.1. The ''gpus_per_rep'' Config Keyword
 18 | 
 19 | The main new functionality to control GPU usage is the `gpus_per_rep` config keyword. Although it's not an actual SLURM key-word, it needs to be specified in the SLURM block of your config.
 20 | It can be a float smaller than 1 or an integer lager or equal to 1. It does what the name suggests, it specifies how many GPUs are requested per repetition.
 21 | For it to properly work, you need to set the `reps_per_job` and `reps_in_parallel` keys accordingly. 
 22 | 
 23 | **Caveat**: I have no idea what happens if different values for `reps_per_job` and `reps_in_parallel` are used throught your YAML. Just don't do it (or test it).
 24 | 
 25 | ### 10.1.1. Example 1: Using only half a GPU per repetition
 26 | 
 27 | Assume your Jobs are small and you want to run 2 on each single GPU. 
 28 | First, set `gpus_per_rep` to 0.5:
 29 | 
 30 | ```yaml
 31 | ---
 32 | # Slurm config
 33 | name: "SLURM"
 34 | partition: "gpu"
 35 | job-name: "half_gpu_job"
 36 | time: 20
 37 | ntasks: 1
 38 | cpus-per-task: 8  # 4 CPUs per rep!
 39 | gpus_per_rep: 0.5
 40 | sbatch_args:
 41 |   gres: "gpu:1
 42 | ``` 
 43 | 
 44 | To have both jobs run on the same GPU in parallel, set `reps_per_job` to 2 and `reps_in_parallel` to 2 (you can also
 45 | set 'reps_per_job' to a multiple of 2):
 46 | 
 47 | ```yml
 48 | --- 
 49 | # Default
 50 | name: DEFAULT
 51 | reps_per_job: 2
 52 | reps_in_parallel: 2
 53 | ``` 
 54 | Specify your experiment as usual, the total number of repetitions should be a multiple of 2.
 55 | 
 56 | **Caveat**: There is nothing in CW2 to ensure GPU memory and compute is distributed evenly and not exceeded. 
 57 | It is your responsibility to take care of that! Check your code if it actually profits from this! (Don't expect a speed-up of 2x,
 58 | more something like > 1.5x)
 59 | 
 60 | ### 10.1.2. Example 2: Using single GPUs when you can only request multiple GPUs 
 61 | 
 62 | Assume you are on a HPC-System where the minimum number of GPUs you can request is 4 (e.g. HoreKa).
 63 | 
 64 | First, set `gpus_per_rep` to 1:
 65 | 
 66 | ```yaml
 67 | ---
 68 | # Slurm config
 69 | name: "SLURM"
 70 | partition: "accelerated"
 71 | job-name: "single_gpu_job"
 72 | time: 20
 73 | ntasks: 1
 74 | cpus-per-task: 16 # 4 CPUs per rep!
 75 | gpus_per_rep: 1
 76 | sbatch_args:
 77 |   gres: "gpu:4  # Note how we request 4 GPUs here!
 78 | ``` 
 79 | 
 80 | To have both jobs run on the same GPU in parallel, set `reps_per_job` to 4 and `reps_in_parallel` to 4 (you can also
 81 | set 'reps_per_job' to a multiple of 4):
 82 | 
 83 | ```yml
 84 | --- 
 85 | # Default
 86 | name: DEFAULT
 87 | reps_per_job: 4
 88 | reps_in_parallel: 4
 89 | ``` 
 90 | Specify your experiment as usual, the total number of repetitions should be a multiple of 4.
 91 | 
 92 | ## 10.2 Cluster Specific Schedulers
 93 | I (Philipp B.) had issues with using this naively on both the Kluster and on HoreKa, but I am unsure if it's a general problem or just a problem of my code
 94 | (Todo: Somebody check with their stuff and tell me).
 95 | On both systems the jobs would run super slow, as the processes where stealing each others CPU resources.
 96 | I had to use different fixes for both systems, and write specific schedulers for them. 
 97 | You can use them via the `scheduler` key in the `slurm` block of your config, possible values are currently:
 98 | 
 99 | - "kluster": Explicitly limits the number of threads used (if you use something else than PyTorch, you probably need to have another look at that)
100 | - "horeka": Explicitly handles the cpu affinity of individual repetitions.  
101 | 
102 | ## 10.3 Use full CPU's computation power in a GPU node.
103 | I (Bruce) had some low CPU computation speed issues when do online RL in Horeka GPU node, where I have to use both CPU (for mujoco) and GPU (for agent update). The reason is that for each experiment's generated gym environment, it can use all the cpus of this node and thus often blocks the access of the other environments or other repititions (when multple repititions are running in parallel). To solve it, I added the assigned CPU cores into the cw_config and you can manually assign theses cores to the environments yourself, e.g. one environment has one distinct core. Something like:
104 | ```python
105 |    env_pids = [envs.processes[i].pid for i in range(num_env)]
106 |    cores_per_env = len(cw_config["cpu_cores"]) // num_env
107 |    cpu_cores_list = list(cw_config["cpu_cores"])
108 |    for i, pid in enumerate(env_pids):
109 |        cores_env = cpu_cores_list[i * cores_per_env: (i + 1) * cores_per_env]
110 |        util.assign_process_to_cpu(pid, set(cores_env))
111 | ```
112 | 


--------------------------------------------------------------------------------
/doc/11_cli_args.md:
--------------------------------------------------------------------------------
 1 | # 11. CLI args
 2 | The following args are currently supported by CW2:
 3 | | Flag           | Name            | Effect                                                                                                                                                                                                            |
 4 | | -------------- | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 5 | | -s             | --slurm         | Run using SLURM Workload Manager.                                                                                                                                                                                 |
 6 | | -o             | --overwrite     | Overwrite existing results.                                                                                                                                                                                       |
 7 | | -e name1 [...] | --experiments   | Allows to specify which experiments should be run. Corresponds to the `name` field of the configuration YAML.                                                                                                     |
 8 | |                | --zip           | Creates a ZIP archive for documentation purposes of $CWD or, if set, "experiment_copy_src".                                                                                                                       |
 9 | |                | --skipsizecheck | Disables a safety size check when Zipping or Code-Copying. The safety prevents unecessarily copying / archiving big files such as training data.                                                                  |
10 | |                | --multicopy     | Creates a Code-Copy for each Job. If you are modifying a hardcoded file in your codestructure during runtime, this feature might help ensure multiple runs do not interfere with each other.                      |
11 | |                | --nocodecopy    | Do not use the Code-Copy feature, even if the config arguments are specified.                                                                                                                                     |
12 | |                | --noconsolelog  | Disables writing logs with the internal PythonLogger module. Slurm will still create its slurm_logs, so no information is lost. Helps if too many repetitions try to open too many open files and causing errors. |
13 | 
14 | 
15 | [Back to Overview](./)


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # CW2 User Documentation
 2 | - [1. Quick Start Guide](01_quickstart.md)
 3 | ---
 4 | ## Basic Features
 5 | - [2. Experiment Class](02_experiment.md)
 6 | - [3. Configuration File](03_config.md) 
 7 | - [4. Introduction Slurm](04_slurm.md)
 8 | - [5. File System](05_files.md)
 9 | ---
10 | ## Advanced Features
11 | - [6. Code Copy](06_code_copy.md)
12 | - [7. Logging Results](07_logging.md)
13 | - [8. Loading Results](08_loading.md) 
14 | - [9. Advanced Features & Parallelization](09_advanced.md)
15 | - [10. Advanced GPU Scheduling](10_advanced_gpu.md)
16 | ---
17 | - [10. CLI options at a Glance](11_cli_args.md)
18 | ---
19 | Some sections are still under construction.


--------------------------------------------------------------------------------
/polynom_tutorial/external_conf.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: DEFAULT
 3 | repetitions: 3000
 4 | external_key: "ahahahaha"
 5 | 
 6 | 
 7 | ---
 8 | name: ext_exp
 9 | import_path: "/home/max/code/cw2/polynom_tutorial/polynom_config.yml"
10 | import_exp: "polynomial"
11 | grid:
12 |   a: [1, 3]
13 |   b: [4, 6]
14 |   x_1: [7]


--------------------------------------------------------------------------------
/polynom_tutorial/polynom_config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Slurm config (optional)
 3 | name: "SLURM_ret"   # MUST BE "SLURM"
 4 | 
 5 | # Required
 6 | partition: "dev"
 7 | job-name: "polynom"    # this will be the experiment's name in slurm
 8 | #path_to_template: "/home/max_li/code/cw2/templates/sbatch_template.sh"   # Path to YOUR prepared sbatch script
 9 | 
10 | # Required - Cluster Specific
11 | num_parallel_jobs: 120
12 | ntasks: 1
13 | cpus-per-task: 1
14 | mem-per-cpu: 1000
15 | time: 30
16 | 
17 | #experiment_copy_auto_dst: "/home/max/autodst"
18 | #experiment_copy_src: "."
19 | 
20 | # Optional
21 | #venv: "/home/max_li/venv/bin/activate"   # optional. path to your virtual environment activate-file
22 | 
23 | # Optional Code Copy: Both Args are required.
24 | #experiment_copy_dst: "/home/max_li/polylog/code"       # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to EXPERIMENTCONFIG.path/code
25 | #experiment_copy_src: "/home/max_li/code/cw2/polynom_tutorial"       # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to CWD.
26 | sh_lines:
27 |   - "# haha"
28 |   - "# hihi"
29 | ---
30 | 
31 | # DEFAULT parameters (Optional)
32 | name: "DEFAULT"   # MUST BE 'DEFAULT'
33 | reps_per_job: 4
34 | reps_in_parallel: 1
35 | 
36 | # Required: Can also be set in DEFAULT
37 | path: "/tmp/polylog"   # location to save results in
38 | repetitions: 2    # number of times one set of parameters is run
39 | iterations: 1000  # number of iterations per repetition
40 | 
41 | # Implementation default parameters
42 | params:
43 |   noise: 5
44 |   stepsize: 0.05
45 | 
46 | ---
47 | # Experiment 1
48 | name: "polynomial"
49 | aah: "aaah"
50 | 
51 | params:
52 |   x_0: 1
53 |   x_1: 2
54 |   x_2: 3
55 |   x_3: 4
56 | 
57 | ---
58 | # Experiment 2
59 | name: "grid_polynom"
60 | repetitions: 1
61 | iterations: 100
62 | 
63 | #import_path: "./external_conf.yml"
64 | #import_exp: "ext_exp"
65 | 
66 | params:
67 |   x_0: 0
68 |   x_3: 0
69 | 
70 | ablative:
71 |   x_0: [2]
72 |   x_3: [6, 12]
73 | 
74 | # A total of 12 Runs will be created
75 | grid:
76 |   x_1: [3, 4]
77 |   x_2: [3, 4, 5]
78 | 
79 | list:
80 |   x_4: [2, 3]
81 |   x_5: [2, 3]
82 | 


--------------------------------------------------------------------------------
/polynom_tutorial/polynom_load.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | from cw2 import cluster_work
 4 | from cw2.cw_data import cw_logging, cw_pd_logger
 5 | 
 6 | if __name__ == "__main__":
 7 |     cw = cluster_work.ClusterWork(None)
 8 |     cw.add_logger(cw_pd_logger.PandasLogger())
 9 | 
10 |     # load() -> pd.DataFrame
11 |     df = cw.load()
12 | 
13 |     rep0 = df.cw2.filter({"x_1": 0})
14 | 
15 |     print(df.head())
16 | 
17 |     print(df.cw2.flatten_pd_log().shape)
18 | 
19 |     for i, job in df.iterrows():
20 |         single_df = job["PandasLogger"]
21 |         single_df[["sample_y", "true_y"]].plot.line()
22 |         plt.savefig(job["rep_path"] + "plot.png")
23 | 


--------------------------------------------------------------------------------
/polynom_tutorial/polynom_main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | from cw2 import cluster_work, cw_error, experiment
 5 | from cw2.cw_data import cw_logging, cw_pd_logger
 6 | 
 7 | 
 8 | class Polynomial(experiment.AbstractIterativeExperiment):
 9 |     # ...
10 | 
11 |     def initialize(
12 |         self, config: dict, rep: int, logger: cw_logging.AbstractLogger
13 |     ) -> None:
14 |         random.seed(rep)
15 | 
16 |     def iterate(self, config: dict, rep: int, n: int) -> dict:
17 |         if rep > 0:
18 |             # You can raise an Experiment Surrender Exception to gracefully end a task prematurely
19 |             raise cw_error.ExperimentSurrender()
20 | 
21 |         if n > 10:
22 |             # Should a task raise an Exception, it will be logged and the next job execution starts.
23 |             y = 3 / 0
24 | 
25 |         params = config["params"]
26 |         print(params)
27 |         x_0 = params["x_0"]
28 |         x_1 = params["x_1"]
29 |         x_2 = params["x_2"]
30 |         x_3 = params["x_3"]
31 | 
32 |         x = params["stepsize"] * n
33 |         y = x_3 * (x**3) + x_2 * (x**2) + x_1 * x + x_0
34 | 
35 |         y_noise = y + (random.randint(-10, 10) / 10.0) * params["noise"]
36 | 
37 |         return {"true_y": y, "sample_y": y_noise}
38 | 
39 |     def save_state(self, config: dict, rep: int, n: int) -> None:
40 |         pass
41 | 
42 |     def finalize(self, surrender=None, crash: bool = False):
43 |         # Use cw_logging.getLogger() for logging functionality
44 |         cw_logging.getLogger().info("Finished. Closing Down.")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     import sys
49 | 
50 |     sys.argv.append("polynom_config.yml")
51 |     sys.argv.append("-o")
52 |     sys.argv.append("-s")
53 |     sys.argv.append("--debug")
54 | 
55 |     cw = cluster_work.ClusterWork(Polynomial)
56 |     cw.add_logger(cw_pd_logger.PandasLogger())
57 |     cw.run()
58 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # To use a consistent encoding
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Get the long description from the README file
10 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
11 |     long_description = f.read()
12 | 
13 | setup(
14 |     name="cw2",
15 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
16 |     # the version across setup.py and the project code, see
17 |     # https://packaging.python.org/en/latest/single_source_version.html
18 |     version="2.5.1",
19 |     description="A reengineered framework to run experiments on a computing cluster.",
20 |     long_description=long_description,
21 |     long_description_content_type="text/markdown",
22 |     # The project's main homepage.
23 |     url="https://github.com/ALRhub/cw2",
24 |     # Author details
25 |     author="Maximilian Li",
26 |     author_email="maximilian.xiling.li@gmail.com",
27 |     license="MIT",
28 |     classifiers=[
29 |         "Development Status :: 5 - Production/Stable",
30 |         "Intended Audience :: Science/Research",
31 |         "Intended Audience :: Education",
32 |         "Topic :: System :: Distributed Computing",
33 |         "Topic :: Scientific/Engineering",
34 |         "Topic :: Scientific/Engineering :: Information Analysis",
35 |         "Topic :: Education",
36 |         "Programming Language :: Python :: 3",
37 |         "Programming Language :: Python :: 3.3",
38 |         "Programming Language :: Python :: 3.4",
39 |         "Programming Language :: Python :: 3.5",
40 |         "Programming Language :: Python :: 3.6",
41 |         "Programming Language :: Python :: 3.7",
42 |         "Programming Language :: Python :: 3.8",
43 |         "Programming Language :: Python :: 3.9",
44 |         "Programming Language :: Python :: 3.10",
45 |         "Environment :: Console",
46 |     ],
47 |     python_requires=">=3",
48 |     # What does your project relate to?
49 |     keywords=["scientific", "experiments", "distributed computing", "mpi", "research"],
50 |     packages=find_packages(),
51 |     package_data={"cw2": ["default_sbatch.sh"]},
52 |     install_requires=["PyYAML", "numpy", "pandas", "joblib"],
53 | )
54 | 


--------------------------------------------------------------------------------
/templates/abstract_config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Slurm config (optional)
 3 | name: "SLURM"   # MUST BE "SLURM"
 4 | 
 5 | # Required
 6 | partition: "dev"
 7 | job-name: "experiment"    # this will be the experiment's name in slurm
 8 | 
 9 | # Required - Cluster Specific
10 | num_parallel_jobs: 120
11 | ntasks: 1
12 | cpus-per-task: 1
13 | time: 30  # Runtime in Wallclock Time. Can be int or str in form "HH:MM:SS"
14 | 
15 | mem-per-cpu: 1000 # Optional - Cluster specific
16 | 
17 | # Optional
18 | path_to_template: "/path/to/sbatch_template.sh"     # Path to YOUR prepared sbatch script. Uses supplied default template if not specified
19 | account: ""                                         # Account name to which Cluster Time will be booked. Cluster dependent.
20 | slurm_log: "/path/to/slurmlog/outputdir"            # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path/slurmlog
21 | venv: "/path/to/virtual_environment/bin/activate"   # optional. path to your virtual environment activate-file
22 | 
23 | # Optional Code Copy: Both Args are required.
24 | experiment_copy_dst: "/path/to/code_copy/dst"       # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
25 | experiment_copy_auto_dst: "/path/to/code_copy/dst"  # will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
26 | experiment_copy_src: "/path/to/code_copy/src"       # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
27 | 
28 | # Optional SBATCH Arguments
29 | sbatch_args:    # Dictionary of SBATCH keywords and arguments
30 |   kw_1: "arg1"  # Will construct the line: #SBATCH --kw_1 arg1
31 |   kw_2: "arg2"  # Will construct the line: #SBATCH --kw_2 arg2
32 | 
33 | # Optional shell instructions
34 | sh_lines:       # List of strings
35 |   - "line 1"
36 |   - "line 2"
37 | 
38 | ---
39 | # DEFAULT parameters (Optional)
40 | name: "DEFAULT"   # MUST BE 'DEFAULT'
41 | 
42 | # Implementation default parameters
43 | # Will be overwritten by named experiments.
44 | params:
45 |   param_1: "default_value"
46 | 
47 | ---
48 | # Experiment 1
49 | name: "experiment_name"
50 | 
51 | # Required: Can also be set in DEFAULT
52 | path: "path/to/output_dir/"   # location to save results in
53 | repetitions: 5    # number of times one set of parameters is run
54 | 
55 | # Optional: Can also be set in DEFAULT
56 | # Only use these values if you are sure you know what you are doing.
57 | # Refer to Chapter 9 of the Docs for more info
58 | reps_per_job: 1    # number of repetitions in each job. useful for paralellization. defaults to 1.
59 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1.
60 | 
61 | # Experiment Parameters: Can also be set in DEFAULT. Can be a nested dictionary.
62 | params:
63 |   param_1: "exp_value_1" # overwrites Default
64 |   param_2: "exp_value_2" # new experiment specific parameter
65 | 
66 | # Dynamically assigned parameters. Can be EITHER 'list' or 'grid'. Can NOT be set in DEFAULT. Can be a nested dictionary.
67 | list:   # alternative - 'grid:'
68 |   param_3: [1, 2]
69 |   param_4: [3, 4]


--------------------------------------------------------------------------------
/templates/abstract_main.py:
--------------------------------------------------------------------------------
 1 | from cw2 import cluster_work, cw_error, experiment
 2 | from cw2.cw_data import cw_logging
 3 | 
 4 | 
 5 | class MyExperiment(experiment.AbstractExperiment):
 6 |     # ...
 7 | 
 8 |     def initialize(
 9 |         self, config: dict, rep: int, logger: cw_logging.LoggerArray
10 |     ) -> None:
11 |         cw_logging.getLogger().info(
12 |             "Ready to start repetition {}. Resetting everything.".format(rep)
13 |         )
14 | 
15 |     def run(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
16 |         # Do Something non-iteratively and logging the result.
17 |         cw_logging.getLogger().info("Doing Something.")
18 |         logger.process("Some Result")
19 |         cw_logging.getLogger().warning("Something went wrong")
20 | 
21 |     def finalize(
22 |         self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False
23 |     ):
24 |         if surrender is not None:
25 |             cw_logging.getLogger().info("Run was surrendered early.")
26 | 
27 |         if crash:
28 |             cw_logging.getLogger().warning("Run crashed with an exception.")
29 |         cw_logging.getLogger().info("Finished. Closing Down.")
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     cw = cluster_work.ClusterWork(MyExperiment)
34 | 
35 |     # If loggers are wanted, must be instantiated manually
36 |     logger1 = cw_logging.AbstractLogger()
37 |     logger2 = cw_logging.AbstractLogger()
38 |     cw.add_logger(logger1)
39 |     cw.add_logger(logger2)
40 | 
41 |     cw.run()
42 | 


--------------------------------------------------------------------------------
/templates/iterative_config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Slurm config (optional)
 3 | name: "SLURM"   # MUST BE "SLURM"
 4 | 
 5 | # Required
 6 | partition: "dev"
 7 | job-name: "experiment"    # this will be the experiment's name in slurm
 8 | 
 9 | # Required - Cluster Specific
10 | num_parallel_jobs: 120
11 | ntasks: 1
12 | cpus-per-task: 1
13 | time: 30  # Runtime in Wallclock Time. Can be int or str in form "HH:MM:SS"
14 | 
15 | mem-per-cpu: 1000 # Optional - Cluster specific
16 | 
17 | # Optional
18 | path_to_template: "/path/to/sbatch_template.sh"     # Path to YOUR prepared sbatch script. Uses supplied default template if not specified
19 | account: ""                                         # Account name to which Cluster Time will be booked. Cluster dependent.
20 | slurm_log: "/path/to/slurmlog/outputdir"            # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path/slurmlog
21 | venv: "/path/to/virtual_environment/bin/activate"   # optional. path to your virtual environment activate-file
22 | 
23 | # Optional Code Copy: Both Args are required.
24 | experiment_copy_dst: "/path/to/code_copy/dst"       # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
25 | experiment_copy_auto_dst: "/path/to/code_copy/dst"  # will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
26 | experiment_copy_src: "/path/to/code_copy/src"       # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue.
27 | 
28 | # Optional SBATCH Arguments
29 | sbatch_args:    # Dictionary of SBATCH keywords and arguments
30 |   kw_1: "arg1"  # Will construct the line: #SBATCH --kw_1 arg1
31 |   kw_2: "arg2"  # Will construct the line: #SBATCH --kw_2 arg2
32 | 
33 | # Optional shell instructions
34 | sh_lines:       # List of strings
35 |   - "line 1"
36 |   - "line 2"
37 | 
38 | ---
39 | # DEFAULT parameters (Optional)
40 | name: "DEFAULT"   # MUST BE 'DEFAULT'
41 | 
42 | # Implementation default parameters
43 | # Will be overwritten by named experiments.
44 | params:
45 |   param_1: "default_value"
46 | 
47 | 
48 | ---
49 | # Experiment 1
50 | name: "experiment_name"
51 | 
52 | # Required: Can also be set in DEFAULT
53 | path: "path/to/output_dir/"   # location to save results in
54 | repetitions: 5    # number of times one set of parameters is run
55 | iterations: 1000  # number of iterations per repetition
56 | 
57 | # Optional: Can also be set in DEFAULT
58 | # Only use these values if you are sure you know what you are doing.
59 | # Refer to Chapter 9 of the Docs for more info
60 | reps_per_job: 1    # number of repetitions in each job. useful for paralellization. defaults to 1.
61 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1.
62 | 
63 | # Experiment Parameters: Can also be set in DEFAULT.
64 | params:
65 |   param_1: "exp_value_1" # overwrites Default
66 |   param_2: "exp_value_2" # new experiment specific parameter
67 | 
68 | # Dynamically assigned parameters. Can be EITHER 'list' or 'grid'. Can NOT be set in DEFAULT. Can be a nested dictionary.
69 | list:   # alternative - 'grid:'
70 |   param_3: [1, 2]
71 |   param_4: [3, 4]
72 | 


--------------------------------------------------------------------------------
/templates/iterative_main.py:
--------------------------------------------------------------------------------
 1 | from cw2 import cluster_work, cw_error, experiment
 2 | from cw2.cw_data import cw_logging
 3 | 
 4 | 
 5 | class MyIterativeExperiment(experiment.AbstractIterativeExperiment):
 6 |     # ...
 7 | 
 8 |     def initialize(
 9 |         self, config: dict, rep: int, logger: cw_logging.LoggerArray
10 |     ) -> None:
11 |         cw_logging.getLogger().info(
12 |             "Ready to start repetition {}. Resetting everything.".format(rep)
13 |         )
14 | 
15 |     def iterate(self, config: dict, rep: int, n: int) -> dict:
16 |         if n > 50:
17 |             raise cw_error.ExperimentSurrender({"Rsult": "End execution early."})
18 | 
19 |         return {"Result": "Current Iteration is {}".format(n)}
20 | 
21 |     def save_state(self, config: dict, rep: int, n: int) -> None:
22 |         if n % 50 == 0:
23 |             cw_logging.getLogger().info("I am stateless. Nothing to write to disk.")
24 | 
25 |     def finalize(
26 |         self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False
27 |     ):
28 |         if surrender is not None:
29 |             cw_logging.getLogger().info("Run was surrendered early.")
30 | 
31 |         if crash:
32 |             cw_logging.getLogger().warning("Run crashed with an exception.")
33 |         cw_logging.getLogger().info("Finished. Closing Down.")
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     cw = cluster_work.ClusterWork(MyIterativeExperiment)
38 |     cw.add_logger(cw_logging.AbstractLogger())
39 |     cw.run()
40 | 


--------------------------------------------------------------------------------
/templates/sbatch_template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p %%partition%%
 3 | # #SBATCH -A %%account%%
 4 | #SBATCH -J %%job-name%%
 5 | #SBATCH --array 0-%%last_job_idx%%%%%num_parallel_jobs%%
 6 | 
 7 | # Please use the complete path details :
 8 | #SBATCH -D %%experiment_execution_dir%%
 9 | #SBATCH -o %%slurm_log%%/out_%A_%a.log
10 | #SBATCH -e %%slurm_log%%/err_%A_%a.log
11 | 
12 | # Cluster Settings
13 | #SBATCH -n %%ntasks%%         # Number of tasks
14 | #SBATCH -c %%cpus-per-task%%  # Number of cores per task
15 | #SBATCH --mem-per-cpu=%%mem-per-cpu%% # Main memory in MByte per MPI task
16 | #SBATCH -t %%time%%             # 1:00:00 Hours, minutes and seconds, or '#SBATCH -t 10' - only minutes
17 | 
18 | %%sbatch_args%%
19 | # -------------------------------
20 | 
21 | # Activate the virtualenv / conda environment
22 | %%venv%%
23 | 
24 | 
25 | # Export Pythonpath
26 | %%pythonpath%%
27 | 
28 | # Additional Instructions from CONFIG.yml
29 | %%sh_lines%%
30 | 
31 | python3 %%python_script%% %%path_to_yaml_config%% -j $SLURM_ARRAY_TASK_ID %%cw_args%%
32 | 
33 | # THIS WAS BUILT FROM THE DEFAULLT SBATCH TEMPLATE


--------------------------------------------------------------------------------
/test/horeka_scheduler_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/test/horeka_scheduler_test/__init__.py


--------------------------------------------------------------------------------
/test/horeka_scheduler_test/horeka_config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Slurm config
 3 | name: "SLURM"
 4 | partition: "accelerated"
 5 | job-name: "horeka_test_job"
 6 | num_parallel_jobs: 120
 7 | time: 2
 8 | ntasks: 1
 9 | cpus-per-task: 10
10 | gpus_per_rep: 1
11 | sbatch_args:
12 |   gres: "gpu:4"
13 | 
14 | ---
15 | # DEFAULT
16 | name: "test"
17 | repetitions: 20
18 | path: "./hs_test_log"
19 | reps_per_job: 4
20 | reps_in_parallel : 4
21 | params:
22 |   dummy: 5
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/test/horeka_scheduler_test/test_experiment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | import time
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from cw2.cw_data import cw_logging
10 | from cw2.experiment import AbstractExperiment, ExperimentSurrender
11 | 
12 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
13 | 
14 | 
15 | class TestExperiment(AbstractExperiment):
16 |     def initialize(
17 |         self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray
18 |     ) -> None:
19 |         np.random.seed(rep * 13)
20 |         print(
21 |             "Hello, repetition ",
22 |             rep,
23 |             "here. I see ",
24 |             torch.cuda.device_count(),
25 |             " GPU(s)",
26 |         )
27 |         if torch.cuda.is_available():
28 |             device = torch.device("cuda")
29 |             print(torch.cuda.get_device_name(device))
30 |             print(torch.cuda.get_device_properties(device))
31 | 
32 |     def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None:
33 |         sleep_time = np.random.rand() * 10
34 |         print("Going to sleep for {:.5f} sec".format(sleep_time))
35 |         time.sleep(sleep_time)
36 |         exit_gracefully = np.random.rand() < 0.5
37 |         if exit_gracefully:
38 |             print("Done (Rep", rep, ")")
39 |             return
40 |         else:
41 |             raise Exception("AAHHH I AM DYING! (Rep ", rep, ")")
42 | 
43 |     def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False):
44 |         pass
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     from cw2.cluster_work import ClusterWork
49 | 
50 |     sys.argv.append("horeka_config.yml")
51 |     sys.argv.append("-o")
52 |     # sys.argv.append("-s")
53 | 
54 |     cw = ClusterWork(TestExperiment)
55 |     cw.run()
56 | 


--------------------------------------------------------------------------------
/test/test_cw_config.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from typing import Dict
  3 | from unittest import main
  4 | 
  5 | from cw2.cw_config import conf_unfolder, cw_config
  6 | 
  7 | 
  8 | class TestParamsExpansion(unittest.TestCase):
  9 |     def setUp(self) -> None:
 10 |         self.conf_obj = cw_config.Config()
 11 | 
 12 |     def expand_dict(self, _d: dict) -> list:
 13 |         d = _d.copy()
 14 |         expands = conf_unfolder.expand_experiments([d], False, False)
 15 |         return [self.remove_non_param_keys(e) for e in expands]
 16 | 
 17 |     def create_minimal_dict(self) -> dict:
 18 |         return {"name": "exp", "path": "test", "_debug": False}
 19 | 
 20 |     def remove_non_param_keys(self, _d: dict) -> dict:
 21 |         d = _d.copy()
 22 |         d["path"] = d["_basic_path"]
 23 |         del d["_basic_path"]
 24 |         del d["_experiment_name"]
 25 |         del d["_nested_dir"]
 26 |         del d["log_path"]
 27 |         return d
 28 | 
 29 |     def test_no_expansion(self):
 30 |         no_params = self.create_minimal_dict()
 31 | 
 32 |         res = self.expand_dict(no_params)
 33 |         self.assertEqual(1, len(res))
 34 |         self.assertDictEqual(no_params, res[0])
 35 | 
 36 |         params_dict = self.create_minimal_dict()
 37 |         params_dict["params"] = {"a": 1, "b": [2, 3], "c": {"c_1": "a", "c_2": "b"}}
 38 | 
 39 |         res = self.expand_dict(params_dict)
 40 |         self.assertEqual(1, len(res))
 41 |         self.assertDictEqual(params_dict, res[0])
 42 | 
 43 |     def test_grid_exp(self):
 44 |         g = self.create_minimal_dict()
 45 |         g["grid"] = {
 46 |             "a": [1],
 47 |             "b": [2],
 48 |         }
 49 | 
 50 |         res = self.expand_dict(g)
 51 |         self.assertEqual(1, len(res))
 52 | 
 53 |         g["grid"]["a"] = [3, 4]
 54 |         res = self.expand_dict(g)
 55 |         self.assertEqual(2, len(res))
 56 | 
 57 |         g["grid"]["b"] = [11, 12, 13]
 58 |         res = self.expand_dict(g)
 59 |         self.assertEqual(6, len(res))
 60 | 
 61 |         g["grid"]["c"] = {"c1": ["c1"], "c2": ["c2a", "c2b"]}
 62 |         res = self.expand_dict(g)
 63 |         self.assertEqual(12, len(res))
 64 | 
 65 |     def test_list_exp(self):
 66 |         g = self.create_minimal_dict()
 67 |         g["list"] = {
 68 |             "a": [1],
 69 |             "b": [2],
 70 |         }
 71 | 
 72 |         res = self.expand_dict(g)
 73 |         self.assertEqual(1, len(res))
 74 | 
 75 |         g["list"]["a"] = [3, 4]
 76 |         res = self.expand_dict(g)
 77 |         self.assertEqual(1, len(res))
 78 | 
 79 |         g["list"]["b"] = [11, 12, 13]
 80 |         res = self.expand_dict(g)
 81 |         self.assertEqual(2, len(res))
 82 | 
 83 |         g["list"]["c"] = {"c1": ["c1"], "c2": ["c2a, c2b"]}
 84 |         res = self.expand_dict(g)
 85 |         self.assertEqual(1, len(res))
 86 | 
 87 |     def test_grid_and_list(self):
 88 |         g = self.create_minimal_dict()
 89 |         g["list"] = {
 90 |             "a": [1],
 91 |             "b": [2],
 92 |         }
 93 |         g["grid"] = {
 94 |             "c": [1],
 95 |             "d": [2],
 96 |         }
 97 |         res = self.expand_dict(g)
 98 |         self.assertEqual(1, len(res))
 99 | 
100 |         g["list"]["a"] = [3, 4]
101 |         g["list"]["b"] = [11, 12, 13]
102 |         res = self.expand_dict(g)
103 |         self.assertEqual(2, len(res))
104 | 
105 |         g["grid"]["c"] = [3, 4]
106 |         res = self.expand_dict(g)
107 |         self.assertEqual(4, len(res))
108 | 
109 |         g["grid"]["cd"] = {"c1": ["c1"], "c2": ["c2a", "c2b"]}
110 |         res = self.expand_dict(g)
111 |         self.assertEqual(8, len(res))
112 | 
113 |         g["list"]["cl"] = {"c1": ["c1"], "c2": ["c2a, c2b"]}
114 |         res = self.expand_dict(g)
115 |         self.assertEqual(4, len(res))
116 | 
117 |     def test_multi_listt(self):
118 |         g = self.create_minimal_dict()
119 |         g["list1"] = {
120 |             "a": [1],
121 |             "b": [2],
122 |         }
123 |         g["list--2"] = {
124 |             "c": [1],
125 |             "d": [2],
126 |         }
127 |         res = self.expand_dict(g)
128 |         self.assertEqual(1, len(res))
129 | 
130 |         g["list1"]["a"] = [3, 4]
131 |         g["list1"]["b"] = [11, 12, 13]
132 |         res = self.expand_dict(g)
133 |         self.assertEqual(2, len(res))
134 | 
135 |         g["list--2"]["c"] = [3, 4]
136 |         g["list--2"]["d"] = [3, 4]
137 |         res = self.expand_dict(g)
138 |         self.assertEqual(4, len(res))
139 | 
140 |         g["list1"]["a"] = [11, 12, 13]
141 |         g["list1"]["b"] = [11, 12, 13]
142 |         g["list--2"]["c"] = [11, 12, 13]
143 |         g["list--2"]["d"] = [11, 12, 13]
144 |         res = self.expand_dict(g)
145 |         self.assertEqual(9, len(res))
146 | 
147 |     def test_ablation(self):
148 |         g = self.create_minimal_dict()
149 |         g["list1"] = {
150 |             "a": [1],
151 |             "b": [2],
152 |         }
153 |         g["ablative"] = {
154 |             "c": [3],
155 |         }
156 |         res = self.expand_dict(g)
157 |         self.assertEqual(1, len(res))
158 | 
159 |         g["ablative"] = {
160 |             "c": [3, 4],
161 |         }
162 |         res = self.expand_dict(g)
163 |         self.assertEqual(2, len(res))
164 | 
165 |         g["ablative"] = {"c": [3], "d": [4]}
166 |         res = self.expand_dict(g)
167 |         self.assertEqual(2, len(res))
168 | 
169 |         g["ablative"] = {"c": [3], "d": [4, 5]}
170 |         g["list1"] = {
171 |             "a": [1, 2],
172 |             "b": [2, 3],
173 |         }
174 |         res = self.expand_dict(g)
175 |         self.assertEqual(6, len(res))
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     unittest.main()
180 | 


--------------------------------------------------------------------------------