├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── cw2 ├── __init__.py ├── alternative_schedulers.py ├── cli_parser.py ├── cluster_work.py ├── cw_config │ ├── __init__.py │ ├── conf_io.py │ ├── conf_path.py │ ├── conf_resolver.py │ ├── conf_unfolder.py │ ├── cw_conf_keys.py │ └── cw_config.py ├── cw_data │ ├── __init__.py │ ├── cw_loading.py │ ├── cw_logging.py │ ├── cw_pd_logger.py │ └── cw_wandb_logger.py ├── cw_error.py ├── cw_slurm │ ├── __init__.py │ ├── cw_slurm.py │ └── cw_slurm_keys.py ├── default_sbatch.sh ├── experiment.py ├── job.py ├── scheduler.py └── util.py ├── doc ├── 01_quickstart.md ├── 02_experiment.md ├── 03_config.md ├── 04_slurm.md ├── 05_files.md ├── 06_code_copy.md ├── 07_logging.md ├── 08_loading.md ├── 09_advanced.md ├── 10_advanced_gpu.md ├── 11_cli_args.md └── README.md ├── polynom_tutorial ├── external_conf.yml ├── polynom_config.yml ├── polynom_load.py └── polynom_main.py ├── pyproject.toml ├── setup.py ├── templates ├── abstract_config.yml ├── abstract_main.py ├── iterative_config.yml ├── iterative_main.py └── sbatch_template.sh └── test ├── horeka_scheduler_test ├── __init__.py ├── horeka_config.yml └── test_experiment.py └── test_cw_config.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig 2 | 3 | # Created by https://www.gitignore.io/api/windows,visualstudiocode,linux,python 4 | # Edit at https://www.gitignore.io/?templates=windows,visualstudiocode,linux,python 5 | 6 | ### Linux ### 7 | *~ 8 | 9 | # temporary files which can be created if a process still has a handle open of a deleted file 10 | .fuse_hidden* 11 | 12 | # KDE directory preferences 13 | .directory 14 | 15 | # Linux trash folder which might appear on any partition or disk 16 | .Trash-* 17 | 18 | # .nfs files are created when an open file is removed but is still being accessed 19 | .nfs* 20 | 21 | ### Python ### 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | pip-wheel-metadata/ 45 | share/python-wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .nox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # Mr Developer 111 | .mr.developer.cfg 112 | .project 113 | .pydevproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | ### VisualStudioCode ### 127 | .vscode/* 128 | 129 | ### VisualStudioCode Patch ### 130 | # Ignore all local history of files 131 | .history 132 | 133 | ### Windows ### 134 | # Windows thumbnail cache files 135 | Thumbs.db 136 | Thumbs.db:encryptable 137 | ehthumbs.db 138 | ehthumbs_vista.db 139 | 140 | # Dump file 141 | *.stackdump 142 | 143 | # Folder config file 144 | [Dd]esktop.ini 145 | 146 | # Recycle Bin used on file shares 147 | $RECYCLE.BIN/ 148 | 149 | # Windows Installer files 150 | *.cab 151 | *.msi 152 | *.msix 153 | *.msm 154 | *.msp 155 | 156 | # Windows shortcuts 157 | *.lnk 158 | 159 | # End of https://www.gitignore.io/api/windows,visualstudiocode,linux,python 160 | 161 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) 162 | 163 | exp_output 164 | polynom_tutorial/log 165 | 166 | # ignore ide files 167 | .idea 168 | venv 169 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Karlsruhe Institute of Technology (KIT) - Autonomous Learning Robots Lab (ALR) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cw2 - ClusterWork 2 2 | 3 | [![Upload Python Package](https://github.com/ALRhub/cw2/actions/workflows/python-publish.yml/badge.svg)](https://github.com/ALRhub/cw2/actions/workflows/python-publish.yml) 4 | 5 | ClusterWork 2 is a python framework to manage experiments using YAML config files. It also enables users to easily deploy multiple experiments using different configurations on computing clusters, which support the [slurm workload manager](https://slurm.schedmd.com/documentation.html). 6 | 7 | ## Installation 8 | ```bash 9 | pip install cw2 10 | ``` 11 | 12 | ## Quickstart 13 | Please refer to the [Quickstart Guide](doc/01_quickstart.md). 14 | 15 | ## Program Execution 16 | To start an experiment locally, e.g. for testing: 17 | ```bash 18 | python3 YOUR_MAIN.py YOUR_CONFIG.yml 19 | ``` 20 | 21 | To start an experiment on a slurm cluster: 22 | ```bash 23 | python3 YOUR_MAIN.py YOUR_CONFIG.yml -s 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /cw2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/__init__.py -------------------------------------------------------------------------------- /cw2/alternative_schedulers.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import concurrent.futures as con 3 | import multiprocessing 4 | import os 5 | import queue 6 | from typing import List 7 | 8 | from joblib import Parallel, delayed 9 | 10 | from cw2 import cw_error, job 11 | from cw2.cw_config import cw_config 12 | from cw2.cw_slurm import cw_slurm 13 | from cw2.scheduler import GPUDistributingLocalScheduler 14 | 15 | 16 | class StarmapGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 17 | def run(self, overwrite: bool = False): 18 | print("Using StarmapGPUDistributingLocalScheduler") 19 | num_parallel = self.joblist[0].n_parallel 20 | for j in self.joblist: 21 | assert ( 22 | j.n_parallel == num_parallel 23 | ), "All jobs in list must have same n_parallel" 24 | assert j.n_parallel == self._queue_elements, ( 25 | "Mismatch between GPUs Queue Elements and Jobs executed in" 26 | "parallel. Fix for optimal resource usage!!" 27 | ) 28 | 29 | with multiprocessing.Pool(processes=num_parallel) as pool: 30 | # setup gpu resource queue 31 | m = multiprocessing.Manager() 32 | gpu_queue = m.Queue(maxsize=self._queue_elements) 33 | for i in range(self._queue_elements): 34 | gpu_queue.put(i) 35 | 36 | for j in self.joblist: 37 | args = [ 38 | (j, c, gpu_queue, self._gpus_per_rep, overwrite) for c in j.tasks 39 | ] 40 | pool.starmap_async( 41 | StarmapGPUDistributingLocalScheduler._execute_task, args 42 | ) 43 | pool.close() 44 | pool.join() 45 | 46 | @staticmethod 47 | def _execute_task( 48 | j: job.Job, 49 | c: dict, 50 | q: multiprocessing.Queue, 51 | gpus_per_job: int, 52 | overwrite: bool = False, 53 | ): 54 | gpu_idx = q.get() 55 | s = ("{}," * gpus_per_job).format( 56 | *[gpu_idx * gpus_per_job + i for i in range(gpus_per_job)] 57 | )[:-1] 58 | try: 59 | os.environ["CUDA_VISIBLE_DEVICES"] = s 60 | j.run_task(c, overwrite) 61 | except cw_error.ExperimentSurrender as _: 62 | return 63 | finally: 64 | q.put(gpu_idx) 65 | 66 | 67 | class ConcurrentGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 68 | def run(self, overwrite: bool = False): 69 | print("Using ConcurrentGPUDistributingLocalScheduler") 70 | num_parallel = self.joblist[0].n_parallel 71 | for j in self.joblist: 72 | assert ( 73 | j.n_parallel == num_parallel 74 | ), "All jobs in list must have same n_parallel" 75 | assert j.n_parallel == self._queue_elements, ( 76 | "Mismatch between GPUs Queue Elements and Jobs executed in" 77 | "parallel. Fix for optimal resource usage!!" 78 | ) 79 | 80 | with con.ProcessPoolExecutor(max_workers=num_parallel) as pool: 81 | # setup gpu resource queue 82 | # gpu_queue = queue.Queue(maxsize=self._queue_elements) 83 | # for i in range(self._queue_elements): 84 | # gpu_queue.put(i) 85 | 86 | results = [] 87 | for j in self.joblist: 88 | for i, c in enumerate(j.tasks): 89 | results.append( 90 | pool.submit( 91 | ConcurrentGPUDistributingLocalScheduler._execute_task, 92 | j, 93 | c, 94 | i, 95 | self._gpus_per_rep, 96 | overwrite, 97 | ) 98 | ) 99 | for r in results: 100 | r.result() 101 | 102 | @staticmethod 103 | def _execute_task( 104 | j: job.Job, 105 | c: dict, 106 | idx: int, 107 | # q: multiprocessing.Queue, 108 | gpus_per_job: int, 109 | overwrite: bool = False, 110 | ): 111 | # gpu_idx = q.get() 112 | s = ("{}," * gpus_per_job).format( 113 | *[idx * gpus_per_job + i for i in range(gpus_per_job)] 114 | )[:-1] 115 | try: 116 | os.environ["CUDA_VISIBLE_DEVICES"] = s 117 | j.run_task(c, overwrite) 118 | except cw_error.ExperimentSurrender as _: 119 | return 120 | 121 | # finally: 122 | # q.put(gpu_idx) 123 | 124 | 125 | class JoblibGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 126 | def run(self, overwrite: bool = False): 127 | print("Using JoblibGPUDistributingLocalScheduler") 128 | for j in self.joblist: 129 | Parallel(n_jobs=j.n_parallel)( 130 | delayed(self.execute_task)(j, c, i, self._gpus_per_rep, overwrite) 131 | for i, c in enumerate(j.tasks) 132 | ) 133 | 134 | def execute_task( 135 | self, j: job.Job, c: dict, idx: int, gpus_per_job: int, overwrite: bool = False 136 | ): 137 | s = ("{}," * gpus_per_job).format( 138 | *[idx * gpus_per_job + i for i in range(gpus_per_job)] 139 | )[:-1] 140 | try: 141 | os.environ["CUDA_VISIBLE_DEVICES"] = s 142 | j.run_task(c, overwrite) 143 | except cw_error.ExperimentSurrender as _: 144 | return 145 | 146 | 147 | class RayGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 148 | def run(self, overwrite: bool = False): 149 | print("Using RayGPUDistributingLocalScheduler") 150 | 151 | import ray 152 | from ray.util.queue import Queue 153 | 154 | @ray.remote 155 | def _execute_task( 156 | j: job.Job, c: dict, q, gpus_per_job: int, overwrite: bool = False 157 | ): 158 | gpu_idx = q.get() 159 | print("I got gpu idx", gpu_idx) 160 | s = ("{}," * gpus_per_job).format( 161 | *[gpu_idx * gpus_per_job + i for i in range(gpus_per_job)] 162 | )[:-1] 163 | try: 164 | os.environ["CUDA_VISIBLE_DEVICES"] = s 165 | j.run_task(c, overwrite) 166 | except cw_error.ExperimentSurrender as _: 167 | return 168 | finally: 169 | print("giving back gpu idx", gpu_idx) 170 | q.put(gpu_idx) 171 | 172 | ray.init() 173 | num_parallel = self.joblist[0].n_parallel 174 | for j in self.joblist: 175 | assert ( 176 | j.n_parallel == num_parallel 177 | ), "All jobs in list must have same n_parallel" 178 | assert j.n_parallel == self._queue_elements, ( 179 | "Mismatch between GPUs Queue Elements and Jobs executed in" 180 | "parallel. Fix for optimal resource usage!!" 181 | ) 182 | gpu_queue = Queue(maxsize=self._queue_elements) 183 | 184 | for i in range(self._queue_elements): 185 | gpu_queue.put(i) 186 | results = [] 187 | for j in self.joblist: 188 | for i, c in enumerate(j.tasks): 189 | results.append( 190 | _execute_task.remote(j, c, gpu_queue, self._gpus_per_rep, overwrite) 191 | ) 192 | ray.get(results) 193 | -------------------------------------------------------------------------------- /cw2/cli_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | class Arguments: 5 | def __init__(self): 6 | p = argparse.ArgumentParser() 7 | p.add_argument("config", metavar="CONFIG.yml") 8 | p.add_argument( 9 | "-j", 10 | "--job", 11 | type=int, 12 | default=None, 13 | help="Run only the specified job. CAVEAT: Should only be used with slurm arrays.", 14 | ) 15 | 16 | # XXX: Disable delete for now 17 | # p.add_argument('-d', '--delete', action='store_true', 18 | # help='CAUTION deletes results of previous runs.') 19 | 20 | p.add_argument( 21 | "-e", 22 | "--experiments", 23 | nargs="+", 24 | default=None, 25 | help="Allows to specify which experiments should be run.", 26 | ) 27 | p.add_argument( 28 | "-s", 29 | "--slurm", 30 | action="store_true", 31 | help="Run using SLURM Workload Manager.", 32 | ) 33 | p.add_argument( 34 | "-o", "--overwrite", action="store_true", help="Overwrite existing results." 35 | ) 36 | p.add_argument( 37 | "-t", 38 | "--prefix-with-timestamp", 39 | dest="prefix_with_timestamp", 40 | action="store_true", 41 | default=False, 42 | help="If specified, prefix all started experiment runs with this timestamp. " 43 | "This can help with telling runs apart from one another. but will also modify the log " 44 | "directiories created. CAUTION: Only works with local schedulers (no SLURM etc.)", 45 | ) 46 | p.add_argument("--nocodecopy", action="store_true", help="Skip code copy.") 47 | p.add_argument( 48 | "--zip", action="store_true", help="Make a Zip Copy of the Code." 49 | ) 50 | p.add_argument( 51 | "--skipsizecheck", 52 | action="store_true", 53 | help="Skip check if code copy src < 200MByte", 54 | ) 55 | p.add_argument( 56 | "--multicopy", 57 | action="store_true", 58 | help="Create a code copy for each job seperately", 59 | ) 60 | p.add_argument( 61 | "--noconsolelog", 62 | action="store_true", 63 | help="Disables writing internal console log files", 64 | ) 65 | p.add_argument( 66 | "--debug", action="store_true", default=False, help="Enable debug mode." 67 | ) 68 | p.add_argument( 69 | "--debugall", 70 | action="store_true", 71 | default=False, 72 | help="Enable debug mode for arguments.", 73 | ) 74 | 75 | self.args = p.parse_args(namespace=self) 76 | if self.args.slurm and self.args.prefix_with_timestamp: 77 | raise ValueError( 78 | "Timestep prefixing (-t) only work on local schedulers, " 79 | "so cannot use args --slurm (-s) and --prefix-with-timestamp (-t) at the same time." 80 | ) 81 | 82 | def get(self) -> dict: 83 | return vars(self.args) 84 | -------------------------------------------------------------------------------- /cw2/cluster_work.py: -------------------------------------------------------------------------------- 1 | from typing import List, Type 2 | 3 | from cw2 import cli_parser, experiment, job, scheduler 4 | from cw2.cw_config import cw_config 5 | from cw2.cw_data import cw_loading, cw_logging 6 | 7 | 8 | class ClusterWork: 9 | def __init__(self, exp_cls: Type[experiment.AbstractExperiment] = None): 10 | self.args = cli_parser.Arguments().get() 11 | self.exp_cls = exp_cls 12 | self.config = cw_config.Config( 13 | self.args["config"], 14 | self.args["experiments"], 15 | self.args["debug"], 16 | self.args["debugall"], 17 | self.args["prefix_with_timestamp"] 18 | ) 19 | 20 | self.logArray = cw_logging.LoggerArray() 21 | 22 | if not self.args["noconsolelog"]: 23 | self.add_logger(cw_logging.PythonLogger()) 24 | self.joblist = None 25 | 26 | def add_logger(self, logger: cw_logging.AbstractLogger) -> None: 27 | """add a logger to the ClusterWork pipeline 28 | 29 | Args: 30 | logger (cw_logging.AbstractLogger): logger object to be called during execution 31 | """ 32 | self.logArray.add(logger) 33 | 34 | def _get_jobs( 35 | self, delete: bool = False, root_dir: str = "", read_only: bool = False 36 | ) -> List[job.Job]: 37 | """private method. creates and returns all configured jobs. 38 | 39 | Args: 40 | delete (bool, optional): delete all old data inside the job directories. Defaults to False. 41 | root_dir (str, optional): [description]. Defaults to "". 42 | 43 | Returns: 44 | List[job.Job]: list of all configured job objects 45 | """ 46 | if self.joblist is None: 47 | factory = job.JobFactory( 48 | self.exp_cls, self.logArray, delete, root_dir, read_only 49 | ) 50 | self.joblist = factory.create_jobs(self.config.exp_configs) 51 | return self.joblist 52 | 53 | def run(self, root_dir: str = "", sch: scheduler.AbstractScheduler = None): 54 | """Run ClusterWork computations. 55 | 56 | Args: 57 | root_dir (str, optional): [description]. Defaults to "". 58 | """ 59 | if self.exp_cls is None: 60 | raise NotImplementedError( 61 | "Cannot run with missing experiment.AbstractExperiment Implementation." 62 | ) 63 | 64 | self.config.to_yaml(relpath=True) 65 | 66 | args = self.args 67 | 68 | # Handle SLURM execution 69 | if args["slurm"]: 70 | s = scheduler.SlurmScheduler(self.config) 71 | else: 72 | # Do Local execution 73 | if sch is None: 74 | if scheduler.GPUDistributingLocalScheduler.use_distributed_gpu_scheduling( 75 | self.config 76 | ): 77 | scheduler_cls = scheduler.get_gpu_scheduler_cls( 78 | self.config.slurm_config.get("scheduler", "mp") 79 | ) 80 | s = scheduler_cls(self.config) 81 | 82 | elif scheduler.CpuDistributingLocalScheduler.use_distributed_cpu_scheduling( 83 | self.config 84 | ): 85 | s = scheduler.CpuDistributingLocalScheduler(self.config) 86 | 87 | else: 88 | s = scheduler.LocalScheduler() 89 | else: 90 | s = sch 91 | 92 | self._run_scheduler(s, root_dir) 93 | 94 | def load(self, root_dir: str = ""): 95 | """Loads all saved information. 96 | 97 | Args: 98 | root_dir (str, optional): [description]. Defaults to "". 99 | 100 | Returns: 101 | pd.DataFrame: saved data in Dataframe form. 102 | """ 103 | 104 | loader = cw_loading.Loader() 105 | 106 | return self._run_scheduler(loader, root_dir, True) 107 | 108 | def _run_scheduler( 109 | self, 110 | s: scheduler.AbstractScheduler, 111 | root_dir: str = "", 112 | read_only: bool = False, 113 | ): 114 | if self.logArray.is_empty(): 115 | cw_logging.getLogger().warning("No Logger has been added. Are you sure?") 116 | 117 | args = self.args 118 | job_list = self._get_jobs(False, root_dir, read_only) 119 | 120 | if args["job"] is not None: 121 | job_list = [job_list[args["job"]]] 122 | 123 | s.assign(job_list) 124 | return s.run(overwrite=args["overwrite"]) 125 | -------------------------------------------------------------------------------- /cw2/cw_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_config/__init__.py -------------------------------------------------------------------------------- /cw2/cw_config/conf_io.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Tuple 3 | 4 | import yaml 5 | 6 | from cw2.cw_config import cw_conf_keys as KEY 7 | from cw2.cw_error import ExperimentNotFoundError, MissingConfigError 8 | 9 | 10 | def get_configs( 11 | config_path: str, experiment_selections: List[str] 12 | ) -> Tuple[dict, dict, List[dict]]: 13 | """reads and seperates the experiment configs from a yaml file 14 | 15 | Args: 16 | config_path (str): path to the yaml file 17 | experiment_selections (List[str]): a list of selected experiment names 18 | 19 | Returns: 20 | Tuple[dict, dict, List[dict]]: SLURM, DEFAULT, Experiment Configurations 21 | """ 22 | all_configs = read_yaml(config_path) 23 | return separate_configs(all_configs, experiment_selections) 24 | 25 | 26 | def read_yaml(config_path: str) -> List[dict]: 27 | """reads a YAML configuration file containing potentially multiple experiments 28 | 29 | Arguments: 30 | config_path {str}: path to the YAML config file 31 | 32 | Returns: 33 | List[dict]: all configs found in the yaml file 34 | """ 35 | if not os.path.exists(config_path): 36 | raise MissingConfigError("Could not find {}".format(config_path)) 37 | 38 | all_configs = [] 39 | 40 | with open(config_path, "r") as f: 41 | for exp_conf in yaml.load_all(f, yaml.FullLoader): 42 | if exp_conf is not None: 43 | all_configs.append(exp_conf) 44 | return all_configs 45 | 46 | 47 | def separate_configs( 48 | all_configs: List[dict], experiment_selections: List[str], suppress: bool = False 49 | ) -> Tuple[List[dict], dict, List[dict]]: 50 | """separates the list of individual configs into the 'special' SLURM, DEFAULT and normal experiment configs 51 | 52 | Arguments: 53 | all_configs (List[dict]): a list of all configurations 54 | experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None. 55 | 56 | Returns: 57 | Tuple[dict, dict, List[dict]]: SLURM, DEFAULT, Experiment Configurations, in this order 58 | """ 59 | default_config = None 60 | slurm_config = [] 61 | experiment_configs = [] 62 | 63 | for c in all_configs: 64 | name = c[KEY.NAME] 65 | 66 | if KEY.SLURM in name.lower(): 67 | slurm_config.append(c) 68 | elif name.lower() == KEY.DEFAULT: 69 | default_config = c 70 | else: 71 | if experiment_selections is None or name in experiment_selections: 72 | experiment_configs.append(c) 73 | 74 | if not suppress and len(experiment_configs) == 0: 75 | raise ExperimentNotFoundError("No selected experiment found in config file.") 76 | 77 | return slurm_config, default_config, experiment_configs 78 | 79 | 80 | def write_yaml(fpath, data): 81 | """write a yaml file 82 | 83 | Args: 84 | fpath : path 85 | data : payload 86 | """ 87 | os.makedirs(os.path.dirname(fpath), exist_ok=True) 88 | with open(fpath, "w") as f: 89 | yaml.dump_all(data, f, default_flow_style=False) 90 | -------------------------------------------------------------------------------- /cw2/cw_config/conf_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Dict, List 3 | 4 | from cw2.cw_config import cw_conf_keys as KEY 5 | 6 | 7 | def normalize_expanded_paths( 8 | expanded_config_list: List[Dict[str, Any]] 9 | ) -> List[Dict[str, Any]]: 10 | """normalizes path key after expansion operation 11 | 12 | Args: 13 | expanded_config_list (List[Dict[str, Any]]): list fo expanded experiment configs 14 | 15 | Returns: 16 | List[Dict[str, Any]]: noramlized expanded experiment configs 17 | """ 18 | # Set Path and LogPath Args depending on the name 19 | for _config in expanded_config_list: 20 | _config[KEY.PATH] = os.path.join( 21 | _config[KEY.i_BASIC_PATH], _config[KEY.i_NEST_DIR], _config[KEY.i_EXP_NAME] 22 | ) 23 | _config[KEY.LOG_PATH] = os.path.join(_config[KEY.PATH], "log") 24 | return expanded_config_list 25 | 26 | 27 | def make_rel_paths(config: Dict[str, Any], base_path: str) -> Dict[str, Any]: 28 | """converts relevant paths of the config into relative paths 29 | 30 | Args: 31 | config (Dict[str, Any]): experiment config 32 | base_path (str): base path 33 | 34 | Returns: 35 | Dict[str, Any]: experiment config with paths relative to base_path 36 | """ 37 | c = config.copy() 38 | _basic_path = base_path 39 | c[KEY.LOG_PATH] = os.path.join(".", os.path.relpath(c[KEY.LOG_PATH], _basic_path)) 40 | c[KEY.i_REP_LOG_PATH] = os.path.join( 41 | ".", os.path.relpath(c[KEY.i_REP_LOG_PATH], _basic_path) 42 | ) 43 | c[KEY.PATH] = os.path.join(".", os.path.relpath(c[KEY.PATH], _basic_path)) 44 | c[KEY.i_BASIC_PATH] = os.path.join( 45 | ".", os.path.relpath(c[KEY.i_BASIC_PATH], _basic_path) 46 | ) 47 | return c 48 | -------------------------------------------------------------------------------- /cw2/cw_config/conf_resolver.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from typing import List 4 | 5 | from cw2 import util 6 | from cw2.cw_config import conf_io 7 | from cw2.cw_config import cw_conf_keys as KEY 8 | from cw2.cw_error import ConfigKeyError, MissingConfigError 9 | 10 | 11 | def resolve_dependencies( 12 | default_config: dict, experiment_configs: List[dict], conf_path: str 13 | ) -> List[dict]: 14 | """resolves all internal (DEFAULT) and external (import) dependencies 15 | 16 | Args: 17 | default_config (dict): DEFAULT exp configuration 18 | experiment_configs (List[dict]): list of experiment configurations 19 | conf_path (str): path of the "calling" config file 20 | 21 | Returns: 22 | List[dict]: list of experiment configurations without unresolved dependencies 23 | """ 24 | experiment_configs = merge_default(default_config, experiment_configs) 25 | 26 | abs_path = os.path.abspath(conf_path) 27 | experiment_configs = import_external_yml(experiment_configs, abs_path) 28 | return experiment_configs 29 | 30 | 31 | def merge_default(default_config: dict, experiment_configs: List[dict]) -> List[dict]: 32 | """merges each individual experiment configuration with the default parameters 33 | 34 | Arguments: 35 | default_config {dict} -- default configuration parameters 36 | experiment_configs {List[dict]} -- a list of individual experiment configurations 37 | 38 | Returns: 39 | List[dict] -- a list of all experiment configurations 40 | """ 41 | if default_config is None: 42 | return experiment_configs 43 | 44 | expanded_exp_configs = [] 45 | for c in experiment_configs: 46 | merge_c = deepcopy(default_config) 47 | merge_c = util.deep_update(merge_c, c) 48 | expanded_exp_configs.append(merge_c) 49 | return expanded_exp_configs 50 | 51 | 52 | def import_external_yml( 53 | experiment_configs: List[dict], abs_path: str, traversal_dict: dict = None 54 | ) -> List[dict]: 55 | """recursively imports external yaml files 56 | The external yaml files are first merged with their own DEFAULT configuration, 57 | then their external dependencies get resolved. 58 | 59 | Args: 60 | experiment_configs (List[dict]): list of experiment configurations 61 | abs_path (str): Absolute file path of the YAML file which gets resolved.. 62 | traversal_dict (dict, optional): Dictionary(abs_path, exp_name) Serves as a failsafe to detect cyclic imports. 63 | Defaults to None. 64 | 65 | Raises: 66 | ConfigKeyError: if a cyclic import is attempted 67 | MissingConfigError: if the linked config cannot be found 68 | 69 | Returns: 70 | List[dict]: a list of resolved experiment configurations. 71 | """ 72 | 73 | if traversal_dict is None: 74 | traversal_dict = {abs_path: []} 75 | 76 | resolved_configs = [] 77 | for config in experiment_configs: 78 | # SKIP 79 | if KEY.IMPORT_PATH not in config and KEY.IMPORT_EXP not in config: 80 | resolved_configs.append(config) 81 | continue 82 | 83 | # Record current step 84 | traversal_dict[abs_path].append(config[KEY.NAME]) 85 | 86 | import_yml = abs_path 87 | if KEY.IMPORT_PATH in config: 88 | import_yml = config[KEY.IMPORT_PATH] 89 | 90 | # Get absolute Path for import 91 | import_yml = os.path.abspath( 92 | os.path.join(os.path.dirname(abs_path), import_yml) 93 | ) 94 | 95 | all_external_configs = conf_io.read_yaml(import_yml) 96 | 97 | ext_exp_name = KEY.DEFAULT 98 | if custom_import_exp(config): 99 | ext_exp_name = config[KEY.IMPORT_EXP] 100 | 101 | # Recursion Anchor: 102 | if import_yml in traversal_dict and ext_exp_name in traversal_dict[import_yml]: 103 | raise ConfigKeyError( 104 | "Cyclic YML import with {} : {}".format(import_yml, ext_exp_name) 105 | ) 106 | 107 | # Default Merge External 108 | _, external, ext_selection = conf_io.separate_configs( 109 | all_external_configs, [ext_exp_name], suppress=True 110 | ) 111 | 112 | if custom_import_exp(config): 113 | if len(ext_selection) == 0: 114 | raise MissingConfigError( 115 | "Could not import {} from {}".format(ext_exp_name, import_yml) 116 | ) 117 | 118 | external = merge_default(external, ext_selection)[0] 119 | 120 | # Register new Anchor 121 | if import_yml not in traversal_dict: 122 | traversal_dict[import_yml] = [] 123 | traversal_dict[import_yml].append(ext_exp_name) 124 | 125 | # Recursion call 126 | ext_resolved_conf = import_external_yml([external], import_yml, traversal_dict)[ 127 | 0 128 | ] 129 | 130 | # Delete Anchor when coming back 131 | del traversal_dict[import_yml] 132 | 133 | resolved_conf = merge_default(ext_resolved_conf, [config])[0] 134 | resolved_conf = archive_import_keys(resolved_conf) 135 | resolved_configs.append(resolved_conf) 136 | return resolved_configs 137 | 138 | 139 | def custom_import_exp(config: dict) -> bool: 140 | """check if the config uses a custom import_exp 141 | 142 | Args: 143 | config (dict): experiment configuration 144 | 145 | Returns: 146 | bool: True if a custom import_exp key is defined 147 | """ 148 | if KEY.IMPORT_EXP not in config: 149 | return False 150 | if config[KEY.IMPORT_EXP].lower() == KEY.DEFAULT: 151 | return False 152 | return True 153 | 154 | 155 | def archive_import_keys(config: dict) -> dict: 156 | """ 157 | Args: 158 | config (dict): experiment configuration 159 | 160 | 161 | Returns: 162 | dict: experiment configuration with archived import keys 163 | """ 164 | removal_keys = [KEY.IMPORT_PATH, KEY.IMPORT_EXP] 165 | replacement_keys = [KEY.i_IMPORT_PATH_ARCHIVE, KEY.i_IMPORT_EXP_ARCHIVE] 166 | 167 | for removal, replacement in zip(removal_keys, replacement_keys): 168 | if removal in config: 169 | config[replacement] = config[removal] 170 | del config[removal] 171 | return config 172 | -------------------------------------------------------------------------------- /cw2/cw_config/conf_unfolder.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | from collections import deque 4 | from copy import deepcopy 5 | from typing import List 6 | 7 | from cw2 import util 8 | from cw2.cw_config import conf_path 9 | from cw2.cw_config import cw_conf_keys as KEY 10 | from cw2.cw_data import cw_logging 11 | 12 | 13 | def unfold_exps(exp_configs: List[dict], debug: bool, debug_all: bool) -> List[dict]: 14 | """unfolds a list of experiment configurations into the different 15 | hyperparameter runs and repetitions 16 | 17 | Args: 18 | exp_configs (List[dict]): list of experiment configurations 19 | 20 | Returns: 21 | List[dict]: list of unfolded experiment configurations 22 | """ 23 | param_expansion = expand_experiments(exp_configs, debug, debug_all) 24 | unrolled = unroll_exp_reps(param_expansion) 25 | return unrolled 26 | 27 | 28 | def expand_experiments( 29 | _experiment_configs: List[dict], debug: bool, debug_all: bool 30 | ) -> List[dict]: 31 | """Expand the experiment configuration with concrete parameter instantiations 32 | 33 | Arguments: 34 | experiment_configs {List[dict]} -- List with experiment configs 35 | 36 | Returns: 37 | List[dict] -- List of experiment configs, with set parameters 38 | """ 39 | 40 | # get all options that are iteratable and build all combinations (grid) or tuples (list) 41 | experiment_configs = deque(deepcopy(_experiment_configs)) 42 | if debug or debug_all: 43 | for ec in experiment_configs: 44 | ec[KEY.REPS] = ec["iterations"] = ec[KEY.REPS_PARALL] = ec[ 45 | KEY.REPS_P_JOB 46 | ] = 1 47 | 48 | expanded_config_list = [] 49 | 50 | while len(experiment_configs) > 0: 51 | config = experiment_configs.popleft() 52 | 53 | # Set Default Values 54 | # save path argument from YML for grid modification 55 | if KEY.i_BASIC_PATH not in config: 56 | config[KEY.i_BASIC_PATH] = config.get(KEY.PATH) 57 | # save name argument from YML for grid modification 58 | if KEY.i_EXP_NAME not in config: 59 | config[KEY.i_EXP_NAME] = config.get(KEY.NAME) 60 | # add empty string for parent DIR in case of grid 61 | if KEY.i_NEST_DIR not in config: 62 | config[KEY.i_NEST_DIR] = "" 63 | # set debug flag 64 | config[KEY.i_DEBUG_FLAG] = debug or debug_all 65 | 66 | expansion = None 67 | for key in config: 68 | if key.startswith(KEY.GRID): 69 | expansion = params_combine(config, key, itertools.product) 70 | break 71 | if key.startswith(KEY.LIST): 72 | expansion = params_combine(config, key, zip) 73 | break 74 | if key.startswith(KEY.ABLATIVE): 75 | expansion = ablative_expand(config, key) 76 | break 77 | 78 | if expansion is not None: 79 | if debug and not debug_all: 80 | expansion = expansion[:1] 81 | experiment_configs.extend(expansion) 82 | else: 83 | expanded_config_list.append(config) 84 | 85 | return conf_path.normalize_expanded_paths(expanded_config_list) 86 | 87 | 88 | def params_combine(config: dict, key: str, iter_func) -> List[dict]: 89 | """combines experiment parameter with its list/grid combinations 90 | 91 | Args: 92 | config (dict): an single experiment configuration 93 | key (str): the combination key, e.g. 'list' or 'grid' 94 | iter_func: itertool-like function for creating the combinations 95 | 96 | Returns: 97 | List[dict]: list of parameter-combined experiments 98 | """ 99 | if iter_func is None: 100 | return [config] 101 | 102 | combined_configs = [] 103 | # convert list/grid dictionary into flat dictionary, where the key is a tuple of the keys and the 104 | # value is the list of values 105 | tuple_dict = util.flatten_dict_to_tuple_keys(config[key]) 106 | _param_names = [".".join(t) for t in tuple_dict] 107 | 108 | param_lengths = map(len, tuple_dict.values()) 109 | if key.startswith(KEY.LIST) and len(set(param_lengths)) != 1: 110 | cw_logging.getLogger().warning( 111 | f'experiment "{config[KEY.NAME]}" list params [{key}] are not of equal length.'.format() 112 | ) 113 | 114 | # create a new config for each parameter setting 115 | for values in iter_func(*tuple_dict.values()): 116 | _config = deepcopy(config) 117 | 118 | # Remove Grid/List Argument 119 | del _config[key] 120 | 121 | if KEY.PARAMS not in _config: 122 | _config[KEY.PARAMS] = {} 123 | 124 | # Expand Grid/List Parameters 125 | for i, t in enumerate(tuple_dict.keys()): 126 | util.insert_deep_dictionary(d=_config.get(KEY.PARAMS), t=t, value=values[i]) 127 | 128 | _config = extend_config_name(_config, _param_names, values) 129 | combined_configs.append(_config) 130 | return combined_configs 131 | 132 | 133 | def ablative_expand(config: dict, key: str): 134 | tuple_dict = util.flatten_dict_to_tuple_keys(config[key]) 135 | _param_names = [".".join(t) for t in tuple_dict] 136 | combined_configs = [] 137 | for i, t in enumerate(tuple_dict.keys()): 138 | for val in tuple_dict[t]: 139 | _config = deepcopy(config) 140 | 141 | # Remove Grid/List Argument 142 | del _config[key] 143 | 144 | if KEY.PARAMS not in _config: 145 | _config[KEY.PARAMS] = {} 146 | util.insert_deep_dictionary(d=_config.get(KEY.PARAMS), t=t, value=val) 147 | # TODO: TEST 148 | _config = extend_config_name(_config, [_param_names[i]], [val]) 149 | 150 | combined_configs.append(_config) 151 | return combined_configs 152 | 153 | 154 | def extend_config_name(config: dict, param_names: list, values: list) -> dict: 155 | """extend an experiment name with a shorthand derived from the parameters and their values 156 | 157 | Args: 158 | config (dict): experiment config 159 | param_names (list): list of parameter names 160 | values (list): list of parameter values 161 | 162 | Returns: 163 | dict: experiment config with extended name 164 | """ 165 | # Rename and append 166 | _converted_name = util.convert_param_names(param_names, values) 167 | 168 | # Use __ only once as a seperator 169 | sep = "__" 170 | if KEY.i_EXP_NAME in config and sep in config.get(KEY.i_EXP_NAME): 171 | sep = "_" 172 | 173 | config[KEY.i_EXP_NAME] = config.get(KEY.i_EXP_NAME) + sep + _converted_name 174 | config[KEY.i_NEST_DIR] = config.get(KEY.NAME) 175 | return config 176 | 177 | 178 | def unroll_exp_reps(exp_configs: List[dict]) -> List[dict]: 179 | """unrolls experiment repetitions into their own configuration object 180 | 181 | Args: 182 | exp_configs (List[dict]): List of experiment configurations 183 | 184 | Returns: 185 | List[dict]: List of unrolled experiment configurations 186 | """ 187 | unrolled_exps = [] 188 | 189 | for config in exp_configs: 190 | if KEY.i_REP_IDX in config: 191 | unrolled_exps.append(config) 192 | continue 193 | 194 | for r in range(config[KEY.REPS]): 195 | c = deepcopy(config) 196 | c[KEY.i_REP_IDX] = r 197 | c[KEY.i_REP_LOG_PATH] = os.path.join( 198 | c.get(KEY.LOG_PATH), "rep_{:02d}".format(r) 199 | ) 200 | unrolled_exps.append(c) 201 | return unrolled_exps 202 | -------------------------------------------------------------------------------- /cw2/cw_config/cw_conf_keys.py: -------------------------------------------------------------------------------- 1 | # SECTIONS 2 | SLURM = "slurm" 3 | DEFAULT = "default" 4 | 5 | # EXP KEYS 6 | NAME = "name" 7 | PATH = "path" 8 | LOG_PATH = "log_path" 9 | 10 | IMPORT_PATH = "import_path" 11 | IMPORT_EXP = "import_exp" 12 | 13 | # REPS 14 | REPS = "repetitions" 15 | REPS_PARALL = "reps_in_parallel" 16 | REPS_P_JOB = "reps_per_job" 17 | 18 | # EXP PARAMS 19 | PARAMS = "params" 20 | GRID = "grid" 21 | LIST = "list" 22 | ABLATIVE = "ablative" 23 | 24 | # INTERNAL 25 | i_BASIC_PATH = "_basic_path" 26 | i_EXP_NAME = "_experiment_name" 27 | i_NEST_DIR = "_nested_dir" 28 | i_DEBUG_FLAG = "_debug" 29 | # INTERNAL REP 30 | i_REP_IDX = "_rep_idx" 31 | i_REP_LOG_PATH = "_rep_log_path" 32 | 33 | # INTERNAL IMPORT ARCHIVE 34 | i_IMPORT_PATH_ARCHIVE = "_import_path_archive" 35 | i_IMPORT_EXP_ARCHIVE = "_import_exp_archive" 36 | 37 | # CPU CORES ASSIGNMENT 38 | i_CPU_CORES = "cpu_cores" 39 | -------------------------------------------------------------------------------- /cw2/cw_config/cw_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | from typing import List, Tuple 4 | from datetime import datetime 5 | 6 | import cw2.cw_config.cw_conf_keys as KEY 7 | from cw2.cw_config import conf_io, conf_path, conf_resolver, conf_unfolder 8 | 9 | 10 | class Config: 11 | def __init__( 12 | self, 13 | config_path: str = None, 14 | experiment_selections: List[str] = None, 15 | debug: bool = False, 16 | debug_all: bool = False, 17 | prefix_with_timestamp: bool = False 18 | ): 19 | self.slurm_config = None 20 | self.exp_configs = None 21 | 22 | self.f_name = None 23 | self.config_path = config_path 24 | self.exp_selections = experiment_selections 25 | 26 | self.prefix_with_timestamp = prefix_with_timestamp 27 | 28 | if config_path is not None: 29 | self.load_config(config_path, experiment_selections, debug, debug_all) 30 | 31 | def load_config( 32 | self, 33 | config_path: str, 34 | experiment_selections: List[str] = None, 35 | debug: bool = False, 36 | debug_all: bool = False, 37 | ) -> None: 38 | """Loads config from YAML file 39 | The config can include multiple experiments, DEFAULT paramters and a SLURM configuration 40 | 41 | Arguments: 42 | config_path {str} -- path to a YAML configuraton file 43 | experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None. 44 | """ 45 | 46 | self.config_path = config_path 47 | self.f_name = os.path.basename(config_path) 48 | 49 | self.exp_selections = experiment_selections 50 | 51 | slurm_configs, self.exp_configs = self._parse_configs( 52 | config_path, experiment_selections, debug, debug_all 53 | ) 54 | self.slurm_config = self._filter_slurm_configs(slurm_configs) 55 | 56 | @staticmethod 57 | def _filter_slurm_configs(slurm_configs: List[dict]) -> dict: 58 | """Returns machine/cluster specific slurm conf (identified by hostname) 59 | if available, otherwise returns the default one (if available) 60 | 61 | Arguments: 62 | slurm_configs: (list[dict]) -- all slurm configurations found in the config file 63 | Returns: 64 | dict -- SLURM configuration to use for this machine 65 | """ 66 | default_conf = None 67 | specific_conf = None 68 | hostname = socket.gethostname().lower() 69 | print("Hostname: {}".format(hostname)) 70 | for c in slurm_configs: 71 | print("Found slurm config: {}".format(c[KEY.NAME])) 72 | if c[KEY.NAME].lower() == KEY.SLURM.lower(): 73 | print("Seeting default slurm config") 74 | default_conf = c 75 | elif c[KEY.NAME].split("_")[1].lower() in hostname: 76 | print("Setting specific slurm config: {}".format(c[KEY.NAME])) 77 | specific_conf = c 78 | specific_conf[KEY.NAME] = KEY.SLURM 79 | 80 | return specific_conf if specific_conf is not None else default_conf 81 | 82 | def _parse_configs( 83 | self, 84 | config_path: str, 85 | experiment_selections: List[str] = None, 86 | debug: bool = False, 87 | debug_all: bool = False, 88 | ) -> Tuple[List[dict], List[dict]]: 89 | """parse the config file, including separating the SLURM configuration and expanding grid / list search params 90 | 91 | Arguments: 92 | config_path {str} -- path to the configuration file 93 | experiment_selections (List[str], optional): List of specific experiments to run. If None runs all. Defaults to None. 94 | 95 | Returns: 96 | Tuple[dict, dict] -- SLURM configuration, list of expanded experiment configurations 97 | """ 98 | 99 | slurm_config, default_config, experiment_configs = conf_io.get_configs( 100 | config_path, experiment_selections 101 | ) 102 | 103 | # if desired, prefix experiments with timestamp 104 | if self.prefix_with_timestamp: 105 | experiment_start = datetime.now().strftime("%m%d-%H%M%S") 106 | for exp_config in experiment_configs: 107 | exp_config.update(name=f"{experiment_start}_{exp_config['name']}") 108 | 109 | experiment_configs = conf_resolver.resolve_dependencies( 110 | default_config, experiment_configs, self.config_path 111 | ) 112 | experiment_configs = conf_unfolder.unfold_exps( 113 | experiment_configs, debug, debug_all 114 | ) 115 | 116 | return slurm_config, experiment_configs 117 | 118 | def to_yaml(self, dir_path: str = "", relpath: bool = True) -> None: 119 | """write config back into a YAML file. 120 | 121 | Args: 122 | fpath (str, optional): path to write to. Will be written to outputdir unless specified differently. Defaults to "". 123 | relpath (bool, optional): Use relative paths only. Usefull for loading functionality. Defaults to True. 124 | """ 125 | 126 | if dir_path == "": 127 | dir_path = self.exp_configs[0][KEY.i_BASIC_PATH] 128 | 129 | original_yml_name = os.path.splitext(self.f_name)[0] 130 | 131 | # List so it can be merged easily 132 | slurm_config = [] 133 | if self.slurm_config is not None: 134 | slurm_config.append(dict(self.slurm_config)) 135 | 136 | readable_configs = self._readable_exp_configs(relpath) 137 | 138 | # Save all named experiment configs in subdir 139 | grouped_configs = self._group_configs_by_name(readable_configs) 140 | for exp_name in grouped_configs.keys(): 141 | fpath = os.path.join( 142 | dir_path, 143 | exp_name, 144 | "relative_{}_{}.yml".format(original_yml_name, exp_name), 145 | ) 146 | conf_io.write_yaml(fpath, slurm_config + grouped_configs[exp_name]) 147 | 148 | # Save global configs 149 | fpath = os.path.join(dir_path, "relative_" + self.f_name) 150 | 151 | if self.exp_selections is not None: 152 | fpath = ( 153 | os.path.splitext(fpath)[0] 154 | + "_" 155 | + "_".join(self.exp_selections) 156 | + ".yml" 157 | ) 158 | 159 | # Merge into single list 160 | data = slurm_config + readable_configs 161 | conf_io.write_yaml(fpath, data) 162 | 163 | def _readable_exp_configs(self, relpath: bool = True) -> List[dict]: 164 | """Internal function to get more readable objects when written as yaml 165 | Converts to dict() and optionally use relative paths only 166 | Args: 167 | relpath (bool, optional): True if the new experiment config file should use relative paths only. Defaults to True. 168 | 169 | Returns: 170 | List[dict]: list of transformed experiment configuration dicts 171 | """ 172 | res = [] 173 | for exp in self.exp_configs: 174 | # Convert attrdict to dict for prettier yaml write 175 | c = dict(exp) 176 | if relpath: 177 | c = conf_path.make_rel_paths(c, c[KEY.i_BASIC_PATH]) 178 | res.append(c) 179 | return res 180 | 181 | def _group_configs_by_name(self, configs: List[dict]) -> dict: 182 | grouped_configs = {} 183 | for c in configs: 184 | name = c[KEY.NAME] 185 | if name not in grouped_configs: 186 | grouped_configs[name] = [c] 187 | else: 188 | grouped_configs[name].append(c) 189 | return grouped_configs 190 | -------------------------------------------------------------------------------- /cw2/cw_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_data/__init__.py -------------------------------------------------------------------------------- /cw2/cw_data/cw_loading.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import pandas as pd 4 | 5 | from cw2 import job, scheduler, util 6 | from cw2.cw_data import cw_logging, cw_pd_logger 7 | 8 | 9 | class Loader(scheduler.AbstractScheduler): 10 | def run(self, overwrite: bool = False): 11 | cw_res = CWResult() 12 | 13 | for j in self.joblist: 14 | cw_res._load_job(j) 15 | 16 | cw_res._compile() 17 | return cw_res.data().set_index(["name", "r"]) 18 | 19 | 20 | class CWResult: 21 | def __init__(self, df: pd.DataFrame = None): 22 | self.data_list = [] 23 | self.df = df 24 | 25 | def _compile(self): 26 | self.df = pd.DataFrame(self.data_list) 27 | self.data_list = None 28 | 29 | def _load_job(self, j: job.Job) -> None: 30 | for c in j.tasks: 31 | rep_data = j.load_task(c) 32 | rep_data.update( 33 | { 34 | "name": c["name"], 35 | "r": c["_rep_idx"], 36 | "rep_path": c["_rep_log_path"], 37 | "params": c["params"], 38 | } 39 | ) 40 | rep_data.update(util.flatten_dict(c["params"])) 41 | self.data_list.append(rep_data) 42 | 43 | def data(self) -> pd.DataFrame: 44 | return self.df 45 | 46 | 47 | @pd.api.extensions.register_dataframe_accessor("cw2") 48 | class Cw2Accessor: 49 | def __init__(self, pandas_obj): 50 | self._obj = pandas_obj 51 | 52 | def filter(self, param_dict: dict): 53 | """filter by parameter dictionary. 54 | Supports nested dictionarys. Has to be the same format as the config file. 55 | 56 | Args: 57 | param_dict (dict): parameter dictionary 58 | 59 | Returns: 60 | pd.DataFrame: filtered result 61 | """ 62 | flattened = util.flatten_dict(param_dict) 63 | 64 | df = self._obj.copy() 65 | for k, v in flattened.items(): 66 | df = df[df[k] == v] 67 | return df 68 | 69 | def repetition(self, r: int): 70 | """only select a specific repetition. 71 | 72 | Args: 73 | r (int): repetition number 74 | 75 | Returns: 76 | pd.DataFrame: filtered result 77 | """ 78 | df = self._obj 79 | return df[df["r"] == r] 80 | 81 | def name(self, name: str): 82 | """only select experiments with a specific name 83 | 84 | Args: 85 | name (str): experiment name 86 | 87 | Returns: 88 | pd.DataFrame: filtered result 89 | """ 90 | df = self._obj 91 | return df[df["name"] == name] 92 | 93 | def logger( 94 | self, 95 | l_name: str = "", 96 | l_obj: cw_logging.AbstractLogger = None, 97 | l_cls: Type[cw_logging.AbstractLogger] = None, 98 | ): 99 | """select the column containg the results from a specific logger 100 | 101 | Args: 102 | l_name (str, optional): the class name of the logger. Defaults to "". 103 | l_obj (cw_logging.AbstractLogger, optional): an instance object of the logger. Defaults to None. 104 | l_cls (Type[cw_logging.AbstractLogger], optional): the class object of the logger. Defaults to None. 105 | 106 | Returns: 107 | pd.Series: The column with the logger results 108 | """ 109 | if l_obj is not None: 110 | l_cls = l_obj.__class__ 111 | 112 | if l_cls is not None: 113 | l_name = l_cls.__name__ 114 | 115 | df = self._obj 116 | return df[l_name] 117 | 118 | def flatten_pd_log(self): 119 | pd_log_col = cw_pd_logger.PandasLogger.__name__ 120 | if pd_log_col not in self._obj.columns: 121 | return self._obj 122 | 123 | df = self._obj 124 | new_df = pd.DataFrame() 125 | for idx, row in df.iterrows(): 126 | nested_df = row[pd_log_col] 127 | 128 | outer_row = row.drop(pd_log_col) 129 | for c, v in outer_row.iteritems(): 130 | if isinstance(v, dict): 131 | nested_df[c] = str(v) 132 | nested_df[c] = nested_df[c].map(eval) 133 | continue 134 | nested_df[c] = v 135 | nested_df["name"] = idx[0] 136 | nested_df["r"] = idx[1] 137 | new_df = new_df.append(nested_df, ignore_index=True) 138 | return new_df.set_index(["name", "r", "iter"]) 139 | -------------------------------------------------------------------------------- /cw2/cw_data/cw_logging.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | import os 4 | import pprint 5 | import sys 6 | from typing import Dict, Iterable, List, Optional 7 | 8 | 9 | class AbstractLogger(abc.ABC): 10 | """Abstract Base Class for all Loggers""" 11 | 12 | def __init__( 13 | self, 14 | ignore_keys: Optional[Iterable] = None, 15 | allow_keys: Optional[Iterable] = None, 16 | ): 17 | """ 18 | Initialize a logger that records based on (a subset of) the provided keys 19 | :param ignore_keys: A list of keys 20 | :param allow_keys: 21 | """ 22 | assert ( 23 | ignore_keys is None or allow_keys is None 24 | ), "Logging keys can either be whitelisted ('ignore_keys') or blacklisted ('allow_keys'), but not both" 25 | self.ignore_keys = ignore_keys 26 | self.allow_keys = allow_keys 27 | 28 | def filter(self, data: Dict) -> Dict: 29 | """ 30 | Base Function. Either filters out ignored keys or looks for allowed ones 31 | 32 | Args: 33 | data: data payload dict 34 | """ 35 | if self.ignore_keys is not None: # blacklist ignored keys 36 | return { 37 | key: value for key, value in data.items() if key not in self.ignore_keys 38 | } 39 | elif self.allow_keys is not None: # whitelist allowed keys 40 | return {key: value for key, value in data.items() if key in self.allow_keys} 41 | else: # use all keys 42 | return data 43 | 44 | def preprocess(self, *args): 45 | """ 46 | intended to be called during Experiment.initialize() 47 | """ 48 | pass 49 | 50 | @abc.abstractmethod 51 | def initialize(self, config: dict, rep: int, rep_log_path: str) -> None: 52 | """needs to be implemented by subclass. 53 | Called once at the start of each repetition. 54 | Used to configure / reset the Logger for each repetition. 55 | 56 | Arguments: 57 | config {attrdict.Attrdict} -- configuration 58 | rep {int} -- repetition counter 59 | """ 60 | raise NotImplementedError 61 | 62 | @abc.abstractmethod 63 | def process(self, data: dict) -> None: 64 | """needs to be implemented by subclass. 65 | The main method. Defines how the logger handles the result of each iteration. 66 | 67 | Arguments: 68 | data -- data payload to be processed by logger 69 | """ 70 | raise NotImplementedError 71 | 72 | @abc.abstractmethod 73 | def finalize(self) -> None: 74 | """needs to be implemented by subclass. 75 | Called at the end of each repetition. 76 | Use it to finalize the processing like write to disk or other cleanup 77 | """ 78 | raise NotImplementedError 79 | 80 | @abc.abstractmethod 81 | def load(self): 82 | """needs to be implemented by subclass. 83 | called when the data should be loaded after execution is complete. 84 | """ 85 | raise NotImplementedError 86 | 87 | 88 | class LoggerArray(AbstractLogger): 89 | """Storage for multiple AbstractLogger objects. 90 | Behaves to the outside like a simple AbstractLogger implementation. 91 | Used to apply multiple loggers in a run. 92 | """ 93 | 94 | def __init__(self): 95 | self._logger_array: List[AbstractLogger] = [] 96 | 97 | def add(self, logger: AbstractLogger) -> None: 98 | self._logger_array.append(logger) 99 | 100 | def initialize(self, config: dict, rep: int, rep_log_path: str) -> None: 101 | for logger in self._logger_array: 102 | logger.initialize(config, rep, rep_log_path) 103 | 104 | def preprocess(self, *args): 105 | for logger in self._logger_array: 106 | logger.preprocess(*args) 107 | 108 | def process(self, data: dict) -> None: 109 | for logger in self._logger_array: 110 | logger.process(data) 111 | 112 | def finalize(self) -> None: 113 | for logger in self._logger_array: 114 | logger.finalize() 115 | 116 | def load(self): 117 | data = {} 118 | for logger in self._logger_array: 119 | try: 120 | d = logger.load() 121 | except: 122 | getLogger().exception(logger.__class__.__name__) 123 | d = "Error when loading {}".format(logger.__class__.__name__) 124 | 125 | if d is not None: 126 | if not isinstance(d, dict): 127 | d = {logger.__class__.__name__: d} 128 | data.update(d) 129 | return data 130 | 131 | def __iter__(self): 132 | return iter(self._logger_array) 133 | 134 | def is_empty(self) -> bool: 135 | return len(self._logger_array) == 0 136 | 137 | 138 | class Printer(AbstractLogger): 139 | """Prints the result of each iteration to the console.""" 140 | 141 | def initialize(self, config: dict, rep: int, rep_log_path: str) -> None: 142 | pass 143 | 144 | def process(self, data: dict) -> None: 145 | data_ = self.filter(data) 146 | pprint.pprint(data_) 147 | 148 | def finalize(self) -> None: 149 | pass 150 | 151 | def load(self): 152 | pass 153 | 154 | 155 | class PythonLogger(AbstractLogger): 156 | """ 157 | Logger which writes calls to logging.getLogger('cw2') on to disk 158 | """ 159 | 160 | def __init__(self): 161 | self.logger = getLogger() 162 | 163 | def initialize(self, config: dict, rep: int, rep_log_path: str) -> None: 164 | self.outh = logging.FileHandler( 165 | os.path.join(rep_log_path, "out.log"), delay=True 166 | ) 167 | self.outh.setLevel(logging.INFO) 168 | self.outh.setFormatter(_formatter) 169 | self.logger.addHandler(self.outh) 170 | 171 | self.errh = logging.FileHandler(os.path.join(rep_log_path, "err.log")) 172 | self.errh.setLevel(logging.ERROR) 173 | self.errh.setFormatter(_formatter) 174 | self.logger.addHandler(self.errh) 175 | 176 | def process(self, data: dict) -> None: 177 | pass 178 | 179 | def finalize(self) -> None: 180 | for h in [self.outh, self.errh]: 181 | h.flush() 182 | h.close() 183 | self.logger.removeHandler(h) 184 | 185 | def load(self): 186 | pass 187 | 188 | 189 | ### logging module functionality #### 190 | 191 | 192 | class _CWFormatter(logging.Formatter): 193 | """Taken From CW V1""" 194 | 195 | def __init__(self): 196 | # self.std_formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s') 197 | self.std_formatter = logging.Formatter("[%(name)s] [%(levelname)s] %(message)s") 198 | self.red_formatter = logging.Formatter( 199 | "[%(asctime)s]:[%(name)s] [%(levelname)s] %(message)s" 200 | ) 201 | 202 | def format(self, record: logging.LogRecord): 203 | if record.levelno < logging.ERROR: 204 | return self.std_formatter.format(record) 205 | else: 206 | return self.red_formatter.format(record) 207 | 208 | 209 | _formatter = _CWFormatter() 210 | 211 | 212 | def getLogger() -> logging.Logger: 213 | """creates a logging.getLogger('cw2') object with initialization. 214 | Parallelization via joblib needs a more sophisticated getLogger function. 215 | 216 | Returns: 217 | logging.Logger 218 | """ 219 | _logging_logger = logging.getLogger("cw2") 220 | 221 | if _logging_logger.getEffectiveLevel() > logging.INFO: 222 | ch = logging.StreamHandler(sys.stdout) 223 | ch.setLevel(logging.INFO) 224 | ch.setFormatter(_formatter) 225 | 226 | _logging_logger.setLevel(logging.INFO) 227 | _logging_logger.addHandler(ch) 228 | 229 | return _logging_logger 230 | -------------------------------------------------------------------------------- /cw2/cw_data/cw_pd_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Iterable, Optional 3 | 4 | import pandas as pd 5 | 6 | from cw2.cw_data import cw_logging 7 | 8 | 9 | class PandasLogger(cw_logging.AbstractLogger): 10 | """Writes the results of each repetition seperately to disk 11 | Each repetition is saved in its own directory. Write occurs after every iteration. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | ignore_keys: Optional[Iterable] = None, 17 | allow_keys: Optional[Iterable] = None, 18 | ): 19 | super().__init__(ignore_keys=ignore_keys, allow_keys=allow_keys) 20 | self.log_path = "" 21 | self.csv_name = "rep.csv" 22 | self.pkl_name = "rep.pkl" 23 | self.df = pd.DataFrame() 24 | 25 | def initialize(self, config: Dict, rep: int, rep_log_path: str): 26 | self.log_path = rep_log_path 27 | self.csv_name = os.path.join(self.log_path, "rep_{}.csv".format(rep)) 28 | self.pkl_name = os.path.join(self.log_path, "rep_{}.pkl".format(rep)) 29 | self.df = pd.DataFrame() 30 | 31 | def process(self, log_data: dict) -> None: 32 | data = self.filter(log_data) 33 | 34 | self.df = self.df.append(data, ignore_index=True) 35 | 36 | try: 37 | self.df.to_csv(self.csv_name, index_label="index") 38 | except: 39 | cw_logging.getLogger().warning("Could not save {}".format(self.csv_name)) 40 | 41 | try: 42 | self.df.to_pickle(self.pkl_name) 43 | except: 44 | cw_logging.getLogger().warning("Could not save {}".format(self.pkl_name)) 45 | 46 | def finalize(self) -> None: 47 | pass 48 | 49 | def load(self): 50 | payload = {} 51 | df: pd.DataFrame = None 52 | 53 | # Check if file exists 54 | try: 55 | df = pd.read_pickle(self.pkl_name) 56 | except FileNotFoundError as _: 57 | warn = "{} does not exist".format(self.pkl_name) 58 | cw_logging.getLogger().warning(warn) 59 | return warn 60 | 61 | # Enrich Payload with descriptive statistics for loading DF structure 62 | """ 63 | for c in df.columns: 64 | if pd.api.types.is_numeric_dtype(df[c]): 65 | payload['{}_min'.format(c)] = df[c].min() 66 | payload['{}_max'.format(c)] = df[c].max() 67 | payload['{}_mean'.format(c)] = df[c].mean() 68 | payload['{}_std'.format(c)] = df[c].std() 69 | 70 | payload['{}_last'.format(c)] = df[c].iloc[-1] 71 | """ 72 | payload[self.__class__.__name__] = df 73 | return payload 74 | -------------------------------------------------------------------------------- /cw2/cw_data/cw_wandb_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from random import random 4 | from time import sleep 5 | 6 | # To prevent conflicts between wandb and the joblib scheduler 7 | # see https://github.com/wandb/client/issues/1525 for reference 8 | os.environ["WANDB_START_METHOD"] = "thread" 9 | 10 | from itertools import groupby 11 | from typing import Dict, Iterable, List, Optional 12 | 13 | import pandas as pd 14 | import wandb 15 | 16 | from cw2.cw_data import cw_logging 17 | from cw2.util import get_file_names_in_directory 18 | 19 | 20 | def reset_wandb_env(): 21 | exclude = { 22 | "WANDB_PROJECT", 23 | "WANDB_ENTITY", 24 | "WANDB_API_KEY", 25 | "WANDB_START_METHOD", 26 | } 27 | for k, v in os.environ.items(): 28 | if k.startswith("WANDB_") and k not in exclude: 29 | del os.environ[k] 30 | 31 | 32 | def group_parameters(list_of_strings: List[str]): 33 | """groups different strings that start with a common substring (using "." as delimiter) 34 | and outputs a single, more concise string. 35 | Example: 36 | outstring = group_parameters['local', 'mod.enc.tidentity', 'mod.hea.nhl5', 'mod.hea.ioFalse', 'mod.enc.hd64'] 37 | % outstring will be 'local,mod_[enc_[hd64,tidentity],hea_[ioFalse,nhl5]]' 38 | """ 39 | groups = [] 40 | uniquekeys = [] 41 | num_subgroups = 0 42 | substring = "" 43 | 44 | for k, g in groupby(sorted(list_of_strings), lambda string: string.split(".")[0]): 45 | groups.append(list(g)) 46 | uniquekeys.append(k) 47 | 48 | if len(groups[-1]) == 1: 49 | substring += groups[-1][0] + "," 50 | num_subgroups += 1 51 | else: 52 | remainder = [s.replace(k, "", 1) for s in groups[-1]] 53 | remainder = [s.replace(".", "", 1) for s in remainder] 54 | if len(remainder) > 0: 55 | subgroups, num_subs = group_parameters(remainder) 56 | if num_subs > 1: 57 | substring += k + "_[" + subgroups + "]," 58 | else: 59 | substring += k + "_" + subgroups + "," 60 | num_subgroups += num_subs 61 | return substring[:-1], len(groups) 62 | 63 | 64 | class WandBLogger(cw_logging.AbstractLogger): 65 | def __init__( 66 | self, 67 | ignore_keys: Optional[Iterable] = None, 68 | allow_keys: Optional[Iterable] = None, 69 | ): 70 | super(WandBLogger, self).__init__( 71 | ignore_keys=ignore_keys, allow_keys=allow_keys 72 | ) 73 | self.log_path = "" 74 | self.run = None 75 | 76 | def initialize(self, config: Dict, rep: int, rep_log_path: str) -> None: 77 | if "wandb" in config.keys(): 78 | self.init_fields(config, rep, rep_log_path) 79 | self.connect_to_wandb() 80 | 81 | else: 82 | warnings.warn("No 'wandb' field in yaml - Ignoring Weights & Biases Logger") 83 | 84 | def init_fields(self, config: Dict, rep: int, rep_log_path: str): 85 | self.log_path = rep_log_path 86 | self.rep = rep 87 | self.config = config["wandb"] 88 | self.cw2_config = config 89 | reset_wandb_env() 90 | self.job_name = config["_experiment_name"].replace("__", "_") 91 | self.use_group_parameters = self.config.get("use_group_parameters", False) 92 | if self.use_group_parameters: 93 | self.job_name = group_parameters(self.job_name.split("_"))[0] 94 | self.runname = self.job_name + "_rep_{:02d}".format(rep) 95 | 96 | # optional: change the job_type to a fixed alias if the option is present 97 | if "job_type" in self.config: 98 | self.job_name = self.config["job_type"] 99 | # have entity and group config entry optional 100 | self.entity = self.config.get("entity", None) 101 | self.group = self.config.get("group", None) 102 | # Get the model logging directory 103 | self.wandb_log_model = self.config.get("log_model", False) 104 | if self.wandb_log_model: 105 | self.save_model_dir = os.path.join(self.log_path, "model") 106 | self.cw2_config["save_model_dir"] = self.save_model_dir 107 | self.model_name = self.config.get("model_name", "model") 108 | else: 109 | self.save_model_dir = None 110 | 111 | def connect_to_wandb(self): 112 | last_error = None 113 | for i in range(10): 114 | try: 115 | self.run = wandb.init( 116 | project=self.cw2_config["wandb"]["project"], 117 | entity=self.entity, 118 | group=self.group, 119 | job_type=self.job_name[:63], 120 | name=self.runname[:63], 121 | config=self.cw2_config["params"], 122 | dir=self.log_path, 123 | settings=wandb.Settings( 124 | _disable_stats=self.cw2_config["wandb"].get( 125 | "disable_stats", False 126 | ) 127 | ), 128 | mode="online" 129 | if self.cw2_config["wandb"].get("enabled", True) 130 | else "disabled", 131 | ) 132 | return # if starting the run is successful, exit the loop (and in this case the function) 133 | except Exception as e: 134 | last_error = e 135 | # implement a simple randomized exponential backoff if starting a run fails 136 | waiting_time = ((random() / 50) + 0.01) * (2**i) 137 | # wait between 0.01 and 10.24 seconds depending on the random seed and the iteration of the exponent 138 | 139 | warnings.warn( 140 | "Problem with starting wandb: {}. Trying again in {} seconds".format( 141 | e, waiting_time 142 | ) 143 | ) 144 | sleep(waiting_time) 145 | warnings.warn("wandb init failed several times.") 146 | raise last_error 147 | 148 | def process(self, data: dict) -> None: 149 | if self.run is not None: 150 | # Skip logging if interval is defined but not satisfied 151 | log_interval = self.config.get("log_interval", None) 152 | if log_interval is not None and data["iter"] % log_interval != 0: 153 | return 154 | 155 | if "histogram" in self.config: 156 | for el in self.config["histogram"]: 157 | if el in data: 158 | self.run.log( 159 | {el: wandb.Histogram(np_histogram=data[el])}, 160 | step=data["iter"], 161 | ) 162 | filtered_data = self.filter(data) 163 | step = data.get("iter", None) 164 | self.run.log(filtered_data, step=step) 165 | 166 | def finalize(self) -> None: 167 | if self.run is not None: 168 | self.log_model() 169 | self.run.finish() 170 | 171 | def load(self): 172 | pass 173 | 174 | def log_model(self): 175 | """ 176 | Log model as an Artifact 177 | 178 | Returns: 179 | None 180 | """ 181 | if self.wandb_log_model is False: 182 | return 183 | 184 | # Initialize wandb artifact 185 | model_artifact = wandb.Artifact(name=self.model_name, type="model") 186 | 187 | # Get all file names in log dir 188 | file_names = get_file_names_in_directory(self.save_model_dir) 189 | 190 | if file_names is None: 191 | warnings.warn("save model dir is not available or empty.") 192 | return 193 | 194 | # Add files into artifact 195 | for file in file_names: 196 | model_artifact.add_file(os.path.join(self.save_model_dir, file)) 197 | 198 | aliases = ["latest", f"finished-rep-{self.rep}"] 199 | 200 | # Log and upload 201 | self.run.log_artifact(model_artifact, aliases=aliases) 202 | 203 | def log_plot(self, x, y, column_names=("x", "y"), plot_id="plot", title="Plot"): 204 | data = [list(i) for i in zip(x, y)] 205 | table = wandb.Table(data=data, columns=column_names) 206 | self.run.log( 207 | { 208 | plot_id: wandb.plot.line( 209 | table, column_names[0], column_names[0], title=title 210 | ) 211 | } 212 | ) 213 | 214 | def log_table(self, data, table_id="table"): 215 | assert type(data) is pd.DataFrame 216 | table = wandb.Table(dataframe=data) 217 | self.run.log({table_id: table}) 218 | -------------------------------------------------------------------------------- /cw2/cw_error.py: -------------------------------------------------------------------------------- 1 | class ConfigKeyError(Exception): 2 | """raised when a key is missing in the configuration.""" 3 | 4 | pass 5 | 6 | 7 | class MissingConfigError(Exception): 8 | """raise when a config document is missing in the configuration.""" 9 | 10 | pass 11 | 12 | 13 | class ExperimentNotFoundError(Exception): 14 | """raise when experiment selection could not be found in the configuration""" 15 | 16 | pass 17 | 18 | 19 | class ExperimentSurrender(Exception): 20 | def __init__(self, payload: dict = None): 21 | if payload is None: 22 | payload = {} 23 | self.payload = payload 24 | -------------------------------------------------------------------------------- /cw2/cw_slurm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/cw2/cw_slurm/__init__.py -------------------------------------------------------------------------------- /cw2/cw_slurm/cw_slurm.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | 7 | import __main__ 8 | 9 | import cw2.cw_config.cw_conf_keys as CKEYS 10 | import cw2.cw_slurm.cw_slurm_keys as SKEYS 11 | from cw2 import cli_parser, cw_error, util 12 | from cw2.cw_config import cw_config 13 | from cw2.cw_data import cw_logging 14 | 15 | 16 | class SlurmConfig: 17 | def __init__(self, conf: cw_config.Config) -> None: 18 | self.conf = conf 19 | self.slurm_conf = conf.slurm_config 20 | 21 | if self.slurm_conf is None: 22 | raise cw_error.MissingConfigError( 23 | "No SLURM configuration found in {}".format(self.conf.config_path) 24 | ) 25 | 26 | self._check_template() 27 | 28 | def _check_template(self): 29 | """check if an sbatch.sh template is present. 30 | If no costum template has been specified, the default will be used. 31 | """ 32 | 33 | if SKEYS.TEMPLATE_PATH not in self.slurm_conf: 34 | self.slurm_conf[SKEYS.TEMPLATE_PATH] = os.path.join( 35 | os.path.dirname(__file__), "../default_sbatch.sh" 36 | ) 37 | 38 | if not os.path.exists(self.slurm_conf[SKEYS.TEMPLATE_PATH]): 39 | raise cw_error.ConfigKeyError( 40 | "Could not find default sbatch template. Please specify your own 'path_to_template'." 41 | ) 42 | 43 | def _complete_optionals(self): 44 | """Fill in any optional values.""" 45 | 46 | sc: dict = self.slurm_conf 47 | 48 | exp_output_path = self.conf.exp_configs[0][CKEYS.i_BASIC_PATH] 49 | 50 | # CREATE OPTIONAL COLLECTIONS 51 | # Must be done first: 52 | sc.setdefault(SKEYS.SBATCH_ARGS, {}) 53 | 54 | # SET DEFAULT VALUES 55 | sc.setdefault(SKEYS.SLURM_LOG, os.path.join(exp_output_path, "slurmlog")) 56 | sc.setdefault(SKEYS.SLURM_OUT, os.path.join(exp_output_path, "sbatch.sh")) 57 | sc.setdefault(SKEYS.ACCOUNT, "") 58 | 59 | # COMPLEX CONVERSIONS 60 | if isinstance(sc[SKEYS.TIME], int): 61 | sc[SKEYS.TIME] = "{:d}:{:d}:00".format( 62 | sc[SKEYS.TIME] // 60, sc[SKEYS.TIME] % 60 63 | ) 64 | 65 | if SKEYS.CPU_MEM in sc: 66 | sc[SKEYS.SBATCH_ARGS][SKEYS.CPU_MEM] = sc.get(SKEYS.CPU_MEM) 67 | 68 | # DEFAULT OR COMPLEX CONVERSION 69 | if SKEYS.VENV in sc: 70 | sc[SKEYS.VENV] = "source activate {}".format(sc[SKEYS.VENV]) 71 | else: 72 | sc[SKEYS.VENV] = "" 73 | 74 | if SKEYS.SH_LINES in sc: 75 | sc[SKEYS.SH_LINES] = "\n".join(sc[SKEYS.SH_LINES]) 76 | else: 77 | sc[SKEYS.SH_LINES] = "" 78 | 79 | def _complete_cli_args(self): 80 | """identify and process the relevant CLI flags from the original call.""" 81 | sc = self.slurm_conf 82 | cw_options = cli_parser.Arguments().get() 83 | 84 | sc[SKEYS.CW_ARGS] = "" 85 | if cw_options["overwrite"]: 86 | sc[SKEYS.CW_ARGS] += " -o" 87 | if cw_options["experiments"] is not None: 88 | sc[SKEYS.CW_ARGS] += " -e " + " ".join(cw_options["experiments"]) 89 | 90 | def _complete_sbatch_args(self): 91 | """if optional SBATCH arguments are present, build a corresponding string.""" 92 | sc = self.slurm_conf 93 | 94 | if SKEYS.SBATCH_ARGS not in sc: # Check if empty 95 | sc[SKEYS.SBATCH_ARGS] = "" 96 | return 97 | else: # Else build String 98 | sbatch_args = sc.get(SKEYS.SBATCH_ARGS) 99 | 100 | args_list = ["#SBATCH --{} {}".format(k, v) for k, v in sbatch_args.items()] 101 | sc[SKEYS.SBATCH_ARGS] = "\n".join(args_list) 102 | 103 | def finalize(self, num_jobs: int): 104 | """enrich slurm configuration with dynamically computed values 105 | 106 | Args: 107 | num_jobs (int): total number of defined jobs 108 | """ 109 | 110 | # counting starts at 0 111 | self.slurm_conf[SKEYS.LAST_IDX] = num_jobs - 1 112 | 113 | # Order is important! 114 | self._complete_optionals() 115 | self._complete_cli_args() 116 | self._complete_sbatch_args() 117 | 118 | 119 | class SlurmDirectoryManager: 120 | MODE_COPY = "COPY" 121 | MODE_MULTI = "MULTI" 122 | MODE_NOCOPY = "NOCOPY" 123 | MODE_ZIP = "ZIP" 124 | 125 | def __init__(self, sc: SlurmConfig, conf: cw_config.Config) -> None: 126 | self.slurm_config = sc 127 | self.conf = conf 128 | self.m = self.set_mode() 129 | os.makedirs(sc.slurm_conf[SKEYS.SLURM_LOG], exist_ok=True) 130 | 131 | def set_mode(self): 132 | """find which code-copy mode is configured 133 | 134 | Raises: 135 | cw_error.ConfigKeyError: if incomplete definition 136 | 137 | Returns: 138 | code-copy mode 139 | """ 140 | sc = self.slurm_config.slurm_conf 141 | 142 | # COUNT MISSING ARGS 143 | cp_error_count = 0 144 | missing_arg = "" 145 | if SKEYS.EXP_CP_AUTO not in sc and SKEYS.EXP_CP_DST not in sc: 146 | cp_error_count += 1 147 | missing_arg = SKEYS.EXP_CP_DST 148 | 149 | if SKEYS.EXP_CP_SRC not in sc: 150 | cp_error_count += 1 151 | missing_arg = SKEYS.EXP_CP_SRC 152 | 153 | # MODE SWITCH 154 | if cp_error_count == 1: 155 | raise cw_error.ConfigKeyError( 156 | "Incomplete SLURM experiment copy config. Missing key: {}".format( 157 | missing_arg 158 | ) 159 | ) 160 | 161 | cw_options = cli_parser.Arguments().get() 162 | if cw_options.get("zip"): 163 | return self.MODE_ZIP 164 | 165 | if cw_options.get("multicopy"): 166 | if cp_error_count == 0: 167 | return self.MODE_MULTI 168 | else: 169 | raise cw_error.ConfigKeyError( 170 | "Incomplete SLURM experiment copy config. Please define SRC and DST for --multicopy" 171 | ) 172 | 173 | if cp_error_count == 0: 174 | return self.MODE_COPY 175 | return self.MODE_NOCOPY 176 | 177 | def dir_size_validation(self, src): 178 | """validates that the SRC for code copy is below 200MB in size 179 | 180 | Args: 181 | src: src path 182 | 183 | Raises: 184 | cw_error.ConfigKeyError: if directory is greater than 200MB 185 | """ 186 | cw_options = cli_parser.Arguments().get() 187 | if cw_options.get("skipsizecheck"): 188 | return 189 | 190 | dirsize = util.get_size(src) 191 | if dirsize > 200.0: 192 | cw_logging.getLogger().warning( 193 | "SourceDir {} is greater than 200MByte".format(src) 194 | ) 195 | msg = ( 196 | "Directory {} is greater than 200MByte." 197 | " If you are sure you want to copy/zip this dir, use --skipsizecheck." 198 | "\nElse check experiment_copy__ configuration keys".format(src) 199 | ) 200 | raise cw_error.ConfigKeyError(msg) 201 | 202 | def get_exp_src(self) -> str: 203 | """retrieves the code-copy src. 204 | Uses CWD as default unless specified 205 | 206 | Returns: 207 | src path 208 | """ 209 | sc = self.slurm_config.slurm_conf 210 | return sc.get(SKEYS.EXP_CP_SRC, os.getcwd()) 211 | 212 | def get_exp_dst(self): 213 | """retrieves the code-copy dst. 214 | Uses CWD as default unless specified 215 | 216 | Returns: 217 | src path 218 | """ 219 | sc = self.slurm_config.slurm_conf 220 | if SKEYS.EXP_CP_AUTO in sc and SKEYS.EXP_CP_DST not in sc: 221 | sc[SKEYS.EXP_CP_DST] = os.path.join( 222 | sc.get(SKEYS.EXP_CP_AUTO), 223 | datetime.datetime.now().strftime("%Y%m%d%G%M%S"), 224 | ) 225 | if SKEYS.EXP_CP_DST in sc: 226 | return sc[SKEYS.EXP_CP_DST] 227 | else: 228 | exp_output_path = self.conf.exp_configs[0][CKEYS.i_BASIC_PATH] 229 | return os.path.join(exp_output_path, "code") 230 | 231 | def zip_exp(self): 232 | """procedure for creating a zip backup""" 233 | src = self.get_exp_src() 234 | dst = self.get_exp_dst() 235 | self.dir_size_validation(src) 236 | 237 | shutil.make_archive(dst, "zip", src) 238 | 239 | def create_single_copy(self): 240 | """creates a copy of the exp for slurm execution""" 241 | src = self.get_exp_src() 242 | dst = self.get_exp_dst() 243 | self._copy_files(src, dst) 244 | 245 | def create_multi_copy(self, num_jobs: int): 246 | """creates multiple copies of the exp, one for each slurm job 247 | 248 | Args: 249 | num_jobs (int): number of total jobs 250 | """ 251 | src = self.get_exp_src() 252 | dst_base = self.get_exp_dst() 253 | 254 | for i in range(num_jobs): 255 | dst = os.path.join(dst_base, str(i)) 256 | self._copy_files(src, dst) 257 | 258 | # Add MultiCopy ChangeDir to Slurmconf 259 | self.slurm_config.slurm_conf[SKEYS.SH_LINES] += "\ncd {} \n".format( 260 | os.path.join(self.get_exp_dst(), "$SLURM_ARRAY_TASK_ID") 261 | ) 262 | 263 | def _copy_files(self, src, dst): 264 | """copies files from src to dst 265 | 266 | Args: 267 | src: source directory 268 | dst: destination directory 269 | 270 | Raises: 271 | cw_error.ConfigKeyError: if the dst is inside the source. Recursive copying! 272 | cw_error.ConfigKeyError: if the dst already exists and overwrite is not forced. 273 | """ 274 | self.dir_size_validation(src) 275 | 276 | # Check Filesystem 277 | if util.check_subdir(src, dst): 278 | raise cw_error.ConfigKeyError( 279 | "experiment_copy_dst is a subdirectory of experiment_copy_src. Recursive Copying is bad." 280 | ) 281 | try: 282 | os.makedirs(dst, exist_ok=cli_parser.Arguments().get()["overwrite"]) 283 | except FileExistsError: 284 | raise cw_error.ConfigKeyError( 285 | "{} already exists. Please define a different 'experiment_copy_dst', use '-o' to overwrite or '--nocodecopy' to skip." 286 | ) 287 | 288 | # Copy files 289 | ign = shutil.ignore_patterns("*.pyc", "tmp*", ".git*") 290 | for item in os.listdir(src): 291 | s = os.path.join(src, item) 292 | d = os.path.join(dst, item) 293 | if os.path.isdir(s): 294 | shutil.copytree(s, d, ignore=ign) 295 | else: 296 | shutil.copy2(s, d) 297 | 298 | def move_files(self, num_jobs: int): 299 | """moves exp files according to detected copy mode 300 | Args: 301 | num_jobs: number of slurm jobs for multi-copy 302 | """ 303 | # Check Skip Flag 304 | cw_options = cli_parser.Arguments().get() 305 | if cw_options.get("nocodecopy"): 306 | print("Skipping Code Copy") 307 | return 308 | 309 | if self.m == self.MODE_COPY: 310 | self.create_single_copy() 311 | 312 | if self.m == self.MODE_MULTI: 313 | self.create_multi_copy(num_jobs) 314 | 315 | if self.m == self.MODE_ZIP: 316 | self.zip_exp() 317 | 318 | def get_exp_exec_dir(self) -> str: 319 | """retrieves the experiment execution dir. 320 | This dir depends on the exp_copy_dst 321 | 322 | Returns: 323 | str: experiment execution directory 324 | """ 325 | if self.m == self.MODE_COPY or self.m == self.MODE_MULTI: 326 | return self.get_exp_dst() 327 | 328 | return self.get_exp_src() 329 | 330 | def get_py_path(self) -> str: 331 | """computes a modified python path, depending on the experiment_copy procedure 332 | 333 | Returns: 334 | str: python path setting 335 | """ 336 | if self.m in [self.MODE_NOCOPY, self.MODE_ZIP]: 337 | return "" 338 | 339 | pypath = sys.path.copy() 340 | 341 | src = self.get_exp_src() 342 | dst = self.get_exp_dst() 343 | 344 | if self.m == self.MODE_MULTI: 345 | dst = os.path.join(dst, "$SLURM_ARRAY_TASK_ID") 346 | 347 | new_path = [ 348 | x.replace(os.path.abspath(src), os.path.abspath(dst)) for x in pypath 349 | ] 350 | # return "export PYTHONPATH=" + ":".join(new_path) 351 | # Maybe this is better? 352 | return "export PYTHONPATH=$PYTHONPATH:" + ":".join(new_path) 353 | 354 | 355 | def run_slurm(conf: cw_config.Config, num_jobs: int) -> None: 356 | """starts slurm execution 357 | 358 | Args: 359 | conf (cw_config.Config): config object 360 | num_jobs (int): total number of jobs 361 | """ 362 | # Finalize Configs 363 | sc = SlurmConfig(conf) 364 | sc.finalize(num_jobs) 365 | 366 | # Create Code Copies 367 | dir_mgr = SlurmDirectoryManager(sc, conf) 368 | dir_mgr.move_files(num_jobs) 369 | 370 | # Write and call slurm script 371 | slurm_script = write_slurm_script(sc, dir_mgr) 372 | cmd = "sbatch " + slurm_script 373 | print(cmd) 374 | subprocess.check_output(cmd, shell=True) 375 | 376 | 377 | def write_slurm_script(slurm_conf: SlurmConfig, dir_mgr: SlurmDirectoryManager) -> str: 378 | """write the sbatch.sh script for slurm to disk 379 | 380 | Args: 381 | slurm_conf (SlurmConfig): Slurm configuration object 382 | 383 | Returns: 384 | str: path to the written script 385 | """ 386 | sc = slurm_conf.slurm_conf 387 | conf = slurm_conf.conf 388 | 389 | template_path = sc[SKEYS.TEMPLATE_PATH] 390 | output_path = sc[SKEYS.SLURM_OUT] 391 | 392 | exp_main_file = os.path.relpath(__main__.__file__, os.getcwd()) 393 | 394 | fid_in = open(template_path, "r") 395 | fid_out = open(output_path, "w") 396 | 397 | tline = fid_in.readline() 398 | 399 | while tline: 400 | tline = tline.replace("%%partition%%", sc["partition"]) 401 | tline = tline.replace("%%account%%", sc[SKEYS.ACCOUNT]) 402 | tline = tline.replace("%%job-name%%", sc["job-name"]) 403 | 404 | tline = tline.replace("%%last_job_idx%%", "{:d}".format(sc[SKEYS.LAST_IDX])) 405 | tline = tline.replace( 406 | "%%num_parallel_jobs%%", "{:d}".format(sc["num_parallel_jobs"]) 407 | ) 408 | 409 | tline = tline.replace( 410 | "%%experiment_execution_dir%%", dir_mgr.get_exp_exec_dir() 411 | ) 412 | 413 | tline = tline.replace("%%slurm_log%%", sc[SKEYS.SLURM_LOG]) 414 | 415 | tline = tline.replace("%%ntasks%%", "{:d}".format(sc["ntasks"])) 416 | tline = tline.replace("%%cpus-per-task%%", "{:d}".format(sc["cpus-per-task"])) 417 | tline = tline.replace("%%time%%", sc[SKEYS.TIME]) 418 | 419 | tline = tline.replace("%%sh_lines%%", sc[SKEYS.SH_LINES]) 420 | 421 | tline = tline.replace("%%venv%%", sc[SKEYS.VENV]) 422 | tline = tline.replace("%%pythonpath%%", dir_mgr.get_py_path()) 423 | 424 | tline = tline.replace("%%python_script%%", exp_main_file) 425 | tline = tline.replace("%%path_to_yaml_config%%", conf.config_path) 426 | 427 | tline = tline.replace("%%cw_args%%", sc[SKEYS.CW_ARGS]) 428 | tline = tline.replace("%%sbatch_args%%", sc[SKEYS.SBATCH_ARGS]) 429 | 430 | fid_out.write(tline) 431 | 432 | tline = fid_in.readline() 433 | fid_in.close() 434 | fid_out.close() 435 | return output_path 436 | -------------------------------------------------------------------------------- /cw2/cw_slurm/cw_slurm_keys.py: -------------------------------------------------------------------------------- 1 | TEMPLATE_PATH = "path_to_template" 2 | 3 | ACCOUNT = "account" 4 | TIME = "time" 5 | 6 | CPU_MEM = "mem-per-cpu" 7 | VENV = "venv" 8 | 9 | SBATCH_ARGS = "sbatch_args" 10 | SH_LINES = "sh_lines" 11 | CW_ARGS = "cw_args" 12 | 13 | SLURM_LOG = "slurm_log" 14 | SLURM_OUT = "slurm_output" 15 | 16 | EXP_CP_AUTO = "experiment_copy_auto_dst" 17 | EXP_CP_DST = "experiment_copy_dst" 18 | EXP_CP_SRC = "experiment_copy_src" 19 | 20 | 21 | LAST_IDX = "last_job_idx" 22 | -------------------------------------------------------------------------------- /cw2/default_sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p %%partition%% 3 | # #SBATCH -A %%account%% 4 | #SBATCH -J %%job-name%% 5 | #SBATCH --array 0-%%last_job_idx%%%%%num_parallel_jobs%% 6 | 7 | # Please use the complete path details : 8 | #SBATCH -D %%experiment_execution_dir%% 9 | #SBATCH -o %%slurm_log%%/out_%A_%a.log 10 | #SBATCH -e %%slurm_log%%/err_%A_%a.log 11 | 12 | # Cluster Settings 13 | #SBATCH -n %%ntasks%% # Number of tasks 14 | #SBATCH -c %%cpus-per-task%% # Number of cores per task 15 | #SBATCH -t %%time%% # 1:00:00 Hours, minutes and seconds, or '#SBATCH -t 10' - only minutes 16 | 17 | %%sbatch_args%% 18 | # ------------------------------- 19 | 20 | # Activate the virtualenv / conda environment 21 | %%venv%% 22 | 23 | 24 | # Export Pythonpath 25 | %%pythonpath%% 26 | 27 | # Additional Instructions from CONFIG.yml 28 | %%sh_lines%% 29 | 30 | python3 %%python_script%% %%path_to_yaml_config%% -j $SLURM_ARRAY_TASK_ID %%cw_args%% 31 | 32 | # THIS WAS BUILT FROM THE DEFAULLT SBATCH TEMPLATE -------------------------------------------------------------------------------- /cw2/experiment.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import datetime as dt 3 | 4 | from cw2.cw_data import cw_logging 5 | from cw2.cw_error import ExperimentSurrender 6 | 7 | 8 | class AbstractExperiment(abc.ABC): 9 | @abc.abstractmethod 10 | def initialize( 11 | self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray 12 | ) -> None: 13 | """needs to be implemented by subclass. 14 | Called once at the start of each repition for initialization purposes. 15 | 16 | Arguments: 17 | cw_config {dict} -- clusterwork experiment configuration 18 | rep {int} -- repition counter 19 | logger {cw_logging.LoggerArray} -- initialized loggers for preprocessing 20 | """ 21 | raise NotImplementedError 22 | 23 | @abc.abstractmethod 24 | def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 25 | """needs to be implemented by subclass. 26 | Called after initialize(). Should be the main procedure of the experiment. 27 | 28 | Args: 29 | config (dict): clusterwork experiment configuration 30 | rep (int): [description] 31 | logger (cw_logging.LoggerArray): [description] 32 | 33 | Raises: 34 | NotImplementedError: [description] 35 | """ 36 | raise NotImplementedError 37 | 38 | @abc.abstractmethod 39 | def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False): 40 | """needs to be implemented by subclass. 41 | Is guaranteed to be called after the experiment has run, even in case of exceptions during execution. 42 | 43 | Args: 44 | surrender (ExperimentSurrender, optional): when the experiment raises an ExperimentSurrender, this object can be accessed here. Defaults to None. 45 | crash (bool, optional): indicating if the experiment raised a 'serious' Exception. Defaults to False. 46 | """ 47 | raise NotImplementedError 48 | 49 | 50 | class AbstractIterativeExperiment(AbstractExperiment): 51 | @abc.abstractmethod 52 | def iterate(self, cw_config: dict, rep: int, n: int) -> dict: 53 | """needs to be implemented by subclass. 54 | The iteration procedure. 55 | 56 | Arguments: 57 | cw_config {dict} -- clusterwork experiment configuration 58 | rep {int} -- repitition counter 59 | n {int} -- iteration counter 60 | 61 | Returns: 62 | dict -- result map 63 | """ 64 | raise NotImplementedError 65 | 66 | @abc.abstractmethod 67 | def save_state(self, cw_config: dict, rep: int, n: int) -> None: 68 | """needs to be implemented by subclass. 69 | Intended to save an intermediate state after each iteration. 70 | Arguments: 71 | cw_config {dict} -- clusterwork experiment configuration 72 | rep {int} -- repitition counter 73 | n {int} -- [description] 74 | """ 75 | raise NotImplementedError 76 | 77 | def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 78 | for n in range(cw_config["iterations"]): 79 | surrender = False 80 | try: 81 | res = self.iterate(cw_config, rep, n) 82 | except ExperimentSurrender as e: 83 | res = e.payload 84 | surrender = True 85 | 86 | res["ts"] = dt.datetime.now() 87 | res["rep"] = rep 88 | res["iter"] = n 89 | logger.process(res) 90 | 91 | self.save_state(cw_config, rep, n) 92 | 93 | if surrender: 94 | raise ExperimentSurrender() 95 | -------------------------------------------------------------------------------- /cw2/job.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Type 3 | 4 | from cw2 import cw_error, experiment 5 | from cw2.cw_config import cw_conf_keys as KEYS 6 | from cw2.cw_data import cw_logging 7 | 8 | 9 | class Job: 10 | """Class defining a computation job. 11 | Can contain 1..n tasks. Each job should encapsulate all information necessary for execution. 12 | A task is an experiment configuration with unique repetition idx. 13 | """ 14 | 15 | def __init__( 16 | self, 17 | tasks: List[Dict], 18 | exp_cls: experiment.AbstractExperiment.__class__, 19 | logger: cw_logging.AbstractLogger, 20 | delete_old_files: bool = False, 21 | root_dir: str = "", 22 | read_only: bool = False, 23 | ): 24 | self.tasks = tasks 25 | 26 | if exp_cls is not None: 27 | self.exp = exp_cls() 28 | self.logger = logger 29 | 30 | self.n_parallel = 1 31 | if KEYS.REPS_PARALL in tasks[0]: 32 | self.n_parallel = tasks[0][KEYS.REPS_PARALL] 33 | 34 | self._root_dir = root_dir 35 | 36 | if not read_only: 37 | self.__create_experiment_directory(tasks, delete_old_files, root_dir) 38 | 39 | def __create_experiment_directory( 40 | self, tasks: List[Dict], delete_old_files=False, root_dir="" 41 | ): 42 | """internal function creating the directories in which the job will write its data. 43 | 44 | Args: 45 | task (List[attrdict.Attrdict]): a list of experiment tasks 46 | delete_old_files (bool, optional): Should the directory be emptied beforehand?. Defaults to False. 47 | root_dir (str, optional): [description]. Defaults to "". 48 | """ 49 | for conf in tasks: 50 | # create experiment path and subdir 51 | os.makedirs(os.path.join(root_dir, conf[KEYS.PATH]), exist_ok=True) 52 | 53 | # create a directory for the log path 54 | os.makedirs(os.path.join(root_dir, conf[KEYS.LOG_PATH]), exist_ok=True) 55 | 56 | # create log path for each repetition 57 | rep_path = os.path.join(root_dir, conf[KEYS.i_REP_LOG_PATH]) 58 | 59 | # XXX: Disable Delete for now 60 | """ 61 | if delete_old_files: 62 | pass 63 | """ 64 | os.makedirs(rep_path, exist_ok=True) 65 | 66 | def run_task(self, c: Dict, overwrite: bool): 67 | """Execute a single task of the job. 68 | 69 | Args: 70 | c (attrdict.AttrDict): task configuration 71 | """ 72 | rep_path = c[KEYS.i_REP_LOG_PATH] 73 | r = c[KEYS.i_REP_IDX] 74 | print(rep_path) 75 | 76 | if not overwrite and self._check_task_exists(c, r): 77 | cw_logging.getLogger().warning( 78 | "Skipping run, as {} is not empty. Use -o to overwrite.".format( 79 | rep_path 80 | ) 81 | ) 82 | return 83 | 84 | surrender = None 85 | crash = False 86 | 87 | self.logger.initialize(c, r, rep_path) 88 | try: 89 | self.exp.initialize(c, r, self.logger) 90 | self.exp.run(c, r, self.logger) 91 | except cw_error.ExperimentSurrender as s: 92 | cw_logging.getLogger().warning("SURRENDER: {}".format(rep_path)) 93 | surrender = s 94 | except: 95 | crash = True 96 | cw_logging.getLogger().exception("EXCEPTION: {}".format(rep_path)) 97 | 98 | self.exp.finalize(surrender, crash) 99 | self.logger.finalize() 100 | 101 | def load_task(self, c: Dict) -> Dict: 102 | """Load the results of a single task. 103 | 104 | Args: 105 | c (attrdict.AttrDict): task configuration 106 | 107 | Returns: 108 | dict: the loaded data 109 | """ 110 | rep_path = os.path.join(self._root_dir, c[KEYS.i_REP_LOG_PATH]) 111 | r = c[KEYS.i_REP_IDX] 112 | self.logger.initialize(c, r, rep_path) 113 | return self.logger.load() 114 | 115 | def _check_task_exists(self, c: Dict, r: int) -> bool: 116 | """internal function. checks if the task has already been run in the past. 117 | 118 | Args: 119 | c (attrdict.AttrDict): task configuration 120 | 121 | Returns: 122 | bool: True if the repetition was already run 123 | """ 124 | rep_path = c[KEYS.i_REP_LOG_PATH] 125 | return len(os.listdir(rep_path)) != 0 126 | 127 | 128 | class JobFactory: 129 | """Facotry class to create single jobs from experiment configuration. 130 | Specifially used to map experiment repetitions to Jobs. 131 | """ 132 | 133 | def __init__( 134 | self, 135 | exp_cls: Type[experiment.AbstractExperiment], 136 | logger: cw_logging.AbstractLogger, 137 | delete_old_files: bool = False, 138 | root_dir: str = "", 139 | read_only: bool = False, 140 | ): 141 | self.exp_cls = exp_cls 142 | self.logger = logger 143 | self.delete_old_files = delete_old_files 144 | self.root_dir = root_dir 145 | self.read_only = read_only 146 | 147 | def _group_exp_tasks(self, task_confs: List[Dict]) -> Dict: 148 | """group tasks by experiment to access common attributes like reps_per_job 149 | 150 | Args: 151 | task_confs (List[attrdict.AttrDict]): list of all task configurations 152 | 153 | Returns: 154 | dict: dictionary of task configurations grouped by name. 155 | """ 156 | grouped_exps = {} 157 | for t in task_confs: 158 | name = t[KEYS.NAME] 159 | if name not in grouped_exps: 160 | grouped_exps[name] = [] 161 | grouped_exps[name].append(t) 162 | return grouped_exps 163 | 164 | def _divide_tasks(self, task_confs: List[Dict]) -> List[List[Dict]]: 165 | """internal function to divide experiment repetitions into sets of repetitions. 166 | Dependent on configured reps_per_job attribute. Each set of repetitions will be one job. 167 | 168 | Args: 169 | task_confs (List[attrdict.AttrDict]): List of task configurations 170 | 171 | Returns: 172 | List[List[attrdict.AttrDict]]: a list containing all subpackages of tasks as lists 173 | """ 174 | grouped_exps = self._group_exp_tasks(task_confs) 175 | tasks = [] 176 | 177 | for exp_name in grouped_exps: 178 | exp_group = grouped_exps[exp_name] 179 | 180 | max_rep = len(exp_group) 181 | 182 | # Use 1 Repetition per job if not defined otherwise 183 | rep_portion = 1 184 | if KEYS.REPS_P_JOB in exp_group[0]: 185 | rep_portion = exp_group[0][KEYS.REPS_P_JOB] 186 | 187 | for start_rep in range(0, max_rep, rep_portion): 188 | tasks.append(exp_group[start_rep : start_rep + rep_portion]) 189 | return tasks 190 | 191 | def create_jobs(self, exp_configs: List[Dict]) -> List[Job]: 192 | """creates a list of all jobs. 193 | 194 | Args: 195 | exp_configs (List[attrdict.AttrDict]): list of all defined experiment configurations. 196 | 197 | Returns: 198 | List[Job]: list of configured jobs. 199 | """ 200 | task_list = self._divide_tasks(exp_configs) 201 | joblist = [] 202 | for task in task_list: 203 | j = Job( 204 | task, 205 | self.exp_cls, 206 | self.logger, 207 | self.delete_old_files, 208 | self.root_dir, 209 | self.read_only, 210 | ) 211 | joblist.append(j) 212 | return joblist 213 | -------------------------------------------------------------------------------- /cw2/scheduler.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import concurrent.futures 3 | import multiprocessing 4 | import os 5 | import socket 6 | import warnings 7 | from typing import List 8 | 9 | from joblib import Parallel, delayed 10 | 11 | from cw2 import cw_error, job 12 | from cw2.cw_config import cw_conf_keys as KEYS 13 | from cw2.cw_config import cw_config 14 | from cw2.cw_slurm import cw_slurm 15 | 16 | 17 | class AbstractScheduler(abc.ABC): 18 | def __init__(self, conf: cw_config.Config = None): 19 | self.joblist = None 20 | self.config = conf 21 | 22 | def assign(self, joblist: List[job.Job]) -> None: 23 | """assigns the scheduler a list of jobs to execute 24 | 25 | Arguments: 26 | joblist {List[job.AbstractJob]} -- list of configured and implemented jobs 27 | """ 28 | self.joblist = joblist 29 | 30 | @abc.abstractmethod 31 | def run(self, overwrite=False): 32 | """the scheduler begins to execute all assigned jobs 33 | 34 | Args: 35 | overwrite (bool, optional): overwrite flag. can be passed to the job. Defaults to False. 36 | """ 37 | raise NotImplementedError 38 | 39 | 40 | class GPUDistributingLocalScheduler(AbstractScheduler): 41 | def __init__(self, conf: cw_config.Config = None): 42 | super(GPUDistributingLocalScheduler, self).__init__(conf=conf) 43 | self._total_num_gpus = int( 44 | conf.slurm_config["sbatch_args"]["gres"].rsplit(":", 1)[1] 45 | ) 46 | self._gpus_per_rep = conf.slurm_config["gpus_per_rep"] 47 | self._queue_elements = int(self._total_num_gpus / self._gpus_per_rep) 48 | 49 | print( 50 | "GPUDistributingLocalScheduler: {} GPUs available, {} GPUs per rep, {} queue elements".format( 51 | self._total_num_gpus, self._gpus_per_rep, self._queue_elements 52 | ) 53 | ) 54 | 55 | if self._gpus_per_rep >= 1.0: 56 | assert self._gpus_per_rep == int( 57 | self._gpus_per_rep 58 | ), "gpus_per_rep must be integer" 59 | 60 | @staticmethod 61 | def use_distributed_gpu_scheduling(conf: cw_config.Config) -> bool: 62 | if conf.slurm_config is None: 63 | return False 64 | # Use if 65 | # 1.) GPUs Requested 66 | # 2.) Number of GPUs per rep specified 67 | # 3.) Number of GPUs per rep != total number of gpus requested 68 | gpus_requested = "gres" in conf.slurm_config.get("sbatch_args", "DUMMY_DEFAULT") 69 | gpus_per_rep_specified = "gpus_per_rep" in conf.slurm_config 70 | 71 | if gpus_requested: 72 | num_gpus_requested = int( 73 | conf.slurm_config["sbatch_args"]["gres"].rsplit(":", 1)[1] 74 | ) 75 | # e.g. gres=gpu:4 or gres=gpu:full:4 76 | else: 77 | num_gpus_requested = 0 78 | 79 | use_distributed_gpu_scheduling = ( 80 | gpus_requested 81 | and gpus_per_rep_specified 82 | and num_gpus_requested != conf.slurm_config["gpus_per_rep"] 83 | ) 84 | 85 | if not use_distributed_gpu_scheduling: 86 | on_horeka_gpu = ( 87 | "hkn" in socket.gethostname() 88 | and conf.slurm_config["partition"] == "accelerated" 89 | ) 90 | if on_horeka_gpu: 91 | assert ( 92 | num_gpus_requested == 4 93 | ), "On HoreKA, you must request 4 GPUs (gres=gpu:4)" 94 | assert ( 95 | not on_horeka_gpu 96 | ), "You are on HoreKA and not using the GPU scheduler, don't! " 97 | 98 | return use_distributed_gpu_scheduling 99 | 100 | @staticmethod 101 | def get_gpu_str(queue_idx: int, gpus_per_rep: float) -> str: 102 | if gpus_per_rep >= 1: 103 | assert ( 104 | int(gpus_per_rep) == gpus_per_rep 105 | ), "gpus_per_rep must be integer if >= 1" 106 | gpus_per_rep = int(gpus_per_rep) 107 | return ("{}," * gpus_per_rep).format( 108 | *[queue_idx * gpus_per_rep + i for i in range(gpus_per_rep)] 109 | )[:-1] 110 | else: 111 | return str(int(queue_idx * gpus_per_rep) + 0.01) 112 | 113 | 114 | class MPGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 115 | def run(self, overwrite: bool = False): 116 | num_parallel = self.joblist[0].n_parallel 117 | for j in self.joblist: 118 | assert ( 119 | j.n_parallel == num_parallel 120 | ), "All jobs in list must have same n_parallel" 121 | assert j.n_parallel == self._queue_elements, ( 122 | "Mismatch between GPUs Queue Elements and Jobs executed in" 123 | "parallel. Fix for optimal resource usage!!" 124 | ) 125 | 126 | with multiprocessing.Pool(processes=num_parallel) as pool: 127 | # setup gpu resource queue 128 | m = multiprocessing.Manager() 129 | gpu_queue = m.Queue(maxsize=self._queue_elements) 130 | for i in range(self._queue_elements): 131 | gpu_queue.put(i) 132 | 133 | for j in self.joblist: 134 | for c in j.tasks: 135 | pool.apply_async( 136 | MPGPUDistributingLocalScheduler._execute_task, 137 | (j, c, gpu_queue, self._gpus_per_rep, overwrite), 138 | ) 139 | pool.close() 140 | pool.join() 141 | 142 | @staticmethod 143 | def _execute_task( 144 | j: job.Job, 145 | c: dict, 146 | q: multiprocessing.Queue, 147 | gpus_per_rep: int, 148 | overwrite: bool = False, 149 | ): 150 | queue_idx = q.get() 151 | gpu_str = MPGPUDistributingLocalScheduler.get_gpu_str(queue_idx, gpus_per_rep) 152 | try: 153 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str 154 | j.run_task(c, overwrite) 155 | except cw_error.ExperimentSurrender as _: 156 | return 157 | finally: 158 | q.put(queue_idx) 159 | 160 | 161 | class HOREKAAffinityGPUDistributingLocalScheduler(GPUDistributingLocalScheduler): 162 | def __init__(self, conf: cw_config.Config = None): 163 | super(HOREKAAffinityGPUDistributingLocalScheduler, self).__init__(conf=conf) 164 | 165 | total_cpus = conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"] 166 | self._cpus_per_rep = total_cpus // self._queue_elements 167 | 168 | assert ( 169 | self._cpus_per_rep > 0 170 | ), "Not enough CPUs for the number of GPUs requested" 171 | 172 | def run(self, overwrite: bool = False): 173 | print("Seeing CPUs:", os.sched_getaffinity(0)) 174 | num_parallel = self.joblist[0].n_parallel 175 | for j in self.joblist: 176 | assert ( 177 | j.n_parallel == num_parallel 178 | ), "All jobs in list must have same n_parallel" 179 | assert j.n_parallel == self._queue_elements, ( 180 | "Mismatch between GPUs Queue Elements and Jobs executed in" 181 | "parallel. Fix for optimal resource usage!!" 182 | ) 183 | 184 | with concurrent.futures.ProcessPoolExecutor( 185 | max_workers=num_parallel, 186 | ) as pool: 187 | # setup gpu resource queue 188 | m = multiprocessing.Manager() 189 | gpu_queue = m.Queue(maxsize=self._queue_elements) 190 | for i in range(self._queue_elements): 191 | gpu_queue.put(i) 192 | 193 | for j in self.joblist: 194 | for c in j.tasks: 195 | pool.submit( 196 | HOREKAAffinityGPUDistributingLocalScheduler._execute_task, 197 | j, 198 | c, 199 | gpu_queue, 200 | self._gpus_per_rep, 201 | self._cpus_per_rep, 202 | overwrite, 203 | ) 204 | 205 | @staticmethod 206 | def _execute_task( 207 | j: job.Job, 208 | c: dict, 209 | q: multiprocessing.Queue, 210 | gpus_per_rep: int, 211 | cpus_per_rep: int, 212 | overwrite: bool = False, 213 | ): 214 | print("Seeing CPUs:", os.sched_getaffinity(0)) 215 | queue_idx = q.get() 216 | gpu_str = HOREKAAffinityGPUDistributingLocalScheduler.get_gpu_str( 217 | queue_idx, gpus_per_rep 218 | ) 219 | cpus = set(range(queue_idx * cpus_per_rep, (queue_idx + 1) * cpus_per_rep)) 220 | print("Job {}: Using GPUs: {} and CPUs: {}".format(queue_idx, gpu_str, cpus)) 221 | try: 222 | os.sched_setaffinity(0, cpus) 223 | c[KEYS.i_CPU_CORES] = cpus 224 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str 225 | j.run_task(c, overwrite) 226 | except cw_error.ExperimentSurrender as _: 227 | return 228 | finally: 229 | q.put(queue_idx) 230 | 231 | 232 | class KlusterThreadLimitingScheduler(GPUDistributingLocalScheduler): 233 | def __init__(self, conf: cw_config.Config = None): 234 | super(KlusterThreadLimitingScheduler, self).__init__(conf=conf) 235 | total_cpus = conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"] 236 | self._num_threads = total_cpus // self._queue_elements 237 | print("Using {} threads per Rep".format(self._num_threads)) 238 | 239 | def run(self, overwrite: bool = False): 240 | num_parallel = self.joblist[0].n_parallel 241 | for j in self.joblist: 242 | assert ( 243 | j.n_parallel == num_parallel 244 | ), "All jobs in list must have same n_parallel" 245 | assert j.n_parallel == self._queue_elements, ( 246 | "Mismatch between GPUs Queue Elements and Jobs executed in" 247 | "parallel. Fix for optimal resource usage!!" 248 | ) 249 | 250 | with multiprocessing.Pool(processes=num_parallel) as pool: 251 | # setup gpu resource queue 252 | m = multiprocessing.Manager() 253 | gpu_queue = m.Queue(maxsize=self._queue_elements) 254 | for i in range(self._queue_elements): 255 | gpu_queue.put(i) 256 | 257 | for j in self.joblist: 258 | for c in j.tasks: 259 | args = ( 260 | j, 261 | c, 262 | gpu_queue, 263 | self._gpus_per_rep, 264 | self._num_threads, 265 | overwrite, 266 | ) 267 | pool.apply_async(KlusterThreadLimitingScheduler._execute_task, args) 268 | pool.close() 269 | pool.join() 270 | 271 | @staticmethod 272 | def _execute_task( 273 | j: job.Job, 274 | c: dict, 275 | q: multiprocessing.Queue, 276 | gpus_per_rep: int, 277 | num_threads: int, 278 | overwrite: bool = False, 279 | ): 280 | queue_idx = q.get() 281 | gpu_str = KlusterThreadLimitingScheduler.get_gpu_str(queue_idx, gpus_per_rep) 282 | try: 283 | os.environ["MKL_NUM_THREADS"] = str(num_threads) 284 | os.environ["NUMEXPR_NUM_THREADS"] = str(num_threads) 285 | os.environ["OMP_NUM_THREADS"] = str(num_threads) 286 | # Ok, that's not so nice, but I did not find better way yet 287 | try: 288 | import torch 289 | 290 | torch.set_num_threads(num_threads) 291 | except ImportError: 292 | pass 293 | 294 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str 295 | j.run_task(c, overwrite) 296 | except cw_error.ExperimentSurrender as _: 297 | return 298 | finally: 299 | q.put(queue_idx) 300 | 301 | 302 | def get_gpu_scheduler_cls(scheduler: str): 303 | if scheduler == "mp": 304 | return MPGPUDistributingLocalScheduler 305 | elif scheduler == "horeka": 306 | return HOREKAAffinityGPUDistributingLocalScheduler 307 | elif scheduler == "kluster": 308 | return KlusterThreadLimitingScheduler 309 | else: 310 | raise NotImplementedError 311 | 312 | 313 | class CpuDistributingLocalScheduler(AbstractScheduler): 314 | def __init__(self, conf: cw_config.Config = None): 315 | super(CpuDistributingLocalScheduler, self).__init__(conf=conf) 316 | self._total_num_cpus = ( 317 | conf.slurm_config["cpus-per-task"] * conf.slurm_config["ntasks"] 318 | ) 319 | self._cpus_per_rep = conf.slurm_config["cpus_per_rep"] 320 | assert self._cpus_per_rep == int( 321 | self._cpus_per_rep 322 | ), "cpus_per_rep must be integer" 323 | self._queue_elements = int(self._total_num_cpus / self._cpus_per_rep) 324 | print( 325 | "CPUDistributingLocalScheduler: {} CPUs available, {} CPUs per rep, {} queue elements".format( 326 | self._total_num_cpus, self._cpus_per_rep, self._queue_elements 327 | ) 328 | ) 329 | 330 | def run(self, overwrite: bool = False): 331 | print("Seeing CPUs:", os.sched_getaffinity(0)) 332 | num_parallel = self.joblist[0].n_parallel 333 | for j in self.joblist: 334 | assert ( 335 | j.n_parallel == num_parallel 336 | ), "All jobs in list must have same n_parallel" 337 | assert j.n_parallel == self._queue_elements, ( 338 | "Mismatch between CPUs Queue Elements and Jobs executed in" 339 | "parallel. Fix for optimal resource usage!!" 340 | ) 341 | 342 | with concurrent.futures.ProcessPoolExecutor( 343 | max_workers=num_parallel, 344 | ) as pool: 345 | # setup gpu resource queue 346 | m = multiprocessing.Manager() 347 | cpu_queue = m.Queue(maxsize=self._queue_elements) 348 | for i in range(self._queue_elements): 349 | cpu_queue.put(i) 350 | 351 | for j in self.joblist: 352 | for c in j.tasks: 353 | pool.submit( 354 | CpuDistributingLocalScheduler._execute_task, 355 | j, 356 | c, 357 | cpu_queue, 358 | self._cpus_per_rep, 359 | overwrite, 360 | ) 361 | 362 | @staticmethod 363 | def _execute_task( 364 | j: job.Job, 365 | c: dict, 366 | q: multiprocessing.Queue, 367 | cpus_per_rep: int, 368 | overwrite: bool = False, 369 | ): 370 | print("Seeing CPUs:", os.sched_getaffinity(0)) 371 | queue_idx = q.get() 372 | cpus = set(range(queue_idx * cpus_per_rep, (queue_idx + 1) * cpus_per_rep)) 373 | print("Job {}: Using CPUs: {}".format(queue_idx, cpus)) 374 | try: 375 | os.sched_setaffinity(0, cpus) 376 | c[KEYS.i_CPU_CORES] = cpus 377 | j.run_task(c, overwrite) 378 | except cw_error.ExperimentSurrender as _: 379 | return 380 | finally: 381 | q.put(queue_idx) 382 | 383 | @staticmethod 384 | def use_distributed_cpu_scheduling(conf: cw_config.Config) -> bool: 385 | if conf.slurm_config is None: 386 | return False 387 | else: 388 | scheduler = conf.slurm_config.get("scheduler", None) 389 | return scheduler == "cpu_distribute" 390 | 391 | 392 | class LocalScheduler(AbstractScheduler): 393 | def run(self, overwrite: bool = False): 394 | for j in self.joblist: 395 | Parallel(n_jobs=j.n_parallel)( 396 | delayed(self.execute_task)(j, c, overwrite) for c in j.tasks 397 | ) 398 | 399 | def execute_task(self, j: job.Job, c: dict, overwrite: bool = False): 400 | try: 401 | j.run_task(c, overwrite) 402 | except cw_error.ExperimentSurrender as _: 403 | return 404 | 405 | 406 | class SlurmScheduler(AbstractScheduler): 407 | def run(self, overwrite: bool = False): 408 | cw_slurm.run_slurm(self.config, len(self.joblist)) 409 | -------------------------------------------------------------------------------- /cw2/util.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import re 4 | 5 | try: 6 | from collections.abc import Mapping, MutableMapping, MutableSequence # noqa 7 | except ImportError: 8 | from collections import Mapping, MutableMapping, MutableSequence # noqa 9 | 10 | 11 | def deep_update(base_dict: dict, update_dict: dict) -> dict: 12 | """Updates the base dictionary with corresponding values from the update dictionary, including nested collections. 13 | Not updated values are kept as is. 14 | 15 | Arguments: 16 | base_dict {dict} -- dictionary to be updated 17 | update_dict {dict} -- dictianry holding update values 18 | 19 | Returns: 20 | dict -- dictanry with updated values 21 | """ 22 | for key, value in update_dict.items(): 23 | # Update Recursively 24 | if isinstance(value, Mapping): 25 | branch = deep_update(base_dict.get(key, {}), value) 26 | base_dict[key] = branch 27 | else: 28 | base_dict[key] = update_dict[key] 29 | return base_dict 30 | 31 | 32 | def flatten_dict(d, parent_key="", sep="_"): 33 | items = [] 34 | for k, v in d.items(): 35 | new_key = parent_key + sep + k if parent_key else k 36 | if isinstance(v, MutableMapping): 37 | items.extend(flatten_dict(v, new_key, sep=sep).items()) 38 | elif isinstance(v, MutableSequence): 39 | keys = map(lambda i: new_key + "_" + str(i), range(len(v))) 40 | items.extend(zip(keys, v)) 41 | else: 42 | items.append((new_key, v)) 43 | return dict(items) 44 | 45 | 46 | def flatten_dict_to_tuple_keys(d: MutableMapping): 47 | flat_dict = {} 48 | for k, v in d.items(): 49 | if isinstance(v, MutableMapping): 50 | sub_dict = flatten_dict_to_tuple_keys(v) 51 | flat_dict.update({(k, *sk): sv for sk, sv in sub_dict.items()}) 52 | 53 | elif isinstance(v, MutableSequence): 54 | flat_dict[(k,)] = v 55 | 56 | return flat_dict 57 | 58 | 59 | def insert_deep_dictionary(d: MutableMapping, t: tuple, value): 60 | if type(t) is tuple: 61 | if len(t) == 1: # tuple contains only one key 62 | d[t[0]] = value 63 | else: # tuple contains more than one key 64 | if t[0] not in d: 65 | d[t[0]] = dict() 66 | insert_deep_dictionary(d[t[0]], t[1:], value) 67 | else: 68 | d[t] = value 69 | 70 | 71 | def append_deep_dictionary(d: MutableMapping, t: tuple, value): 72 | if type(t) is tuple: 73 | if len(t) == 1: # tuple contains only one key 74 | if t[0] not in d: 75 | d[t[0]] = [] 76 | d[t[0]].append(value) 77 | else: # tuple contains more than one key 78 | if t[0] not in d: 79 | d[t[0]] = dict() 80 | append_deep_dictionary(d[t[0]], t[1:], value) 81 | else: 82 | d[t] = value 83 | 84 | 85 | def format_time(time_in_secs: float) -> str: 86 | return str(datetime.timedelta(seconds=time_in_secs)) 87 | 88 | 89 | def shorten_param(_param_name): 90 | name_parts = _param_name.split(".") 91 | shortened_parts = ".".join(map(lambda s: s[:3], name_parts[:-1])) 92 | # also handle cases where the leaf name contains '__' then splitting at '_' yields an empty '' string element 93 | shortened_leaf = "".join(map(lambda s: '' if len(s) <= 0 else s[0], name_parts[-1].split("_"))) 94 | if shortened_parts: 95 | return shortened_parts + "." + shortened_leaf 96 | else: 97 | return shortened_leaf 98 | 99 | 100 | def get_size(start_path: str): 101 | """recursively compute size of a directory 102 | 103 | Args: 104 | start_path (str): directory path 105 | 106 | Returns: 107 | size in MByte 108 | """ 109 | total_size = 0 110 | for dirpath, _, filenames in os.walk(start_path): 111 | for f in filenames: 112 | fp = os.path.join(dirpath, f) 113 | total_size += os.path.getsize(fp) 114 | return total_size / 1000000.0 115 | 116 | 117 | def check_subdir(parent: str, child: str) -> bool: 118 | """Check if the child is a subdirectory of the parent. 119 | 120 | Args: 121 | parent (str): Path of the suspected parent dir 122 | child (str): path of the suspected child dir 123 | 124 | Returns: 125 | bool: True if child is subdir of parent 126 | """ 127 | parent_path = os.path.abspath(parent) 128 | child_path = os.path.abspath(child) 129 | 130 | return os.path.commonpath([parent_path]) == os.path.commonpath( 131 | [parent_path, child_path] 132 | ) 133 | 134 | 135 | def convert_param_names(_param_names: list, values: list) -> str: 136 | """create new shorthand name derived from parameter and value association 137 | Arguments: 138 | _param_names (list): parameter names for the experiment 139 | values (list): concrete values for each parameter 140 | 141 | Returns: 142 | str: shorthand name 143 | """ 144 | 145 | _converted_name = "_".join( 146 | "{}{}".format(shorten_param(k), v) for k, v in zip(_param_names, values) 147 | ) 148 | # _converted_name = re.sub("[' \[\],()]", '', _converted_name) 149 | _converted_name = re.sub("[' ]", "", _converted_name) 150 | _converted_name = re.sub('["]', "", _converted_name) 151 | _converted_name = re.sub("[(\[]", "_", _converted_name) 152 | _converted_name = re.sub("[)\]]", "", _converted_name) 153 | _converted_name = re.sub("[,]", "_", _converted_name) 154 | return _converted_name 155 | 156 | 157 | def get_file_names_in_directory(directory: str) -> [str]: 158 | """ 159 | Get file names in given directory 160 | Args: 161 | directory: directory where you want to explore 162 | 163 | Returns: 164 | file names in a list 165 | 166 | """ 167 | file_names = None 168 | try: 169 | (_, _, file_names) = next(os.walk(directory)) 170 | if len(file_names) == 0: 171 | file_names = None 172 | except StopIteration as e: 173 | print("Cannot read files from directory: ", directory) 174 | return file_names 175 | -------------------------------------------------------------------------------- /doc/01_quickstart.md: -------------------------------------------------------------------------------- 1 | # 1. Quickstart Guide 2 | To deploy an existing project using **cw2**, the following highlevel steps are required: 3 | 4 | - [1. Quickstart Guide](#1-quickstart-guide) 5 | - [1.1. Experiment Implementation](#11-experiment-implementation) 6 | - [1.2. Main() Function](#12-main-function) 7 | - [1.3. Config YAML](#13-config-yaml) 8 | - [1.4. Program Execution](#14-program-execution) 9 | 10 | 11 | This quickstart guide is intended to help you quickly deploy your existing project. To develop a more robust understanding of the mechanisms behind **cw2**, please refer to the corresponding sections of the [User Guide](./). 12 | 13 | You can find barebones templates in the [template folder](../cw2/../templates/). 14 | 15 | ## 1.1. Experiment Implementation 16 | **cw2** requires that your program logic implements the [`cw2.experiment.AbstractExperiment`](../cw2/experiment.py) interface. 17 | 18 | Lets assume you already have a working python project `existing_project.py` 19 | ```python 20 | # existing_project.py 21 | def project_main(): 22 | # perform my program 23 | # ... 24 | 25 | if __name__ == "__main__": 26 | project_main() 27 | ``` 28 | 29 | Create a new file to implement the `AbstractExperiment` interface, e.g. `MY_CW_MAIN.py`, and call your existing project's main (`project_main`) inside the experiments `run()` function: 30 | 31 | ```python 32 | # MY_CW_MAIN.py 33 | from cw2 import experiment, cw_error 34 | from cw2.cw_data import cw_logging 35 | 36 | import existing_project 37 | 38 | class MyExperiment(experiment.AbstractExperiment): 39 | # ... 40 | 41 | def initialize(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 42 | # Skip for Quickguide 43 | pass 44 | 45 | def run(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 46 | # Perform your existing task 47 | existing_project.project_main() 48 | 49 | def finalize(self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False): 50 | # Skip for Quickguide 51 | pass 52 | ``` 53 | For more information on the experiment interface: [Experiment Class](02_experiment.md) 54 | ## 1.2. Main() Function 55 | 56 | As with any Python program, you need to define a `__main__` function. 57 | 58 | It creates a `ClusterWork` instance with your experiment. If you want to use any compatible [loggers](07_logging.md), you can also add them here. Finally it will start experiment: 59 | 60 | ```Python 61 | from cw2 import cluster_work 62 | 63 | if __name__ == "__main__": 64 | # Give the MyExperiment Class, not MyExperiment() Object!! 65 | cw = cluster_work.ClusterWork(MyExperiment) 66 | 67 | # Optional: Add loggers 68 | cw.add_logger(...) 69 | 70 | # RUN! 71 | cw.run() 72 | ``` 73 | The easiest location for this main function is in the same file as your experiment implementation, e.g. `MY_CW_MAIN.py` 74 | 75 | For more information on Logging: [Logging Results](07_logging.md) 76 | 77 | ## 1.3. Config YAML 78 | To qucikly deploy your first **cw2** experiment, create a simple YAML configuration file: 79 | 80 | ```yaml 81 | --- 82 | # Experiment 1 83 | name: "experiment_name" 84 | 85 | # Required: Can also be set in DEFAULT 86 | path: "path/to/output_dir/" # location to save results in 87 | repetitions: 1 # number of times one set of parameters is run 88 | 89 | # Experiment Parameters: 90 | params: 91 | key: 'value' 92 | ``` 93 | 94 | We strongly recommend you read the [Config Guide](03_config.md) to better understand what the different options mean, and how you can use this file to efficiently define hyperparameter grids. 95 | 96 | 97 | ## 1.4. Program Execution 98 | To start an experiment locally, e.g. for testing: 99 | ```bash 100 | python3 MY_CW_MAIN.py YOUR_CONFIG.yml 101 | ``` 102 | 103 | To start an experiment on a slurm cluster: 104 | ```bash 105 | python3 MY_CW_MAIN.py YOUR_CONFIG.yml -s 106 | ``` 107 | 108 | For more information on slurm: [Slurm Guide](04_slurm.md) 109 | 110 | For more information on available CLI Arguments: [CLI at a Glance](11_cli_args.md) 111 | 112 | [Back to Overview](./) 113 | -------------------------------------------------------------------------------- /doc/02_experiment.md: -------------------------------------------------------------------------------- 1 | # 2. Experiment Class 2 | 3 | - [2. Experiment Class](#2-experiment-class) 4 | - [2.1. Initialize](#21-initialize) 5 | - [2.1.1 Can I use `__init__` for a global counter ? __**NO**__!!!](#211-can-i-use-__init__-for-a-global-counter--no) 6 | - [2.2 Run](#22-run) 7 | - [2.2.1 cw_config: dict](#221-cw_config-dict) 8 | - [2.2.2 rep: int](#222-rep-int) 9 | - [2.2.3 logger: LoggerArray](#223-logger-loggerarray) 10 | - [2.3 Finalize](#23-finalize) 11 | - [2.4 Iterative Experiment](#24-iterative-experiment) 12 | - [2.4.1 Iterate](#241-iterate) 13 | - [2.4.2 Save State](#242-save-state) 14 | 15 | To run yur project with **cw2** you must implement the [`AbstractExperiment`](../cw2/experiment.py) interface. 16 | This ensures that you can run multiple repetitions in the same process (e.g. for numerically unstable experiments) in the same process during local execution or deploy it massively parallelized using slurm on a computing cluster. 17 | 18 | This interace provides three functions 19 | 20 | - `initialize()` 21 | - `run()` 22 | - `finalize()` 23 | 24 | corresponding to three phases during programm execution. In abstract, a **cw2** run, wether running locally in a single thread, or distirbuted using slurm, takes the form of: 25 | 26 | ```Python 27 | exp = AbstractExperiment() # Object is created once! __init__ is only called once!! 28 | 29 | 30 | for r in repetitions: # Can be parallelized or sequential! 31 | exp.initialize(...) # Initialize / Reset the experiment for each repetition / thread 32 | exp.run(...) # Execute experiment logic 33 | exp.finalize() # Finalize / Clean the experiment after each repetition / thread. Close all writers, etc. 34 | ``` 35 | 36 | A repetition is the repeated execution of an experiment with the exact same configuration of parameters. 37 | 38 | 39 | ## 2.1. Initialize 40 | The `initialize()` should be used like the `__init__` constructor typically present with python objects. It will be called before each experiment execution, whereas the constructor is only called once at the very start. As the Experiment-Object does not get instantiated newly for each execution, unwanted carry over effects between executions might occur. Take the following example: 41 | 42 | ```python 43 | class FaultyExperiment(AbstractExperiment): 44 | def __init__(self): 45 | # Is set only once during lifetime 46 | self.speed_of_light = 300 # 1000 km / s 47 | 48 | def initalize(self, ...): 49 | self.distance_traveled = 0 50 | 51 | def run(self, ...): 52 | self.distance_traveled += self.speed_of_light 53 | # Activate Warp Speed: 54 | self.speed_of_light *= 2 55 | 56 | def finalize(self, ...): 57 | print("Repetition " + str(rep)) 58 | print(self.distance_traveled) 59 | ``` 60 | 61 | If you run this `FaultyExperiment` with three Repetitions, you will get an output like: 62 | ``` 63 | Repetition 0: 64 | 300 65 | 66 | Reptition 1: 67 | 600 68 | 69 | Repition 2: 70 | 1200 71 | ``` 72 | The `distance_traveled` sum gets reset to 0 at the start of each repetition. But the `speed_of_light` is modified during the `run()` function, which is persisted across the reptitions. 73 | 74 | ### 2.1.1 Can I use `__init__` for a global counter ? __**NO**__!!! 75 | When deploying on a computing cluster using slurm, most likely every repetition is executed in its own independent process. This results in a dual set of requirements for your experiment implementation: 76 | 77 | 1. Each experiment repetition should be independently deployable. Do not assume that you can access any results from an earlier repetition through `self.*` fields. The only kind of persistency you can rely on, is writing results to disk. 78 | 2. Do not rely on that an Experiment Instance gets destroyed between repetitions. Always assume that `self.*` fields might carry leftover information unless explicitely (re)set in the `initialize()` method. 79 | 80 | ## 2.2 Run 81 | Thre `run()` should implement the main logic / process of your project. There are no restrictions what you can do here. As this function is probably the most important in your implementation, we want to discuss in more detail its paramters. 82 | 83 | ```python 84 | def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 85 | ... 86 | ``` 87 | ### 2.2.1 cw_config: dict 88 | `cw_config` is a dictonary containing an unrolled experiment configuration. Unrolled means that `grid` and `list` keywords have been resolved and the `DEFAULT` documents have been merged. 89 | Important keys of this `dict` for your implementation might be: 90 | - `params`: containing the unrolled `params` section of your configuration file. See [Configuration YAML File](03_config.md) for more information. 91 | - `_rep_log_path`: a path unique to this repition. You can write your results / logs to this directory. It is guaranteed to exist and to be threadsafe. No other experimental run of your deployment will access this path. See [CW2 File System](05_files.md) for more information. 92 | 93 | ### 2.2.2 rep: int 94 | `rep` is an integer indication the repetition number of this run. As the repetitions are mostly intended to repeat the same parameter combination for numerically unstable experiments, the most likely scenario to use this parameter is to seed a random number generator, e.g. 95 | 96 | ```python 97 | np.seed(rep) 98 | ``` 99 | 100 | The repetition number is not globally unique, meaning you cannot use the `rep` argument alone to save your results in a global database. 101 | Assume you have the following YAML configuration: 102 | 103 | ```yaml 104 | --- 105 | name: "exp_1" 106 | repetitions: 2 107 | 108 | grid: 109 | x: [1, 2] 110 | y: [3, 4] 111 | ``` 112 | 113 | The `grid` keyword will generate 2x2 = 4 parameter combinations with 2 repetitions each, resulting in a total of 8 runs. 114 | Assume an Experiment implementation with the following `run()` function: 115 | 116 | ```python 117 | def run(self, cw_config: dict, rep: int, ...): 118 | print(cw_config['params']) 119 | print(rep) 120 | ``` 121 | Output: 122 | ``` 123 | x: 1, y: 3 124 | rep: 0 125 | 126 | x: 1, y: 3 127 | rep: 1 128 | 129 | x: 2, y: 3 130 | rep: 0 131 | 132 | x: 2, y: 3 133 | rep: 1 134 | 135 | ... 136 | ``` 137 | Only the combination of `params` and `rep` is unique in each run, or the `_rep_log_path`. 138 | 139 | 140 | ### 2.2.3 logger: LoggerArray 141 | `logger` is a [`LoggerArray`](../cw2/cw_data/cw_logging.py) object. If you have added any Logger object, you can pass them your results / messages with 142 | ```python 143 | msg = {} 144 | logger.process(msg) 145 | ``` 146 | See [Logging Results](07_logging.md) for more information. 147 | 148 | ## 2.3 Finalize 149 | The finalize function is called after `run()` has finished at the end of each repetition. The intention for this function is to close any opened writers / database connections, and maybe summarize the results from an (iterative) experiment execution. The function signature of `finalize()` differs from the other `AbstractExperiment` functions. 150 | 151 | ```python 152 | def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False): 153 | ... 154 | ``` 155 | 156 | If the `run()` function wants to abort early for whatever reason, e.g. converged loss function or any other kind of reason, the `run()` function can raise an [`ExperimentSurrender`](../cw2/cw_error.py) error. This custom error can take a `dict` as payload, which can then be accessed by the finalize. If you have different scenarios in which you want to abort an experimental run, this payload can be accessed through this `surrender` object by the `finalize()` function to react accordingly. See [Advanced Features & Parallelization](09_advanced.md) for more information. 157 | 158 | `crash` is a boolean indication if `initialize()` or `run()` encountered any error, which you did not catch in your implementation. **cw2** ensures that even if a critical error occurs in those two functions, `finalize()` still gets called to perform its shutdown procedure. Following repetitions / runs in the same process should therefore not be impacted by earlier errors. 159 | 160 | 161 | ## 2.4 Iterative Experiment 162 | If you have an experiment with an iterative process, e.g. a for-loop as main component in your `run()` method, you might want to implement the [`AbstractIterativeExperiment`](../cw2/experiment.py) interface. 163 | 164 | This interface comes with additional functionality. For example, you can define the number of iterations in your YAML config file with the `iterations` keyword, and **cw2** handles the for-loop for you. It also provides a [`PandasLogger`](../cw2/cw_data/cw_pd_logger.py) to write your results after each iteration into an excel like structure. 165 | 166 | ### 2.4.1 Iterate 167 | Instead of implementing the `run()` method, you have to implement `iterate()`: 168 | 169 | ```python 170 | def iterate(self, cw_config: dict, rep: int, n: int) -> dict: 171 | return {"Result": "Current Iteration is {}".format(n)} 172 | ``` 173 | In addition to the `cw_config` configuration object and `rep` repetition indicator, it also receives the current iteration `n`. This function should perform one single iteration of your process and return a dict with your results / messages / metrics you want to log. 174 | 175 | The following keys are already reserved: 176 | - `"ts"` timestamp of the iteration results 177 | - `"rep"` repetition counter 178 | - `"iter"` iteration counter 179 | 180 | You can again raise an [`ExperimentSurrender`](../cw2/cw_error.py) error to abort early. In this case, the payload of the error is used as the result for logging. 181 | 182 | ### 2.4.2 Save State 183 | After each `iterate()` call, the `save_state()` function is executed. 184 | It has the same parameters as the `iterate()` function, but does not return a result. 185 | 186 | You could use this function to save a snapshot / model of your experiment after each iteration. 187 | 188 | ```python 189 | def save_state(self, cw_config: dict, rep: int, n: int) -> None: 190 | # Save model every 50 iterations. 191 | if n % 50 == 0: 192 | self.model.to_disk(cw_config['_rep_log_path']) 193 | ``` 194 | 195 | 196 | [Back to Overview](./) 197 | -------------------------------------------------------------------------------- /doc/03_config.md: -------------------------------------------------------------------------------- 1 | # 3. Configuration YAML File 2 | - [3. Configuration YAML File](#3-configuration-yaml-file) 3 | - [3.1. Experiment Configuration](#31-experiment-configuration) 4 | - [3.1.1. Experiment Header](#311-experiment-header) 5 | - [3.1.2. Experiment Parameters](#312-experiment-parameters) 6 | - [3.1.2.1 Ablative Parameter Search](#3121-ablative-parameter-search) 7 | - [3.1.3. Recommended Practices: Experiment Configuration](#313-recommended-practices-experiment-configuration) 8 | - [3.1.3.1. Params is your safe space](#3131-params-is-your-safe-space) 9 | - [3.1.3.2. You dont want multiple DEFAULTS...](#3132-you-dont-want-multiple-defaults) 10 | - [3.2. SLURM Configuration](#32-slurm-configuration) 11 | - [3.3. Example Templates](#33-example-templates) 12 | - [3.4. Important Keys](#34-important-keys) 13 | 14 | To configure the execution of the experiment, you need to write a YAML-file. A YAML file consists several documents which begin with `---`: 15 | ```yaml 16 | --- 17 | # First Document 18 | 19 | 20 | --- 21 | # Second Document 22 | 23 | 24 | ``` 25 | 26 | For **cw2** we expect each yaml document to contain a key `name`: 27 | 28 | ```yaml 29 | --- 30 | # First Document 31 | name: "name_1" 32 | 33 | 34 | --- 35 | # Second Document 36 | name: "name_2" 37 | ``` 38 | 39 | The name is used to identify an experiment configuration and can be chosen freely, **EXCEPT** for these names: 40 | 1. `DEFAULT` defines a default configuration. It may only exist *once* in your YAML file. If some parameter settings are shared between your experiments, you can define them inside the `DEFAULT` document. Unless they are specified differently in a named experiment, the settings from the `DEFAULT` will be used. The `DEFAULT` document follows the same structure as a generic experiment configuration document. 41 | 42 | 1. `SLURM` defines a slurm configuration. It may only exist *once* in your YAML file. This document defines the relevant settings for the execution on a computing cluster, and are specific to each cluster. It follows its own special structure. 43 | 44 | 45 | ## 3.1. Experiment Configuration 46 | An experiment configuration (generic or default) has the following structure: 47 | 48 | ```yaml 49 | name: "experiment_name" 50 | 51 | # Experiment Header 52 | # ... 53 | 54 | # Experiment Parameters 55 | # ... 56 | ``` 57 | 58 | ### 3.1.1. Experiment Header 59 | 60 | ```yaml 61 | --- 62 | name: "experiment_name" 63 | 64 | # Required: Can also be set in DEFAULT 65 | path: "path/to/output_dir/" # path for saving the results 66 | repetitions: 5 # number of repeated runs for each parameter combination 67 | 68 | # Required for AbstractIterativeExperiments only. Can also be set in DEFAULT 69 | iterations: 1000 # number of iterations per repetition. 70 | 71 | # Optional: Can also be set in DEFAULT 72 | # Only change these values if you are sure you know what you are doing. 73 | reps_per_job: 1 # number of repetitions in each job. useful for paralellization. defaults to 1. 74 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1. 75 | 76 | 77 | # Experiment Parameters 78 | # ... 79 | # ... 80 | ``` 81 | **All fields can be defined in the `DEFAULT` document and do not need to be set in each experiment specifically.** 82 | 83 | If you want to understand the `reps_per_job` and `reps_in_parallel` settings, please read TODO: BACKGROUND KNOWLEDGE 84 | 85 | ### 3.1.2. Experiment Parameters 86 | The experiment parameter section is highly specific to your code and use case. You can freely define parameter names within the `params:` key, e.g.: 87 | ```yaml 88 | --- 89 | name: "DEFAULT": 90 | # ... all required fields 91 | 92 | 93 | --- 94 | name: "ComputerVision" 95 | # required fields are filled by DEFAULT 96 | 97 | # Experiment Parameters 98 | params: 99 | batchsize: 5 100 | pretrained: "imagenet" 101 | 102 | ``` 103 | 104 | You can freely define parameter names and the structure, such as nested parameters, or list values. 105 | 106 | You can use **cw2** to also quickly define a hyperparameter space using the `grid` or `list` keyword. This YAML file using `list` 107 | ```yaml 108 | --- 109 | name: "DEFAULT": 110 | # ... all required fields 111 | 112 | 113 | --- 114 | name: "CV-List" 115 | # required fields are filled by DEFAULT 116 | 117 | # Experiment Parameters 118 | list: 119 | batchsize: [3, 7] 120 | learning_rate: [0.4, 0.8] 121 | ``` 122 | 123 | 124 | is the same as if you had defined: 125 | ```yaml 126 | --- 127 | name: "DEFAULT": 128 | # ... all required fields 129 | 130 | 131 | --- 132 | name: "CV-list-3-04" 133 | # required fields are filled by DEFAULT 134 | 135 | # Experiment Parameters 136 | params: 137 | batchsize: 3 138 | learning_rate: 0.4 139 | 140 | --- 141 | name: "CV-list-7-08" 142 | # required fields are filled by DEFAULT 143 | 144 | # Experiment Parameters 145 | params: 146 | batchsize: 7 147 | learning_rate: 0.8 148 | ``` 149 | 150 | The `list` keyword requires all parameter sets to be of equal length and will combine every n-th value. The `grid` keyword will generate all possible combinations, i.e. in the above example 2x2 = 4 combinations: 151 | 152 | `(3, 0.4) (3, 0.8) (7, 0.4) (7, 0.8)`) 153 | 154 | You can also combine `grid` and `list` in the same experiment. For every `list` combination, the `grid` will be solved, resulting in a total number of `product('grid') * min(length('list'))` runs. 155 | 156 | 157 | The final experiment configurations combining all techniques could look like: 158 | ```yaml 159 | --- 160 | # DEFAULT parameters (Optional) 161 | name: "DEFAULT" # MUST BE 'DEFAULT' 162 | path: "/default/dir/" # location to save results in 163 | repetitions: 5 # number of times one set of parameters is run 164 | 165 | # Implementation default parameters 166 | # Can be overwritten by named experiments. 167 | params: 168 | net_architecture: "vgg16" 169 | 170 | 171 | --- 172 | # Experiment 1 173 | name: "VGG" 174 | 175 | # Required: 176 | # Repetitions are defined in DEFAULT 177 | path: "/vgg/results/" # overwrite DEFAULT setting 178 | 179 | # Experiment Parameters: 180 | # params.net_architecture from DEFAULT 181 | 182 | # Creates all combinations 183 | grid: 184 | learning_rate: [0.5] 185 | batchsize: [5, 10] 186 | 187 | 188 | --- 189 | # Experiment 2 190 | name: "AlexNet" 191 | 192 | # Required settings defined in DEFAULT 193 | 194 | # Experiment Parameters: 195 | params: 196 | net_architecture: "alex_net" # overwrite DEFAULT 197 | learning_rate: 0.9 # no combination tryout 198 | batch_size: 2 # no combination tryout 199 | ``` 200 | 201 | #### 3.1.2.1 Ablative Parameter Search 202 | A new, advanced option is the use of the `ablative` keyword. This mechanic is helpful if you want to estimate the impact of specific hyperparameters. 203 | **cw2** will only subsitute one parameter from the `ablative` section at a time. You can think of it as a shortcut to defining multiple default `params` sections quickly. 204 | 205 | For example, the following experiment configuration 206 | 207 | ```yaml 208 | --- 209 | name: XYZ 210 | # Required settings defined in DEFAULT 211 | 212 | params: 213 | pretrained: 'imagenet' 214 | initialization: 'kmeans' 215 | 216 | grid: 217 | learning_rate: [0.3, 0.6] 218 | gamma: [1, 2, 3] 219 | 220 | ablative: 221 | pretrained: [False] 222 | initialization: ['random', 'softmax'] 223 | ``` 224 | will result in a total of 24 runs: 6 `grid` kombinations with default `params` settings, 6 with `pretrained: False`, 6 with `initialization: random` and an additional 6 with `initialization: softmax` 225 | 226 | As you can see, the keys under `ablative` are changed one at a time, but never multiple at once. 227 | 228 | **Attention!!** 229 | 230 | `ablative` keys are changed one at a time. You are responsible to supply "default" `params` for when the other parameters under the `ablative` keyword are exchanged. 231 | 232 | 233 | ### 3.1.3. Recommended Practices: Experiment Configuration 234 | 1. `params` is your safe space! 235 | 2. If you feel like you need multiple `DEFAULT` sections, you probably want multiple YAML files 236 | 237 | #### 3.1.3.1. Params is your safe space 238 | A common use case for **cw2** is the hyperparameter search for ML models. Often users only put the hyperparameters they search for into the `params` sections and keep their "constants", like training data location, outside. For example: 239 | 240 | ```yaml 241 | --- 242 | name: "THIS IS NOT RECOMMENDED" 243 | # Required settings 244 | # ... 245 | 246 | params: 247 | learning_rate: 0.3 248 | batch_size: 4 249 | 250 | training_data: "/my/dataset" 251 | speed_of_light: "c" 252 | ``` 253 | 254 | While this will probably not cause an error, I recommend you still define your constants inside the `params` sections. During runtime **cw2** will modify the internal configuration object. While it is highly unlikely, you might overwrite such an internal keyword, leading to unforeseen issues, especially as the software evolves. For now, internal keywords generally begin with an underscore (`_internal_keyword`) and should be avoided. 255 | 256 | To stay on the safe side, put all your custom parameters / arguments / constants inside the `params` section. **cw2** guarantees that all the values inside this section will not be altered without explicit user permission by using a combination keyword like `grid` or `list`. For example: 257 | 258 | ```yaml 259 | --- 260 | name: "THIS IS THE WAY" 261 | # Required settings 262 | # ... 263 | 264 | params: 265 | learning_rate: 0.3 266 | batch_size: 4 267 | training_data: "/my/dataset" 268 | speed_of_light: "c" 269 | ``` 270 | 271 | #### 3.1.3.2. You dont want multiple DEFAULTS... 272 | When running the same experiments for a long time, you may try out different parameters. Especially in the beginning, it is easier to extend the YAML file by adding a new document to the bottom of the file. After a while, you might find you have two "clusters" of configurations, maybe two algorithms / models, that you compare to each other. These models might require very different parameters, and it might not even be possible to share a common `DEFAULT` setting between those two classes. 273 | 274 | In this case, I recommend you split the YAML file into two files, one for each approach. As you are most likely deploying such big experiments on a computing cluster using slurm, you do not have to wait for the results of the first set of tasks before starting the second. 275 | 276 | ```console 277 | # Naive Approach 278 | u@cluster:~$ python experiment.py BIG_OLD_LEGACY.yml -s 279 | 280 | # Split Approach 281 | u@cluster:~$ python experiment.py model_1.yml -s 282 | u@cluster:~$ python experiment.py model_2.yml -s 283 | ``` 284 | 285 | A new feature to help alleviate this problem, is the linking / import of external yaml files, see [Linking External YAML Files](09_advanced.md). 286 | 287 | 288 | ## 3.2. SLURM Configuration 289 | If you want to run a **cw2** experiment on a SLURM cluster, you __must__ include a document in your YAML configuration file with the `name` key set to `"SLURM"`. During local execution this document is ignored. 290 | 291 | ```yaml 292 | --- 293 | # Slurm config 294 | name: "SLURM" # MUST BE "SLURM" 295 | ``` 296 | 297 | The following fields are __required__ to ensure correct execution of your job on the slurm cluster. Please refer to the [sbatch docu](https://slurm.schedmd.com/sbatch.html) for further explanations. 298 | ```yaml 299 | # ... continued 300 | # Required 301 | job-name: "yourjob" # this will be the experiment's name in slurm 302 | ``` 303 | 304 | The following fields are __required__ to configure your hardware requirements. These are _highly_ cluster specific. Please refer to the [sbatch docu](https://slurm.schedmd.com/sbatch.html) for further explanations. 305 | ```yaml 306 | # ... continued 307 | # Required - Cluster Specific 308 | partition: "dev" 309 | num_parallel_jobs: 120 310 | ntasks: 1 311 | cpus-per-task: 1 312 | time: 30 313 | ``` 314 | 315 | All the following sections are optional arguments. 316 | If they are not present in this slurm configuration, a default behaviour is used. 317 | ```yaml 318 | # ... continued 319 | # Optional 320 | account: "" # Account name to which Cluster Time will be booked. Cluster specific. 321 | mem-per-cpu: 1000 # Optional - Cluster specific 322 | 323 | experiment_copy_dst: "/path/to/code_copy/dst" # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. If not set, no copy will be made. 324 | experiment_copy_auto_dst: /path/to/code_copy/dst" # optional. will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Overrules experiment_copy_dst. If not set, no copy will be made. 325 | experiment_copy_src: "/path/to/code_copy/src" # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to directory of __MAIN__ file. 326 | slurm_log: "/path/to/slurmlog/outputdir" # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path 327 | venv: "/path/to/virtual_environment" # optional. path to your virtual environment activate-file 328 | ``` 329 | 330 | If you have further need to configure slurm, you can use all the options offered by the [sbatch docu](https://slurm.schedmd.com/sbatch.html). Please use the following style of defining _keyword_ -> _value_ pairs: 331 | 332 | ```yaml 333 | # ... continued 334 | # Optional SBATCH Arguments 335 | sbatch_args: # Dictionary of SBATCH keywords and arguments 336 | kw_1: "arg1" # Will construct the line: #SBATCH --kw_1 arg1 337 | kw_2: "arg2" # Will construct the line: #SBATCH --kw_2 arg2 338 | ``` 339 | 340 | Sometimes it is necessary to do execute some additional instructions in the linux shell before starting the python process using slurm. You can define arbitrarily many additional shell instructions using the following format: 341 | ```yaml 342 | # ... continued 343 | # Optional shell instructions 344 | sh_lines: # List of strings 345 | - "line 1" 346 | - "line 2" 347 | ``` 348 | ## 3.3. Example Templates 349 | This documentation gets updated less frequently than potential feature introductions. 350 | When in doubt, refer to the provided templates: 351 | - [AbstractExperiment Configuration](../templates/abstract_config.yml) 352 | - [AbstractIterativeExperiment Configuration](../templates/iterative_config.yml) 353 | 354 | ## 3.4. Important Keys 355 | These are important configuration keys you have access to in the various methods of your `AbstractExperiment` Implementation. 356 | - `cw_config['params']` is a dictionary containing everything under the `params` keyword, including the merged values from `DEFAULT` and `list`/`grid` keywords. 357 | - `cw_config['_rep_log_path']` is a `str` entry pointing to the _threadsafe_ directory of this repetition. Here all **cw2** logging artifactsof this repitition will be written. If you have any results / model checkpoints you can save them here under the guarantee that no other **cw2** run will interfere. 358 | 359 | [Back to Overview](./) 360 | -------------------------------------------------------------------------------- /doc/04_slurm.md: -------------------------------------------------------------------------------- 1 | # 4. SLURM Introduction 2 | under construction 3 | 4 | 5 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/05_files.md: -------------------------------------------------------------------------------- 1 | # 5. The CW2 File System 2 | under construction 3 | 4 | 5 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/06_code_copy.md: -------------------------------------------------------------------------------- 1 | # 6. Code Copy Feature 2 | 3 | - [6. Code Copy Feature](#6-code-copy-feature) 4 | - [6.1. Enabling Code Copy](#61-enabling-code-copy) 5 | - [6.2. Disabling Code Copy](#62-disabling-code-copy) 6 | - [6.3 CLI Options](#63-cli-options) 7 | - [6.4 Known Challenges](#64-known-challenges) 8 | 9 | 10 | When submitting a job to a SLURM cluster, it is likely to wait in queue until requested compute resources become available. During this queuing time, the code can still be changed, as no Python process has been started yet. 11 | 12 | Any changes the user makes to their code in this queueing time, will be in effect once the job starts. For example: 13 | 14 | - User starts with default codebase A. They submit their first slurm job, waiting for results. 15 | - While waiting, the user implements a new feature, resulting in a new codebase A*. 16 | - Wanting to compare A* to the future results of A, the user submits a second job. 17 | - After a while, the results of both jobs are ready. The results of the first job and second job are exactly identical. The user is confused. 18 | 19 | In the above example, both jobs ran with codebase A*, leading to identical results. 20 | 21 | To avoid this problem, we offer the **Code Copy Feature**. 22 | 23 | ## 6.1. Enabling Code Copy 24 | To enable code copy, add the `src` and **one (1)** `dst` argument to your `SLURM` config section: 25 | 26 | ```yaml 27 | # Required for Code-Copy-Feature 28 | experiment_copy_src: "/path/to/code_copy/src" # Code Copy Source directory. 29 | 30 | # Choose one for Code-Copy-Feature 31 | experiment_copy_dst: "/path/to/code_copy/dst" # Code Copy Destination directory. Will be overwritten if called multiple times. 32 | experiment_copy_auto_dst: "/path/to/code_copy/dst" # Code Copy Destination directory autoincrement. Will create a new subdirectory each time. 33 | ``` 34 | 35 | If you only want to "document" the code, so that you might reproduce it later, you can use the `--zip` CLI option. This will create a Zip Archive of your code in the code-copy `dst`. 36 | 37 | ## 6.2. Disabling Code Copy 38 | To permanently disable code copy, remove the `src` and `dst` arguments from your `SLURM` config section. 39 | To temporarily disable code copy, add `--nocodecopy` to your `python main.py config.yaml` call. 40 | 41 | ## 6.3 CLI Options 42 | For a full and updated list, please refer to the [CLI Args Docu](11_cli_args.md). 43 | | Flag | Name | Effect | 44 | | ---- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 45 | | | --zip | Creates a ZIP archive for documentation purposes of $CWD or, if set, "experiment_copy_src". | 46 | | | --skipsizecheck | Disables a safety size check when Zipping or Code-Copying. The safety prevents unecessarily copying / archiving big files such as training data. | 47 | | | --multicopy | Creates a Code-Copy for each Job. If you are modifying a hardcoded file in your codestructure during runtime, this feature might help ensure multiple runs do not interfere with each other. | 48 | | | --nocodecopy | Do not use the Code-Copy feature, even if the config arguments are specified. | 49 | 50 | ## 6.4 Known Challenges 51 | 1. Code Copy can quickly lead to a storage problems. To avoid this, we have a safety check disabling code-copy if more than 200MB are targeted. This can be disabled via `--skipsizecheck`. 52 | **Attention!!** 53 | If your `src` contains training data, it will also be copied each time. 54 | If your `dst` is inside of `src`, future copies will contain the old ones. This can quickly lead to a file size explosion. 55 | 56 | 2. To ensure that the copied code is executed, `cw2` will modify the `$PYTHONPATH` to point at the `dst` directory. While in my experience this should be stable, it could lead to issues if you are also modifying the `$PYTHONPATH` somewhere. 57 | 58 | As with all more advanced features, please double check upon first execution, if your code is still executed as expected. 59 | 60 | 61 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/07_logging.md: -------------------------------------------------------------------------------- 1 | # 7. Logging Results 2 | 3 | - [7. Logging Results](#7-logging-results) 4 | - [7.1. Console Logger](#71-console-logger) 5 | - [7.2. Logger Interface](#72-logger-interface) 6 | - [7.3. Advanced Loggers](#73-advanced-loggers) 7 | - [7.3.1. Pandas](#731-pandas) 8 | - [7.3.2. WandB](#732-wandb) 9 | 10 | **cw2** comes with a a variety of logging capabilities. This document will explain how to use the basic "Console" logging to document `print()`-like statements. 11 | 12 | ## 7.1. Console Logger 13 | When you create a `cw2.ClusterWork` instance in your _main_, a custom [python logging](https://docs.python.org/3/howto/logging.html) object is created. You can use this object to "print" statements to the console and they will be automatically saved into a logfile on disk in your output folder (TODO: FILESYTEM). Two files will be written: 14 | 15 | - `out.log` contains every message you passed to the logger 16 | - `err.log` contains only error messages 17 | 18 | You can access it from anywhere within a **cw2** program by: 19 | 20 | ```python 21 | from cw2.cw_data import cw_logging 22 | 23 | # retrieve logger 24 | l = cw_logging.get_logger() 25 | 26 | # Print Generic Message() 27 | l.info("This will be written to out.log") 28 | 29 | # Print Error Message 30 | l.error("This will be written to err.log AND out.log") 31 | ``` 32 | 33 | You do not need to initialize or close the logger object. It is handled automatically by **cw2**. 34 | 35 | ## 7.2. Logger Interface 36 | If you want to implement your own custom logger, you have to implement the corresponding interface [`AbstractLogger`](../cw2/cw_data/cw_logging.py) 37 | 38 | ```Python 39 | from cw2.cw_data import cw_logging 40 | 41 | class MyLogger(cw_logging.AbstractLogger): 42 | # ... 43 | 44 | def initialize(self, config: attrdict.AttrDict, rep: int, rep_log_path: str): 45 | # Initialize / Reset the logger for a new repetition 46 | self.log_path = rep_log_path + 'my_file.txt' 47 | self.data_list = [] 48 | 49 | def process(self, data) -> None: 50 | # Processes incoming data. 51 | # Need to do your own check if data is in the format you expect. 52 | print(data) 53 | self.data_list.append(data) 54 | 55 | def finalize(self) -> None: 56 | # Finalize the processing, e.g. write the internal data to disk and close all writers 57 | write_to_disk(self.data, self.log_path) 58 | 59 | def load(self): 60 | # Implement this function to load potential results 61 | self.data = read_from_disk(self.log_path) 62 | return self.data 63 | ``` 64 | 65 | The execution order is very similar to the order of an [`AbstractIterativeExperiment`](../cw2/experiment.py): 66 | 67 | ```Python 68 | log = AbstractLogger() # Initialize only GLOBAL values & CONSTANTS 69 | for r in repetitions: 70 | log.initialize(...) # Initialize / Reset the logger for each repetition. 71 | 72 | for i in iterations: 73 | result = experiment.iterate(...) # Obtain some data from an experiment 74 | log.process(result) # Log the result 75 | 76 | log.finalize() # Finalize / Clean the logger after each repetition 77 | ``` 78 | Each logger is responsible themselves to check results and how handle them. 79 | 80 | 81 | ## 7.3. Advanced Loggers 82 | **cw2** provides advanced logging functionality in form of a [Pandas Dataframe](https://pandas.pydata.org/) Logger for Excel-like table structures, and a [Weights & Biases (WandB)](https://wandb.ai/site) Logger for advanced metrics. 83 | ### 7.3.1. Pandas 84 | ### 7.3.2. WandB 85 | This description is intended as a first primer, and is not tested by me. 86 | 87 | To instantiate the WandB logger, you need to add it to the LoggerArray. 88 | 89 | ```Python 90 | if __name__ == "__main__": 91 | cw = ClusterWork(YourExp) 92 | 93 | cw.add_logger(WandBLogger()) 94 | cw.run() 95 | ``` 96 | 97 | Your `config.yml` find needs to be configured for wandb: 98 | Please refer to the official WandB documentation and the WandBLogger code to learn, what options you have and their effect. 99 | 100 | ```yaml 101 | --- 102 | name: some_exp 103 | repetitions: 5 104 | params: 105 | ... 106 | 107 | wandb: 108 | project: project_name 109 | group: group_name 110 | ``` 111 | 112 | Logging data with the WandBLogger is the same as every other logger: 113 | 114 | For `AbstractIterativeExperiment` implementations, the complete result dictionary returned by your `iterate()` function will be logged, unless you used the `ignore_keys` parameters during Logger creation: 115 | 116 | ```Python 117 | # logs everything 118 | wandb_l = WandBLogger() 119 | 120 | # logs everything except for the key secret 121 | wandb_l = WandBLogger(ignore_keys=['secret']) 122 | ``` 123 | 124 | When using an `AbstractExperiment` implementation, you have to log results manually: 125 | 126 | ```Python 127 | def run(self, config, repetition, logger): 128 | do_something() 129 | results = { 130 | # fill dictionary 131 | } 132 | logger.process(results) 133 | ``` 134 | 135 | Optional config parameters of the wandb logger: 136 | ```yaml 137 | wandb: 138 | optional_config: value_of_this_config 139 | ``` 140 | - **log_model**: bool, indicates whether the model shall be logged by the wandb or not. 141 | When it is false or not given, nothing happens. 142 | When it is true, the wandb logger will assume you have saved some meaning model files (such as NN weights) under `rep_xx/log/model`. 143 | In the end of each repetition, the logger will upload all the files saved there as an Artifact. 144 | The wandb logger does not care about the content and types of the files in such directory, or how did you save model in such directory. 145 | If such directory does not exist, or it contains no file, then wandb logger will log a warning but will not raise any error to break your experiment. 146 | In your own experiment class, you can get this directory in the initialize function and save model: 147 | ```python 148 | class MyCoolExp(experiment.AbstractIterativeExperiment): 149 | def initialize(self, cw_config: dict, 150 | rep: int, logger: cw_logging.LoggerArray) -> None: 151 | self.net = CoolNet() 152 | 153 | # Get the determined directory to save the model 154 | self.save_model_dir = cw_config.save_model_dir 155 | 156 | # You need to make a new dir of this given save model dir too! 157 | # os.mkdir(...) 158 | 159 | # You may save your model for every M epochs 160 | self.save_model_interval = 100 161 | 162 | def save_state(self, cw_config: dict, rep: int, n: int) -> None: 163 | if self.save_model_dir and ((n + 1) % self.save_model_interval == 0 164 | or (n + 1) == cw_config.iterations): 165 | self.net.save_weights(log_dir=self.save_model_dir, epoch=n + 1) 166 | ``` 167 | 168 | - **model_name**: string, name of the saved model. 169 | It is only useful when **log_model** is set. 170 | If the **model_name** is not set, the saved model will use "model" as its default name. 171 | 172 | 173 | - **log_interval**: int value. If it is given, it indicates that you want to log result in a given interval. 174 | This helps in the experiment which contains too many iterations (epochs), so that you do not want to log stuff for every iteration. 175 | 176 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/08_loading.md: -------------------------------------------------------------------------------- 1 | # 8. Loading Results 2 | We provide a simple function to access the results from your runs. An example can be found in `polynom_tutorial\polynom_load.py`: 3 | 4 | ```Python 5 | from cw2 import cluster_work, cw_logging 6 | 7 | cw = cluster_work.ClusterWork(None) 8 | 9 | # Add all the loggers whose results you want to load. 10 | cw.add_logger(cw_logging.PandasRepSaver()) 11 | # ... 12 | 13 | 14 | # res is a pandas.DataFrame 15 | res = cw.load() 16 | ``` 17 | 18 | The resulting object is a `pandas.DataFrame` with each repetition as a row, and each configuration parameter and logger result as a column. 19 | You can use all the available `pandas` methods to filter and do your own analysis of the results. 20 | 21 | Additionally we offer our own processing functions with an extension of the `pandas` API: `df.cw2` 22 | For example, to select a single repetition in the result dataframe `res` from the example above, use `df.cw2.repetition()`: 23 | 24 | ```Python 25 | # ... 26 | res = cw.load() 27 | repetition_0 = res.cw2.repetition(0) 28 | ``` 29 | 30 | To select all runs with a specific hyper-parameter setting, use `df.cw2.filter()`: 31 | ```Python 32 | # ... 33 | res = cw.load() 34 | 35 | # parameter dict - same structure as CONFIG.params 36 | interesting_params = { 37 | 'param1': 1 38 | } 39 | 40 | interesting_results = res.cw2.filter( 41 | interesting_params 42 | ) 43 | ``` 44 | 45 | 46 | 47 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/09_advanced.md: -------------------------------------------------------------------------------- 1 | # 9. Advanced Features & Parallelization 2 | - [9. Advanced Features & Parallelization](#9-advanced-features--parallelization) 3 | - [9.1. Error Handling](#91-error-handling) 4 | - [9.2. Parallelization](#92-parallelization) 5 | - [9.2.1 Parallelization Pitfalls](#921-parallelization-pitfalls) 6 | - [9.3. Custom Scheduler](#93-custom-scheduler) 7 | - [9.4. Linking External YAML Files](#94-linking-external-yaml-files) 8 | 9 | ## 9.1. Error Handling 10 | Should any kind of exception be raised during an Experiment execution (`initialize()` or `run()`), **cw2** will abort this experiment run, log the error including stacktrace to a log file in the repetition directory and continue with the next task. 11 | 12 | If you want to end an (iterative) experiment early, you can raise the `cw_error.ExperimentSurrender` exception to gracefully abort the experiment execution. 13 | 14 | The `finalize()` function of you experiment has access to a raised `cw_error.ExperimentSurrender` exception and can access its payload. You can use this to "transmit" data to your finalziation procedure and react accordingly. 15 | 16 | ## 9.2. Parallelization 17 | First, an attempt to establish a terminology: 18 | - Experiment: A collection of hyperparameter runs, defined in the `config.yml` via the `name` key. 19 | - Hyperparameter run: A combination of hyperparameters, as defined by `params` and combination keywords such as `grid`. Can be repeated multiple times 20 | - Repetition: A singular repetition of a hyperparameter run. 21 | - Job (cw2): A computing job, resulting in its own, independend (computing) process. Per default a 1:1 mapping with repetitions. SLURM calls this "unit" of computation task (`cpu-per-task` keyword.) 22 | 23 | The following config results in `2*2 (grid) * 5 (repetitions)` jobs. 24 | ```yaml 25 | --- 26 | name: exp1 27 | repetitions: 5 28 | grid: 29 | a: [1, 2] 30 | b: [3, 4] 31 | ``` 32 | 33 | Often, a cluster has restrictions on how many SLURM tasks / cw2 jobs can be submitted by a user at once. For this purpose, the 1:1 mapping of assign each repetition its own job can be changed with the `reps_per_job` config keyword. Multiple repetitions are bundled into one process, which are computed sequentially. 34 | 35 | This can then be futher parallelized by using the `reps_in_parallel` config keyword. This starts a multi-threading parallelization within a job process. 36 | 37 | ### 9.2.1 Parallelization Pitfalls 38 | Currently, we use joblib per default for the multi-threading parallelization. This can cause issues with GPU intensive tasks like Deep Learning or special third party libraries, e.g. Mujoco. 39 | 40 | 41 | ## 9.3. Custom Scheduler 42 | In **cw2** a scheduler is an object responsible for executing a list of jobs (see [Slurm Introduction](04_slurm.md)). In some cases it might be necessary to built your own, custom scheduler. E.g., when the use of parallelization inside of a job is required, and your experiment is not compatible with the default joblib multiprocessing approach (for example through the use of GPU acceleration). 43 | 44 | **cw2** does not offer such advanced schedulers on its own, as they might be highly dependend on your use case and applied libraries. 45 | 46 | To build your custom scheduler, you need to at least implement the [`AbstractScheduler`](../cw2/scheduler.py) interface. 47 | 48 | You might want to use [`LocalScheduler`](../cw2/scheduler.py) as a reference implementation. 49 | 50 | Remember: The Scheduler sees the `Job` objects, which itself might bundle multiple cw2 tasks / repetitions (NOT SLURM tasks). 51 | 52 | This is a very abstract, non-working example how this might look like: 53 | 54 | ```python 55 | import some_gpu_acc 56 | from some_gpu_acc import some_multiproc_pool 57 | 58 | from cw2.scheduler import LocalScheduler 59 | 60 | class CustomScheduler(AbstractScheduler): 61 | def run(self, overwrite: bool = False): 62 | for job in self.joblist: 63 | for t in job.tasks: 64 | some_multiproc_pool(N_CORES).parallelize( 65 | job.run_task(t, overwrite) 66 | ) 67 | 68 | ``` 69 | 70 | To use your new custom scheduler, you have to give it to the [`ClusterWorks`](../cw2/cluster_work.py) instance in your `__main__` function: 71 | 72 | ```python 73 | from cw2 import cluster_work 74 | 75 | if __name__ == "__main__": 76 | # Give the MyExperiment Class, not MyExperiment() Object!! 77 | cw = cluster_work.ClusterWork(MyExperiment) 78 | 79 | # RUN WITH CUSTOM SCHEDULER!!! 80 | cw.run(s = CustomScheduler()) 81 | ``` 82 | 83 | ## 9.4. Linking External YAML Files 84 | It might be helpful to you, to organize your experiment configs into different yaml files which refer to each other. 85 | Similiar to the merging behaviour with a `DEFAULT` configuration, you can now define a "parent" configuration with two new keywords: 86 | 87 | ```yaml 88 | --- 89 | name: "child" 90 | import_path: "some_path" # optional. can be an absolute path, or relative to this yaml file. 91 | # if only import_exp is present, defaults to THIS file. 92 | import_exp: "parent_exp" # optional. basically -e option which external experiment should be the basis. 93 | # The external experiment will be merged with its own default before importing. 94 | # Case Sensitive. Defaults to "DEFAULT". 95 | ``` 96 | 97 | Imported yaml files can be children with imports themselves. A child will always overwrite its parent. Relative paths will always be relative to the file they are written in, NOT to the root or main.py 98 | 99 | Cyclic Linking should be detected and result in an error message. 100 | 101 | The resolution order is: 102 | 1. A named experiment `child` gets merged with its internal `DEFAULT` configuration. Shared keys are "overwritten" by the more specific `child`. 103 | 2. Should after the merge an `import_` key be present in the configuration, the specified `parent_exp` gets loaded. 104 | 3. The `parent_exp` is merged with its internal "Parent"-`DEFAULT`. 105 | 4. Repeat Steps 2-4 for each parent. 106 | 107 | 108 | 109 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/10_advanced_gpu.md: -------------------------------------------------------------------------------- 1 | # 10. Advanced GPU Scheduling 2 | 3 | Here we discuss advanced GPU Scheduling, i.e., advanced methods to distribute repetitions across GPUs. 4 | There are two main use cases for this: 5 | 6 | 1.) **Putting Multiple Repetition on GPU**: Often, a single repetition is not enough to fully saturate the GPU (especially for the larger 7 | Teslar Models used in HPC clusters). Therefore, it can be beneficial to run multiple repetitions in parallel on a single GPU. 8 | 9 | 2.) **Requesting Single GPUs not possible**: Some HPC Clusters are configured in a way that requesting single GPUs via SLURM is not possible. 10 | In this case, you'll always get multiple GPUs at once, and it's your responsibility to distribute the load across them. 11 | 12 | **Caveat**: Please always have an eye on your jobs and make sure they behave as expected with regard to GPU utilization and runtime, do not fully rely on this! 13 | The underlying multiprocessing is tricky business, behaviour is not always consistent across different machines and python versions. 14 | There can be weird side effects. 15 | 16 | 17 | ## 10.1. The ''gpus_per_rep'' Config Keyword 18 | 19 | The main new functionality to control GPU usage is the `gpus_per_rep` config keyword. Although it's not an actual SLURM key-word, it needs to be specified in the SLURM block of your config. 20 | It can be a float smaller than 1 or an integer lager or equal to 1. It does what the name suggests, it specifies how many GPUs are requested per repetition. 21 | For it to properly work, you need to set the `reps_per_job` and `reps_in_parallel` keys accordingly. 22 | 23 | **Caveat**: I have no idea what happens if different values for `reps_per_job` and `reps_in_parallel` are used throught your YAML. Just don't do it (or test it). 24 | 25 | ### 10.1.1. Example 1: Using only half a GPU per repetition 26 | 27 | Assume your Jobs are small and you want to run 2 on each single GPU. 28 | First, set `gpus_per_rep` to 0.5: 29 | 30 | ```yaml 31 | --- 32 | # Slurm config 33 | name: "SLURM" 34 | partition: "gpu" 35 | job-name: "half_gpu_job" 36 | time: 20 37 | ntasks: 1 38 | cpus-per-task: 8 # 4 CPUs per rep! 39 | gpus_per_rep: 0.5 40 | sbatch_args: 41 | gres: "gpu:1 42 | ``` 43 | 44 | To have both jobs run on the same GPU in parallel, set `reps_per_job` to 2 and `reps_in_parallel` to 2 (you can also 45 | set 'reps_per_job' to a multiple of 2): 46 | 47 | ```yml 48 | --- 49 | # Default 50 | name: DEFAULT 51 | reps_per_job: 2 52 | reps_in_parallel: 2 53 | ``` 54 | Specify your experiment as usual, the total number of repetitions should be a multiple of 2. 55 | 56 | **Caveat**: There is nothing in CW2 to ensure GPU memory and compute is distributed evenly and not exceeded. 57 | It is your responsibility to take care of that! Check your code if it actually profits from this! (Don't expect a speed-up of 2x, 58 | more something like > 1.5x) 59 | 60 | ### 10.1.2. Example 2: Using single GPUs when you can only request multiple GPUs 61 | 62 | Assume you are on a HPC-System where the minimum number of GPUs you can request is 4 (e.g. HoreKa). 63 | 64 | First, set `gpus_per_rep` to 1: 65 | 66 | ```yaml 67 | --- 68 | # Slurm config 69 | name: "SLURM" 70 | partition: "accelerated" 71 | job-name: "single_gpu_job" 72 | time: 20 73 | ntasks: 1 74 | cpus-per-task: 16 # 4 CPUs per rep! 75 | gpus_per_rep: 1 76 | sbatch_args: 77 | gres: "gpu:4 # Note how we request 4 GPUs here! 78 | ``` 79 | 80 | To have both jobs run on the same GPU in parallel, set `reps_per_job` to 4 and `reps_in_parallel` to 4 (you can also 81 | set 'reps_per_job' to a multiple of 4): 82 | 83 | ```yml 84 | --- 85 | # Default 86 | name: DEFAULT 87 | reps_per_job: 4 88 | reps_in_parallel: 4 89 | ``` 90 | Specify your experiment as usual, the total number of repetitions should be a multiple of 4. 91 | 92 | ## 10.2 Cluster Specific Schedulers 93 | I (Philipp B.) had issues with using this naively on both the Kluster and on HoreKa, but I am unsure if it's a general problem or just a problem of my code 94 | (Todo: Somebody check with their stuff and tell me). 95 | On both systems the jobs would run super slow, as the processes where stealing each others CPU resources. 96 | I had to use different fixes for both systems, and write specific schedulers for them. 97 | You can use them via the `scheduler` key in the `slurm` block of your config, possible values are currently: 98 | 99 | - "kluster": Explicitly limits the number of threads used (if you use something else than PyTorch, you probably need to have another look at that) 100 | - "horeka": Explicitly handles the cpu affinity of individual repetitions. 101 | 102 | ## 10.3 Use full CPU's computation power in a GPU node. 103 | I (Bruce) had some low CPU computation speed issues when do online RL in Horeka GPU node, where I have to use both CPU (for mujoco) and GPU (for agent update). The reason is that for each experiment's generated gym environment, it can use all the cpus of this node and thus often blocks the access of the other environments or other repititions (when multple repititions are running in parallel). To solve it, I added the assigned CPU cores into the cw_config and you can manually assign theses cores to the environments yourself, e.g. one environment has one distinct core. Something like: 104 | ```python 105 | env_pids = [envs.processes[i].pid for i in range(num_env)] 106 | cores_per_env = len(cw_config["cpu_cores"]) // num_env 107 | cpu_cores_list = list(cw_config["cpu_cores"]) 108 | for i, pid in enumerate(env_pids): 109 | cores_env = cpu_cores_list[i * cores_per_env: (i + 1) * cores_per_env] 110 | util.assign_process_to_cpu(pid, set(cores_env)) 111 | ``` 112 | -------------------------------------------------------------------------------- /doc/11_cli_args.md: -------------------------------------------------------------------------------- 1 | # 11. CLI args 2 | The following args are currently supported by CW2: 3 | | Flag | Name | Effect | 4 | | -------------- | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 5 | | -s | --slurm | Run using SLURM Workload Manager. | 6 | | -o | --overwrite | Overwrite existing results. | 7 | | -e name1 [...] | --experiments | Allows to specify which experiments should be run. Corresponds to the `name` field of the configuration YAML. | 8 | | | --zip | Creates a ZIP archive for documentation purposes of $CWD or, if set, "experiment_copy_src". | 9 | | | --skipsizecheck | Disables a safety size check when Zipping or Code-Copying. The safety prevents unecessarily copying / archiving big files such as training data. | 10 | | | --multicopy | Creates a Code-Copy for each Job. If you are modifying a hardcoded file in your codestructure during runtime, this feature might help ensure multiple runs do not interfere with each other. | 11 | | | --nocodecopy | Do not use the Code-Copy feature, even if the config arguments are specified. | 12 | | | --noconsolelog | Disables writing logs with the internal PythonLogger module. Slurm will still create its slurm_logs, so no information is lost. Helps if too many repetitions try to open too many open files and causing errors. | 13 | 14 | 15 | [Back to Overview](./) -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # CW2 User Documentation 2 | - [1. Quick Start Guide](01_quickstart.md) 3 | --- 4 | ## Basic Features 5 | - [2. Experiment Class](02_experiment.md) 6 | - [3. Configuration File](03_config.md) 7 | - [4. Introduction Slurm](04_slurm.md) 8 | - [5. File System](05_files.md) 9 | --- 10 | ## Advanced Features 11 | - [6. Code Copy](06_code_copy.md) 12 | - [7. Logging Results](07_logging.md) 13 | - [8. Loading Results](08_loading.md) 14 | - [9. Advanced Features & Parallelization](09_advanced.md) 15 | - [10. Advanced GPU Scheduling](10_advanced_gpu.md) 16 | --- 17 | - [10. CLI options at a Glance](11_cli_args.md) 18 | --- 19 | Some sections are still under construction. -------------------------------------------------------------------------------- /polynom_tutorial/external_conf.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: DEFAULT 3 | repetitions: 3000 4 | external_key: "ahahahaha" 5 | 6 | 7 | --- 8 | name: ext_exp 9 | import_path: "/home/max/code/cw2/polynom_tutorial/polynom_config.yml" 10 | import_exp: "polynomial" 11 | grid: 12 | a: [1, 3] 13 | b: [4, 6] 14 | x_1: [7] -------------------------------------------------------------------------------- /polynom_tutorial/polynom_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Slurm config (optional) 3 | name: "SLURM_ret" # MUST BE "SLURM" 4 | 5 | # Required 6 | partition: "dev" 7 | job-name: "polynom" # this will be the experiment's name in slurm 8 | #path_to_template: "/home/max_li/code/cw2/templates/sbatch_template.sh" # Path to YOUR prepared sbatch script 9 | 10 | # Required - Cluster Specific 11 | num_parallel_jobs: 120 12 | ntasks: 1 13 | cpus-per-task: 1 14 | mem-per-cpu: 1000 15 | time: 30 16 | 17 | #experiment_copy_auto_dst: "/home/max/autodst" 18 | #experiment_copy_src: "." 19 | 20 | # Optional 21 | #venv: "/home/max_li/venv/bin/activate" # optional. path to your virtual environment activate-file 22 | 23 | # Optional Code Copy: Both Args are required. 24 | #experiment_copy_dst: "/home/max_li/polylog/code" # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to EXPERIMENTCONFIG.path/code 25 | #experiment_copy_src: "/home/max_li/code/cw2/polynom_tutorial" # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. Defaults to CWD. 26 | sh_lines: 27 | - "# haha" 28 | - "# hihi" 29 | --- 30 | 31 | # DEFAULT parameters (Optional) 32 | name: "DEFAULT" # MUST BE 'DEFAULT' 33 | reps_per_job: 4 34 | reps_in_parallel: 1 35 | 36 | # Required: Can also be set in DEFAULT 37 | path: "/tmp/polylog" # location to save results in 38 | repetitions: 2 # number of times one set of parameters is run 39 | iterations: 1000 # number of iterations per repetition 40 | 41 | # Implementation default parameters 42 | params: 43 | noise: 5 44 | stepsize: 0.05 45 | 46 | --- 47 | # Experiment 1 48 | name: "polynomial" 49 | aah: "aaah" 50 | 51 | params: 52 | x_0: 1 53 | x_1: 2 54 | x_2: 3 55 | x_3: 4 56 | 57 | --- 58 | # Experiment 2 59 | name: "grid_polynom" 60 | repetitions: 1 61 | iterations: 100 62 | 63 | #import_path: "./external_conf.yml" 64 | #import_exp: "ext_exp" 65 | 66 | params: 67 | x_0: 0 68 | x_3: 0 69 | 70 | ablative: 71 | x_0: [2] 72 | x_3: [6, 12] 73 | 74 | # A total of 12 Runs will be created 75 | grid: 76 | x_1: [3, 4] 77 | x_2: [3, 4, 5] 78 | 79 | list: 80 | x_4: [2, 3] 81 | x_5: [2, 3] 82 | -------------------------------------------------------------------------------- /polynom_tutorial/polynom_load.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | from cw2 import cluster_work 4 | from cw2.cw_data import cw_logging, cw_pd_logger 5 | 6 | if __name__ == "__main__": 7 | cw = cluster_work.ClusterWork(None) 8 | cw.add_logger(cw_pd_logger.PandasLogger()) 9 | 10 | # load() -> pd.DataFrame 11 | df = cw.load() 12 | 13 | rep0 = df.cw2.filter({"x_1": 0}) 14 | 15 | print(df.head()) 16 | 17 | print(df.cw2.flatten_pd_log().shape) 18 | 19 | for i, job in df.iterrows(): 20 | single_df = job["PandasLogger"] 21 | single_df[["sample_y", "true_y"]].plot.line() 22 | plt.savefig(job["rep_path"] + "plot.png") 23 | -------------------------------------------------------------------------------- /polynom_tutorial/polynom_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | from cw2 import cluster_work, cw_error, experiment 5 | from cw2.cw_data import cw_logging, cw_pd_logger 6 | 7 | 8 | class Polynomial(experiment.AbstractIterativeExperiment): 9 | # ... 10 | 11 | def initialize( 12 | self, config: dict, rep: int, logger: cw_logging.AbstractLogger 13 | ) -> None: 14 | random.seed(rep) 15 | 16 | def iterate(self, config: dict, rep: int, n: int) -> dict: 17 | if rep > 0: 18 | # You can raise an Experiment Surrender Exception to gracefully end a task prematurely 19 | raise cw_error.ExperimentSurrender() 20 | 21 | if n > 10: 22 | # Should a task raise an Exception, it will be logged and the next job execution starts. 23 | y = 3 / 0 24 | 25 | params = config["params"] 26 | print(params) 27 | x_0 = params["x_0"] 28 | x_1 = params["x_1"] 29 | x_2 = params["x_2"] 30 | x_3 = params["x_3"] 31 | 32 | x = params["stepsize"] * n 33 | y = x_3 * (x**3) + x_2 * (x**2) + x_1 * x + x_0 34 | 35 | y_noise = y + (random.randint(-10, 10) / 10.0) * params["noise"] 36 | 37 | return {"true_y": y, "sample_y": y_noise} 38 | 39 | def save_state(self, config: dict, rep: int, n: int) -> None: 40 | pass 41 | 42 | def finalize(self, surrender=None, crash: bool = False): 43 | # Use cw_logging.getLogger() for logging functionality 44 | cw_logging.getLogger().info("Finished. Closing Down.") 45 | 46 | 47 | if __name__ == "__main__": 48 | import sys 49 | 50 | sys.argv.append("polynom_config.yml") 51 | sys.argv.append("-o") 52 | sys.argv.append("-s") 53 | sys.argv.append("--debug") 54 | 55 | cw = cluster_work.ClusterWork(Polynomial) 56 | cw.add_logger(cw_pd_logger.PandasLogger()) 57 | cw.run() 58 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # To use a consistent encoding 2 | from codecs import open 3 | from os import path 4 | 5 | from setuptools import find_packages, setup 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 11 | long_description = f.read() 12 | 13 | setup( 14 | name="cw2", 15 | # Versions should comply with PEP440. For a discussion on single-sourcing 16 | # the version across setup.py and the project code, see 17 | # https://packaging.python.org/en/latest/single_source_version.html 18 | version="2.5.1", 19 | description="A reengineered framework to run experiments on a computing cluster.", 20 | long_description=long_description, 21 | long_description_content_type="text/markdown", 22 | # The project's main homepage. 23 | url="https://github.com/ALRhub/cw2", 24 | # Author details 25 | author="Maximilian Li", 26 | author_email="maximilian.xiling.li@gmail.com", 27 | license="MIT", 28 | classifiers=[ 29 | "Development Status :: 5 - Production/Stable", 30 | "Intended Audience :: Science/Research", 31 | "Intended Audience :: Education", 32 | "Topic :: System :: Distributed Computing", 33 | "Topic :: Scientific/Engineering", 34 | "Topic :: Scientific/Engineering :: Information Analysis", 35 | "Topic :: Education", 36 | "Programming Language :: Python :: 3", 37 | "Programming Language :: Python :: 3.3", 38 | "Programming Language :: Python :: 3.4", 39 | "Programming Language :: Python :: 3.5", 40 | "Programming Language :: Python :: 3.6", 41 | "Programming Language :: Python :: 3.7", 42 | "Programming Language :: Python :: 3.8", 43 | "Programming Language :: Python :: 3.9", 44 | "Programming Language :: Python :: 3.10", 45 | "Environment :: Console", 46 | ], 47 | python_requires=">=3", 48 | # What does your project relate to? 49 | keywords=["scientific", "experiments", "distributed computing", "mpi", "research"], 50 | packages=find_packages(), 51 | package_data={"cw2": ["default_sbatch.sh"]}, 52 | install_requires=["PyYAML", "numpy", "pandas", "joblib"], 53 | ) 54 | -------------------------------------------------------------------------------- /templates/abstract_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Slurm config (optional) 3 | name: "SLURM" # MUST BE "SLURM" 4 | 5 | # Required 6 | partition: "dev" 7 | job-name: "experiment" # this will be the experiment's name in slurm 8 | 9 | # Required - Cluster Specific 10 | num_parallel_jobs: 120 11 | ntasks: 1 12 | cpus-per-task: 1 13 | time: 30 # Runtime in Wallclock Time. Can be int or str in form "HH:MM:SS" 14 | 15 | mem-per-cpu: 1000 # Optional - Cluster specific 16 | 17 | # Optional 18 | path_to_template: "/path/to/sbatch_template.sh" # Path to YOUR prepared sbatch script. Uses supplied default template if not specified 19 | account: "" # Account name to which Cluster Time will be booked. Cluster dependent. 20 | slurm_log: "/path/to/slurmlog/outputdir" # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path/slurmlog 21 | venv: "/path/to/virtual_environment/bin/activate" # optional. path to your virtual environment activate-file 22 | 23 | # Optional Code Copy: Both Args are required. 24 | experiment_copy_dst: "/path/to/code_copy/dst" # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 25 | experiment_copy_auto_dst: "/path/to/code_copy/dst" # will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 26 | experiment_copy_src: "/path/to/code_copy/src" # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 27 | 28 | # Optional SBATCH Arguments 29 | sbatch_args: # Dictionary of SBATCH keywords and arguments 30 | kw_1: "arg1" # Will construct the line: #SBATCH --kw_1 arg1 31 | kw_2: "arg2" # Will construct the line: #SBATCH --kw_2 arg2 32 | 33 | # Optional shell instructions 34 | sh_lines: # List of strings 35 | - "line 1" 36 | - "line 2" 37 | 38 | --- 39 | # DEFAULT parameters (Optional) 40 | name: "DEFAULT" # MUST BE 'DEFAULT' 41 | 42 | # Implementation default parameters 43 | # Will be overwritten by named experiments. 44 | params: 45 | param_1: "default_value" 46 | 47 | --- 48 | # Experiment 1 49 | name: "experiment_name" 50 | 51 | # Required: Can also be set in DEFAULT 52 | path: "path/to/output_dir/" # location to save results in 53 | repetitions: 5 # number of times one set of parameters is run 54 | 55 | # Optional: Can also be set in DEFAULT 56 | # Only use these values if you are sure you know what you are doing. 57 | # Refer to Chapter 9 of the Docs for more info 58 | reps_per_job: 1 # number of repetitions in each job. useful for paralellization. defaults to 1. 59 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1. 60 | 61 | # Experiment Parameters: Can also be set in DEFAULT. Can be a nested dictionary. 62 | params: 63 | param_1: "exp_value_1" # overwrites Default 64 | param_2: "exp_value_2" # new experiment specific parameter 65 | 66 | # Dynamically assigned parameters. Can be EITHER 'list' or 'grid'. Can NOT be set in DEFAULT. Can be a nested dictionary. 67 | list: # alternative - 'grid:' 68 | param_3: [1, 2] 69 | param_4: [3, 4] -------------------------------------------------------------------------------- /templates/abstract_main.py: -------------------------------------------------------------------------------- 1 | from cw2 import cluster_work, cw_error, experiment 2 | from cw2.cw_data import cw_logging 3 | 4 | 5 | class MyExperiment(experiment.AbstractExperiment): 6 | # ... 7 | 8 | def initialize( 9 | self, config: dict, rep: int, logger: cw_logging.LoggerArray 10 | ) -> None: 11 | cw_logging.getLogger().info( 12 | "Ready to start repetition {}. Resetting everything.".format(rep) 13 | ) 14 | 15 | def run(self, config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 16 | # Do Something non-iteratively and logging the result. 17 | cw_logging.getLogger().info("Doing Something.") 18 | logger.process("Some Result") 19 | cw_logging.getLogger().warning("Something went wrong") 20 | 21 | def finalize( 22 | self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False 23 | ): 24 | if surrender is not None: 25 | cw_logging.getLogger().info("Run was surrendered early.") 26 | 27 | if crash: 28 | cw_logging.getLogger().warning("Run crashed with an exception.") 29 | cw_logging.getLogger().info("Finished. Closing Down.") 30 | 31 | 32 | if __name__ == "__main__": 33 | cw = cluster_work.ClusterWork(MyExperiment) 34 | 35 | # If loggers are wanted, must be instantiated manually 36 | logger1 = cw_logging.AbstractLogger() 37 | logger2 = cw_logging.AbstractLogger() 38 | cw.add_logger(logger1) 39 | cw.add_logger(logger2) 40 | 41 | cw.run() 42 | -------------------------------------------------------------------------------- /templates/iterative_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Slurm config (optional) 3 | name: "SLURM" # MUST BE "SLURM" 4 | 5 | # Required 6 | partition: "dev" 7 | job-name: "experiment" # this will be the experiment's name in slurm 8 | 9 | # Required - Cluster Specific 10 | num_parallel_jobs: 120 11 | ntasks: 1 12 | cpus-per-task: 1 13 | time: 30 # Runtime in Wallclock Time. Can be int or str in form "HH:MM:SS" 14 | 15 | mem-per-cpu: 1000 # Optional - Cluster specific 16 | 17 | # Optional 18 | path_to_template: "/path/to/sbatch_template.sh" # Path to YOUR prepared sbatch script. Uses supplied default template if not specified 19 | account: "" # Account name to which Cluster Time will be booked. Cluster dependent. 20 | slurm_log: "/path/to/slurmlog/outputdir" # optional. dir in which slurm output and error logs will be saved. Defaults to EXPERIMENTCONFIG.path/slurmlog 21 | venv: "/path/to/virtual_environment/bin/activate" # optional. path to your virtual environment activate-file 22 | 23 | # Optional Code Copy: Both Args are required. 24 | experiment_copy_dst: "/path/to/code_copy/dst" # optional. dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 25 | experiment_copy_auto_dst: "/path/to/code_copy/dst" # will autoincrement and create a dir TO which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 26 | experiment_copy_src: "/path/to/code_copy/src" # optional. dir FROM which the current code will be copied. Useful to prevent unintentional changes while the job is in queue. 27 | 28 | # Optional SBATCH Arguments 29 | sbatch_args: # Dictionary of SBATCH keywords and arguments 30 | kw_1: "arg1" # Will construct the line: #SBATCH --kw_1 arg1 31 | kw_2: "arg2" # Will construct the line: #SBATCH --kw_2 arg2 32 | 33 | # Optional shell instructions 34 | sh_lines: # List of strings 35 | - "line 1" 36 | - "line 2" 37 | 38 | --- 39 | # DEFAULT parameters (Optional) 40 | name: "DEFAULT" # MUST BE 'DEFAULT' 41 | 42 | # Implementation default parameters 43 | # Will be overwritten by named experiments. 44 | params: 45 | param_1: "default_value" 46 | 47 | 48 | --- 49 | # Experiment 1 50 | name: "experiment_name" 51 | 52 | # Required: Can also be set in DEFAULT 53 | path: "path/to/output_dir/" # location to save results in 54 | repetitions: 5 # number of times one set of parameters is run 55 | iterations: 1000 # number of iterations per repetition 56 | 57 | # Optional: Can also be set in DEFAULT 58 | # Only use these values if you are sure you know what you are doing. 59 | # Refer to Chapter 9 of the Docs for more info 60 | reps_per_job: 1 # number of repetitions in each job. useful for paralellization. defaults to 1. 61 | reps_in_parallel: 1 # number of repetitions in each job that are executed in parallel. defaults to 1. 62 | 63 | # Experiment Parameters: Can also be set in DEFAULT. 64 | params: 65 | param_1: "exp_value_1" # overwrites Default 66 | param_2: "exp_value_2" # new experiment specific parameter 67 | 68 | # Dynamically assigned parameters. Can be EITHER 'list' or 'grid'. Can NOT be set in DEFAULT. Can be a nested dictionary. 69 | list: # alternative - 'grid:' 70 | param_3: [1, 2] 71 | param_4: [3, 4] 72 | -------------------------------------------------------------------------------- /templates/iterative_main.py: -------------------------------------------------------------------------------- 1 | from cw2 import cluster_work, cw_error, experiment 2 | from cw2.cw_data import cw_logging 3 | 4 | 5 | class MyIterativeExperiment(experiment.AbstractIterativeExperiment): 6 | # ... 7 | 8 | def initialize( 9 | self, config: dict, rep: int, logger: cw_logging.LoggerArray 10 | ) -> None: 11 | cw_logging.getLogger().info( 12 | "Ready to start repetition {}. Resetting everything.".format(rep) 13 | ) 14 | 15 | def iterate(self, config: dict, rep: int, n: int) -> dict: 16 | if n > 50: 17 | raise cw_error.ExperimentSurrender({"Rsult": "End execution early."}) 18 | 19 | return {"Result": "Current Iteration is {}".format(n)} 20 | 21 | def save_state(self, config: dict, rep: int, n: int) -> None: 22 | if n % 50 == 0: 23 | cw_logging.getLogger().info("I am stateless. Nothing to write to disk.") 24 | 25 | def finalize( 26 | self, surrender: cw_error.ExperimentSurrender = None, crash: bool = False 27 | ): 28 | if surrender is not None: 29 | cw_logging.getLogger().info("Run was surrendered early.") 30 | 31 | if crash: 32 | cw_logging.getLogger().warning("Run crashed with an exception.") 33 | cw_logging.getLogger().info("Finished. Closing Down.") 34 | 35 | 36 | if __name__ == "__main__": 37 | cw = cluster_work.ClusterWork(MyIterativeExperiment) 38 | cw.add_logger(cw_logging.AbstractLogger()) 39 | cw.run() 40 | -------------------------------------------------------------------------------- /templates/sbatch_template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p %%partition%% 3 | # #SBATCH -A %%account%% 4 | #SBATCH -J %%job-name%% 5 | #SBATCH --array 0-%%last_job_idx%%%%%num_parallel_jobs%% 6 | 7 | # Please use the complete path details : 8 | #SBATCH -D %%experiment_execution_dir%% 9 | #SBATCH -o %%slurm_log%%/out_%A_%a.log 10 | #SBATCH -e %%slurm_log%%/err_%A_%a.log 11 | 12 | # Cluster Settings 13 | #SBATCH -n %%ntasks%% # Number of tasks 14 | #SBATCH -c %%cpus-per-task%% # Number of cores per task 15 | #SBATCH --mem-per-cpu=%%mem-per-cpu%% # Main memory in MByte per MPI task 16 | #SBATCH -t %%time%% # 1:00:00 Hours, minutes and seconds, or '#SBATCH -t 10' - only minutes 17 | 18 | %%sbatch_args%% 19 | # ------------------------------- 20 | 21 | # Activate the virtualenv / conda environment 22 | %%venv%% 23 | 24 | 25 | # Export Pythonpath 26 | %%pythonpath%% 27 | 28 | # Additional Instructions from CONFIG.yml 29 | %%sh_lines%% 30 | 31 | python3 %%python_script%% %%path_to_yaml_config%% -j $SLURM_ARRAY_TASK_ID %%cw_args%% 32 | 33 | # THIS WAS BUILT FROM THE DEFAULLT SBATCH TEMPLATE -------------------------------------------------------------------------------- /test/horeka_scheduler_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ALRhub/cw2/7a7b8a235731e8576e1616a46a61f442cd616cd3/test/horeka_scheduler_test/__init__.py -------------------------------------------------------------------------------- /test/horeka_scheduler_test/horeka_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Slurm config 3 | name: "SLURM" 4 | partition: "accelerated" 5 | job-name: "horeka_test_job" 6 | num_parallel_jobs: 120 7 | time: 2 8 | ntasks: 1 9 | cpus-per-task: 10 10 | gpus_per_rep: 1 11 | sbatch_args: 12 | gres: "gpu:4" 13 | 14 | --- 15 | # DEFAULT 16 | name: "test" 17 | repetitions: 20 18 | path: "./hs_test_log" 19 | reps_per_job: 4 20 | reps_in_parallel : 4 21 | params: 22 | dummy: 5 23 | 24 | 25 | -------------------------------------------------------------------------------- /test/horeka_scheduler_test/test_experiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import time 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from cw2.cw_data import cw_logging 10 | from cw2.experiment import AbstractExperiment, ExperimentSurrender 11 | 12 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0" 13 | 14 | 15 | class TestExperiment(AbstractExperiment): 16 | def initialize( 17 | self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray 18 | ) -> None: 19 | np.random.seed(rep * 13) 20 | print( 21 | "Hello, repetition ", 22 | rep, 23 | "here. I see ", 24 | torch.cuda.device_count(), 25 | " GPU(s)", 26 | ) 27 | if torch.cuda.is_available(): 28 | device = torch.device("cuda") 29 | print(torch.cuda.get_device_name(device)) 30 | print(torch.cuda.get_device_properties(device)) 31 | 32 | def run(self, cw_config: dict, rep: int, logger: cw_logging.LoggerArray) -> None: 33 | sleep_time = np.random.rand() * 10 34 | print("Going to sleep for {:.5f} sec".format(sleep_time)) 35 | time.sleep(sleep_time) 36 | exit_gracefully = np.random.rand() < 0.5 37 | if exit_gracefully: 38 | print("Done (Rep", rep, ")") 39 | return 40 | else: 41 | raise Exception("AAHHH I AM DYING! (Rep ", rep, ")") 42 | 43 | def finalize(self, surrender: ExperimentSurrender = None, crash: bool = False): 44 | pass 45 | 46 | 47 | if __name__ == "__main__": 48 | from cw2.cluster_work import ClusterWork 49 | 50 | sys.argv.append("horeka_config.yml") 51 | sys.argv.append("-o") 52 | # sys.argv.append("-s") 53 | 54 | cw = ClusterWork(TestExperiment) 55 | cw.run() 56 | -------------------------------------------------------------------------------- /test/test_cw_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Dict 3 | from unittest import main 4 | 5 | from cw2.cw_config import conf_unfolder, cw_config 6 | 7 | 8 | class TestParamsExpansion(unittest.TestCase): 9 | def setUp(self) -> None: 10 | self.conf_obj = cw_config.Config() 11 | 12 | def expand_dict(self, _d: dict) -> list: 13 | d = _d.copy() 14 | expands = conf_unfolder.expand_experiments([d], False, False) 15 | return [self.remove_non_param_keys(e) for e in expands] 16 | 17 | def create_minimal_dict(self) -> dict: 18 | return {"name": "exp", "path": "test", "_debug": False} 19 | 20 | def remove_non_param_keys(self, _d: dict) -> dict: 21 | d = _d.copy() 22 | d["path"] = d["_basic_path"] 23 | del d["_basic_path"] 24 | del d["_experiment_name"] 25 | del d["_nested_dir"] 26 | del d["log_path"] 27 | return d 28 | 29 | def test_no_expansion(self): 30 | no_params = self.create_minimal_dict() 31 | 32 | res = self.expand_dict(no_params) 33 | self.assertEqual(1, len(res)) 34 | self.assertDictEqual(no_params, res[0]) 35 | 36 | params_dict = self.create_minimal_dict() 37 | params_dict["params"] = {"a": 1, "b": [2, 3], "c": {"c_1": "a", "c_2": "b"}} 38 | 39 | res = self.expand_dict(params_dict) 40 | self.assertEqual(1, len(res)) 41 | self.assertDictEqual(params_dict, res[0]) 42 | 43 | def test_grid_exp(self): 44 | g = self.create_minimal_dict() 45 | g["grid"] = { 46 | "a": [1], 47 | "b": [2], 48 | } 49 | 50 | res = self.expand_dict(g) 51 | self.assertEqual(1, len(res)) 52 | 53 | g["grid"]["a"] = [3, 4] 54 | res = self.expand_dict(g) 55 | self.assertEqual(2, len(res)) 56 | 57 | g["grid"]["b"] = [11, 12, 13] 58 | res = self.expand_dict(g) 59 | self.assertEqual(6, len(res)) 60 | 61 | g["grid"]["c"] = {"c1": ["c1"], "c2": ["c2a", "c2b"]} 62 | res = self.expand_dict(g) 63 | self.assertEqual(12, len(res)) 64 | 65 | def test_list_exp(self): 66 | g = self.create_minimal_dict() 67 | g["list"] = { 68 | "a": [1], 69 | "b": [2], 70 | } 71 | 72 | res = self.expand_dict(g) 73 | self.assertEqual(1, len(res)) 74 | 75 | g["list"]["a"] = [3, 4] 76 | res = self.expand_dict(g) 77 | self.assertEqual(1, len(res)) 78 | 79 | g["list"]["b"] = [11, 12, 13] 80 | res = self.expand_dict(g) 81 | self.assertEqual(2, len(res)) 82 | 83 | g["list"]["c"] = {"c1": ["c1"], "c2": ["c2a, c2b"]} 84 | res = self.expand_dict(g) 85 | self.assertEqual(1, len(res)) 86 | 87 | def test_grid_and_list(self): 88 | g = self.create_minimal_dict() 89 | g["list"] = { 90 | "a": [1], 91 | "b": [2], 92 | } 93 | g["grid"] = { 94 | "c": [1], 95 | "d": [2], 96 | } 97 | res = self.expand_dict(g) 98 | self.assertEqual(1, len(res)) 99 | 100 | g["list"]["a"] = [3, 4] 101 | g["list"]["b"] = [11, 12, 13] 102 | res = self.expand_dict(g) 103 | self.assertEqual(2, len(res)) 104 | 105 | g["grid"]["c"] = [3, 4] 106 | res = self.expand_dict(g) 107 | self.assertEqual(4, len(res)) 108 | 109 | g["grid"]["cd"] = {"c1": ["c1"], "c2": ["c2a", "c2b"]} 110 | res = self.expand_dict(g) 111 | self.assertEqual(8, len(res)) 112 | 113 | g["list"]["cl"] = {"c1": ["c1"], "c2": ["c2a, c2b"]} 114 | res = self.expand_dict(g) 115 | self.assertEqual(4, len(res)) 116 | 117 | def test_multi_listt(self): 118 | g = self.create_minimal_dict() 119 | g["list1"] = { 120 | "a": [1], 121 | "b": [2], 122 | } 123 | g["list--2"] = { 124 | "c": [1], 125 | "d": [2], 126 | } 127 | res = self.expand_dict(g) 128 | self.assertEqual(1, len(res)) 129 | 130 | g["list1"]["a"] = [3, 4] 131 | g["list1"]["b"] = [11, 12, 13] 132 | res = self.expand_dict(g) 133 | self.assertEqual(2, len(res)) 134 | 135 | g["list--2"]["c"] = [3, 4] 136 | g["list--2"]["d"] = [3, 4] 137 | res = self.expand_dict(g) 138 | self.assertEqual(4, len(res)) 139 | 140 | g["list1"]["a"] = [11, 12, 13] 141 | g["list1"]["b"] = [11, 12, 13] 142 | g["list--2"]["c"] = [11, 12, 13] 143 | g["list--2"]["d"] = [11, 12, 13] 144 | res = self.expand_dict(g) 145 | self.assertEqual(9, len(res)) 146 | 147 | def test_ablation(self): 148 | g = self.create_minimal_dict() 149 | g["list1"] = { 150 | "a": [1], 151 | "b": [2], 152 | } 153 | g["ablative"] = { 154 | "c": [3], 155 | } 156 | res = self.expand_dict(g) 157 | self.assertEqual(1, len(res)) 158 | 159 | g["ablative"] = { 160 | "c": [3, 4], 161 | } 162 | res = self.expand_dict(g) 163 | self.assertEqual(2, len(res)) 164 | 165 | g["ablative"] = {"c": [3], "d": [4]} 166 | res = self.expand_dict(g) 167 | self.assertEqual(2, len(res)) 168 | 169 | g["ablative"] = {"c": [3], "d": [4, 5]} 170 | g["list1"] = { 171 | "a": [1, 2], 172 | "b": [2, 3], 173 | } 174 | res = self.expand_dict(g) 175 | self.assertEqual(6, len(res)) 176 | 177 | 178 | if __name__ == "__main__": 179 | unittest.main() 180 | --------------------------------------------------------------------------------