├── .gitignore ├── docs ├── requirements.txt ├── img │ ├── gym.gif │ ├── job-resource.drawio │ ├── job-resource.svg │ └── cluster-resourcepool.svg ├── modules.rst ├── tutorials │ └── index.rst ├── index.rst ├── Makefile ├── design.rst ├── make.bat ├── schedgym.rst └── conf.py ├── mypy.ini ├── setup.cfg ├── pyproject.toml ├── .coveragerc ├── schedgym ├── __init__.py ├── envs │ ├── __init__.py │ ├── render.py │ ├── deeprm_env.py │ ├── simulator.py │ ├── compact_env.py │ ├── base.py │ └── workload.py ├── scheduler │ ├── random_scheduler.py │ ├── backfilling_scheduler.py │ ├── __init__.py │ ├── fifo_scheduler.py │ ├── sjf_scheduler.py │ ├── tetris_scheduler.py │ ├── packer_scheduler.py │ ├── easy_scheduler.py │ ├── null_scheduler.py │ └── scheduler.py ├── workload │ ├── __init__.py │ ├── base.py │ ├── swf_parser.py │ ├── distribution.py │ └── trace.py ├── resource.py ├── simulator.py ├── heap.py ├── pool.py ├── event.py ├── cluster.py └── job.py ├── requirements.txt ├── LICENSE ├── sjf-agent.py ├── .github └── workflows │ ├── pythonpackage.yml │ └── codeql-analysis.yml ├── README.rst ├── setup.py └── deeprm-agent.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | venv 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | docutils 3 | nbsphinx 4 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | 3 | ignore_missing_imports = True 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = attr: schedgym.__version__ 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "Cython"] -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = schedgym/envs/render.py 3 | relative_files = True 4 | -------------------------------------------------------------------------------- /docs/img/gym.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renatolfc/sched-rl-gym/HEAD/docs/img/gym.gif -------------------------------------------------------------------------------- /schedgym/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __version__ = '0.1.0' 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.16.3 2 | intervaltree==3.0.2 3 | pytest==4.6.3 4 | coverage==5.3 5 | pytest-cov 6 | parallelworkloads 7 | cython 8 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | Source code modules documentation 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | schedgym 8 | -------------------------------------------------------------------------------- /docs/tutorials/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | Currently, we have a single tutorial that shows how to implement a PPO agent, 5 | but in the future we will add more alternatives. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | :caption: Tutorials: 10 | 11 | ppo.ipynb 12 | -------------------------------------------------------------------------------- /schedgym/envs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from gym.envs.registration import register 6 | 7 | from .deeprm_env import DeepRmEnv 8 | from .compact_env import CompactRmEnv 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | register( 13 | id='DeepRM-v0', 14 | nondeterministic=False, 15 | entry_point=f'schedgym.envs.deeprm_env:{DeepRmEnv.__name__}', 16 | ) 17 | 18 | register( 19 | id='CompactRM-v0', 20 | nondeterministic=False, 21 | entry_point=f'schedgym.envs.compact_env:{CompactRmEnv.__name__}', 22 | ) 23 | -------------------------------------------------------------------------------- /schedgym/scheduler/random_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """random - a random scheduler""" 5 | 6 | import random 7 | 8 | from schedgym.scheduler import PackerScheduler 9 | 10 | 11 | class RandomScheduler(PackerScheduler): 12 | """A random scheduling policy. 13 | 14 | This reuses functionality from the :class:`PackerScheduler`. Therefore, it 15 | only needs to define a random priority function. 16 | """ 17 | 18 | def get_priority(self, _) -> int: 19 | """Random priority function for random scheduler.""" 20 | return random.randint(0, len(self.queue_admission) - 1) 21 | -------------------------------------------------------------------------------- /schedgym/workload/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """workload - Package for generators of load for a cluster. 5 | 6 | Supports generative workloads, based on probability distributions, and 7 | trace-based workloads in the Standard Workload Format. 8 | """ 9 | 10 | from .base import WorkloadGenerator 11 | from .distribution import BinomialWorkloadGenerator 12 | from .distribution import DistributionalWorkloadGenerator 13 | from .trace import TraceGenerator, SwfGenerator 14 | 15 | __all__ = [ 16 | 'WorkloadGenerator', 17 | 'DistributionalWorkloadGenerator', 18 | 'BinomialWorkloadGenerator', 19 | 'TraceGenerator', 20 | 'SwfGenerator', 21 | ] 22 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to sched-rl-gym's documentation! 2 | ======================================== 3 | 4 | In this documentation site you will find information about the general design 5 | of the environment, a set of tutorials on how to instantiate and use it, along 6 | with a reference of the source code. 7 | 8 | Please use the following list of contents to select what you're interested in. 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :caption: Contents: 13 | 14 | design 15 | tutorials/index 16 | modules 17 | 18 | .. include:: ../README.rst 19 | :start-after: inclusion-marker-do-not-remove 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | -------------------------------------------------------------------------------- /schedgym/scheduler/backfilling_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """backfilling_scheduler - Module for a conservative backfilling scheduler""" 5 | 6 | from schedgym.scheduler import Scheduler 7 | 8 | 9 | class BackfillingScheduler(Scheduler): 10 | """Implements a conservative backfilling scheduler.""" 11 | 12 | def schedule(self) -> None: 13 | "Schedules a job according to the conservative backfilling strategy." 14 | for job in self.queue_admission: 15 | time, resources = self.find_first_time_for(job) 16 | if not resources: 17 | raise AssertionError('Something is terribly wrong') 18 | self.assign_schedule(job, resources, time) 19 | self.queue_admission.clear() 20 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | docs: 18 | mkdir -p docs/img 19 | cp img/gym.gif docs/img/ 20 | 21 | # Catch-all target: route all unknown targets to Sphinx using the new 22 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 23 | %: Makefile docs 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /docs/design.rst: -------------------------------------------------------------------------------- 1 | Design of sched-rl-gym 2 | ====================== 3 | 4 | This page documents the overall design of the environment, and may be useful in 5 | understanding its components. 6 | 7 | `sched-rl-gym` was designed with a view of having multiple layers to try and 8 | separate functionality between them. Conceptually, we have three layers: 9 | 10 | 1. Simulator primitives 11 | 2. Simulator 12 | 3. OpenAI Gym <-> Simulator Glue 13 | 14 | With user code living in a fourth layer atop 3. The good thing of using this 15 | design is that one can also access each layer directly, which is useful for: 16 | 17 | 1. Unit testing (the code is tested with `coverage on coveralls.io 18 | `_) 19 | 2. Using the simulator directly (to replicate results, for example) 20 | -------------------------------------------------------------------------------- /schedgym/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """scheduler - basic scheduling algorithms for the *simulation* layer.""" 5 | 6 | from .scheduler import Scheduler 7 | from .sjf_scheduler import SjfScheduler 8 | from .fifo_scheduler import FifoScheduler 9 | from .backfilling_scheduler import BackfillingScheduler 10 | from .null_scheduler import NullScheduler 11 | from .packer_scheduler import PackerScheduler 12 | from .random_scheduler import RandomScheduler 13 | from .tetris_scheduler import TetrisScheduler 14 | from .easy_scheduler import EasyScheduler 15 | 16 | __all__ = [ 17 | 'Scheduler', 18 | 'SjfScheduler', 19 | 'BackfillingScheduler', 20 | 'NullScheduler', 21 | 'PackerScheduler', 22 | 'RandomScheduler', 23 | 'TetrisScheduler', 24 | 'EasyScheduler', 25 | 'FifoScheduler', 26 | ] 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /schedgym/scheduler/fifo_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """fifo_scheduler - First-In First-Out module""" 5 | 6 | from typing import List 7 | 8 | from schedgym.job import Job 9 | from schedgym.scheduler import Scheduler 10 | 11 | 12 | class FifoScheduler(Scheduler): 13 | """A FIFO scheduler.""" 14 | 15 | def schedule(self) -> None: 16 | """Schedules jobs according to submission time. 17 | 18 | This implements a *string* FIFO strategy, meaning it will always obey 19 | submission order, even when it creates fragmentation. 20 | """ 21 | scheduled_jobs: List[Job] = [] 22 | for job in self.queue_admission: 23 | resources = self.can_schedule_now(job) 24 | if resources: 25 | self.assign_schedule(job, resources, self.current_time) 26 | scheduled_jobs.append(job) 27 | else: 28 | break 29 | for job in scheduled_jobs: 30 | self.queue_admission.remove(job) 31 | -------------------------------------------------------------------------------- /schedgym/workload/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """base - base module for all workload generators""" 5 | 6 | from abc import ABC, abstractmethod 7 | from typing import Optional, List 8 | 9 | from schedgym.job import Job 10 | 11 | 12 | class WorkloadGenerator(ABC): 13 | """An abstract workload generator""" 14 | 15 | current_time: int 16 | 17 | @abstractmethod 18 | def step(self, offset: int = 1) -> List[Optional[Job]]: 19 | """Steps the workload generator by :param offset:. 20 | 21 | This may, or may not, return new jobs, depending on the internal 22 | probability distributions of the workload generator. 23 | 24 | Parameters 25 | ---------- 26 | offset : int 27 | The number of time steps to advance the workload generator. 28 | """ 29 | 30 | @abstractmethod 31 | def __len__(self): 32 | """Returns the length of the workload. Zero if unbounded.""" 33 | 34 | @abstractmethod 35 | def peek(self): 36 | """Peeks what would be the next job""" 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2020 Renato L. de F. Cunha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/schedgym.rst: -------------------------------------------------------------------------------- 1 | schedgym package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | schedgym.envs 11 | schedgym.scheduler 12 | schedgym.workload 13 | 14 | Submodules 15 | ---------- 16 | 17 | schedgym.resource module 18 | ----------------------- 19 | 20 | .. automodule:: schedgym.resource 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | schedgym.cluster module 26 | ---------------------- 27 | 28 | .. automodule:: schedgym.cluster 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | schedgym.event module 34 | -------------------- 35 | 36 | .. automodule:: schedgym.event 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | schedgym.heap module 42 | ------------------- 43 | 44 | .. inheritance-diagram:: schedgym.heap.Heap 45 | 46 | .. automodule:: schedgym.heap 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | schedgym.job module 52 | ------------------ 53 | 54 | .. automodule:: schedgym.job 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | 59 | schedgym.pool module 60 | ------------------- 61 | 62 | .. automodule:: schedgym.pool 63 | :members: 64 | :undoc-members: 65 | :show-inheritance: 66 | 67 | schedgym.simulator module 68 | ------------------------ 69 | 70 | .. automodule:: schedgym.simulator 71 | :members: 72 | :undoc-members: 73 | :show-inheritance: 74 | 75 | Module contents 76 | --------------- 77 | 78 | .. automodule:: schedgym 79 | :members: 80 | :undoc-members: 81 | :show-inheritance: 82 | -------------------------------------------------------------------------------- /schedgym/scheduler/sjf_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """sjf_scheduler - Shortest job first scheduler""" 5 | 6 | from typing import List 7 | 8 | from schedgym.job import Job 9 | from schedgym.scheduler import Scheduler 10 | 11 | 12 | class SjfScheduler(Scheduler): 13 | """A shortest job first scheduler.""" 14 | 15 | def schedule(self) -> None: 16 | """Schedules jobs according to shortest job first. 17 | 18 | This does so by re-sorting the queue by requested time and iterating 19 | through it until a job can be scheduled. 20 | """ 21 | ignored_jobs: List[Job] = [] 22 | # XXX: We always re-sort the queue. If we ever want to learn from 23 | # demonstration, we'd probably have to do something like: 24 | # candidates = sorted( 25 | # enumerate(self.queue_admission), 26 | # key=lambda e:(e[1].requested_time, e[1].submission_time) 27 | # ) 28 | # and work from there. Hence, the jobs we scheduled would have their 29 | # indices and we could generate intermediate states as needed. 30 | # FIXME: With the current implementation, smaller jobs that are 31 | # longer may be scheduled first. 32 | for job in sorted( 33 | self.queue_admission, 34 | key=lambda j: (j.requested_time, j.submission_time), 35 | ): 36 | resources = self.can_schedule_now(job) 37 | if resources: 38 | self.assign_schedule(job, resources, self.current_time) 39 | else: 40 | ignored_jobs.append(job) 41 | self.queue_admission = ignored_jobs 42 | -------------------------------------------------------------------------------- /schedgym/scheduler/tetris_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """tetris_scheduler - A scheduler that mixes Packer and SJF""" 5 | 6 | from schedgym.job import Job 7 | from schedgym.scheduler import PackerScheduler 8 | 9 | 10 | class TetrisScheduler(PackerScheduler): 11 | """Implements the Tetris scheduler. 12 | 13 | Adds a factor that controls how much Packer behavior and how much SJF 14 | behavior influences the scheduler. 15 | 16 | Parameters 17 | ---------- 18 | number_of_processors : int 19 | Number of processors in the cluster this scheduler manages 20 | total_memory : int 21 | Amount of memory in the cluster this scheduler manages 22 | packer_sjf_ratio : float 23 | Dial to tune packer to sjf ratio. Valid values in [0, 1], with 24 | 0 being SJF and 1 being Packer behavior. 25 | """ 26 | 27 | packer_sjf_ratio: float 28 | 29 | def __init__( 30 | self, 31 | number_of_processors: int, 32 | total_memory: int, 33 | packer_sjf_ratio: float = 0.5, 34 | ): 35 | super().__init__(number_of_processors, total_memory) 36 | self.packer_sjf_ratio = packer_sjf_ratio 37 | 38 | def get_priority(self, j: Job) -> float: 39 | """Gives the packer/sjf mixed priority. 40 | 41 | Parameters 42 | ---------- 43 | j : Job 44 | The job for which we're computing priority. 45 | """ 46 | return ( 47 | self.packer_sjf_ratio 48 | * ( 49 | self.free_resources[0] * j.requested_processors 50 | + self.free_resources[1] 51 | + j.requested_memory 52 | ) 53 | + (1 - self.packer_sjf_ratio) * 1.0 / j.requested_time 54 | ) 55 | -------------------------------------------------------------------------------- /docs/img/job-resource.drawio: -------------------------------------------------------------------------------- 1 | 7Vtdb+I6EP01PLbCCfngsUD37q66EtpWur1PKzcxxFonZh1TYH/9HRObkARoSgmw2kioYia2sX3OmfEkaccexst/BJ5F33hIWMfqhsuOPepYlocc+Kscq8zRs1DmmAoaZq4txyP9TbSzq71zGpK00FByziSdFZ0BTxISyIIPC8EXxWYTzoq/OsNTUnE8BphVvf/SUEaZ13e6uf8zodPI/DLq6isxNo21I41wyBdbLvu+Yw8F5zL7Fi+HhKm9M/uS9fu05+pmYoIksk6Hm+cxm/zw5vzml//6YxLcre6fb1AvG+YVs7le8XeS8rkIiJ61XJmtSBc0ZjgBazDhiXzUVxDYmNFpAt8DmAsR4HglQlLYxTt9QfIZeIOIsvABr/hczTiVOPhprEHEBf0Nw2Kmx4TLQmpCWG6hxaPqCe4ueAVJoc3YbAMqub7hZaHhA06ldgScMTxL6ctmGTEWU5oMuJQ83jSKaaAv662CpZHlXhDQBlqQBOExkWIFTXSHnqGHloPh0SLnFnK1L9rile9qSms6Tzcj54jDFw36ewjgVAjQsQZqTYIHJE25gF+8A/uLQhZaPQlSpQZsiFxDJvhPMuSMAwdGCc+4QhkruQxdGJnIvWRJZzigyfRh3WbUyz3f9bYoF4e+E7aWVUTDkCQKaC6xxBmqCsIZp4lc75szgA/s7rB763QcmPgQbJTb8FHNhRzyBNaC6RpUApRZEEWbegw4ILQqLzQPLLceD6zGeODu4UFMYg7jtxxonAOOdWkOeDs4UEKZ0TV6GcomI6KjII4BLEZyTJ8U5KMbVMHdruJu78CY4RfCxjylknI1vsjalrC/GLyeXzPUN4Vuv4LuV/7SZvkGsjzyj8vyxndy7M0ZtBLdhT7s6SS/9+zXBvc31d+/7gRvFtIG9ybgrZu7mwru1cxtpDyGarkN8g0EebvfLwR55F26lvP3BPm2lDtpKPCuO9CjXYG+reTOSIGLF3LVk36b6k8F7qXLOKuC7ZDNU5Wq2yTffCVXO8k3VsrZe6J7W8mdSv/Wdef36hObNrifCtxLl3FWNbofPKm1If7jId61SiHerqlwu9cUC1qJf0jipwngjWm8xlMWkoR36uUFsEKKY56ETxFVCRMufKLM6BEso0aVYSMZ51dgqs/bxn8KHcid2hwZDWbWylhLKp+3vm/1AivvpAzTJ5s9CSsvUpQUCCvMjiQHalf9NBrizJQcxLi7G+NdoArCsKSvxdntglUPN1Y8zeNDzyvGh80jfDOEuc+27pWTozKQa78xULbqykBrlm3W+AHi1SgLr4x41tmYZ/biTeZlFP07mAdUwKutZjqC752wW3q7BfWcw/Mqtff7Jd5nEzipCsxW/EEquEIRuGcTQZm7yOndOsfJwCnpydzAb1oFjv0uFSBUUE1DMqjxOPDKZOAidFAIyhgTQWFrVMFzMXH459MGKj6CsnqlQ2ttZTilgVAzCcIpPzNzvMPz6u+eV7PSqBbh1y4N7wpTxJ6q+xwpwvfVjhynBLfEONtVp9Cz5AnXeleesOxz5Ild95uvWwxn04JfUwr22ZRQfifBQtatV7oRXVcJtv32WEdXrGDm//mQNc//fcS+/x8= -------------------------------------------------------------------------------- /schedgym/scheduler/packer_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """packer_scheduler - A scheduler based on the Packer heuristic""" 5 | 6 | from typing import List 7 | 8 | from schedgym.job import Job 9 | from schedgym.scheduler import Scheduler 10 | 11 | 12 | class PackerScheduler(Scheduler): 13 | """Implements the Packer heuristic. 14 | 15 | In the paper, they give higher priority to the dot product of requested 16 | resources with the set of available resources. This means that, since 17 | the number of resources don't change, the jobs that use more resources 18 | will be preferred. Every time a new job is scheduled, the prioritization 19 | changes, since they always schedule one job at a time. 20 | """ 21 | 22 | def get_priority(self, j: Job) -> float: 23 | """Computes the priority of a given job. 24 | 25 | This computes the priority of a job according to the Packer heuristic, 26 | which will prefer jobs with higher dot-product between free and 27 | requested resources. 28 | 29 | Parameters 30 | ---------- 31 | j : Job 32 | The job whose priority is to be calculated. 33 | """ 34 | if self.ignore_memory: 35 | return self.free_resources[0] * j.requested_processors 36 | return ( 37 | self.free_resources[0] * j.requested_processors 38 | + self.free_resources[1] * j.requested_memory 39 | ) 40 | 41 | def schedule(self) -> None: 42 | """Schedules jobs according to the Packer heuristic.""" 43 | ignored_jobs: List[Job] = [] 44 | for job in reversed( 45 | sorted(self.queue_admission, key=lambda j: self.get_priority(j)) 46 | ): 47 | resources = self.can_schedule_now(job) 48 | if resources: 49 | self.assign_schedule(job, resources, self.current_time) 50 | else: 51 | ignored_jobs.append(job) 52 | self.queue_admission = ignored_jobs 53 | -------------------------------------------------------------------------------- /sjf-agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import gym 6 | import json 7 | import numpy as np 8 | import schedgym.envs as deeprm 9 | 10 | EPISODES = 1 11 | MAX_EPISODE_LENGTH = 200 12 | 13 | 14 | def sjf_action(observation): 15 | "Selects the job SJF (Shortest Job First) would select." 16 | 17 | current, wait, _, _ = observation 18 | best = wait.shape[2] + 1 # infinity 19 | best_idx = wait.shape[1] 20 | 21 | free = np.ones(current.shape[0]) * current.shape[-1] - np.sum(current[:, 0, :] != 0) 22 | 23 | for slot in range(wait.shape[1]): 24 | required_resources = (wait[:, slot, 0, :] != 0).sum(axis=1) 25 | if np.all(required_resources) and np.all(required_resources <= free): 26 | tmp = np.sum(wait[0, slot, :, 0]) 27 | if tmp < best: 28 | best_idx = slot 29 | best = tmp 30 | return best_idx 31 | 32 | 33 | def find_position(q, idx): 34 | for i, j in enumerate(q): 35 | if j.slot_position == idx: 36 | return i 37 | return idx 38 | 39 | 40 | def pack_observation(ob, time_horizon): 41 | current, wait, backlog, time = ob 42 | wait = wait.reshape(time_horizon, -1) 43 | current = current.reshape(time_horizon, -1) 44 | return np.hstack((current, wait, backlog, time)) 45 | 46 | def main(): 47 | kwargs = {'use_raw_state': True} 48 | if os.path.exists('config/test.json'): 49 | with open('config/test.json', 'r') as fp: 50 | kwargs = json.load(fp) 51 | env: deeprm.DeepRmEnv = gym.make('DeepRM-v0', **kwargs) 52 | time_horizon = env.reset()[0].shape[1] 53 | for episode in range(EPISODES): 54 | ob = env.reset() 55 | action = sjf_action(ob) 56 | while True: 57 | ob, reward, done, _ = env.step(action) 58 | action = sjf_action(ob) 59 | ob = pack_observation(ob, time_horizon) 60 | env.render() 61 | if done: 62 | break 63 | env.close() 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | import schedgym 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'sched-rl-gym' 23 | copyright = '2020, Renato L. de F. Cunha' 24 | author = 'Renato L. de F. Cunha' 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'nbsphinx', 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.napoleon', 37 | 'sphinx.ext.inheritance_diagram', 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 47 | 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_theme = 'alabaster' 55 | 56 | # Add any paths that contain custom static files (such as style sheets) here, 57 | # relative to this directory. They are copied after the builtin static files, 58 | # so a file named "default.css" will overwrite the builtin "default.css". 59 | html_static_path = ['_static'] 60 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: sched-rl-gym 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | max-parallel: 4 13 | matrix: 14 | python-version: [3.7, 3.8, 3.9] 15 | 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -e . 26 | - name: Lint with flake8 27 | run: | 28 | pip install flake8 29 | # stop the build if there are Python syntax errors or undefined names 30 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 31 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 32 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 33 | - name: Test with pytest & coverage 34 | run: | 35 | pip install pytest pytest-cov coveralls 36 | coverage run --source schedgym -m pytest schedgym 37 | coverage report -m 38 | - name: Coveralls Parallel 39 | uses: AndreMiras/coveralls-python-action@develop 40 | with: 41 | github-token: ${{ secrets.github_token }} 42 | parallel: true 43 | finish_build: 44 | name: Finish Coveralls 45 | needs: build 46 | runs-on: ubuntu-latest 47 | steps: 48 | - name: Finished 49 | uses: AndreMiras/coveralls-python-action@develop 50 | with: 51 | github-token: ${{ secrets.github_token }} 52 | parallel-finished: true 53 | build_wheels: 54 | name: Build wheels on ${{ matrix.os }} 55 | runs-on: ${{ matrix.os }} 56 | strategy: 57 | matrix: 58 | os: [ubuntu-20.04, macOS-10.15] 59 | steps: 60 | - uses: actions/checkout@v2 61 | - uses: actions/setup-python@v2 62 | - name: Build wheels 63 | uses: pypa/cibuildwheel@v2.3.1 64 | - uses: actions/upload-artifact@v2 65 | with: 66 | path: ./wheelhouse/*.whl 67 | build_sdist: 68 | name: Build source distribution 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v2 72 | 73 | - name: Build sdist 74 | run: pip install cython && pipx run --system-site-packages build --sdist 75 | 76 | - uses: actions/upload-artifact@v2 77 | with: 78 | path: dist/*.tar.gz -------------------------------------------------------------------------------- /schedgym/resource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """resource - basic resource unit 5 | 6 | This module has two classes: 7 | 1. `PrimaryResource`, an enumeration for the different supported types (CPU 8 | and MEMORY) 9 | 2. The basic resource group, which is comprised of *both* CPU and memory 10 | """ 11 | 12 | import copy 13 | import enum 14 | from typing import Tuple 15 | 16 | from intervaltree import IntervalTree 17 | 18 | 19 | class PrimaryResource(enum.IntEnum): 20 | """Enumeration for identifying the various supported resource types.""" 21 | 22 | CPU = 0 23 | MEMORY = 1 24 | 25 | 26 | class Resource(object): 27 | """The basic resource group. 28 | 29 | This groups IntervalTrees into as many resources that can are supported in 30 | the system. 31 | 32 | This is referenced by a :class:`schedgym.job.Job` to represent *which 33 | specific resources* are being used by that job. 34 | 35 | Parameters 36 | ---------- 37 | processors : IntervalTree 38 | An interval tree that defines a set of processors 39 | memory : IntervalTree 40 | An interval tree that defines a set of memory resources 41 | ignore_memory : bool 42 | Whether memory should be taken in consideration when measuring 43 | resource usage. 44 | """ 45 | 46 | memory: IntervalTree 47 | """IntervalTree that stores memory used""" 48 | processors: IntervalTree 49 | """IntervalTree that stores processors used""" 50 | 51 | def __init__( 52 | self, 53 | processors: IntervalTree = IntervalTree(), 54 | memory: IntervalTree = IntervalTree(), 55 | ignore_memory: bool = False, 56 | ): 57 | self.ignore_memory = ignore_memory 58 | self.processors = copy.copy(processors) 59 | self.memory = copy.copy(memory) 60 | 61 | def measure(self) -> Tuple[int, int]: 62 | """Returns the total amount of resources in use. 63 | 64 | Returns: 65 | Tuple: A tuple containing the amount of resources used for each 66 | resource type supported. 67 | """ 68 | processors = sum([i.end - i.begin for i in self.processors]) 69 | memory = sum([i.end - i.begin for i in self.memory]) 70 | return processors, memory 71 | 72 | def __bool__(self) -> bool: 73 | return bool(self.processors) and ( 74 | self.ignore_memory or bool(self.memory) 75 | ) 76 | 77 | def __repr__(self): 78 | return f'Resource({self.processors}, {self.memory})' 79 | 80 | def __str__(self): 81 | return f'Resource({self.processors}, {self.memory})' 82 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '31 17 * * 2' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /schedgym/simulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """simulator - Classes for simulating job submission and execution. 5 | 6 | This module comprises an abstract base class for simulation and a time-based 7 | simulator that inherits directly from `Simulator`. 8 | 9 | The time-based simulator is coupled with 10 | a :class:`schedgym.workload.WorkloadGenerator` to generate jobs at a given time 11 | step. 12 | """ 13 | 14 | import enum 15 | 16 | from abc import ABC, abstractmethod 17 | 18 | from . import workload, scheduler as sched 19 | 20 | 21 | class SimulationType(enum.Enum): 22 | """Enumeration to differentiate between simulation types""" 23 | 24 | TIME_BASED = 0 25 | EVENT_BASED = 1 26 | 27 | 28 | class Simulator(ABC): 29 | """Abstract base class for simulation. 30 | 31 | Parameters 32 | ---------- 33 | workload_generator : workload.WorkloadGenerator 34 | An object to generate load when time is stepped. 35 | scheduler : sched.Scheduler 36 | A scheduling algorithm that will schedule jobs according to a given 37 | rule. 38 | """ 39 | 40 | current_time: int 41 | scheduler: sched.Scheduler 42 | simulation_start_time: int 43 | 44 | def __init__( 45 | self, 46 | workload_generator: workload.WorkloadGenerator, 47 | scheduler: sched.Scheduler, 48 | ): 49 | 50 | self.current_time = 0 51 | self.scheduler = scheduler 52 | self.simulation_start_time = 0 53 | self.workload: workload.WorkloadGenerator = workload_generator 54 | 55 | @staticmethod 56 | def make( 57 | simulation_type: SimulationType, 58 | workload_generator: workload.WorkloadGenerator, 59 | scheduler: sched.Scheduler, 60 | ): 61 | """Factory method for instantiating new simulators.""" 62 | if simulation_type == SimulationType.TIME_BASED: 63 | return TimeBasedSimulator(workload_generator, scheduler) 64 | raise RuntimeError(f'Unsupported simulation type {simulation_type}') 65 | 66 | @abstractmethod 67 | def step(self, submit) -> None: 68 | """Runs a simulation step.""" 69 | 70 | 71 | class TimeBasedSimulator(Simulator): 72 | """A simulator that is based on time.""" 73 | 74 | scheduler: sched.Scheduler 75 | 76 | def __init__( 77 | self, 78 | workload_generator: workload.WorkloadGenerator, 79 | scheduler: sched.Scheduler, 80 | ): 81 | super().__init__(workload_generator, scheduler) 82 | self.current_time = 0 83 | 84 | def step(self, submit=True): 85 | self.current_time += 1 86 | self.scheduler.step() 87 | jobs = self.workload.step() 88 | if submit and jobs: 89 | for job in jobs: 90 | if job is not None: 91 | self.scheduler.submit(job) 92 | -------------------------------------------------------------------------------- /schedgym/scheduler/easy_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """easy_scheduler - A scheduler that uses easy backfilling. 5 | """ 6 | 7 | from typing import List, Tuple, Optional 8 | 9 | from schedgym.job import Job, JobStatus 10 | from schedgym.scheduler import Scheduler 11 | from schedgym.event import JobEvent 12 | 13 | 14 | class EasyScheduler(Scheduler): 15 | """EASY backfilling scheduler. 16 | 17 | This is a backfilling scheduling that uses the EASY strategy. Upon 18 | encountering a single job that cannot be scheduled, it makes a reservation 19 | for that job on which would be the first time it should start on. 20 | 21 | Smaller jobs than the one currenly with a reservation may start, provided 22 | they do not delay the one with a reservation. 23 | """ 24 | 25 | reservation: Optional[Tuple[JobEvent, JobEvent]] 26 | 27 | def __init__(self, *args, **kwargs): 28 | super().__init__(*args, **kwargs) 29 | self.reservation = None 30 | 31 | def _handle_reservation(self) -> None: 32 | if not self.reservation: 33 | return 34 | 35 | start, finish = self.reservation 36 | if ( 37 | start.time == self.current_time 38 | or start.job.status != JobStatus.WAITING 39 | ): 40 | # Reservation will be fulfilled 41 | self.reservation = None 42 | return 43 | 44 | resources = self.can_schedule_now(start.job) 45 | if resources: 46 | self.queue_waiting.remove(start.job) 47 | 48 | self.job_events.remove(start) 49 | self.job_events.remove(finish) 50 | 51 | self.assign_schedule(start.job, resources, self.current_time) 52 | self.reservation = None 53 | 54 | def schedule(self) -> None: 55 | ignored_jobs: List[Job] = [] 56 | 57 | self._handle_reservation() 58 | for job in self.queue_admission: 59 | resources = self.can_schedule_now(job) 60 | if resources: 61 | self.assign_schedule(job, resources, self.current_time) 62 | else: 63 | if not self.reservation: 64 | # This is the first job without a reservation. 65 | # We're doing EASY backfilling, so we create a 66 | # reservation for this one job and keep going 67 | time, resources = self.find_first_time_for(job) 68 | if not resources: 69 | raise AssertionError('Something is terribly wrong') 70 | self.reservation = self.assign_schedule( 71 | job, resources, time 72 | ) 73 | else: 74 | # We already have a reservation, so we skip this job 75 | ignored_jobs.append(job) 76 | self.queue_admission = ignored_jobs 77 | -------------------------------------------------------------------------------- /schedgym/heap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """heap - A Priority Queue based on the `heapq` module.""" 5 | 6 | import heapq 7 | import itertools 8 | from typing import Generic, TypeVar, List, Dict, Generator, Optional, Iterator 9 | from typing import Tuple, cast 10 | 11 | T = TypeVar('T') 12 | ENTRY_T = Tuple[int, int, List[Optional[T]]] 13 | 14 | 15 | class Heap(Generic[T]): 16 | """A Priority Queue that is backed by a heap data structure. 17 | 18 | To reduce the computational cost of key removal, this class wastes a bit 19 | memory by *not* actually deleting items. 20 | """ 21 | 22 | entry_finder: Dict[Optional[T], ENTRY_T] 23 | 'Cache to check in O(1) whether an entry exists in the heap.' 24 | priority_queue: List[ENTRY_T] 25 | 'The actual priority queue, implemented as a list with heap ordering.' 26 | 27 | def __init__(self): 28 | """Initializes the heap. 29 | 30 | """ 31 | self.priority_queue = [] 32 | self.entry_finder = {} 33 | self.counter = itertools.count() 34 | 35 | def add(self, item, priority=0) -> None: 36 | """Add a new item or update the priority of an existing item""" 37 | if item in self.entry_finder: 38 | self.remove(item) 39 | count = next(self.counter) 40 | entry = (priority, count, [item]) 41 | self.entry_finder[item] = entry 42 | heapq.heappush(self.priority_queue, entry) 43 | 44 | def remove(self, item) -> None: 45 | """Mark an existing item as removed. Raise KeyError if not found.""" 46 | entry = self.entry_finder.pop(item) 47 | entry[-1][0] = None 48 | 49 | def pop(self) -> T: 50 | """Remove and return the lowest priority task. 51 | 52 | Raises KeyError if empty.""" 53 | while self.priority_queue: 54 | _, _, (item,) = heapq.heappop(self.priority_queue) 55 | if item is not None: 56 | del self.entry_finder[item] # type: ignore 57 | return cast(T, item) 58 | raise KeyError('pop from an empty priority queue') 59 | 60 | def __iter__(self) -> Iterator[T]: 61 | return iter(self.heapsort()) 62 | 63 | def __contains__(self, item): 64 | return item in self.entry_finder 65 | 66 | def __len__(self): 67 | return len(self.entry_finder) 68 | 69 | @property 70 | def first(self) -> Optional[T]: 71 | """Returns the "first" item (highest priority item) in the Heap.""" 72 | if len(self.entry_finder) == 0: 73 | return None 74 | for (_, _, (item,)) in self.priority_queue: 75 | if item is not None: 76 | return cast(T, item) 77 | return None 78 | 79 | def heapsort(self) -> Generator[T, None, None]: 80 | """Generator that iterates over all elements in the heap in priority 81 | order.""" 82 | h = [e for e in self.priority_queue] 83 | while h: 84 | entry = heapq.heappop(h)[-1][0] 85 | if entry is not None: 86 | yield cast(T, entry) 87 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://github.com/renatolfc/sched-rl-gym/workflows/sched-rl-gym/badge.svg 2 | :alt: sched-rl-gym 3 | .. image:: https://coveralls.io/repos/github/renatolfc/sched-rl-gym/badge.svg?branch=master 4 | :target: https://coveralls.io/github/renatolfc/sched-rl-gym?branch=master 5 | .. image:: https://readthedocs.org/projects/sched-rl-gym/badge/?version=latest 6 | :target: https://sched-rl-gym.readthedocs.io/en/latest/?badge=latest 7 | :alt: Documentation Status 8 | 9 | 10 | sched-rl-gym: Gym environment for HPC job scheduling problems 11 | ============================================================= 12 | 13 | .. inclusion-marker-do-not-remove 14 | 15 | ``sched-rl-gym`` is an `OpenAI Gym `__ 16 | environment for job scheduling problems. Currently, it implements `the 17 | Markov Decision 18 | Process `__ 19 | defined by 20 | `DeepRM `__. 21 | 22 | You can `use it as any other OpenAI Gym 23 | environment `__, provided the module is 24 | registered. Lucky for you, it supports auto registration upon first 25 | import. 26 | 27 | Therefore, you can get started by importing the environment with 28 | ``import schedgym.envs as schedgym``. 29 | 30 | As a parallel with the CartPole example in the Gym documentation, the 31 | following code will implement a random agent: 32 | 33 | .. code:: python 34 | 35 | import gym 36 | import schedgym.envs as schedgym 37 | 38 | env = gym.make('DeepRM-v0', use_raw_state=True) 39 | env.reset() 40 | 41 | for _ in range(200): 42 | env.render() 43 | observation, reward, done, info = env.step(env.action_space.sample()) 44 | env.close() 45 | 46 | With the following rendering: 47 | 48 | .. figure:: ./docs/img/gym.gif 49 | :alt: OpenAI Gym Environment rendering 50 | 51 | OpenAI Gym Environment rendering 52 | 53 | Features 54 | -------- 55 | 56 | - OpenAI Gym environment 57 | - Human rendering 58 | - Configurable environment 59 | 60 | Installation 61 | ------------ 62 | 63 | The easiest/quickest way to install sched-rl-gym is to use ``pip`` with 64 | the command: 65 | 66 | :: 67 | 68 | pip install -e git+https://github.com/renatolfc/sched-rl-gym.git#egg=sched-rl-gym 69 | 70 | We do recommend you use a `virtual 71 | environment `__, to not 72 | pollute your python installation with custom packages. 73 | 74 | If you want to be able to edit the code, then your best bet is to clone 75 | this repository with 76 | 77 | :: 78 | 79 | git clone https://github.com/renatolfc/sched-rl-gym.git 80 | 81 | In this case, you will need to install the dependencies manually. 82 | 83 | Dependencies 84 | ~~~~~~~~~~~~ 85 | 86 | The dependencies are documented in the ``requirements.txt`` file. You 87 | can install them with 88 | 89 | :: 90 | 91 | pip install -r requirements.txt 92 | 93 | Contribute 94 | ---------- 95 | 96 | - Issue tracker: https://github.com/renatolfc/sched-rl-gym/issues 97 | - Source code: https://github.com/renatolfc/sched-rl-gym 98 | 99 | Support 100 | ------- 101 | 102 | If you’re having issues, please let us know. The easiest way is to `open 103 | an issue on 104 | github `__. 105 | 106 | License 107 | ------- 108 | 109 | The project is licensed under the MIT license. 110 | -------------------------------------------------------------------------------- /schedgym/workload/swf_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | swf_parser - Parser for the Standard Workload Format (SWF) 6 | 7 | A full description of the format, with meanings for each field is available on 8 | the web at http://www.cs.huji.ac.il/labs/parallel/workload/swf.html. 9 | """ 10 | 11 | from enum import IntEnum 12 | 13 | import logging 14 | 15 | from ..job import Job, SwfJobStatus 16 | 17 | logger = logging.getLogger(__name__) # pylint: disable=C 18 | 19 | 20 | class SwfFields(IntEnum): 21 | """Fields of the Standard Workload Format.""" 22 | 23 | JOB_ID = 0 24 | SUBMITTED = 1 25 | WAIT_TIME = 2 26 | EXEC_TIME = 3 27 | ALLOC_PROCS = 4 28 | AVG_CPU_USAGE = 5 29 | USED_MEM = 6 30 | REQ_PROCS = 7 31 | REQ_TIME = 8 32 | REQ_MEM = 9 33 | STATUS = 10 34 | USER_ID = 11 35 | GROUP_ID = 12 36 | EXECUTABLE = 13 37 | QUEUE_NUM = 14 38 | PART_NUM = 15 39 | PRECEDING_JOB = 16 40 | THINK_TIME = 17 41 | 42 | 43 | CONVERTERS = { 44 | key: int if key != SwfFields.AVG_CPU_USAGE else float for key in SwfFields 45 | } 46 | 47 | 48 | def parse(filename, processors, memory, ignore_memory=False): 49 | """Parser for SWF job files. 50 | 51 | The SWF is a simple format with commented lines starting with the ';' 52 | character and other lines separated by spaces. 53 | 54 | Parsing, therefore, involves splitting the lines and associating each 55 | column of the file with a field. 56 | """ 57 | 58 | with open(filename, 'r') as fp: # pylint: disable=C 59 | for line in fp: 60 | if ';' in line: 61 | continue 62 | fields = line.strip().split() 63 | fields = [ # Converts all fields according to our rules 64 | CONVERTERS[SwfFields(i)](f) for i, f in enumerate(fields) 65 | ] 66 | 67 | job = Job( 68 | fields[SwfFields.JOB_ID], 69 | fields[SwfFields.SUBMITTED], 70 | fields[SwfFields.EXEC_TIME], 71 | fields[SwfFields.ALLOC_PROCS], 72 | fields[SwfFields.AVG_CPU_USAGE], 73 | fields[SwfFields.USED_MEM], 74 | fields[SwfFields.REQ_PROCS], 75 | fields[SwfFields.REQ_TIME], 76 | fields[SwfFields.REQ_MEM], 77 | SwfJobStatus(fields[SwfFields.STATUS]), 78 | fields[SwfFields.USER_ID], 79 | fields[SwfFields.GROUP_ID], 80 | fields[SwfFields.EXECUTABLE], 81 | fields[SwfFields.QUEUE_NUM], 82 | fields[SwfFields.PART_NUM], 83 | fields[SwfFields.PRECEDING_JOB], 84 | fields[SwfFields.THINK_TIME], 85 | fields[SwfFields.WAIT_TIME], 86 | ) 87 | 88 | if job.requested_memory < 0 < job.memory_use: 89 | job.requested_memory = job.memory_use 90 | 91 | if job.requested_processors < 0 < job.processors_allocated: 92 | job.requested_processors = job.processors_allocated 93 | 94 | if job.requested_memory < 0 and ignore_memory: 95 | job.requested_memory = 0 96 | 97 | if ( 98 | job.requested_processors < 1 99 | or (job.requested_memory < 1 and not ignore_memory) 100 | or job.execution_time < 1 101 | or job.submission_time < 0 102 | ): 103 | logger.warning(f'Ignoring malformed job {job.id}') 104 | continue 105 | 106 | if job.requested_time < job.execution_time: 107 | job.requested_time = job.execution_time 108 | 109 | if job.requested_processors > processors: 110 | job.requested_processors = processors 111 | 112 | if job.requested_memory > memory: 113 | job.requested_memory = memory 114 | 115 | yield job 116 | -------------------------------------------------------------------------------- /schedgym/envs/render.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import matplotlib 5 | import numpy as np 6 | 7 | import pyglet 8 | from matplotlib import pylab as plt 9 | import matplotlib.backends.backend_agg as agg 10 | 11 | DPI = 96 12 | WIDTH = 800 13 | HEIGHT = 600 14 | RESOLUTION = (WIDTH, HEIGHT) 15 | 16 | SUPPORTED_MODES = { 17 | 'human': lambda: DeepRmHumanRenderer, 18 | 'rgb_array': lambda: DeepRmRgbRenderer, 19 | } 20 | 21 | 22 | class DeepRmRgbRenderer(object): 23 | def __init__(self, resolution=RESOLUTION, dpi=DPI): 24 | self.resolution = resolution 25 | self.dpi = DPI 26 | 27 | @staticmethod 28 | def plot_substate(ax, title, state, colorbar=False): 29 | cmap = matplotlib.cm.get_cmap('rainbow') 30 | cmap.set_under('w') 31 | im = ax.imshow(state, cmap=cmap, vmin=0.001, vmax=1) 32 | if colorbar: 33 | ax.figure.colorbar(im, ax=ax) 34 | ax.set_title(title) 35 | ax.set_xlabel('Slots') 36 | ax.set_ylabel('Time horizon (timesteps)') 37 | ax.set_xticks([]) 38 | ax.set_yticks([]) 39 | ax.grid() 40 | 41 | def render(self, state): 42 | width = self.resolution[0] / self.dpi 43 | height = self.resolution[1] / self.dpi 44 | fig = plt.figure(0, figsize=(width, height), dpi=self.dpi) 45 | 46 | current, wait, backlog, time = state 47 | lines = current.shape[0] 48 | 49 | # Axes {{{ 50 | axs_current = [ 51 | plt.subplot2grid((lines, 3), (i, 0)) for i in range(lines) 52 | ] 53 | axs_wait = [plt.subplot2grid((lines, 3), (i, 1)) for i in range(lines)] 54 | ax_backlog = plt.subplot2grid((lines, 3), (0, 2), rowspan=lines) 55 | # End of Axes }}} 56 | 57 | for i, (ax_current, ax_wait) in enumerate(zip(axs_current, axs_wait)): 58 | self.plot_substate( 59 | ax_current, f'Current resources {i}', current[i] 60 | ) 61 | self.plot_substate( 62 | ax_wait, f'Waiting jobs stack {i}', np.mean(wait[i], axis=0) 63 | ) 64 | self.plot_substate(ax_backlog, 'Backlog', backlog, True) 65 | 66 | fig.tight_layout() 67 | canvas = agg.FigureCanvasAgg(fig) 68 | canvas.draw() 69 | renderer = canvas.get_renderer() 70 | raw_data = renderer.tostring_rgb() 71 | size = canvas.get_width_height() 72 | plt.close(fig) 73 | 74 | return np.frombuffer(raw_data, dtype=np.uint8).reshape( 75 | (size[0], size[1], 3) 76 | ) 77 | 78 | 79 | class DeepRmHumanRenderer(DeepRmRgbRenderer, pyglet.window.Window): 80 | def __init__(self, resolution=RESOLUTION, dpi=DPI): 81 | super().__init__(resolution, dpi) 82 | 83 | self.rendering = None 84 | width, height = resolution 85 | self.window = pyglet.window.Window(width, height, visible=False) 86 | self.window.set_caption('Scheduler State') 87 | self.window.set_visible() 88 | self.window.on_draw = self.on_draw 89 | 90 | def on_draw(self): 91 | self.window.clear() 92 | if self.rendering is not None: 93 | height, width, _ = self.rendering.shape 94 | img = pyglet.image.ImageData( 95 | height, 96 | width, 97 | 'RGB', 98 | self.rendering.data.tobytes(), 99 | -3 * height, 100 | ) 101 | 102 | img.blit(0, 0) 103 | 104 | def render(self, state): 105 | self.rendering = super().render(state) 106 | 107 | pyglet.clock.tick() 108 | self.window.switch_to() 109 | self.window.dispatch_events() 110 | self.window.dispatch_event('on_draw') 111 | self.window.flip() 112 | 113 | return self.rendering 114 | 115 | 116 | class DeepRmRenderer(object): 117 | def __init__(self, mode, *args, **kwargs): 118 | if mode not in SUPPORTED_MODES: 119 | raise RuntimeError('Requested unsupported mode %s' % mode) 120 | self.renderer = SUPPORTED_MODES[mode]()(*args, **kwargs) 121 | 122 | def render(self, state): 123 | return self.renderer.render(state) 124 | -------------------------------------------------------------------------------- /schedgym/workload/distribution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """distribution - Generative models for workload generation""" 5 | 6 | import math 7 | import random 8 | import itertools 9 | from abc import ABC, abstractmethod 10 | from typing import List, Optional 11 | 12 | from schedgym.job import Job, JobParameters 13 | from schedgym.workload.base import WorkloadGenerator 14 | 15 | 16 | class DistributionalWorkloadGenerator(WorkloadGenerator, ABC): 17 | """An abstract class for workload generation based on distributions. 18 | 19 | Parameters 20 | ---------- 21 | length : int 22 | An optional length of workload generation. When length samples 23 | are generated, automatic iteration will stop. 24 | """ 25 | 26 | length: int 27 | current_element: int 28 | 29 | def __init__(self, length=0): 30 | self.length = length 31 | self.current_element = 0 32 | 33 | @abstractmethod 34 | def step(self, offset=1) -> List[Optional[Job]]: 35 | """Steps the workload generator by :param offset:. 36 | 37 | This may, or may not, return new jobs, depending on the internal 38 | probability distributions of the workload generator. 39 | 40 | Parameters 41 | ---------- 42 | offset : int 43 | The number of time steps to advance the workload generator. 44 | """ 45 | 46 | 47 | class BinomialWorkloadGenerator(DistributionalWorkloadGenerator): 48 | """A workload generator that is based on a Bernoulli distribution. 49 | 50 | Parameters 51 | ---------- 52 | new_job_rate : float 53 | The probability of generating a new job 54 | small_job_chance : float 55 | The probability a sampled job will be "small" 56 | small_job_parameters : JobParameters 57 | The characteristics of "small" jobs 58 | large_job_parameters : JobParameters 59 | The characteristics of "large" jobs 60 | length : int 61 | The size of the sequence of jobs generated when iterating over this 62 | workload generator 63 | """ 64 | 65 | new_job_rate: float 66 | small_job_chance: float 67 | large_job: JobParameters 68 | small_job: JobParameters 69 | 70 | def __init__( 71 | self, 72 | new_job_rate, 73 | small_job_chance, 74 | small_job_parameters, 75 | large_job_parameters, 76 | length=0, 77 | runtime_estimates=None, 78 | estimate_parameters=None, 79 | ): 80 | super().__init__(length) 81 | 82 | self.current_time = 0 83 | self.counter = itertools.count(1) 84 | self.new_job_rate = new_job_rate 85 | self.small_job_chance = small_job_chance 86 | self.small_job = small_job_parameters 87 | self.large_job = large_job_parameters 88 | 89 | if runtime_estimates is not None and runtime_estimates not in [ 90 | 'gaussian', 91 | 'gaussian-over', 92 | 'gaussian-under', 93 | ]: 94 | raise ValueError(f'Unsupported estimate type {runtime_estimates}') 95 | 96 | self.runtime_estimates = runtime_estimates 97 | self.estimate_parameters = estimate_parameters 98 | 99 | def step(self, offset=1) -> List[Optional[Job]]: 100 | self.current_time += offset 101 | if random.random() > self.new_job_rate: 102 | return [] 103 | if random.random() < self.small_job_chance: 104 | j = self.small_job.sample(self.current_time) 105 | else: 106 | j = self.large_job.sample(self.current_time) 107 | if self.runtime_estimates and self.runtime_estimates.startswith( 108 | 'gaussian' 109 | ): 110 | if self.estimate_parameters is None: 111 | raise RuntimeError( 112 | "Can't sample runtime estimates with undefined parameters" 113 | ) 114 | diff = random.gauss(0, self.estimate_parameters * j.execution_time) 115 | if 'over' in self.runtime_estimates: 116 | diff = abs(diff) 117 | elif 'under' in self.runtime_estimates: 118 | diff = -abs(diff) 119 | j.requested_time = max(math.ceil(j.execution_time + diff), 1) 120 | j.id = next(self.counter) 121 | return [j] 122 | 123 | def __len__(self): 124 | return self.length 125 | 126 | def peek(self): 127 | return self.step(0) 128 | -------------------------------------------------------------------------------- /schedgym/workload/trace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """trace - A trace-based workload generator 5 | 6 | Inherits from the base WorkloadGenerator and uses the swf_parser to parse SWF 7 | files. 8 | """ 9 | 10 | from itertools import takewhile 11 | from typing import Iterator, Optional, Sequence, Callable 12 | 13 | from ..job import Job 14 | from .base import WorkloadGenerator 15 | from .swf_parser import parse as parse_swf 16 | 17 | 18 | class TraceGenerator(WorkloadGenerator): 19 | restart: bool 20 | trace: Sequence[Job] 21 | refresh_jobs: Optional[Callable] = None 22 | 23 | def __init__(self, restart=False, trace=None): 24 | self.current_time = 0 25 | self.restart = restart 26 | self.current_element = 0 27 | 28 | if trace is not None: 29 | self.trace = trace 30 | else: 31 | self.trace = [] 32 | 33 | def step(self, offset=1): 34 | """ "Samples" jobs from the trace file. 35 | 36 | Parameters 37 | ---------- 38 | offset : int 39 | The amount to offset the current time step 40 | """ 41 | if offset < 0: 42 | raise ValueError('Submission time must be positive') 43 | if self.current_element >= len(self.trace): 44 | if self.restart: 45 | self.current_element = 0 46 | for job in self.trace: 47 | job.submission_time += self.current_time 48 | if self.refresh_jobs is not None: 49 | self.refresh_jobs() 50 | else: 51 | raise StopIteration('Workload finished') 52 | submission_time = self.current_time + offset 53 | jobs = takewhile( 54 | lambda j: j[1].submission_time <= submission_time, 55 | enumerate( 56 | self.trace[self.current_element:], self.current_element 57 | ), 58 | ) 59 | self.current_time = submission_time 60 | jobs = list(jobs) 61 | if jobs: 62 | self.current_element = jobs[-1][0] + 1 63 | return [j for (i, j) in jobs] 64 | return [] 65 | 66 | @property 67 | def last_event_time(self): 68 | """The submission time of the last generated job""" 69 | offset = ( 70 | self.current_element 71 | if self.current_element < len(self.trace) 72 | else -1 73 | ) 74 | return self.trace[offset].submission_time 75 | 76 | def __len__(self): 77 | return len(self.trace) 78 | 79 | def __next__(self) -> Job: 80 | if self.current_element >= len(self.trace): 81 | if self.restart: 82 | self.current_element = 0 83 | if self.refresh_jobs is not None: 84 | self.refresh_jobs() 85 | else: 86 | raise StopIteration() 87 | job = self.trace[self.current_element] 88 | self.current_element += 1 89 | return job 90 | 91 | def __iter__(self) -> Iterator[Optional[Job]]: 92 | return iter(self.trace) 93 | 94 | def peek(self) -> Optional[Job]: 95 | job = next(self) 96 | if self.current_element > 0: 97 | self.current_element -= 1 98 | return job 99 | 100 | 101 | class SwfGenerator(TraceGenerator): 102 | """A trace-based (workload log) generator. 103 | 104 | Supports starting the parsing after an offset, and also supports reading a 105 | pre-specified number of jobs. 106 | 107 | Parameters 108 | ---------- 109 | tracefile : str 110 | The path to the filed to be parsed and used as input for workload 111 | generation. 112 | processors : int 113 | The number of processors in this trace 114 | memory : int 115 | The amount of memory in this trace 116 | restart : bool 117 | Whether to restart from the beginning of the file when we reach 118 | its end (or, in the case we're using an offset and a length, to 119 | restart from the offset up to the length) 120 | ignore_memory : bool 121 | Whether to ignore (or not) memory usage 122 | """ 123 | 124 | tracefile: str 125 | ignore_memory: bool 126 | 127 | def __init__( 128 | self, 129 | tracefile, 130 | processors, 131 | memory, 132 | offset=0, 133 | length=None, 134 | restart=False, 135 | ignore_memory=False, 136 | ): 137 | 138 | super().__init__( 139 | restart, 140 | list(parse_swf(tracefile, processors, memory, ignore_memory)), 141 | ) 142 | self.tracefile = tracefile 143 | 144 | if length is None: 145 | length = len(self.trace) 146 | else: 147 | length = length if length <= len(self.trace) else len(self.trace) 148 | 149 | self.trace = self.trace[offset:offset + length] 150 | 151 | self.current_element = 0 152 | -------------------------------------------------------------------------------- /schedgym/scheduler/null_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """null_scheduler - a module that doesn't do any scheduling 5 | 6 | The purposed of this module is to provide a way for clients of the simulator to 7 | implement different scheduling strategies. 8 | 9 | Most notably, this can be used by learning agents to select which jobs to 10 | schedule in an iterative way. 11 | """ 12 | 13 | from typing import Optional 14 | 15 | from ..job import Job 16 | from ..scheduler import Scheduler 17 | 18 | # The main issue here is that we have two kinds of steps: 19 | # 1. OpenAI Gym steps 20 | # 2. Scheduler steps 21 | # For OpenAI Gym steps, we need to pass an action. For Scheduler steps, we need 22 | # to pass an offset. 23 | 24 | 25 | class NullScheduler(Scheduler): 26 | """An scheduler that receives scheduling commands from a client. 27 | 28 | This is a null scheduler in the sense that scheduling decisions aren't made 29 | by this class, but by another class, which forwards its decisions to this 30 | one so that they can be propagated into the simulator. As such, this 31 | implements the interface between RL environments (such as OpenAI gym) 32 | and the scheduler simulator. 33 | 34 | Parameters 35 | ---------- 36 | number_of_processors : int 37 | The number of processors managed by this scheduler 38 | total_memory : int 39 | The total amount of memory in the cluster managed by this scheduler 40 | """ 41 | 42 | current_slot: Optional[int] 43 | 44 | def __init__( 45 | self, number_of_processors, total_memory, ignore_memory=False 46 | ): 47 | self.current_slot: Optional[int] = None 48 | super().__init__( 49 | number_of_processors, total_memory, ignore_memory=ignore_memory 50 | ) 51 | 52 | def step(self, offset: int = None) -> bool: 53 | """Steps the scheduler by setting which job to choose. 54 | 55 | Uses the offset to select a position in the admission queue. If the 56 | agent select a job that doesn't fit the cluster, such a selection is 57 | ignored by the scheduler. 58 | 59 | Differently from its base class, this method **does not** forward time. 60 | For this, please see :func:`forward_time`. 61 | 62 | Parameters 63 | ---------- 64 | offset : int 65 | The offset in the admission queue of the job to select. Any 66 | negative number represents a no-op. 67 | """ 68 | if self.current_slot is not None: 69 | raise AssertionError('current_slot invariant not true') 70 | 71 | self.current_slot = offset if offset is not None else -1 72 | return self.schedule() 73 | 74 | def forward_time(self): 75 | """Forwards time by one time step. 76 | 77 | For details, see :func:`step`. 78 | """ 79 | 80 | present = self.job_events.step(1) 81 | self.cluster = self.play_events( 82 | present, self.cluster, update_queues=True 83 | ) 84 | self.current_time += 1 85 | self.schedule() 86 | 87 | @property 88 | def action_space(self): 89 | """Helper that gives the number of actions available for the agent.""" 90 | # We always support the null action 91 | return len(self.queue_admission) + 1 92 | 93 | def schedule(self) -> bool: 94 | """Tries to schedule the job selected with :func:`step`. 95 | 96 | When :func:`step` is called, it stores the job currently selected by 97 | the client. This function will check in the queue which job the 98 | selection corresponds to and will check if the job fits in the cluster 99 | *right now*. If it does, the job is scheduled, otherwise, it is 100 | ignored. 101 | In either case, the current selection is cleared. 102 | 103 | Returns: 104 | bool: True if the selected job was scheduled. False otherwise. 105 | """ 106 | try: 107 | if ( 108 | self.current_slot is not None 109 | and len(self.queue_admission) > 0 110 | and 0 <= self.current_slot < len(self.queue_admission) 111 | ): 112 | job: Job = self.queue_admission[self.current_slot] 113 | if not self.cluster.fits(job): 114 | return False 115 | resources = self.can_schedule_now(job) 116 | if resources: 117 | self.assign_schedule(job, resources, self.current_time) 118 | self.queue_admission.pop(self.current_slot) 119 | return True 120 | return False 121 | return False 122 | finally: 123 | self.current_slot = None 124 | 125 | def sjf_lt( 126 | self, a: Job, b: Optional[Job] 127 | ): # pylint: disable=C, no-self-use 128 | """Comparison function that gives the same ordering SJF would give. 129 | 130 | Parameters 131 | ---------- 132 | a: Job 133 | A first job 134 | b: Job 135 | A second job 136 | 137 | Returns: 138 | bool: True if `a` is shorter than `b`. False otherwise. 139 | """ 140 | return b is None or (a.requested_time < b.requested_time) 141 | 142 | def sjf_action(self, limit: int) -> int: 143 | """Returns the index of the job SJF would pick. 144 | 145 | Parameters 146 | ---------- 147 | limit : int 148 | How far in the admission queue to look when searching for the 149 | shortest job. 150 | """ 151 | 152 | best = None 153 | bestidx = limit 154 | limits = slice(0, limit if limit >= 0 else None) 155 | for i, job in enumerate(self.queue_admission[limits]): 156 | if self.sjf_lt(job, best): 157 | if self.cluster.fits(job): 158 | best = job 159 | bestidx = i 160 | return bestidx 161 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """A setuptools based setup module. 5 | See: 6 | https://packaging.python.org/guides/distributing-packages-using-setuptools/ 7 | https://github.com/pypa/sampleproject 8 | """ 9 | 10 | # Always prefer setuptools over distutils 11 | from setuptools import setup, find_packages, Extension 12 | import pathlib 13 | import schedgym # noqa 14 | try: 15 | from Cython.Build import cythonize 16 | except (NameError, ModuleNotFoundError): 17 | def cythonize(*args, **kwargs): 18 | pass 19 | 20 | here = pathlib.Path(__file__).parent.resolve() 21 | 22 | # Get the long description from the README file 23 | long_description = (here / 'README.rst').read_text(encoding='utf-8') 24 | 25 | extras = { 26 | 'render': [ 27 | 'matplotlib', 28 | 'pyglet', 29 | ], 30 | 'test': [ 31 | 'pytest', 32 | 'coverage', 33 | ], 34 | 'docs': [ 35 | 'Sphinx', 36 | 'docutils', 37 | 'nbsphinx', 38 | ] 39 | } 40 | 41 | extras['all'] = [item for group in extras.values() for item in group] 42 | 43 | # Arguments marked as "Required" below must be included for upload to PyPI. 44 | # Fields marked as "Optional" may be commented out. 45 | 46 | setup( 47 | name='sched-rl-gym', 48 | description='OpenAI Gym environment for HPC job scheduling', 49 | long_description=long_description, 50 | long_description_content_type='text/x-rst', 51 | url='https://github.com/renatolfc/sched-rl-gym', 52 | author='Renato L. de F. Cunha', 53 | author_email='renatocunha@acm.org', 54 | 55 | classifiers=[ 56 | 'Development Status :: 3 - Alpha', 57 | 58 | # Indicate who your project is intended for 59 | 'Intended Audience :: Developers', 60 | 'Intended Audience :: Information Technology', 61 | 62 | 'Topic :: Scientific/Engineering', 63 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 64 | 65 | # Pick your license as you wish 66 | 'License :: OSI Approved :: MIT License', 67 | 68 | # Specify the Python versions you support here. In particular, ensure 69 | # that you indicate you support Python 3. These classifiers are *not* 70 | # checked by 'pip install'. See instead 'python_requires' below. 71 | 'Programming Language :: Python :: 3', 72 | 'Programming Language :: Python :: 3.6', 73 | 'Programming Language :: Python :: 3.7', 74 | 'Programming Language :: Python :: 3.8', 75 | 'Programming Language :: Python :: 3.9', 76 | 'Programming Language :: Python :: 3 :: Only', 77 | ], 78 | 79 | keywords='gym, reinforcement learning, artificial intelligence', 80 | 81 | package_dir={'schedgym': 'schedgym'}, 82 | packages=find_packages(), 83 | python_requires='>=3.6, <4', 84 | 85 | # This field lists other packages that your project depends on to run. 86 | # Any package you put here will be installed by pip when your project is 87 | # installed, so they must be valid existing projects. 88 | # 89 | # For an analysis of "install_requires" vs pip's requirements files see: 90 | # https://packaging.python.org/en/latest/requirements.html 91 | install_requires=[ 92 | 'gym', 93 | 'numpy', 94 | 'cython', 95 | 'intervaltree>=3.0', 96 | 'parallelworkloads', 97 | ], 98 | 99 | extras_require=extras, 100 | 101 | ext_modules=cythonize([ 102 | Extension('schedgym.job', ['schedgym/job.py']), 103 | Extension('schedgym.pool', ['schedgym/pool.py']), 104 | Extension('schedgym.simulator', ['schedgym/simulator.py']), 105 | Extension('schedgym.resource', ['schedgym/resource.py']), 106 | Extension('schedgym.cluster', ['schedgym/cluster.py']), 107 | Extension('schedgym.envs.workload', ['schedgym/envs/workload.py']), 108 | Extension('schedgym.envs.simulator', ['schedgym/envs/simulator.py']), 109 | Extension('schedgym.envs.compact_env', ['schedgym/envs/compact_env.py']), 110 | Extension('schedgym.envs.base', ['schedgym/envs/base.py']), 111 | Extension('schedgym.envs.deeprm_env', ['schedgym/envs/deeprm_env.py']), 112 | Extension('schedgym.scheduler.backfilling_scheduler', ['schedgym/scheduler/backfilling_scheduler.py']), 113 | Extension('schedgym.scheduler.null_scheduler', ['schedgym/scheduler/null_scheduler.py']), 114 | Extension('schedgym.scheduler.easy_scheduler', ['schedgym/scheduler/easy_scheduler.py']), 115 | Extension('schedgym.scheduler.fifo_scheduler', ['schedgym/scheduler/fifo_scheduler.py']), 116 | Extension('schedgym.scheduler.packer_scheduler', ['schedgym/scheduler/packer_scheduler.py']), 117 | Extension('schedgym.scheduler.random_scheduler', ['schedgym/scheduler/random_scheduler.py']), 118 | Extension('schedgym.scheduler.sjf_scheduler', ['schedgym/scheduler/sjf_scheduler.py']), 119 | Extension('schedgym.scheduler.tetris_scheduler', ['schedgym/scheduler/tetris_scheduler.py']), 120 | Extension('schedgym.workload.base', ['schedgym/workload/base.py']), 121 | Extension('schedgym.workload.trace', ['schedgym/workload/trace.py']), 122 | Extension('schedgym.workload.distribution', ['schedgym/workload/distribution.py']), 123 | Extension('schedgym.workload.swf_parser', ['schedgym/workload/swf_parser.py']), 124 | ], language_level=3), 125 | 126 | # List additional URLs that are relevant to your project as a dict. 127 | # 128 | # This field corresponds to the "Project-URL" metadata fields: 129 | # https://packaging.python.org/specifications/core-metadata/#project-url-multiple-use 130 | # 131 | # Examples listed include a pattern for specifying where the package tracks 132 | # issues, where the source is hosted, where to say thanks to the package 133 | # maintainers, and where to support the project financially. The key is 134 | # what's used to render the link text on PyPI. 135 | project_urls={ # Optional 136 | 'Bug Reports': 'https://github.com/renatolfc/sched-rl-gym/issues', 137 | 'Say Thanks!': 'https://saythanks.io/to/renatolfc', 138 | 'Source': 'https://github.com/renatolfc/sched-rl-gym', 139 | }, 140 | ) 141 | -------------------------------------------------------------------------------- /schedgym/envs/deeprm_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import Union 5 | 6 | import numpy as np 7 | 8 | import gym.spaces.box 9 | import gym.spaces.discrete 10 | import gym.spaces.tuple 11 | 12 | from ..job import Job 13 | from .base import BaseRmEnv 14 | from .simulator import DeepRmSimulator 15 | from .workload import DeepRmWorkloadGenerator 16 | 17 | import logging 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | MAXIMUM_QUEUE_SIZE = 16 22 | 23 | RESOURCE_SLOTS = 10 24 | 25 | NUMBER_OF_RESOURCES = 2 26 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10 27 | 28 | 29 | class DeepRmEnv(BaseRmEnv): 30 | n_work: int 31 | n_resources: int 32 | use_raw_sate: bool 33 | simulator: DeepRmSimulator 34 | workload: DeepRmWorkloadGenerator 35 | observation_space: Union[gym.spaces.tuple.Tuple, gym.spaces.box.Box] 36 | action_space: gym.spaces.discrete.Discrete 37 | 38 | metadata = {'render.modes': ['human', 'rgb_array']} 39 | 40 | def __init__(self, **kwargs): 41 | super().__init__(**kwargs) 42 | 43 | self.use_raw_state = kwargs.get('use_raw_state', False) 44 | 45 | self.n_resources = kwargs.get( 46 | 'n_resources', NUMBER_OF_RESOURCES 47 | ) # resources in the system 48 | self.n_work = kwargs.get( 49 | 'n_work', MAXIMUM_QUEUE_SIZE 50 | ) # max amount of work in the queue 51 | if self.backlog_size % self.time_horizon: 52 | raise AssertionError('Backlog must be a multiple of time horizon') 53 | 54 | self.backlog_width = self.backlog_size // self.time_horizon 55 | 56 | self.setup_spaces() 57 | 58 | def setup_spaces(self): 59 | self.action_space = gym.spaces.discrete.Discrete(self.job_slots + 1) 60 | if self.use_raw_state: 61 | self.setup_raw_spaces() 62 | else: 63 | self.setup_image_spaces() 64 | 65 | def setup_image_spaces(self): 66 | self.observation_space = gym.spaces.box.Box( 67 | low=0.0, 68 | high=1.0, 69 | shape=( 70 | self.time_horizon, 71 | ( 72 | (0 if self.ignore_memory else (self.job_slots + 1)) 73 | * self.scheduler.total_memory 74 | ) 75 | + (self.job_slots + 1) * self.scheduler.number_of_processors 76 | + self.backlog_width 77 | + 1, 78 | ), 79 | ) 80 | 81 | def setup_raw_spaces(self): 82 | self.memory_space = gym.spaces.box.Box( 83 | low=0.0, 84 | high=1.0, 85 | shape=(self.time_horizon, self.scheduler.total_memory), 86 | ) 87 | self.processor_space = gym.spaces.box.Box( 88 | low=0.0, 89 | high=1.0, 90 | shape=(self.time_horizon, self.scheduler.number_of_processors), 91 | ) 92 | self.backlog_space = gym.spaces.box.Box( 93 | low=0.0, high=1.0, shape=(self.time_horizon, self.backlog_width) 94 | ) 95 | self.memory_slots_space = gym.spaces.box.Box( 96 | low=0.0, 97 | high=1.0, 98 | shape=( 99 | self.job_slots, 100 | self.time_horizon, 101 | self.scheduler.total_memory, 102 | ), 103 | ) 104 | self.processor_slots_space = gym.spaces.box.Box( 105 | low=0.0, 106 | high=1.0, 107 | shape=( 108 | self.job_slots, 109 | self.time_horizon, 110 | self.scheduler.number_of_processors, 111 | ), 112 | ) 113 | self.time_since_space = gym.spaces.discrete.Discrete(self.time_horizon) 114 | 115 | self.observation_space = gym.spaces.tuple.Tuple( 116 | ( 117 | self.processor_space, 118 | self.memory_space, 119 | self.processor_slots_space, 120 | self.memory_slots_space, 121 | self.backlog_space, 122 | self.time_since_space, 123 | ) 124 | ) 125 | self.observation_space.n = np.sum( # type: ignore 126 | [ 127 | np.prod(e.shape) if isinstance(e, gym.spaces.box.Box) else e.n 128 | for e in self.observation_space 129 | ] 130 | ) 131 | 132 | @property 133 | def state(self): 134 | state, jobs, backlog = self.scheduler.state( 135 | self.time_horizon, self.job_slots 136 | ) 137 | s = self._convert_state( 138 | state, 139 | jobs, 140 | backlog, 141 | ( 142 | (self.simulator.current_time - self.simulator.last_job_time) 143 | / MAX_TIME_TRACKING_SINCE_LAST_JOB 144 | ), 145 | ) 146 | if self.use_raw_state: 147 | return s 148 | return self.pack_observation(s) 149 | 150 | def pack_observation(self, ob): 151 | current, wait, backlog, time = ob 152 | wait = wait.reshape(self.time_horizon, -1) 153 | current = current.reshape(self.time_horizon, -1) 154 | return np.hstack((current, wait, backlog, time)) 155 | 156 | def find_slot_position(self, action): 157 | if action < len(self.scheduler.queue_admission): 158 | return action 159 | return self.action_space.n - 1 160 | 161 | def step(self, action: int): 162 | done = False 163 | found = False 164 | if 0 <= action < self.action_space.n - 1: 165 | action = self.find_slot_position(action) 166 | found = True 167 | try: 168 | intermediate = self.simulator.rl_step( 169 | action if found else None, self.reward_mapper[self.reward_jobs] 170 | ) 171 | except StopIteration: 172 | intermediate = [[Job()]] 173 | done = True 174 | 175 | reward = self.reward if any(intermediate) else 0 176 | done = bool(self.time_limit) and ( 177 | self.scheduler.current_time > self.time_limit or done 178 | ) 179 | 180 | if not done and self.smdp and any(intermediate): 181 | rewards = [self.compute_reward(js) for js in intermediate] 182 | rewards[0] = 0 183 | reward = ( 184 | self.gamma ** np.arange(len(intermediate)) 185 | ).dot(rewards) 186 | 187 | return ( 188 | self.state, 189 | reward, 190 | done, 191 | self.stats if done else {} 192 | ) 193 | -------------------------------------------------------------------------------- /schedgym/pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """pool - Resource Pool management (see :class:`schedgym.cluster.Cluster`).""" 5 | 6 | import copy 7 | import enum 8 | from typing import Iterable, List, Optional 9 | 10 | from intervaltree import IntervalTree, Interval 11 | 12 | 13 | class ResourceType(enum.IntEnum): 14 | """Enumeration to determine which kind of resource we're managing.""" 15 | 16 | CPU = 1 17 | MEMORY = 0 18 | 19 | 20 | class ResourcePool: 21 | """A pool of resources. 22 | 23 | This is the basic structure managed by a :class:`schedgym.cluster.Cluster`. 24 | 25 | Parameters 26 | ---------- 27 | resource_type : ResourceType 28 | The type of resource in this pool 29 | size : int 30 | The amount of resources available in this pool 31 | used_pool : IntervalTree 32 | The set of resources currently in use in this resource pool. 33 | """ 34 | 35 | used_pool: IntervalTree 36 | 37 | def __init__( 38 | self, 39 | resource_type: ResourceType, 40 | size: int, 41 | used_pool: IntervalTree = None, 42 | ): 43 | self.size = size 44 | self.used_resources = 0 45 | self.type = resource_type 46 | if used_pool is None: 47 | self.used_pool = IntervalTree() 48 | else: 49 | self.used_pool = used_pool 50 | self.used_resources = sum( 51 | [ResourcePool.measure(i) for i in used_pool] 52 | ) 53 | 54 | def clone(self): 55 | """Duplicates this ResourcePool in memory.""" 56 | return copy.deepcopy(self) 57 | 58 | @property 59 | def free_resources(self) -> int: 60 | """Returns the amount of free resources in this resource pool""" 61 | return self.size - self.used_resources 62 | 63 | def fits(self, size) -> bool: 64 | """Checks whether a given amount of resources can be allocated. 65 | 66 | Parameters 67 | ---------- 68 | size : int 69 | The amount of resources to allocate in this pool 70 | 71 | Returns: 72 | bool: True when the size fits the pool, and False otherwise. 73 | """ 74 | if size <= 0: 75 | raise AssertionError("Can't allocate zero resources") 76 | return size <= self.free_resources 77 | 78 | @staticmethod 79 | def measure(interval: Interval): 80 | """Measures the size of an interval. 81 | 82 | Parameters 83 | ---------- 84 | interval : Interval 85 | The interval to be measured. 86 | """ 87 | return interval.end - interval.begin 88 | 89 | def find(self, size: int, data: Optional[int] = None) -> IntervalTree: 90 | """Finds an interval tree of a given size in this resource pool. 91 | 92 | This is essentially an operation to find *which* resources to allocate 93 | considering that we manage individual resource units and guarantee 94 | exclusive usage by a resource unit. 95 | 96 | Parameters 97 | ---------- 98 | size : int 99 | The size (amount) of resources to allocate 100 | data : Optional[int] 101 | The identifier of the "owner" of the found resources. This 102 | allows us to keep track which job "owns" which resources during 103 | execution. 104 | 105 | Returns: 106 | IntervalTree: An interval tree with the size requested if such 107 | a tree can be found. Otherwise, an empty tree is returned. 108 | """ 109 | used = IntervalTree() 110 | if not self.fits(size): 111 | return used 112 | free = IntervalTree([Interval(0, self.size, data)]) 113 | used_size: int = 0 114 | for interval in self.used_pool: 115 | free.chop(interval.begin, interval.end) 116 | for interval in free: 117 | temp_size = ResourcePool.measure(interval) + used_size 118 | if temp_size == size: 119 | used.add(interval) 120 | break 121 | if temp_size < size: 122 | used.add(interval) 123 | used_size = temp_size 124 | else: 125 | used.add( 126 | Interval( 127 | interval.begin, interval.begin + size - used_size, data 128 | ) 129 | ) 130 | break 131 | return used 132 | 133 | def allocate(self, intervals: Iterable[Interval]) -> None: 134 | """Adds a set of intervals to the current used pool of resources. 135 | 136 | This is the opposite of :func:`schedgym.cluster.Cluster.free`. 137 | 138 | Parameters 139 | ---------- 140 | intervals : Iterable[Interval] 141 | The set of intervals that should be allocated (most likely, 142 | this will be the resource of calling 143 | :func:`schedgym.cluster.Cluster.find`). 144 | 145 | Returns: 146 | None 147 | """ 148 | for i in intervals: 149 | if self.used_resources + self.measure(i) > self.size: 150 | raise AssertionError( 151 | 'Tried to allocate past size of resource pool' 152 | ) 153 | self.used_pool.add(i) 154 | self.used_resources += self.measure(i) 155 | 156 | def free(self, intervals: Iterable[Interval]) -> None: 157 | """Frees a set of used resources. 158 | 159 | This is the opposite of :func:`schedgym.cluster.Cluster.allocate`. 160 | 161 | Parameters 162 | ---------- 163 | intervals : Iterable[Interval] 164 | The set of intervals to be freed (most likely, these will have 165 | been allocated with the output of 166 | :func:`schedgym.cluster.Cluster.find`). 167 | """ 168 | for i in intervals: 169 | if i not in self.used_pool: 170 | raise AssertionError('Tried to free unused resource set') 171 | self.used_pool.remove(i) 172 | self.used_resources -= self.measure(i) 173 | 174 | @property 175 | def intervals(self) -> List[Interval]: 176 | """The set of intervals currently used in this resource pool.""" 177 | # pylint: disable=unnecessary-comprehension 178 | return [i for i in self.used_pool] 179 | 180 | def __repr__(self): 181 | return ( 182 | f'ResourcePool(resource_type={self.type}, ' 183 | f'size={self.size}, used_pool={self.used_pool})' 184 | ) 185 | -------------------------------------------------------------------------------- /schedgym/event.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """event - Event Handling classes 5 | 6 | We have a basic Event type, which is specialized by 7 | 1. A ResourceEvent, related to events that occur to resources and 8 | 2. A JobEvent, related to events that occur to jobs 9 | """ 10 | 11 | import copy 12 | import enum 13 | import warnings 14 | from typing import List, Optional, Iterable, TypeVar, Generic, Iterator 15 | 16 | from intervaltree import Interval 17 | 18 | from .job import Job 19 | from .heap import Heap 20 | from .pool import ResourceType 21 | 22 | T = TypeVar('T', bound='Event') # pylint: disable=C 23 | 'Generic type for type annotations' 24 | 25 | 26 | class EventType(enum.IntEnum): 27 | """Enumeration for the different types of events that can occur.""" 28 | 29 | RESOURCE_ALLOCATE = 0 30 | RESOURCE_FREE = 1 31 | JOB_FINISH = 2 32 | JOB_START = 3 33 | 34 | 35 | class Event: 36 | """A base event class. 37 | 38 | Parameters 39 | ---------- 40 | time : int 41 | The time at which this event occurs 42 | type : EventType 43 | What is the type of this event 44 | """ 45 | 46 | time: int 47 | type: EventType 48 | 49 | def __init__(self, time: int, type: EventType): 50 | # pylint: disable=redefined-builtin 51 | self.time = time 52 | self.type = type 53 | 54 | def clone(self): 55 | """Clones this event. 56 | 57 | Returns: 58 | A new event identical to this one, but with no memory sharing. 59 | """ 60 | return copy.copy(self) 61 | 62 | 63 | class ResourceEvent(Event): 64 | """An event related to resource allocation or to the freeing of resources. 65 | 66 | Parameters 67 | ---------- 68 | time : int 69 | The time at which this event occurs 70 | type : EventType 71 | What is the type of this event 72 | resources : Iterable[Interval] 73 | The resources that are being allocated/free'd by this event 74 | """ 75 | 76 | resources: Iterable[Interval] 77 | resource_type: ResourceType 78 | 79 | def __init__( 80 | self, 81 | time: int, 82 | type: EventType, 83 | resource_type: ResourceType, 84 | resources: Iterable[Interval], 85 | ): 86 | # pylint: disable=redefined-builtin 87 | super().__init__(time, type) 88 | self.resources = resources 89 | self.resource_type = resource_type 90 | 91 | 92 | class JobEvent(Event): 93 | """An event related to the start of finishing of jobs. 94 | 95 | Parameters 96 | ---------- 97 | time : int 98 | The time at which this event occurs 99 | type : EventType 100 | What is the type of this event 101 | job : Job 102 | The job to which this event applies 103 | """ 104 | 105 | job: Job 106 | 107 | def __init__(self, time: int, type: EventType, job: Job): 108 | # pylint: disable=redefined-builtin 109 | super().__init__(time, type) 110 | self.job = job 111 | 112 | @property 113 | def processors(self) -> Iterable[Interval]: 114 | """The processors touched by the job that caused this event""" 115 | return self.job.resources.processors 116 | 117 | @property 118 | def memory(self) -> Iterable[Interval]: 119 | """The memory touched by the job that caused this event""" 120 | return self.job.resources.memory 121 | 122 | def __str__(self): 123 | return f'JobEvent<{self.time}, {self.type.name}, {self.job}>' 124 | 125 | def __repr__(self): 126 | return str(self) 127 | 128 | 129 | class EventQueue(Generic[T]): 130 | """A priority-queue of events sorted by time. 131 | 132 | Parameters 133 | ---------- 134 | time : int 135 | The moment in time this event queue begins. 136 | """ 137 | 138 | time: int 139 | past: List[T] 140 | future: Heap[T] 141 | 142 | def __init__(self, time: int = 0): 143 | self.past = [] 144 | self.time = time 145 | self.future = Heap() 146 | 147 | def add(self, event: T) -> None: 148 | """Adds a new event to the priority queue. 149 | 150 | Parameters 151 | ---------- 152 | event 153 | The event to be added 154 | """ 155 | if event.time >= self.time: 156 | self.future.add(event, (event.time, event.type)) 157 | else: 158 | self.past.append(event) 159 | self.past.sort(key=lambda e: e.time) 160 | warnings.warn( 161 | 'Adding events to the past might change the ' 162 | 'ordering of events that happened at the same ' 163 | 'time.' 164 | ) 165 | 166 | def step(self, time: int = 1) -> Iterable[T]: 167 | """Steps time in the event queue. 168 | 169 | Parameters 170 | ---------- 171 | time : int 172 | The amount of time steps to perform 173 | 174 | Returns: 175 | A list with all events that happened between the previous time and 176 | the current time. 177 | """ 178 | if time < 0: 179 | raise AssertionError('Tried to move into the past.') 180 | self.time += time 181 | present: List[T] = [] 182 | first = self.future.first 183 | while first and first.time <= self.time: 184 | current = self.future.pop() 185 | present.append(current) 186 | self.past.append(current) 187 | first = self.future.first 188 | return present 189 | 190 | def remove(self, event: Event) -> None: 191 | """Removes an event from the queue. 192 | 193 | The event is required to not have happened yet, as removal of past 194 | events is not supported. 195 | """ 196 | if event not in self.future: 197 | raise ValueError('Tried to remove non-existant value') 198 | self.future.remove(event) 199 | 200 | @property 201 | def first(self) -> Optional[T]: # XXX: This is probably not needed 202 | """The first event in the future to happen in this queue.""" 203 | return self.future.first 204 | 205 | @property 206 | def next(self) -> Optional[T]: 207 | """The next event to happen in this queue.""" 208 | if len(self.future) == 0: 209 | return None 210 | return self.future.first 211 | 212 | @property 213 | def last(self) -> Optional[T]: 214 | """The last event to have happened in this queue.""" 215 | return self.past[-1] if self.past else None 216 | 217 | def __iter__(self) -> Iterator[T]: 218 | return self.future.heapsort() 219 | 220 | def __str__(self) -> str: 221 | return f'{[e for e in self.future.heapsort()]}' 222 | 223 | def __repr__(self): 224 | return str(self) 225 | -------------------------------------------------------------------------------- /schedgym/envs/simulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from enum import IntEnum 5 | from typing import Callable, List, Optional, Union, cast 6 | 7 | from schedgym.job import Job 8 | from schedgym.scheduler import NullScheduler 9 | from schedgym.envs.workload import ( 10 | DeepRmWorkloadGenerator, 11 | SyntheticWorkloadGenerator, 12 | ) 13 | 14 | WorkloadGeneratorType = Union[ 15 | DeepRmWorkloadGenerator, SyntheticWorkloadGenerator 16 | ] 17 | 18 | 19 | class SimulationType(IntEnum): 20 | EVENT_BASED = (0,) 21 | TIME_BASED = 1 22 | 23 | @staticmethod 24 | def from_str(simulation_type: str): 25 | simulation_type = simulation_type.upper().replace('-', '_') 26 | if simulation_type in SimulationType.__members__: 27 | return SimulationType[simulation_type] 28 | else: 29 | raise ValueError( 30 | f'{simulation_type} is not a valid SimulationType.' 31 | ) 32 | 33 | 34 | class DeepRmSimulator: 35 | scheduler: NullScheduler 36 | workload: Union[DeepRmWorkloadGenerator, SyntheticWorkloadGenerator] 37 | 38 | def __init__( 39 | self, 40 | workload_generator: WorkloadGeneratorType, 41 | scheduler: NullScheduler, 42 | simulation_type: SimulationType = SimulationType.TIME_BASED, 43 | job_slots: Optional[int] = None, 44 | ): 45 | 46 | self.scheduler = scheduler 47 | self.workload = workload_generator 48 | self.simulation_type = simulation_type 49 | self.job_slots = slice(0, job_slots) 50 | self.simulator = self.build() 51 | self.reset(self.workload, scheduler) 52 | 53 | def rl_step( 54 | self, 55 | action: Optional[int], 56 | listjobs: Optional[Callable[[], List[Job]]], 57 | ) -> List[List[Job]]: 58 | return self.simulator.rl_step( 59 | action if action is not None else -1, 60 | listjobs if listjobs else lambda: self.scheduler.jobs_in_system, 61 | ) 62 | 63 | def build(self): 64 | if self.simulation_type == SimulationType.EVENT_BASED: 65 | return EventBasedDeepRmSimulator( 66 | self.workload, 67 | self.scheduler, 68 | self.job_slots, 69 | ) 70 | elif self.simulation_type == SimulationType.TIME_BASED: 71 | return TimeBasedDeepRmSimulator( 72 | self.workload, 73 | self.scheduler, 74 | self.job_slots, 75 | ) 76 | else: 77 | raise NotImplementedError( 78 | f'Unsupported simulation type {self.simulation_type}' 79 | ) 80 | 81 | @property 82 | def current_time(self): 83 | return self.simulator.current_time 84 | 85 | @property 86 | def last_job_time(self): 87 | return self.simulator.last_job_time 88 | 89 | def reset(self, workload, scheduler): 90 | self.scheduler = scheduler 91 | self.workload = workload 92 | self.simulator = self.build() 93 | 94 | 95 | class EventBasedDeepRmSimulator: 96 | last_job_time: int 97 | scheduler: NullScheduler 98 | job_slots: slice 99 | 100 | def __init__( 101 | self, 102 | workload_generator: WorkloadGeneratorType, 103 | scheduler: NullScheduler, 104 | job_slots: slice, 105 | ): 106 | if ( 107 | not isinstance(workload_generator, DeepRmWorkloadGenerator) 108 | and not isinstance(workload_generator, SyntheticWorkloadGenerator) 109 | ) or not isinstance(scheduler, NullScheduler): 110 | raise AssertionError('Invalid arguments received.') 111 | 112 | self.current_time = 0 113 | self.scheduler = scheduler 114 | self.simulation_start_time = 0 115 | self.workload = workload_generator 116 | self.job_slots = job_slots 117 | 118 | self.current_time = self.last_job_time = 0 119 | if isinstance(workload_generator, SyntheticWorkloadGenerator): 120 | first_job_time = cast( 121 | Job, workload_generator.peek() 122 | ).submission_time - 1 123 | workload_generator.current_time = first_job_time 124 | scheduler.job_events.time = first_job_time 125 | scheduler.current_time = first_job_time 126 | self.current_time = first_job_time 127 | 128 | def rl_step( 129 | self, action: int, listjobs: Callable[[], List[Job]] 130 | ) -> List[List[Job]]: 131 | "Returns a list of jobs for each successful intermediate time step." 132 | 133 | if self.scheduler.step(action): 134 | return [[]] 135 | 136 | jobs: List[List[Job]] = [] 137 | self.current_time += 1 138 | while True: 139 | j = self.workload.step() 140 | if j: 141 | self.scheduler.submit(j) 142 | self.last_job_time = self.current_time 143 | self.scheduler.forward_time() 144 | jobs.append(listjobs()) 145 | if self.scheduler.some_job_fits(self.job_slots): 146 | break 147 | return jobs 148 | 149 | 150 | class TimeBasedDeepRmSimulator: 151 | last_job_time: int 152 | scheduler: NullScheduler 153 | job_slots: slice 154 | 155 | def __init__( 156 | self, 157 | workload_generator: WorkloadGeneratorType, 158 | scheduler: NullScheduler, 159 | job_slots: slice, 160 | ): 161 | if ( 162 | not isinstance(workload_generator, DeepRmWorkloadGenerator) 163 | and not isinstance(workload_generator, SyntheticWorkloadGenerator) 164 | ) or not isinstance(scheduler, NullScheduler): 165 | raise AssertionError('Invalid arguments received.') 166 | 167 | self.scheduler = scheduler 168 | self.simulation_start_time = 0 169 | self.workload = workload_generator 170 | self.current_time = self.last_job_time = 0 171 | self.job_slots = job_slots 172 | 173 | if isinstance(workload_generator, SyntheticWorkloadGenerator): 174 | first_job_time = cast( 175 | Job, workload_generator.peek() 176 | ).submission_time - 1 177 | workload_generator.current_time = first_job_time 178 | scheduler.job_events.time = first_job_time 179 | scheduler.current_time = first_job_time 180 | 181 | def step(self, _=True): 182 | """Not implemented in DeepRmSimulator""" 183 | raise NotImplementedError('This simulator cannot follow the base API') 184 | 185 | def rl_step( 186 | self, action: int, listjobs: Callable[[], List[Job]] 187 | ) -> List[List[Job]]: 188 | "Returns a list of jobs for each successful intermediate time step." 189 | 190 | if self.scheduler.step(action): 191 | return [[]] 192 | else: 193 | self.current_time += 1 194 | j = self.workload.step() 195 | if j: 196 | self.scheduler.submit(j) 197 | self.last_job_time = self.current_time 198 | self.scheduler.forward_time() 199 | return [listjobs()] 200 | -------------------------------------------------------------------------------- /schedgym/envs/compact_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | import gym.spaces.box 6 | import gym.spaces.discrete 7 | 8 | from ..job import Job 9 | from .base import BaseRmEnv 10 | 11 | import logging 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | MAXIMUM_JOB_LENGTH = 15 17 | RESOURCE_SLOTS = 10 18 | MAXIMUM_JOB_SIZE = 10 19 | 20 | AMOUNT_OF_MEMORY = 10 21 | NUMBER_OF_RESOURCES = 2 22 | NUMBER_OF_PROCESSORS = 10 23 | MAXIMUM_NUMBER_OF_ACTIVE_JOBS = 40 # Number of colors in image 24 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10 25 | 26 | NEW_JOB_RATE = 0.7 27 | SMALL_JOB_CHANCE = 0.8 28 | 29 | DEFAULT_WORKLOAD = { 30 | 'type': 'deeprm', 31 | 'new_job_rate': NEW_JOB_RATE, 32 | 'max_job_size': MAXIMUM_JOB_SIZE, 33 | 'max_job_len': MAXIMUM_JOB_LENGTH, 34 | 'small_job_chance': SMALL_JOB_CHANCE, 35 | } 36 | 37 | 38 | class CompactRmEnv(BaseRmEnv): 39 | metadata = {'render.modes': ['human', 'rgb_array']} 40 | 41 | def __init__(self, **kwargs): 42 | super().__init__(**kwargs) 43 | 44 | self.memory = kwargs.get('memory', AMOUNT_OF_MEMORY) 45 | self.processors = kwargs.get('processors', NUMBER_OF_PROCESSORS) 46 | 47 | self.renderer = kwargs.get('renderer', None) 48 | 49 | self.maximum_work = self.processors 50 | self.maximum_work_mem = self.memory 51 | 52 | self._setup_spaces() 53 | 54 | def _setup_spaces(self): 55 | self.action_space = gym.spaces.discrete.Discrete(self.job_slots + 1) 56 | 57 | self.observation_space = gym.spaces.box.Box( 58 | low=0.0, high=1.0, shape=((len(self.state),)), dtype=np.float32 59 | ) 60 | 61 | def reset(self) -> np.ndarray: 62 | super().reset() 63 | self.maximum_work = self.time_limit * self.processors 64 | self.maximum_work_mem = self.time_limit * self.memory 65 | return super().reset() 66 | 67 | def step(self, action: int): 68 | done = False 69 | found = True 70 | if not (0 <= action < self.action_space.n - 1): 71 | found = False 72 | 73 | try: 74 | intermediate = self.simulator.rl_step( 75 | action if found else None, self.reward_mapper[self.reward_jobs] 76 | ) 77 | # XXX: This is technically incorrect. The correct thing to do here 78 | # is: when we have a trace-based workload generator, we need to 79 | # maintain a check on whether we want to sample from it or not, and 80 | # use the time limit to actually decide whether we're done or not. 81 | # In the current setting, we might potentially "lose" the last jobs 82 | # of the workload. 83 | except StopIteration: 84 | intermediate = [[Job()]] 85 | done = True 86 | 87 | reward = self.reward if any(intermediate) else 0 88 | done = bool(self.time_limit) and ( 89 | self.scheduler.current_time > self.time_limit or done 90 | ) 91 | 92 | if not done and self.smdp and any(intermediate): 93 | rewards = [self.compute_reward(js) for js in intermediate] 94 | rewards[0] = 0 95 | reward = ( 96 | self.gamma ** np.arange(len(intermediate)) 97 | ).dot(rewards) 98 | 99 | return ( 100 | self.state, 101 | reward, 102 | done, 103 | self.stats if done else {} 104 | ) 105 | 106 | @property 107 | def state(self): 108 | state, jobs, backlog = self.scheduler.state( 109 | self.time_horizon, self.job_slots 110 | ) 111 | newstate = np.zeros( 112 | (len(state[0]) * (1 if self.ignore_memory else 2) * 2) 113 | ) 114 | newstate[: len(state[0]) * 2] = ( 115 | np.array( 116 | [(e[0], e[1]) for e in state[0]], 117 | dtype=np.float32 118 | ).reshape((-1,),) / self.processors 119 | ) 120 | if not self.ignore_memory: 121 | newstate[len(state[0]) * 2:] = ( 122 | np.array( 123 | [(e[0], e[1]) for e in state[1]], 124 | dtype=np.float32 125 | ).reshape((-1,)) / self.memory 126 | ) 127 | jobs = self._normalize_jobs(jobs).reshape((-1,)) 128 | backlog = backlog * np.ones(1) / self.backlog_size 129 | 130 | running = [ 131 | j 132 | for j in self.scheduler.queue_running 133 | if j.submission_time + j.requested_time 134 | > self.scheduler.current_time 135 | ] 136 | 137 | remaining_work = ( 138 | sum( 139 | [ 140 | ( 141 | j.submission_time 142 | + j.requested_time 143 | - self.scheduler.current_time 144 | ) 145 | * j.requested_processors 146 | for j in running 147 | ] 148 | ) 149 | / self.maximum_work 150 | ) 151 | remaining_work_mem = ( 152 | sum( 153 | [ 154 | ( 155 | j.submission_time 156 | + j.requested_time 157 | - self.scheduler.current_time 158 | ) 159 | * j.requested_memory 160 | for j in running 161 | ] 162 | ) 163 | / self.maximum_work_mem 164 | ) 165 | 166 | # XXX: this normalization only works while we're sampling at most one 167 | # job per time step. Once this is not true, we risk having the 168 | # queue_size feature > 1.0 (which is incorrect) 169 | queue_size = len(self.scheduler.queue_admission) / self.time_limit 170 | time_left = 1 - self.scheduler.current_time / self.time_limit 171 | 172 | try: 173 | next_free = min( 174 | running, key=lambda x: x.start_time + x.requested_time 175 | ) 176 | next_free = np.array( 177 | ( 178 | ( 179 | next_free.start_time 180 | + next_free.requested_time 181 | - self.scheduler.current_time 182 | ) 183 | / self.time_limit, 184 | next_free.requested_processors / self.processors, 185 | (state[0][0][0] + next_free.requested_processors) 186 | / self.processors, 187 | ) 188 | ) 189 | except ValueError: 190 | next_free = np.array((0, 0, 1.0)) 191 | 192 | return np.hstack( 193 | ( 194 | newstate, 195 | jobs, 196 | backlog, 197 | next_free, 198 | np.array( 199 | (remaining_work, remaining_work_mem, queue_size, time_left) 200 | ), 201 | ), 202 | ) 203 | 204 | def _normalize_jobs(self, jobs): 205 | def _sumdiv(arr, idx, orig, limit): 206 | arr[idx] = (orig + 1) / (limit + 1) 207 | 208 | ret = np.zeros((len(jobs), len(jobs[0])), dtype=np.float32) 209 | for i, job in enumerate(jobs): 210 | _sumdiv(ret[i], 0, job.submission_time, self.time_limit) 211 | _sumdiv(ret[i], 1, job.requested_time, self.time_limit) 212 | _sumdiv(ret[i], 2, job.requested_memory, self.memory) 213 | _sumdiv(ret[i], 3, job.requested_processors, self.processors) 214 | _sumdiv(ret[i], 4, job.queue_size, self.time_limit) 215 | _sumdiv( 216 | ret[i], 217 | 5, 218 | job.queued_work, 219 | self.time_limit * self.time_limit * self.processors, 220 | ) 221 | _sumdiv(ret[i], 6, job.free_processors, self.processors) 222 | return ret 223 | -------------------------------------------------------------------------------- /docs/img/job-resource.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Resource+ processors: IntervalTree+ memory: IntervalTreeJob+ resources: ResourceIntervalTree -------------------------------------------------------------------------------- /docs/img/cluster-resourcepool.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | ResourcePool+ processors: IntervalTree+ memory: IntervalTreeCluster+ resources: ResourceIntervalTree -------------------------------------------------------------------------------- /schedgym/cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """cluster - Classes for cluster management 5 | 6 | The workhorse of this module is the :class:`schedgym.cluster.Cluster` class, 7 | which manages resources in a cluster. 8 | """ 9 | 10 | import copy 11 | from typing import Tuple, Iterable, Optional 12 | 13 | from . import pool 14 | 15 | from .job import Job, Resource 16 | from .event import JobEvent, EventType 17 | 18 | # pylint: disable=C 19 | RESOURCE_TYPE = Tuple[Iterable[pool.Interval], Iterable[pool.Interval]] 20 | 21 | 22 | class Cluster: 23 | """A cluster as a set of resources. 24 | 25 | Currently, this doesn't make a distinction between machines. So it only 26 | manages groups of resources. 27 | 28 | Note that although we don't differentiate between machines, we **do** honor 29 | resources. Therefore, if a given processor is allocated by a job j, we make 30 | sure not to allocate processor n for any other job until j finishes. 31 | 32 | Due to the above constraint, some checks are more complex (and, 33 | consequently, slower) than if we disregarded *which* processors and memory 34 | units were used and only counted the *amount* of resources used. 35 | 36 | This makes our design slightly closer to reality, though. 37 | 38 | The figure below shows the relationship between clusters, ResourcePools, 39 | and the basic data structure for resource management (`IntervalTree`). 40 | 41 | .. image:: /img/cluster-resourcepool.svg 42 | 43 | Parameters 44 | ---------- 45 | processors : int 46 | The number of processors in this cluster 47 | memory : int 48 | The amount of memory in this cluster 49 | ignore_memory : bool 50 | Whether memory should be considered for decisions or not 51 | used_processors : Optional[Resource] 52 | Processors already in use in this cluster 53 | used_memory : Optional[Resource] 54 | Amount of memory already used in this cluster 55 | """ 56 | 57 | ignore_memory: bool 58 | memory: pool.ResourcePool 59 | processors: pool.ResourcePool 60 | 61 | def __init__( 62 | self, 63 | processors: int, 64 | memory: int, 65 | ignore_memory: bool = False, 66 | used_processors: Optional[Resource] = None, 67 | used_memory: Optional[Resource] = None, 68 | ): 69 | self.ignore_memory = ignore_memory 70 | self.memory = pool.ResourcePool( 71 | pool.ResourceType.MEMORY, memory, used_memory 72 | ) 73 | self.processors = pool.ResourcePool( 74 | pool.ResourceType.CPU, processors, used_processors 75 | ) 76 | 77 | @property 78 | def free_resources(self) -> Tuple[int, int]: 79 | """The set of resources *not* in use in this cluster.""" 80 | return self.processors.free_resources, self.memory.free_resources 81 | 82 | def fits(self, job: Job) -> bool: 83 | """Checks whether a job fits in this cluster. 84 | 85 | Parameters 86 | ---------- 87 | job : Job 88 | The job to check against in this cluster 89 | 90 | Returns: 91 | True if the job fits the cluster (can be added to the cluster), and 92 | False otherwise 93 | """ 94 | return self.processors.fits(job.requested_processors) and ( 95 | self.ignore_memory or self.memory.fits(job.requested_memory) 96 | ) 97 | 98 | def allocate(self, job: Job) -> None: 99 | """Checks whether a job fits the system and allocates resources for it. 100 | 101 | Parameters 102 | ---------- 103 | job : Job 104 | The job to allocate resources to. 105 | """ 106 | if not self.fits(job): 107 | raise AssertionError( 108 | f'Unable to allocate resources for {job} in {self}' 109 | ) 110 | self.processors.allocate(job.resources.processors) 111 | self.memory.allocate(job.resources.memory) 112 | 113 | def clone(self): 114 | """Clones this Cluster (duplicating it in memory).""" 115 | return copy.deepcopy(self) 116 | 117 | def find(self, job: Job) -> Resource: 118 | """Finds resources for a job. 119 | 120 | If the job fits in the system, this will return a set of resources that 121 | can be used by a job. If it doesn't, will return an empty set of 122 | resources (which evaluate to False in boolean expressions). 123 | 124 | Parameters 125 | ---------- 126 | job : Job 127 | The job to find resources to. 128 | """ 129 | p = self.processors.find(job.requested_processors, job.id) 130 | if not p: 131 | return Resource() 132 | if self.ignore_memory: 133 | return Resource(p, ignore_memory=True) 134 | m = self.memory.find(job.requested_memory, job.id) 135 | return Resource(p, m) 136 | 137 | def free(self, job: Job) -> None: 138 | """Frees the resources used by a job. 139 | 140 | Parameters 141 | ---------- 142 | job : Job 143 | The job to free resources from. 144 | """ 145 | self.processors.free(job.resources.processors) 146 | if not self.ignore_memory: 147 | self.memory.free(job.resources.memory) 148 | 149 | def find_resources_at_time( 150 | self, time: int, job: Job, events: Iterable[JobEvent] 151 | ) -> Resource: 152 | """Finds resources for a job at a given time step. 153 | 154 | This is probably the most complex (and most important) function in this 155 | class. To find an allocation for a job, we have to iterate through the 156 | queue of events and evaluating the state of the system given that set 157 | of events to check whether a given job would fit the system. 158 | 159 | Since this method can be called with time stamps in the far future, we 160 | are required to play events to find the exact configuration in the 161 | future. 162 | 163 | Parameters 164 | ---------- 165 | time : int 166 | The time at which to check whether the job fits the system 167 | job : Job 168 | The job to check 169 | events : Iterable[JobEvent] 170 | A set of events that will play out in the future 171 | 172 | Returns: 173 | A set of resources if the job fits the cluster at time `time`, or 174 | an empty set of resources otherwise. (See 175 | :func:`schedgym.cluster.Cluster.find`.) 176 | """ 177 | def valid(e, time): 178 | return time + 1 <= e.time < job.requested_time + time 179 | 180 | used = Resource(self.processors.used_pool, self.memory.used_pool) 181 | for event in ( 182 | e 183 | for e in events 184 | if (valid(e, time) and e.type == EventType.JOB_START) 185 | ): 186 | for i in event.processors: 187 | used.processors.add(i) 188 | for i in event.memory: 189 | used.memory.add(i) 190 | used.processors.merge_overlaps() 191 | used.memory.merge_overlaps() 192 | return Cluster( 193 | self.processors.size, 194 | self.memory.size, 195 | self.ignore_memory, 196 | used.processors, 197 | used.memory, 198 | ).find(job) 199 | 200 | @property 201 | def state(self) -> Tuple[Tuple[int, int, dict], ...]: 202 | """Gets the current state of the cluster as numpy arrays. 203 | 204 | Returns: 205 | Tuple: a pair containing the number of processors used and the 206 | memory used and the jobs that are using such resources. 207 | """ 208 | processors = ( 209 | self.processors.free_resources, 210 | self.processors.used_resources, 211 | {(i.begin, i.end): i.data for i in self.processors.used_pool}, 212 | ) 213 | memory = ( 214 | self.memory.free_resources, 215 | self.memory.used_resources, 216 | {(i.begin, i.end): i.data for i in self.memory.used_pool}, 217 | ) 218 | if self.ignore_memory: 219 | return (processors,) 220 | else: 221 | return processors, memory 222 | 223 | def __bool__(self): 224 | return ( 225 | self.processors.free_resources != 0 226 | and self.memory.free_resources != 0 227 | ) 228 | 229 | def __repr__(self): 230 | return ( 231 | f'Cluster({self.processors}, {self.memory}, {self.ignore_memory})' 232 | ) 233 | 234 | def __str__(self): 235 | return ( 236 | f'Cluster({self.processors}, {self.memory}, {self.ignore_memory})' 237 | ) 238 | -------------------------------------------------------------------------------- /schedgym/envs/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import random 5 | from enum import IntEnum 6 | from typing import List, Dict 7 | from abc import ABC, abstractmethod 8 | 9 | import gym 10 | 11 | import numpy as np 12 | 13 | from .simulator import SimulationType, DeepRmSimulator 14 | from ..scheduler.null_scheduler import NullScheduler 15 | from .workload import build as build_workload 16 | 17 | BACKLOG_SIZE = 60 18 | MAXIMUM_NUMBER_OF_ACTIVE_JOBS = 40 # Number of colors in image 19 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10 20 | 21 | TIME_HORIZON = 20 22 | JOB_SLOTS = 5 23 | AMOUNT_OF_MEMORY = 10 24 | NUMBER_OF_PROCESSORS = 10 25 | MAXIMUM_JOB_LENGTH = 15 26 | MAXIMUM_JOB_SIZE = 10 27 | NEW_JOB_RATE = 0.7 28 | SMALL_JOB_CHANCE = 0.8 29 | DEFAULT_WORKLOAD = { 30 | 'type': 'deeprm', 31 | 'new_job_rate': NEW_JOB_RATE, 32 | 'max_job_size': MAXIMUM_JOB_SIZE, 33 | 'max_job_len': MAXIMUM_JOB_LENGTH, 34 | 'small_job_chance': SMALL_JOB_CHANCE, 35 | } 36 | 37 | 38 | class RewardJobs(IntEnum): 39 | ALL = (0,) 40 | JOB_SLOTS = (1,) 41 | WAITING = (2,) 42 | RUNNING_JOB_SLOTS = (3,) 43 | 44 | @staticmethod 45 | def from_str(reward_range: str): 46 | reward_range = reward_range.upper().replace('-', '_') 47 | if reward_range in RewardJobs.__members__: 48 | return RewardJobs[reward_range] 49 | else: 50 | raise ValueError( 51 | f'{reward_range} is not a valid RewardJobs range. ' 52 | f'Valid options are: {list(RewardJobs.__members__.keys())}.' 53 | ) 54 | 55 | 56 | class BaseRmEnv(ABC, gym.Env): 57 | metadata = {'render.modes': ['human', 'rgb_array']} 58 | 59 | job_slots: int 60 | time_limit: int 61 | job_num_cap: int 62 | time_horizon: int 63 | ignore_memory: bool 64 | color_index: List[int] 65 | color_cache: Dict[int, int] 66 | simulator: DeepRmSimulator 67 | 68 | @abstractmethod 69 | def __init__(self, **kwargs): 70 | self.color_cache = {} 71 | self.renderer = kwargs.get('renderer', None) 72 | self.shuffle_colors = kwargs.get('shuffle_colors', False) 73 | self.job_num_cap = kwargs.get( 74 | 'job_num_cap', MAXIMUM_NUMBER_OF_ACTIVE_JOBS 75 | ) 76 | self.simulation_type = SimulationType.from_str( 77 | kwargs.get('simulation_type', 'time_based') 78 | ) 79 | 80 | self.reward_jobs = RewardJobs.from_str( 81 | kwargs.get('reward_jobs', 'all') 82 | ) 83 | 84 | self.smdp = self.simulation_type == SimulationType.EVENT_BASED 85 | self.gamma = kwargs.get('gamma', 1.0) 86 | 87 | self.time_horizon = kwargs.get( 88 | 'time_horizon', TIME_HORIZON 89 | ) # number of time steps in the graph 90 | 91 | time_limit = kwargs.get('time_limit', 200) 92 | if time_limit is None: 93 | self.time_limit = 1 94 | self.update_time_limit = True 95 | else: 96 | self.time_limit = time_limit 97 | self.update_time_limit = False 98 | 99 | step = 1.0 / self.job_num_cap 100 | # zero is already present and set to "no job there" 101 | self.colormap = np.arange(start=step, stop=1, step=step) 102 | if self.shuffle_colors: 103 | np.random.shuffle(self.colormap) 104 | self.color_index = list(range(len(self.colormap))) 105 | 106 | # Number of jobs to show 107 | self.job_slots = kwargs.get('job_slots', JOB_SLOTS) 108 | 109 | self.reward_mapper = { 110 | RewardJobs.ALL: lambda: self.scheduler.jobs_in_system, 111 | RewardJobs.WAITING: lambda: self.scheduler.queue_admission, 112 | RewardJobs.JOB_SLOTS: lambda: self.scheduler.queue_admission[ 113 | : self.job_slots 114 | ], 115 | RewardJobs.RUNNING_JOB_SLOTS: lambda: self.scheduler.queue_running 116 | + self.scheduler.queue_admission[: self.job_slots], 117 | } 118 | 119 | self.backlog_size = kwargs.get('backlog_size', BACKLOG_SIZE) 120 | self.memory = kwargs.get('memory', AMOUNT_OF_MEMORY) 121 | self.processors = kwargs.get('processors', NUMBER_OF_PROCESSORS) 122 | self.ignore_memory = kwargs.get('ignore_memory', False) 123 | 124 | self.workload_config = kwargs.get('workload', DEFAULT_WORKLOAD) 125 | wl = build_workload(self.workload_config) 126 | 127 | scheduler = NullScheduler( 128 | self.processors, self.memory, ignore_memory=self.ignore_memory 129 | ) 130 | self.simulator = DeepRmSimulator( 131 | wl, 132 | scheduler, 133 | simulation_type=self.simulation_type, 134 | job_slots=self.job_slots, 135 | ) 136 | 137 | def reset(self) -> np.ndarray: 138 | scheduler = NullScheduler( 139 | self.processors, self.memory, ignore_memory=self.ignore_memory 140 | ) 141 | wl = build_workload(self.workload_config) 142 | if self.update_time_limit and hasattr(wl, 'trace'): 143 | self.time_limit = ( 144 | wl.trace[-1].submission_time + # type: ignore 145 | wl.trace[-1].execution_time # type: ignore 146 | ) 147 | self.simulator.reset(wl, scheduler) 148 | return self.state 149 | 150 | def _render_state(self): 151 | state, jobs, backlog = self.scheduler.state( 152 | self.time_horizon, self.job_slots 153 | ) 154 | s = self._convert_state( 155 | state, 156 | jobs, 157 | backlog, 158 | ( 159 | (self.simulator.current_time - self.simulator.last_job_time) 160 | / MAX_TIME_TRACKING_SINCE_LAST_JOB 161 | ), 162 | ) 163 | return s 164 | 165 | def build_current_state(self, current): 166 | ret = [np.zeros((self.time_horizon, sum(e[0][:-1]))) for e in current] 167 | for i, _ in enumerate(current): 168 | for t in range(self.time_horizon): 169 | for k, v in current[i][t][-1].items(): 170 | ret[i][t][slice(*k)] = v 171 | return ret 172 | 173 | def build_job_slots(self, wait): 174 | memory = np.zeros( 175 | (self.job_slots, self.time_horizon, self.scheduler.total_memory) 176 | ) 177 | processors = np.zeros( 178 | ( 179 | self.job_slots, 180 | self.time_horizon, 181 | self.scheduler.number_of_processors, 182 | ) 183 | ) 184 | for i, j in enumerate(wait): 185 | if j.requested_processors == -1: 186 | break 187 | time_slice = slice( 188 | 0, 189 | self.time_horizon 190 | if j.requested_time > self.time_horizon 191 | else j.requested_time, 192 | ) 193 | processors[i, time_slice, : j.requested_processors] = 1.0 194 | if j.requested_memory != -1: 195 | memory[i, time_slice, : j.requested_memory] = 1.0 196 | return (processors,) if self.ignore_memory else (processors, memory) 197 | 198 | def _convert_state(self, current, wait, backlog, time): 199 | current = self.build_current_state(current) 200 | wait = self.build_job_slots(wait) 201 | backlog_width = self.backlog_size // self.time_horizon 202 | backlog = np.ones(self.time_horizon * backlog_width) * backlog 203 | unique = set(np.unique(current[0])) - {0.0} 204 | if len(unique) > self.job_num_cap: 205 | raise AssertionError('Number of jobs > number of colors') 206 | available_colors = list( 207 | set(self.color_index) 208 | - set( 209 | [self.color_cache[j] for j in unique if j in self.color_cache] 210 | ) 211 | ) 212 | need_color = unique - set(self.color_cache.keys()) 213 | for i, j in enumerate(need_color): 214 | self.color_cache[j] = available_colors[i] 215 | for j in unique: # noqa 216 | for resource in current: 217 | resource[resource == j] = self.colormap[self.color_cache[j]] 218 | 219 | return ( 220 | np.array(current), 221 | np.array(wait), 222 | backlog.reshape((self.time_horizon, -1)), 223 | np.ones((self.time_horizon, 1)) * min(1.0, time), 224 | ) 225 | 226 | def render(self, mode='human'): 227 | if self.renderer is None: 228 | from .render import DeepRmRenderer 229 | 230 | self.renderer = DeepRmRenderer(mode) 231 | rgb = self.renderer.render(self._render_state()) 232 | return rgb 233 | 234 | def seed(self, seed=None): 235 | if seed is None: 236 | seed = random.randint(0, 99999999) 237 | np.random.seed(seed) 238 | random.seed(seed) 239 | return [seed] 240 | 241 | def compute_reward(self, joblist): 242 | return -np.sum([1 / j.execution_time for j in joblist]) 243 | 244 | @property 245 | def reward(self): 246 | return self.compute_reward(self.reward_mapper[self.reward_jobs]()) 247 | 248 | @property 249 | def stats(self): 250 | return self.scheduler.stats 251 | 252 | @property 253 | @abstractmethod 254 | def state(self): 255 | raise NotImplementedError 256 | 257 | @property 258 | def scheduler(self) -> NullScheduler: 259 | return self.simulator.scheduler 260 | -------------------------------------------------------------------------------- /schedgym/envs/workload.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # flake8: noqa E501 4 | 5 | import math 6 | import random 7 | import warnings 8 | import itertools 9 | from math import log2 10 | from typing import Optional, List 11 | from collections import namedtuple 12 | from parallelworkloads.lublin99 import Lublin99 13 | from parallelworkloads.tsafrir05 import Tsafrir05 14 | 15 | from schedgym import workload as wl, job 16 | 17 | JobParameters = namedtuple('JobParameters', ['small', 'large']) 18 | 19 | 20 | class DeepRmWorkloadGenerator(wl.DistributionalWorkloadGenerator): 21 | def __init__(self, *args: wl.BinomialWorkloadGenerator): 22 | super().__init__(max([w.length for w in args])) 23 | 24 | self.generators = args 25 | self.counter = itertools.count(1) 26 | 27 | for generator in self.generators: 28 | generator.counter = self.counter 29 | 30 | def step(self, offset=1) -> List[Optional[job.Job]]: 31 | return self.generators[ 32 | random.randint(0, len(self.generators) - 1) 33 | ].step() 34 | 35 | def __len__(self): 36 | return self.generators[0].length 37 | 38 | def peek(self): 39 | return self.step() 40 | 41 | @staticmethod 42 | def build( 43 | new_job_rate, 44 | small_job_chance, 45 | max_job_len, 46 | max_job_size, 47 | ignore_memory=False, 48 | min_large_job_len=None, 49 | max_small_job_len=None, 50 | min_small_job_len=None, 51 | min_dominant_job_size=None, 52 | min_other_job_size=None, 53 | max_other_job_size=None, 54 | runtime_estimates=None, 55 | estimate_parameters=None, 56 | ) -> 'DeepRmWorkloadGenerator': 57 | # Time-related job parameters {{{ 58 | small_job_time_lower = ( 59 | 1 if min_small_job_len is None else min_small_job_len 60 | ) 61 | small_job_time_upper = ( 62 | max(max_job_len // 5, 1) 63 | if max_small_job_len is None 64 | else max_small_job_len 65 | ) 66 | large_job_time_lower = ( 67 | int(max_job_len * (2 / 3)) 68 | if min_large_job_len is None 69 | else min_large_job_len 70 | ) 71 | large_job_time_upper = max_job_len 72 | # }}} 73 | 74 | # Resource-related job parameters {{{ 75 | dominant_resource_lower = ( 76 | max_job_size // 2 77 | if min_dominant_job_size is None 78 | else min_dominant_job_size 79 | ) 80 | dominant_resource_upper = max_job_size 81 | other_resource_lower = ( 82 | 1 if min_other_job_size is None else min_other_job_size 83 | ) 84 | other_resource_upper = ( 85 | max_job_size // 5 86 | if max_other_job_size is None 87 | else max_other_job_size 88 | ) 89 | # }}} 90 | 91 | cpu_dominant_parameters = JobParameters( # {{{ 92 | job.JobParameters( 93 | small_job_time_lower, 94 | small_job_time_upper, 95 | dominant_resource_lower, 96 | dominant_resource_upper, 97 | other_resource_lower, 98 | other_resource_upper, 99 | ), 100 | job.JobParameters( 101 | large_job_time_lower, 102 | large_job_time_upper, 103 | dominant_resource_lower, 104 | dominant_resource_upper, 105 | other_resource_lower, 106 | other_resource_upper, 107 | ), 108 | ) # }}} 109 | 110 | mem_dominant_parameters = JobParameters( # {{{ 111 | job.JobParameters( 112 | small_job_time_lower, 113 | small_job_time_upper, 114 | other_resource_lower, 115 | other_resource_upper, 116 | dominant_resource_lower, 117 | dominant_resource_upper, 118 | ), 119 | job.JobParameters( 120 | large_job_time_lower, 121 | large_job_time_upper, 122 | other_resource_lower, 123 | other_resource_upper, 124 | dominant_resource_lower, 125 | dominant_resource_upper, 126 | ), 127 | ) # }}} 128 | 129 | generators = ( 130 | wl.BinomialWorkloadGenerator( 131 | new_job_rate, 132 | small_job_chance, 133 | cpu_dominant_parameters.small, 134 | cpu_dominant_parameters.large, 135 | runtime_estimates=runtime_estimates, 136 | estimate_parameters=estimate_parameters, 137 | ), 138 | wl.BinomialWorkloadGenerator( 139 | new_job_rate, 140 | small_job_chance, 141 | mem_dominant_parameters.small, 142 | mem_dominant_parameters.large, 143 | runtime_estimates=runtime_estimates, 144 | estimate_parameters=estimate_parameters, 145 | ), 146 | ) 147 | 148 | return DeepRmWorkloadGenerator( 149 | *generators[: (1 if ignore_memory else None)] 150 | ) 151 | 152 | 153 | class SyntheticWorkloadGenerator(wl.TraceGenerator): 154 | """A synthetic workload generator based on realistic models.""" 155 | 156 | def __init__( 157 | self, 158 | length, 159 | nodes, 160 | start_time=8, 161 | random_seed=0, 162 | restart=False, 163 | uniform_proportion=0.95, 164 | cdf_break=0.5, 165 | runtime_estimates=None, 166 | estimate_parameters=None, 167 | ): 168 | """Synthetic workload generator based on Lublin's work. 169 | 170 | Parameters 171 | ---------- 172 | length : int 173 | number of jobs to generate 174 | nodes : int 175 | number of compute nodes in the system 176 | start_time : int 177 | hour of day in which to start simulation 178 | random_seed : int 179 | random seed to use to generate jobs 180 | restart : bool 181 | whether to restart after a sample finishes 182 | uniform_proportion : float 183 | tunes the proportion between the first and second uniform 184 | distributions in the two-stage uniform process 185 | cdf_break : float 186 | whether to move the break closer to the inferior or superior 187 | limit. A value closer to 0 will (tend to) produce bigger jobs, 188 | while a value closer to 1 will (tend to) produce smaller jobs 189 | runtime_estimates : {'gaussian', 'tsafrir', None} 190 | whether to include runtime estimates and the method used 191 | to compute them: 192 | * None generates perfect estimate (estimates equal run time) 193 | * 'gaussian' generates estimates with zero-mean Gaussian noise 194 | added to them 195 | * 'tsafrir' uses Dan Tsafrir's model of user runtime estimates 196 | to generate estimates 197 | estimate_parameters : Union[float, List[Tuple[float, float]] 198 | the parameters used for generating user estimates. 199 | Depends on :param:`runtime_estimates`. 200 | When `runtime_estimates` is 'gaussian', this is a single 201 | floating-point number that sets the standard deviation of the 202 | noise. 203 | When `runtime_estimates` is 'tsafrir', this is a list of 204 | floating-point pairs specifying a histogram (time, number of 205 | jobs) of job runtime popularity. 206 | """ 207 | random.seed(random_seed) 208 | 209 | self.lublin = Lublin99(False, random_seed, length) 210 | self.lublin.start = start_time 211 | self.random_seed = random_seed 212 | self.nodes = nodes 213 | 214 | uniform_low_prob = 0.8 215 | log2_size = log2(nodes) 216 | min_umed = log2_size - 3.5 217 | max_umed = log2_size - 1.5 218 | breaking_point = cdf_break * min_umed + (1 - cdf_break) * max_umed 219 | 220 | self.lublin.setParallelJobProbabilities( 221 | False, 222 | uniform_low_prob, 223 | breaking_point, 224 | log2_size, 225 | uniform_proportion, 226 | ) 227 | 228 | self.runtime_estimates = runtime_estimates 229 | self.estimate_parameters = estimate_parameters 230 | 231 | trace = self.refresh_jobs() 232 | super().__init__(restart, trace) 233 | 234 | def refresh_jobs(self): 235 | """Refreshes the underlying job list.""" 236 | jobs = self.lublin.generate() 237 | if self.runtime_estimates: 238 | if self.runtime_estimates == 'tsafrir': 239 | if self.estimate_parameters is not None: 240 | warnings.warn( 241 | 'Setting tsafrir parameters is currently unsupported' 242 | ) 243 | tsafrir = Tsafrir05(jobs) 244 | jobs = tsafrir.generate(jobs) 245 | elif self.runtime_estimates == 'gaussian': 246 | for j in jobs: 247 | j.reqTime = math.ceil( 248 | random.gauss( 249 | j.runTime, self.estimate_parameters * j.runTime 250 | ) 251 | ) 252 | if j.reqTime < 1: 253 | j.reqTime = 1 254 | else: 255 | raise ValueError( 256 | f'Unsupported estimate type {self.runtime_estimates}' 257 | ) 258 | 259 | self.trace = [job.Job.from_swf_job(j) for j in jobs] 260 | return self.trace 261 | 262 | 263 | def build(workload_config: dict): 264 | type = workload_config['type'] 265 | kwargs = {k: v for k, v in workload_config.items() if k != 'type'} 266 | if type == 'deeprm': 267 | return DeepRmWorkloadGenerator.build(**kwargs) 268 | elif type == 'lublin': 269 | return SyntheticWorkloadGenerator(**kwargs) 270 | else: 271 | raise RuntimeError(f'Unsupported workload model type {type} requested') 272 | -------------------------------------------------------------------------------- /schedgym/job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """job - Classes for jobs in the simulator. 5 | """ 6 | 7 | import enum 8 | 9 | import random 10 | import warnings 11 | 12 | from collections import namedtuple 13 | 14 | from .resource import Resource, PrimaryResource 15 | 16 | JobState = namedtuple( 17 | 'JobState', 18 | [ 19 | 'submission_time', 20 | 'requested_time', 21 | 'requested_memory', 22 | 'requested_processors', 23 | 'queue_size', 24 | 'queued_work', 25 | 'free_processors', 26 | ], 27 | ) 28 | 29 | 30 | class JobStatus(enum.IntEnum): 31 | """An enumeration for different states of a job within our simulator.""" 32 | 33 | SUBMITTED = 0 34 | RUNNING = 1 35 | WAITING = 2 36 | COMPLETED = 3 37 | SCHEDULED = 4 38 | 39 | 40 | class SwfJobStatus(enum.IntEnum): 41 | """An enumeration for different states of a job in the SWF_. 42 | 43 | .. _SWF: https://www.cs.huji.ac.il/labs/parallel/workload/swf.html 44 | """ 45 | 46 | FAILED = 0 47 | COMPLETED = 1 48 | PARTIAL_TO_BE_CONTINUED = 2 49 | PARTIAL_LAST_COMPLETED = 3 50 | PARTIAL_LAST_FAILED = 4 51 | CANCELLED = 5 52 | MEANINGLESS = -1 53 | 54 | 55 | class Job: 56 | """A job in the system. 57 | 58 | This follows the fields of the `Standard Workload Format 59 | `_ with a couple 60 | of helper methods to compute slowdown and bounded slowdown of a job. The 61 | initializer arguments follow the same ordering and have the same meaning 62 | than those in the SWF description. 63 | 64 | This makes use of the :class:`schedgym.resource.Resource` class to keep 65 | track of the assigned resources to the job. Resource assignment itself is 66 | performed by 67 | :func:`schedgym.scheduler.scheduler.Scheduler.assign_schedule`. 68 | 69 | The figure below shows the relationship between jobs, resources, and the 70 | basic data structure for resource management (`IntervalTree`). 71 | 72 | .. image:: /img/job-resource.svg 73 | """ 74 | 75 | resources: Resource 76 | 77 | SWF_JOB_MAP = { 78 | 'jobId': 'id', 79 | 'submissionTime': 'submission_time', 80 | 'waitTime': 'wait_time', 81 | 'runTime': 'execution_time', 82 | 'allocProcs': 'processors_allocated', 83 | 'avgCpuUsage': 'average_cpu_use', 84 | 'usedMem': 'memory_use', 85 | 'reqProcs': 'requested_processors', 86 | 'reqTime': 'requested_time', 87 | 'reqMem': 'requested_memory', 88 | 'status': 'status', 89 | 'userId': 'user_id', 90 | 'groupId': 'group_id', 91 | 'executable': 'executable', 92 | 'queueNum': 'queue_number', 93 | 'partNum': 'partition_number', 94 | 'precedingJob': 'preceding_job_id', 95 | 'thinkTime': 'think_time', 96 | } 97 | 98 | def __init__( 99 | self, 100 | job_id=-1, 101 | submission_time=-1, 102 | execution_time=-1, 103 | processors_allocated=-1, 104 | average_cpu_use=-1, 105 | memory_use=-1, 106 | requested_processors=-1, 107 | requested_time=-1, 108 | requested_memory=-1, 109 | status=-1, 110 | user_id=-1, 111 | group_id=-1, 112 | executable=-1, 113 | queue_number=-1, 114 | partition_number=-1, 115 | preceding_job_id=-1, 116 | think_time=-1, 117 | wait_time=-1, 118 | ignore_memory=True, 119 | ): 120 | self.id: int = job_id 121 | self.submission_time: int = submission_time 122 | self.execution_time: int = execution_time 123 | self.requested_time: int = requested_time 124 | self.requested_processors: int = requested_processors 125 | self.processors_allocated: int = processors_allocated 126 | self.average_cpu_use: int = average_cpu_use 127 | self.memory_use: int = memory_use 128 | self.requested_memory: int = requested_memory 129 | self.status: JobStatus = status 130 | self.user_id: int = user_id 131 | self.group_id: int = group_id 132 | self.executable: int = executable 133 | self.queue_number: int = queue_number 134 | self.partition_number: int = partition_number 135 | self.preceding_job_id: int = preceding_job_id 136 | self.think_time = think_time 137 | self.wait_time = wait_time 138 | 139 | self.resources = Resource() 140 | self.first_scheduling_promise: int = -1 141 | self.start_time: int = -1 142 | self.finish_time: int = -1 143 | self.ignore_memory = ignore_memory 144 | self.slot_position: int = -1 145 | self.free_processors = -1 146 | self.queued_work = -1 147 | self.queue_size = -1 148 | 149 | def __str__(self): 150 | return ( 151 | f'Job<{self.id}, {self.status.name}, start={self.start_time}, ' 152 | f'processors={self.requested_processors}, ' 153 | f'memory={self.requested_memory} ' 154 | f'duration={self.execution_time}>' 155 | ) 156 | 157 | __repr__ = __str__ 158 | 159 | @property 160 | def proper(self): 161 | """Checks whether this job is a proper job with assigned resources. 162 | 163 | Returns: 164 | bool: True if the job is proper, and False otherwise. 165 | """ 166 | processors, memory = self.resources.measure() 167 | return processors == self.requested_processors and ( 168 | self.ignore_memory or memory == self.requested_memory 169 | ) 170 | 171 | @property 172 | def slowdown(self): 173 | """Computes the slowdown of the current job.""" 174 | if self.finish_time < 0: 175 | warnings.warn( 176 | f'Failed to obtain slowdown for job {self}. ' 177 | 'It may not have finished yet.' 178 | ) 179 | return -1 180 | return ( 181 | self.finish_time - self.submission_time 182 | ) / self.execution_time 183 | 184 | @property 185 | def bounded_slowdown(self): 186 | """Gives the bounded slowdown of a job""" 187 | if self.finish_time < 0: 188 | warnings.warn( 189 | f'Failed to obtain avg bounded slowdown for job {self}.' 190 | 'It may not have finished yet.' 191 | ) 192 | return -1 193 | return max( 194 | 1, 195 | (self.finish_time - self.submission_time) 196 | / max(10, self.execution_time), 197 | ) 198 | 199 | @property 200 | def swf(self): 201 | """Returns an SWF representation of this job""" 202 | return ( 203 | f'{self.id} {self.submission_time} {self.wait_time} ' 204 | f'{self.execution_time} {self.processors_allocated} ' 205 | f'{self.average_cpu_use} ' 206 | f'{self.memory_use} {self.requested_processors} ' 207 | f'{self.requested_time} {self.requested_memory} ' 208 | f'{self.swfstatus} {self.user_id} {self.group_id} ' 209 | f'{self.executable} {self.queue_number} ' 210 | f'{self.partition_number} {self.preceding_job_id} ' 211 | f'{self.think_time}' 212 | ) 213 | 214 | @property 215 | def swfstatus(self): 216 | """Returns the job status in the format expected by the SWF.""" 217 | if self.status == JobStatus.COMPLETED: 218 | return SwfJobStatus.COMPLETED 219 | return SwfJobStatus.MEANINGLESS 220 | 221 | @staticmethod 222 | def from_swf_job(swf_job): 223 | """Converts an SWF job to our internal job format.""" 224 | new_job = Job() 225 | for key, value in Job.SWF_JOB_MAP.items(): 226 | tmp = getattr(swf_job, key) 227 | setattr(new_job, value, int(tmp) if 'time' in value else tmp) 228 | 229 | new_job.status = JobStatus.SUBMITTED 230 | new_job.requested_processors = new_job.processors_allocated 231 | if new_job.requested_time == -1: 232 | new_job.requested_time = new_job.execution_time 233 | 234 | return new_job 235 | 236 | @property 237 | def state(self): 238 | return JobState( 239 | self.submission_time, 240 | self.requested_time, 241 | self.requested_memory, 242 | self.requested_processors, 243 | self.queue_size, 244 | self.queued_work, 245 | self.free_processors, 246 | ) 247 | 248 | 249 | class JobParameters: 250 | """Class for using with generative models for job creation. 251 | 252 | Assumes two types of jobs: 253 | 1. "Small" jobs and 254 | 2. "Large" jobs 255 | 256 | A job has probability s of being small and (1-s) of being large. 257 | 258 | Moreover, jobs have a dominant resource to distinguish between CPU-bound 259 | and I/O bound jobs, with probability of being either CPU-bound and I/O 260 | bound 261 | 0.5. 262 | 263 | A user of this class must specify all bounds. 264 | 265 | Parameters 266 | ---------- 267 | lower_time_bound : int 268 | The minimum time a job will run for 269 | upper_time_bound : int 270 | The maximum time a job will run for 271 | lower_cpu_bound : int 272 | The minimum number of processors a job will consume 273 | upper_cpu_bound : int 274 | The maximum number of processors a job will consume 275 | lower_mem_bound : int 276 | The minimum amount of memory a job will consume 277 | upper_mem_bound : int 278 | The maximum amount of memory a job will consume 279 | 280 | Used by :class:`schedgym.workload.distribution.BinomialWorkloadGenerator`. 281 | """ 282 | 283 | lower_time_bound: int 284 | upper_time_bound: int 285 | lower_resource_bound: int 286 | upper_resource_bound: int 287 | 288 | @staticmethod 289 | def _validate_parameters(*args): 290 | for param in args: 291 | if param <= 0: 292 | raise AssertionError( 293 | 'Unable to work with non-positive bounds.' 294 | ) 295 | 296 | def __init__( 297 | self, 298 | lower_time_bound: int, 299 | upper_time_bound: int, 300 | lower_cpu_bound: int, 301 | upper_cpu_bound: int, 302 | lower_mem_bound: int, 303 | upper_mem_bound: int, 304 | ): 305 | self._validate_parameters( 306 | lower_time_bound, 307 | upper_time_bound, 308 | lower_cpu_bound, 309 | upper_cpu_bound, 310 | lower_mem_bound, 311 | upper_mem_bound, 312 | ) 313 | 314 | self.lower_time_bound = lower_time_bound 315 | self.upper_time_bound = upper_time_bound 316 | self.lower_cpu_bound = lower_cpu_bound 317 | self.upper_cpu_bound = upper_cpu_bound 318 | self.lower_mem_bound = lower_mem_bound 319 | self.upper_mem_bound = upper_mem_bound 320 | 321 | self.resource_samplers = { 322 | PrimaryResource.CPU: lambda: random.randint( 323 | self.lower_cpu_bound, self.upper_cpu_bound 324 | ), 325 | PrimaryResource.MEMORY: lambda: random.randint( 326 | self.lower_mem_bound, self.upper_mem_bound 327 | ), 328 | } 329 | 330 | self.job_id = 1 331 | self.time_step = 0 332 | 333 | def add_time(self, steps: int = 1) -> None: 334 | """Increments time in the internal counter.""" 335 | if steps < 0: 336 | raise AssertionError("Time can't be negative.") 337 | self.time_step += steps 338 | 339 | def sample(self, submission_time: int = 0) -> Job: 340 | """Samples a new job. 341 | 342 | Parameters 343 | ---------- 344 | submission_time : int 345 | The time at which the new sampled job would have been 346 | submitted. If omitted, the current times step is used. 347 | """ 348 | time_duration = random.randint( 349 | self.lower_time_bound, self.upper_time_bound 350 | ) 351 | 352 | cpu = self.resource_samplers[PrimaryResource.CPU]() 353 | mem = self.resource_samplers[PrimaryResource.MEMORY]() 354 | 355 | job = Job( 356 | self.job_id, 357 | submission_time if submission_time else self.time_step, 358 | time_duration, 359 | cpu, 360 | 0, 361 | mem, 362 | cpu, 363 | time_duration, 364 | mem, 365 | JobStatus.WAITING, 366 | 1, 367 | 1, 368 | 1, 369 | 1, 370 | 1, 371 | -1, 372 | -1, 373 | -1, 374 | ) 375 | self.job_id += 1 376 | 377 | return job 378 | -------------------------------------------------------------------------------- /deeprm-agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import namedtuple, defaultdict 5 | 6 | import argparse 7 | 8 | import os 9 | import gym 10 | import json 11 | import pickle 12 | import numpy as np 13 | from typing import List 14 | from pathlib import Path 15 | from collections import OrderedDict 16 | 17 | import schedgym.envs as deeprm 18 | 19 | from numpy.lib.stride_tricks import as_strided 20 | 21 | import torch 22 | import torch.nn as nn 23 | import torch.optim as optim 24 | import torch.nn.functional as F 25 | import torch.utils.data as data 26 | import torch.multiprocessing as mp 27 | from torch.distributions import Categorical 28 | 29 | from torch.utils.tensorboard.writer import SummaryWriter 30 | 31 | SLOTS: int = 10 32 | BACKLOG: int = 60 33 | TIME_LIMIT: int = 50 34 | TIME_HORIZON: int = 20 35 | PARALLEL_WORKERS: int = 20 36 | TRAINING_ITERATIONS: int = 6 37 | OPTIMIZERS = { 38 | 'adam': lambda model, args: optim.Adam(model.parameters(), lr=args.lr), 39 | 'rmsprop': lambda model, args: optim.RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum), 40 | } 41 | 42 | TMPDIR = Path(f'/run/user/{os.getuid()}') 43 | Experience = namedtuple( 44 | 'Experience', 45 | field_names='state action reward'.split() 46 | ) 47 | 48 | 49 | class PGNet(nn.Module): 50 | def __init__(self, env): 51 | super().__init__() 52 | 53 | self.input_height = env.observation_space.shape[0] 54 | self.input_width = env.observation_space.shape[1] 55 | self.output_size = env.action_space.n 56 | 57 | self.nn = nn.Sequential(OrderedDict([ 58 | ('fc1', nn.Linear(self.input_height * self.input_width, 512)), 59 | ('relu1', nn.ReLU()), 60 | ('fc2', nn.Linear(512, 256)), 61 | ('relu2', nn.ReLU()), 62 | ])) 63 | self.out = nn.Linear(256, self.output_size) 64 | 65 | def forward(self, x): 66 | x = x.view(-1, self.input_height * self.input_width) 67 | x = self.nn(x) 68 | scores = self.out(x) 69 | return F.softmax(scores, dim=1) 70 | 71 | def select_action(self, state, device='cpu'): 72 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 73 | probs = self(state) 74 | mass = Categorical(probs) 75 | action = mass.sample() 76 | return action.item() 77 | 78 | def log_prob(self, state, action, device='cpu'): 79 | state = state.float() 80 | action = action.float() 81 | probs = self(state).view((action.shape[0], action.shape[1], -1)) 82 | mass = Categorical(probs) 83 | return mass.log_prob(action), mass.entropy() 84 | 85 | class Callback(object): 86 | def __call__(self, score) -> None: 87 | raise NotImplementedError 88 | 89 | 90 | class ReduceLROnPlateau(Callback): 91 | def __init__(self, patience, rate, args, minimum=None, negate_score=True): 92 | self.patience = patience 93 | self.args = args 94 | self.rate = rate 95 | self.counter = 0 96 | self.best_score = None 97 | self.minimum = minimum 98 | self.negate_score = negate_score 99 | 100 | def __call__(self, score): 101 | if self.negate_score: 102 | score = -score 103 | if self.best_score is None: 104 | self.best_score = score 105 | elif score <= self.best_score: 106 | self.counter += 1 107 | if self.counter >= self.patience: 108 | self.counter = 0 109 | print( 110 | f'Reducing learning rate from {self.args.lr} ' 111 | f'to {self.args.lr * self.rate} ' 112 | f'(best score was {self.best_score})' 113 | ) 114 | tmp = self.args.lr * self.rate 115 | if self.minimum and tmp < self.minimum: 116 | tmp = self.minimum 117 | self.args.lr = tmp 118 | else: 119 | self.best_score = score 120 | self.counter = 0 121 | 122 | 123 | def make_discount_array(gamma, timesteps): 124 | vals = np.zeros(2 * timesteps - 1) 125 | vals[timesteps - 1:] = gamma ** np.arange(timesteps) 126 | return as_strided( 127 | vals[timesteps - 1:], 128 | shape=(timesteps, timesteps), 129 | strides=(-vals.strides[0], vals.strides[0]), 130 | writeable=False 131 | ) 132 | 133 | 134 | def setup_environment(envname, wlkwargs) -> deeprm.DeepRmEnv: 135 | env: deeprm.DeepRmEnv = gym.make(envname, **wlkwargs) 136 | env.reset() 137 | 138 | return env 139 | 140 | 141 | def run_episode(env, model, max_episode_length, device='cpu'): 142 | trajectory = [] 143 | total_reward = 0 144 | state = env.reset() 145 | for _ in range(max_episode_length): 146 | action = model.select_action(state, device) 147 | next_state, reward, done, _ = env.step(action) 148 | exp = Experience(state, action, reward) 149 | trajectory.append(exp) 150 | total_reward += reward 151 | if done: 152 | break 153 | state = next_state 154 | return trajectory 155 | 156 | 157 | def compute_baselines(trajectories): 158 | returns = np.zeros((len(trajectories), max((len(traj) for traj in trajectories)))) 159 | for i in range(len(trajectories)): 160 | tmp = np.array([e.reward for e in trajectories[i]]) 161 | returns[i, :len(tmp)] = tmp 162 | return returns, returns.mean(axis=0) 163 | 164 | 165 | def run_episodes(rank, args, model, device, wlkwargs) -> List[List[Experience]]: 166 | np.random.seed(args.seed + rank) 167 | torch.manual_seed(args.seed + rank) 168 | env = setup_environment(args.envname, wlkwargs) 169 | 170 | return [run_episode(env, model, args.max_episode_length, device) 171 | for _ in range(args.trajectories_per_batch)] 172 | 173 | 174 | def run_episodes_pickle(rank, args, model, device, wlkwargs): 175 | trajectories = run_episodes(rank, args, model, device, wlkwargs) 176 | with open(TMPDIR / f'{rank}.pkl', 'wb') as fp: 177 | pickle.dump(trajectories, fp, pickle.HIGHEST_PROTOCOL) 178 | 179 | 180 | def train_one_epoch(rank, args, model, device, loss_queue, wlkwargs) -> None: 181 | """Trains the model for one epoch. 182 | 183 | This uses baselining in the REINFORCE algorithm. There are many ways to 184 | compute baselines. Examples: 185 | 186 | 1. The approach taken by DeepRM in the original paper, in which each 187 | timestep has its own baseline, which is computed as the average 188 | return for a trajectory. 189 | 2. Computing a global baseline for each trajectory in which it is the 190 | average return in that trajectory. 191 | 3. A global baseline computed as the average return over all 192 | trajectories. 193 | 194 | In this function, we follow 1., but nothing prevents us from using 2 or 3. 195 | """ 196 | # You might need to divide the learning rate by the number of workers 197 | 198 | optimizer = OPTIMIZERS[args.optimizer.lower()](model, args) 199 | 200 | optimizer.zero_grad() 201 | trajectories = run_episodes(rank, args, model, device, wlkwargs) 202 | 203 | rewards, baselines = compute_baselines(trajectories) 204 | baselines_mat = np.array([baselines 205 | for _ in range(args.trajectories_per_batch)]) 206 | baselines_mat = baselines_mat * (rewards != 0) 207 | discounts = make_discount_array(args.gamma, rewards.shape[1]) 208 | discounted_returns = (discounts @ rewards.T).T 209 | advantages = discounted_returns - baselines_mat 210 | 211 | policy_loss, entropy = [], [] 212 | for i, t in enumerate(trajectories): 213 | for j, e in enumerate(t): 214 | policy_loss.append(e.log_prob * advantages[i, j]) 215 | entropy.append(e.entropy()) 216 | 217 | policy_loss = torch.cat(policy_loss).sum() + torch.cat(entropy).sum() * args.entropy 218 | (-policy_loss).backward() 219 | optimizer.step() 220 | 221 | lengths = [len(t) for t in trajectories] 222 | loss_queue.put(( 223 | rank, policy_loss.clone().cpu().data.numpy(), 224 | advantages.mean(), advantages.std(), 225 | rewards.mean(), rewards.std(), 226 | discounted_returns.mean(), discounted_returns.std(), 227 | np.mean(lengths), np.std(lengths) 228 | )) 229 | 230 | 231 | def build_argument_parser(): 232 | parser = argparse.ArgumentParser(description='DeepRM training') 233 | parser.add_argument('--epochs', type=int, default=TRAINING_ITERATIONS, 234 | metavar='N', help='number of epochs to train') 235 | parser.add_argument('--workers', type=int, default=PARALLEL_WORKERS, 236 | metavar='N', help='number of workers to train') 237 | parser.add_argument('--seed', type=int, default=42, 238 | metavar='S', help='random seed to use') 239 | parser.add_argument('--lr', type=float, default=1e-2, metavar='LR', 240 | help='Learning rate for gradient ascent') 241 | parser.add_argument('--momentum', type=float, default=0.99, metavar='LR', 242 | help='momentum for gradient ascent') 243 | parser.add_argument('--cuda', action='store_true', default=False, 244 | help='enables training with CUDA') 245 | parser.add_argument('--envname', type=str, default='DeepRM-v0', 246 | help='OpenAI Gym environment to use') 247 | parser.add_argument('--max-episode-length', type=int, default=200, 248 | metavar='N', help='Maximum number of timesteps in episode') 249 | parser.add_argument('--trajectories-per-batch', type=int, default=200, 250 | metavar='N', help='Number of trajectories in a batch') 251 | parser.add_argument('--gamma', type=float, default=0.99, metavar='γ', 252 | help='Discount factor') 253 | parser.add_argument('--debug', action='store_true', default=False) 254 | parser.add_argument('--load', type=str, default=None, metavar='PATH', 255 | help='Loads a previously-trained model') 256 | parser.add_argument('--optimizer', type=str, default='adam', 257 | help='optimizer to use') 258 | parser.add_argument('--workload', type=str, default=None, 259 | help='Path to a workload configuration file') 260 | parser.add_argument('--entropy', type=float, default=0., 261 | help='entropy regularization factor') 262 | return parser 263 | 264 | 265 | def main(): 266 | args = build_argument_parser().parse_args() 267 | 268 | use_cuda = args.cuda and torch.cuda.is_available() 269 | device = torch.device('cuda' if use_cuda else 'cpu') 270 | 271 | torch.manual_seed(args.seed) 272 | mp.set_start_method('spawn') 273 | 274 | if args.workload is None: 275 | wlkwargs = {} 276 | else: 277 | with open(args.workload) as fp: 278 | wlkwargs = json.load(fp) 279 | 280 | model = PGNet(setup_environment(args.envname, wlkwargs)).to(device) 281 | if args.load is not None: 282 | model.load_state_dict(torch.load(args.load)) 283 | model.share_memory() 284 | 285 | writer = SummaryWriter() 286 | loss_queue = mp.Queue() 287 | 288 | callbacks = [ReduceLROnPlateau(500, .5, args, 1e-5, negate_score=True)] 289 | train_synchronous_parallel(args, callbacks, device, loss_queue, model, writer, wlkwargs) 290 | 291 | writer.close() 292 | torch.save(model.state_dict(), 'policy.pth') 293 | 294 | 295 | def train_synchronous_parallel(args, callbacks, device, loss_queue, model, writer, wlkwargs): 296 | for epoch in range(args.epochs): 297 | print(f'Current epoch: {epoch}') 298 | losses = [] 299 | if args.debug: 300 | train_one_epoch(0, args, model, device, loss_queue) 301 | else: 302 | with mp.Pool(processes=args.workers) as pool: 303 | pool.starmap_async( 304 | run_episodes_pickle, 305 | [(i, args, model, device, wlkwargs) for i in range(args.workers)], 306 | 1 307 | ).get() 308 | 309 | fps = [open(TMPDIR / f'{i}.pkl', 'rb') for i in range(args.workers)] 310 | ret = [pickle.load(fp) for fp in fps] 311 | [fp.close() for fp in fps] 312 | 313 | optimizer = OPTIMIZERS[args.optimizer.lower()](model, args) 314 | optimizer.zero_grad() 315 | 316 | trajectories = [e for l in ret for e in l] 317 | rewards, baselines = compute_baselines(trajectories) 318 | baselines_mat = np.array([baselines 319 | for _ in range(len(trajectories))]) 320 | baselines_mat = baselines_mat * (rewards != 0) 321 | discounts = make_discount_array(args.gamma, rewards.shape[1]) 322 | discounted_returns = (discounts @ rewards.T).T 323 | advantages = discounted_returns - baselines_mat 324 | 325 | states = [[e.state for e in t] for t in trajectories] 326 | actions = [[e.action for e in t] for t in trajectories] 327 | maxlen = max((len(s) for s in states)) 328 | for s, a in zip(states, actions): 329 | s += [np.zeros_like(s[0])] * (maxlen - len(s)) 330 | a += [np.zeros_like(a[0])] * (maxlen - len(a)) 331 | 332 | def compute_loss(model, states, actions, advantages, device): 333 | states, actions, advantages = [torch.from_numpy(t) for t in (states, actions, advantages)] 334 | dataset = data.TensorDataset( 335 | states, actions, advantages 336 | ) 337 | loader = data.DataLoader( 338 | dataset, batch_size=64, shuffle=False 339 | ) 340 | loss = 0 341 | for state, action, advantage in loader: 342 | l, e = model.log_prob( 343 | state.to(device), action.to(device), device 344 | ) 345 | loss += (l * advantage.to(device)).sum() 346 | loss += (e * args.entropy).sum() 347 | return loss 348 | 349 | policy_loss = compute_loss( 350 | model, 351 | np.array(states), 352 | np.array(actions), 353 | np.array(advantages), 354 | device 355 | ) 356 | (-policy_loss).backward() 357 | optimizer.step() 358 | 359 | lengths = [len(t) for t in trajectories] 360 | loss_queue.put(( 361 | 0, policy_loss.clone().cpu().data.numpy(), 362 | advantages.mean(), advantages.std(), 363 | rewards.mean(), rewards.std(), 364 | discounted_returns.mean(), discounted_returns.std(), 365 | np.mean(lengths), np.std(lengths) 366 | )) 367 | 368 | for name, param in model.named_parameters(): 369 | writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch) 370 | 371 | losses, extras = [], defaultdict(list) 372 | features = 'ardl' 373 | while not loss_queue.empty(): 374 | rank, loss, *extra = loss_queue.get() 375 | print( 376 | f'Loss for worker {rank} on epoch {epoch}: {loss}' 377 | ) 378 | losses.append(loss) 379 | for i, feature in enumerate(features): 380 | extras[f'{feature}μ'].append(extra[i * 2]) 381 | extras[f'{feature}σ'].append(extra[i * 2 + 1]) 382 | writer.add_scalar(f'{feature}μ/{rank}', extra[i * 2], epoch) 383 | writer.add_scalar(f'{feature}σ/{rank}', extra[i * 2 + 1], epoch) 384 | print( 385 | 'Loss for epoch {}: {}±{}'.format(epoch, np.mean(losses), np.std(losses)) 386 | ) 387 | writer.add_scalar('loss', np.mean(losses), epoch) 388 | for i, feature in enumerate(features): 389 | writer.add_scalar(f'{feature}μ', np.mean(extras[f'{feature}μ']), epoch) 390 | writer.add_scalar(f'{feature}σ', np.mean(extras[f'{feature}σ']), epoch) 391 | writer.add_scalar('α', args.lr, epoch) 392 | for callback in callbacks: 393 | callback(np.mean(losses)) 394 | 395 | writer.flush() 396 | torch.save(model.state_dict(), f'checkpoint/policy-{epoch}.pth') 397 | 398 | 399 | if __name__ == '__main__': 400 | main() 401 | -------------------------------------------------------------------------------- /schedgym/scheduler/scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """scheduler - Module with basic scheduling functionality. 5 | 6 | This is the core of the simulator, since this module contains functionality 7 | that interacts with all other components. 8 | """ 9 | 10 | from abc import ABC, abstractmethod 11 | from collections import defaultdict 12 | from typing import ( 13 | List, 14 | Iterable, 15 | Tuple, 16 | Dict, 17 | Any, 18 | Union, 19 | NamedTuple, 20 | Optional, 21 | ) 22 | 23 | import collections.abc 24 | 25 | import numpy as np 26 | 27 | from schedgym.cluster import Cluster 28 | from schedgym.job import Job, JobStatus, Resource 29 | from schedgym.event import JobEvent, EventType, EventQueue 30 | 31 | 32 | class Stats(NamedTuple): 33 | """A named tuple with scheduling statistics""" 34 | 35 | utilization: float 36 | load: float 37 | slowdown: float 38 | makespan: float 39 | bsld: float 40 | 41 | 42 | class Scheduler(ABC): 43 | # pylint: disable=too-many-instance-attributes 44 | # pylint: disable=too-many-public-methods 45 | """Base class for scheduling. 46 | 47 | This class implements the core scheduling primitives common to all 48 | schedulers, and it also manages the "connection" with Cluster objects 49 | to manage them. 50 | 51 | Internally, the scheduler manages four general "queues": 52 | * Admission: For jobs that have been submitted, but to which the 53 | scheduler hasn't made a decision yet 54 | * Waiting: For jobs that the scheduler has already generated an 55 | schedule, but that haven't been started yet 56 | * Running: For jobs that have started execution, but hasn't 57 | finished yet 58 | * Completed: For jobs that have finished execution 59 | 60 | Parameters 61 | ---------- 62 | number_of_processors : int 63 | The number of processors in the system 64 | total_memory : int 65 | The amount of memory in the system 66 | ignore_memory : bool 67 | Whether memory should be ignored when making decisions, or not 68 | """ 69 | 70 | used_memory: int 71 | current_time: int 72 | total_memory: int 73 | used_processors: int 74 | need_schedule_call: bool 75 | number_of_processors: int 76 | queue_waiting: List[Job] 77 | queue_running: List[Job] 78 | queue_admission: List[Job] 79 | queue_completed: List[Job] 80 | cluster: Cluster 81 | job_events: EventQueue[JobEvent] 82 | stats: Dict[int, Stats] 83 | 84 | def __init__( 85 | self, number_of_processors, total_memory, ignore_memory=False 86 | ): 87 | self.number_of_processors = number_of_processors 88 | self.total_memory = total_memory 89 | 90 | self.queue_waiting = [] 91 | self.queue_running = [] 92 | self.queue_completed = [] 93 | self.queue_admission = [] 94 | 95 | self.stats = {} 96 | self.used_memory = 0 97 | self.current_time = 0 98 | self.used_processors = 0 99 | self.ignore_memory = ignore_memory 100 | self.job_events = EventQueue(self.current_time - 1) 101 | self.cluster = Cluster( 102 | number_of_processors, total_memory, ignore_memory 103 | ) 104 | self.need_schedule_call = False 105 | 'Tracks whether we might need to schedule jobs' 106 | 107 | @property 108 | def all_jobs(self) -> List[Job]: 109 | """Returns a list of all the jobs that ever got into the system""" 110 | return ( 111 | self.queue_completed 112 | + self.queue_running 113 | + self.queue_waiting 114 | + self.queue_admission 115 | ) 116 | 117 | @property 118 | def slowdown(self) -> List[float]: 119 | """Returns the slowdown of all completed jobs""" 120 | return [j.slowdown for j in self.queue_completed] 121 | 122 | @property 123 | def jobs_in_system(self) -> List[Job]: 124 | """Returns a list with all the jobs that haven't completed yet""" 125 | return self.queue_running + self.queue_waiting + self.queue_admission 126 | 127 | @property 128 | def makespan(self) -> int: 129 | """Computes the makespan of all finished jobs""" 130 | return max([0] + [j.finish_time for j in self.queue_completed]) 131 | 132 | @property 133 | def load(self) -> float: 134 | """Computes the current load in the system. 135 | 136 | The load is the ratio between the number of requested processors and 137 | the number of processors in the system. 138 | """ 139 | requested_processors = sum( 140 | [j.requested_processors for j in self.jobs_in_system] 141 | ) 142 | return requested_processors / self.number_of_processors 143 | 144 | @property 145 | def utilization(self) -> float: 146 | """Instant processor utilization.""" 147 | return self.used_processors / self.number_of_processors 148 | 149 | @property 150 | def bounded_slowdown(self) -> List[float]: 151 | """Computes the bounded slowdown for all completed jobs""" 152 | return [j.bounded_slowdown for j in self.queue_completed] 153 | 154 | def _start_running(self, j: Job) -> None: 155 | """Starts running job `j`. 156 | 157 | Parameters 158 | ---------- 159 | j : Job 160 | The job to start running 161 | """ 162 | self.queue_waiting.remove(j) 163 | self.queue_running.append(j) 164 | 165 | j.status = JobStatus.RUNNING 166 | self.used_memory += j.memory_use 167 | self.used_processors += j.processors_allocated 168 | j.wait_time = j.start_time - j.submission_time 169 | 170 | def _complete_job(self, j: Job) -> None: 171 | """Marks a job as completed. 172 | 173 | Parameters 174 | ---------- 175 | j : Job 176 | The job to mark completed 177 | """ 178 | self.queue_running.remove(j) 179 | self.queue_completed.append(j) 180 | 181 | j.status = JobStatus.COMPLETED 182 | j.finish_time = j.start_time + j.execution_time 183 | self.used_memory -= j.memory_use 184 | self.used_processors -= j.processors_allocated 185 | 186 | def _add_job_events( 187 | self, job: Job, time: int 188 | ) -> Tuple[JobEvent, JobEvent]: 189 | """Adds start and finish events for a job to the current events. 190 | 191 | Parameters 192 | ---------- 193 | job : Job 194 | The job whose events are to be added to the system 195 | time : int 196 | The time step to associate the start event with 197 | """ 198 | if not job.resources or not job.proper: 199 | raise AssertionError( 200 | 'Malformed job submitted either with no processors, ' 201 | 'or with insufficient number of ' 202 | 'processors' 203 | ) 204 | start = JobEvent(time, EventType.JOB_START, job) 205 | finish = start.clone() 206 | finish.time += job.execution_time 207 | finish.type = EventType.JOB_FINISH 208 | self.job_events.add(start) 209 | self.job_events.add(finish) 210 | 211 | return start, finish 212 | 213 | @property 214 | def free_resources(self) -> Tuple[int, int]: 215 | """Returns the amount of free resources in the system.""" 216 | return ( 217 | self.number_of_processors - self.used_processors, 218 | self.total_memory - self.used_memory, 219 | ) 220 | 221 | def step(self, offset: int = None) -> bool: 222 | """Steps the simulation 223 | 224 | Parameters 225 | ---------- 226 | offset : int 227 | The number of time steps to take (must be >= 0) 228 | """ 229 | if offset is None: 230 | offset = 1 231 | if offset < 0: 232 | raise AssertionError('Tried to move backwards in time') 233 | 234 | scheduled = False 235 | for _ in range(offset): 236 | if self.need_schedule_call or ( 237 | self.queue_admission 238 | and self.job_events.first 239 | and self.job_events.first.time == self.current_time 240 | ): 241 | self.need_schedule_call = False 242 | scheduled = True 243 | self.schedule() 244 | present = self.job_events.step(1) 245 | self.cluster = self.play_events( 246 | present, self.cluster, update_queues=True 247 | ) 248 | self.current_time += 1 249 | return scheduled 250 | 251 | def play_events( 252 | self, 253 | events: Iterable[JobEvent], 254 | cluster: Cluster, 255 | update_queues: bool = False, 256 | ) -> Cluster: 257 | """Play events from a given event queue, updating state accordingly. 258 | 259 | On top of playing the events, this also updates job statistics, 260 | which can be queried at any given time. 261 | 262 | After execution, the current state of the cluster is returned. 263 | 264 | This method is used by a number of operations: both to find future 265 | schedules for jobs and to check whether a job can be added at a given 266 | time step. For this reason, an optional argument is included to define 267 | whether to update queues or not. 268 | 269 | Parameters 270 | ---------- 271 | events : Iterable[JobEvent] 272 | The events to play 273 | cluster : Cluster 274 | The cluster to operate on when playing events 275 | update_queues : bool 276 | Whether to update queues when job start and job finished events 277 | are found. 278 | """ 279 | for event in events: 280 | if event.type == EventType.JOB_START: 281 | cluster.allocate(event.job) 282 | if update_queues: 283 | self._start_running(event.job) 284 | self.update_stats() 285 | elif event.type == EventType.JOB_FINISH: 286 | cluster.free(event.job) 287 | if update_queues: 288 | self._complete_job(event.job) 289 | self.update_stats() 290 | else: 291 | raise RuntimeError('Unexpected event type found') 292 | return cluster 293 | 294 | @staticmethod 295 | def fits( 296 | time: int, job: Job, cluster: Cluster, events: Iterable[JobEvent] 297 | ) -> Resource: 298 | """Checks whether a job fits a given cluster at a given time. 299 | 300 | Once again, this requires an iterable of events and a cluster to 301 | operate on to check whether the job fits the cluster. 302 | 303 | Parameters 304 | ---------- 305 | job : Job 306 | The job to check 307 | cluster : Cluster 308 | The cluster to operate on 309 | events : Iterable[JobEvent] 310 | An iterable that provides the job events this scheduler will 311 | operate on 312 | 313 | Returns: 314 | Resource: The set of resources (when found) or an empty set of 315 | resources (when the job won't fit the cluster). 316 | """ 317 | return cluster.find_resources_at_time(time, job, events) 318 | 319 | def some_job_fits(self, job_slots: slice = slice(0, None)): 320 | """Checks whether any jobs in the admission queue fits _right now_.""" 321 | 322 | return any( 323 | [self.cluster.fits(j) for j in self.queue_admission[job_slots]] 324 | ) 325 | 326 | def can_schedule_now(self, job: Job) -> Resource: 327 | """Checks whether a job can be scheduled in the current cluster now. 328 | 329 | This is a special case of :func:`fits` in which we're operating right 330 | now with the current cluster. 331 | 332 | Parameters 333 | ---------- 334 | job : Job 335 | The job to check. 336 | """ 337 | cluster = self.cluster.clone() 338 | events = filter(lambda e: e.time <= self.current_time, self.job_events) 339 | for event in events: 340 | if event.type == EventType.JOB_START: 341 | cluster.allocate(event.job) 342 | elif event.type == EventType.JOB_FINISH: 343 | cluster.free(event.job) 344 | return cluster.find_resources_at_time( 345 | self.current_time, job, self.job_events 346 | ) 347 | 348 | def find_first_time_for(self, job: Job) -> Tuple[int, Resource]: 349 | """Finds the first time stamp on which we can start a job. 350 | 351 | Parameters 352 | ---------- 353 | job : Job 354 | The job to find a time for 355 | """ 356 | 357 | if (not self.job_events.next) or ( 358 | self.job_events.next.time > self.current_time 359 | ): 360 | resources = self.cluster.find_resources_at_time( 361 | self.current_time, job, self.job_events 362 | ) 363 | if resources: 364 | return self.current_time, resources 365 | 366 | near_future: Dict[int, List[JobEvent]] = defaultdict(list) 367 | for e in self.job_events: 368 | near_future[e.time].append(e) 369 | 370 | cluster = self.cluster.clone() 371 | for time in sorted(near_future): 372 | cluster = self.play_events(near_future[time], cluster) 373 | resources = cluster.find_resources_at_time( 374 | time, job, self.job_events 375 | ) 376 | if resources: 377 | return time, resources 378 | 379 | raise AssertionError( 380 | 'Failed to find time for job, even in the far future.' 381 | ) 382 | 383 | def submit(self, job: Union[Job, Iterable[Optional[Job]]]) -> None: 384 | """Submits a new job to the system. 385 | 386 | Parameters 387 | ---------- 388 | job : Union[Job, Sequence[Job]] 389 | Can either be a single job, or a sequence of jobs. If 390 | a sequence, all jobs in the sequence are submitted at the same 391 | time. 392 | """ 393 | if isinstance(job, collections.abc.Iterable): 394 | for j in job: 395 | self._submit(j) 396 | else: 397 | self._submit(job) 398 | self.need_schedule_call = True 399 | 400 | def _submit(self, job: Optional[Job]) -> None: 401 | """Internal implementation of job submission. 402 | 403 | Adds the new job to the `submission_queue` and sets job status to 404 | `JobStatus.SUBMITTED`. 405 | """ 406 | if job is None: 407 | return 408 | 409 | if job.requested_processors > self.number_of_processors: 410 | raise RuntimeError( 411 | 'Impossible to allocate resources for job bigger than cluster.' 412 | ) 413 | job.submission_time = self.current_time 414 | job.status = JobStatus.SUBMITTED 415 | 416 | # Compute statistics to be used in state representation {{{ 417 | job.queue_size = len(self.queue_admission) 418 | job.queued_work = sum( 419 | [ 420 | j.requested_time * j.requested_processors 421 | for j in self.queue_admission 422 | ] 423 | ) 424 | job.free_processors = self.cluster.state[0][0] 425 | # }}} 426 | 427 | self.queue_admission.append(job) 428 | 429 | def state(self, timesteps: int, job_slots: int): 430 | """Returns the current state of the cluster as viewed by the scheduler. 431 | 432 | The state representation used here is deeply inspired by the DeepRM 433 | state representation, meaning it will return three blocks of 434 | information: 435 | * The current status of processors and memory used in the system 436 | * A select number of jobs in the admission queue 437 | * A "backlog" representing the presence or absence of jobs in the queue 438 | (for jobs that didn't make into the previous representation) 439 | 440 | Parameters 441 | ---------- 442 | timesteps : int 443 | The number of time steps to look into the future 444 | job_slots : int 445 | The number of job slots to use (the amount of jobs in the 446 | admission queue to represent) 447 | """ 448 | # Gets all events between now and `timesteps` {{{ 449 | near_future: Dict[int, List[JobEvent]] = defaultdict(list) 450 | for e in filter( 451 | lambda e: e.time < self.current_time + timesteps + 1, 452 | self.job_events, 453 | ): 454 | near_future[e.time - self.current_time].append(e) 455 | # }}} 456 | 457 | # Gets the state representation of currently in use resources {{{ 458 | tmp = [] 459 | cluster = self.cluster.clone() 460 | for t in range(timesteps): 461 | if t in near_future: 462 | cluster = self.play_events(near_future[t], cluster) 463 | tmp.append(cluster.state) 464 | state = list(zip(*tmp)) 465 | if self.ignore_memory: 466 | state = state[:1] 467 | # }}} 468 | 469 | # Gets the representation of jobs in `job_slots` {{{ 470 | jobs = [ 471 | j.state 472 | for i, j in enumerate(self.queue_admission) 473 | if i < job_slots 474 | ] 475 | for i, job in enumerate(self.queue_admission): 476 | if i >= job_slots: 477 | break 478 | job.slot_position = i 479 | jobs += [Job().state for _ in range(job_slots - len(jobs))] 480 | # }}} 481 | 482 | # Gets the backlog {{{ 483 | backlog = max(len(self.queue_admission) - len(jobs), 0) 484 | # }}} 485 | 486 | return state, jobs, backlog 487 | 488 | def assign_schedule( 489 | self, job, resources, time 490 | ) -> Tuple[JobEvent, JobEvent]: 491 | """Assigns a schedule to a job. 492 | 493 | What this means is that the job is removed from the admission queue 494 | and is put into the "waiting" queue, which contains jobs that *will* 495 | run and already have an schedule. Also changes job status and assigns 496 | resources to a joub, along with the time it will start running. 497 | 498 | Parameters 499 | ---------- 500 | job : Job 501 | The job to be assigned a schedule 502 | resources : Resource 503 | The set of resources the job will use 504 | time : int 505 | The start time of the job 506 | """ 507 | job.status = JobStatus.WAITING 508 | job.resources.memory = resources.memory 509 | job.resources.processors = resources.processors 510 | job.resources.ignore_memory = resources.ignore_memory 511 | job.start_time = time 512 | self.queue_waiting.append(job) 513 | return self._add_job_events(job, time) 514 | 515 | @abstractmethod 516 | def schedule(self) -> Any: 517 | """Schedules tasks.""" 518 | 519 | def update_stats(self) -> None: 520 | """Updates the usage statistics of the system. 521 | 522 | Statistics are only computed when job events happen in the cluster. 523 | """ 524 | self.stats[self.current_time] = Stats( 525 | self.utilization, 526 | self.load, 527 | np.mean(self.slowdown) if self.queue_completed else 0.0, 528 | self.makespan, 529 | np.mean(self.bounded_slowdown) if self.queue_completed else 0.0, 530 | ) 531 | --------------------------------------------------------------------------------