├── .gitignore
├── docs
    ├── requirements.txt
    ├── img
    │   ├── gym.gif
    │   ├── job-resource.drawio
    │   ├── job-resource.svg
    │   └── cluster-resourcepool.svg
    ├── modules.rst
    ├── tutorials
    │   └── index.rst
    ├── index.rst
    ├── Makefile
    ├── design.rst
    ├── make.bat
    ├── schedgym.rst
    └── conf.py
├── mypy.ini
├── setup.cfg
├── pyproject.toml
├── .coveragerc
├── schedgym
    ├── __init__.py
    ├── envs
    │   ├── __init__.py
    │   ├── render.py
    │   ├── deeprm_env.py
    │   ├── simulator.py
    │   ├── compact_env.py
    │   ├── base.py
    │   └── workload.py
    ├── scheduler
    │   ├── random_scheduler.py
    │   ├── backfilling_scheduler.py
    │   ├── __init__.py
    │   ├── fifo_scheduler.py
    │   ├── sjf_scheduler.py
    │   ├── tetris_scheduler.py
    │   ├── packer_scheduler.py
    │   ├── easy_scheduler.py
    │   ├── null_scheduler.py
    │   └── scheduler.py
    ├── workload
    │   ├── __init__.py
    │   ├── base.py
    │   ├── swf_parser.py
    │   ├── distribution.py
    │   └── trace.py
    ├── resource.py
    ├── simulator.py
    ├── heap.py
    ├── pool.py
    ├── event.py
    ├── cluster.py
    └── job.py
├── requirements.txt
├── LICENSE
├── sjf-agent.py
├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   └── codeql-analysis.yml
├── README.rst
├── setup.py
└── deeprm-agent.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | venv
3 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx
2 | docutils
3 | nbsphinx
4 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | 
3 | ignore_missing_imports = True
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = attr: schedgym.__version__
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "Cython"]


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 |     omit = schedgym/envs/render.py
3 |     relative_files = True
4 | 


--------------------------------------------------------------------------------
/docs/img/gym.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renatolfc/sched-rl-gym/HEAD/docs/img/gym.gif


--------------------------------------------------------------------------------
/schedgym/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | __version__ = '0.1.0'
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.16.3
2 | intervaltree==3.0.2
3 | pytest==4.6.3
4 | coverage==5.3
5 | pytest-cov
6 | parallelworkloads
7 | cython
8 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | Source code modules documentation
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    schedgym
8 | 


--------------------------------------------------------------------------------
/docs/tutorials/index.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | Currently, we have a single tutorial that shows how to implement a PPO agent,
 5 | but in the future we will add more alternatives.
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 |    :caption: Tutorials:
10 | 
11 |    ppo.ipynb
12 | 


--------------------------------------------------------------------------------
/schedgym/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | from gym.envs.registration import register
 6 | 
 7 | from .deeprm_env import DeepRmEnv
 8 | from .compact_env import CompactRmEnv
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | register(
13 |     id='DeepRM-v0',
14 |     nondeterministic=False,
15 |     entry_point=f'schedgym.envs.deeprm_env:{DeepRmEnv.__name__}',
16 | )
17 | 
18 | register(
19 |     id='CompactRM-v0',
20 |     nondeterministic=False,
21 |     entry_point=f'schedgym.envs.compact_env:{CompactRmEnv.__name__}',
22 | )
23 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/random_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """random - a random scheduler"""
 5 | 
 6 | import random
 7 | 
 8 | from schedgym.scheduler import PackerScheduler
 9 | 
10 | 
11 | class RandomScheduler(PackerScheduler):
12 |     """A random scheduling policy.
13 | 
14 |     This reuses functionality from the :class:`PackerScheduler`. Therefore, it
15 |     only needs to define a random priority function.
16 |     """
17 | 
18 |     def get_priority(self, _) -> int:
19 |         """Random priority function for random scheduler."""
20 |         return random.randint(0, len(self.queue_admission) - 1)
21 | 


--------------------------------------------------------------------------------
/schedgym/workload/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """workload - Package for generators of load for a cluster.
 5 | 
 6 | Supports generative workloads, based on probability distributions, and
 7 | trace-based workloads in the Standard Workload Format.
 8 | """
 9 | 
10 | from .base import WorkloadGenerator
11 | from .distribution import BinomialWorkloadGenerator
12 | from .distribution import DistributionalWorkloadGenerator
13 | from .trace import TraceGenerator, SwfGenerator
14 | 
15 | __all__ = [
16 |     'WorkloadGenerator',
17 |     'DistributionalWorkloadGenerator',
18 |     'BinomialWorkloadGenerator',
19 |     'TraceGenerator',
20 |     'SwfGenerator',
21 | ]
22 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to sched-rl-gym's documentation!
 2 | ========================================
 3 | 
 4 | In this documentation site you will find information about the general design
 5 | of the environment, a set of tutorials on how to instantiate and use it, along
 6 | with a reference of the source code.
 7 | 
 8 | Please use the following list of contents to select what you're interested in.
 9 | 
10 | .. toctree::
11 |    :maxdepth: 2
12 |    :caption: Contents:
13 | 
14 |    design
15 |    tutorials/index
16 |    modules
17 | 
18 | .. include:: ../README.rst
19 |    :start-after: inclusion-marker-do-not-remove
20 | 
21 | 
22 | Indices and tables
23 | ==================
24 | 
25 | * :ref:`genindex`
26 | * :ref:`modindex`
27 | * :ref:`search`
28 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/backfilling_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """backfilling_scheduler - Module for a conservative backfilling scheduler"""
 5 | 
 6 | from schedgym.scheduler import Scheduler
 7 | 
 8 | 
 9 | class BackfillingScheduler(Scheduler):
10 |     """Implements a conservative backfilling scheduler."""
11 | 
12 |     def schedule(self) -> None:
13 |         "Schedules a job according to the conservative backfilling strategy."
14 |         for job in self.queue_admission:
15 |             time, resources = self.find_first_time_for(job)
16 |             if not resources:
17 |                 raise AssertionError('Something is terribly wrong')
18 |             self.assign_schedule(job, resources, time)
19 |         self.queue_admission.clear()
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | docs:
18 | 	mkdir -p docs/img
19 | 	cp img/gym.gif docs/img/
20 | 
21 | # Catch-all target: route all unknown targets to Sphinx using the new
22 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
23 | %: Makefile docs
24 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 


--------------------------------------------------------------------------------
/docs/design.rst:
--------------------------------------------------------------------------------
 1 | Design of sched-rl-gym
 2 | ======================
 3 | 
 4 | This page documents the overall design of the environment, and may be useful in
 5 | understanding its components.
 6 | 
 7 | `sched-rl-gym` was designed with a view of having multiple layers to try and
 8 | separate functionality between them. Conceptually, we have three layers:
 9 | 
10 |  1. Simulator primitives
11 |  2. Simulator
12 |  3. OpenAI Gym <-> Simulator Glue
13 | 
14 | With user code living in a fourth layer atop 3. The good thing of using this
15 | design is that one can also access each layer directly, which is useful for:
16 | 
17 |  1. Unit testing (the code is tested with `coverage on coveralls.io
18 |     <https://coveralls.io/github/renatolfc/sched-rl-gym>`_)
19 |  2. Using the simulator directly (to replicate results, for example)
20 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """scheduler - basic scheduling algorithms for the *simulation* layer."""
 5 | 
 6 | from .scheduler import Scheduler
 7 | from .sjf_scheduler import SjfScheduler
 8 | from .fifo_scheduler import FifoScheduler
 9 | from .backfilling_scheduler import BackfillingScheduler
10 | from .null_scheduler import NullScheduler
11 | from .packer_scheduler import PackerScheduler
12 | from .random_scheduler import RandomScheduler
13 | from .tetris_scheduler import TetrisScheduler
14 | from .easy_scheduler import EasyScheduler
15 | 
16 | __all__ = [
17 |     'Scheduler',
18 |     'SjfScheduler',
19 |     'BackfillingScheduler',
20 |     'NullScheduler',
21 |     'PackerScheduler',
22 |     'RandomScheduler',
23 |     'TetrisScheduler',
24 |     'EasyScheduler',
25 |     'FifoScheduler',
26 | ]
27 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/fifo_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """fifo_scheduler - First-In First-Out module"""
 5 | 
 6 | from typing import List
 7 | 
 8 | from schedgym.job import Job
 9 | from schedgym.scheduler import Scheduler
10 | 
11 | 
12 | class FifoScheduler(Scheduler):
13 |     """A FIFO scheduler."""
14 | 
15 |     def schedule(self) -> None:
16 |         """Schedules jobs according to submission time.
17 | 
18 |         This implements a *string* FIFO strategy, meaning it will always obey
19 |         submission order, even when it creates fragmentation.
20 |         """
21 |         scheduled_jobs: List[Job] = []
22 |         for job in self.queue_admission:
23 |             resources = self.can_schedule_now(job)
24 |             if resources:
25 |                 self.assign_schedule(job, resources, self.current_time)
26 |                 scheduled_jobs.append(job)
27 |             else:
28 |                 break
29 |         for job in scheduled_jobs:
30 |             self.queue_admission.remove(job)
31 | 


--------------------------------------------------------------------------------
/schedgym/workload/base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """base - base module for all workload generators"""
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | from typing import Optional, List
 8 | 
 9 | from schedgym.job import Job
10 | 
11 | 
12 | class WorkloadGenerator(ABC):
13 |     """An abstract workload generator"""
14 | 
15 |     current_time: int
16 | 
17 |     @abstractmethod
18 |     def step(self, offset: int = 1) -> List[Optional[Job]]:
19 |         """Steps the workload generator by :param offset:.
20 | 
21 |         This may, or may not, return new jobs, depending on the internal
22 |         probability distributions of the workload generator.
23 | 
24 |         Parameters
25 |         ----------
26 |             offset : int
27 |                 The number of time steps to advance the workload generator.
28 |         """
29 | 
30 |     @abstractmethod
31 |     def __len__(self):
32 |         """Returns the length of the workload. Zero if unbounded."""
33 | 
34 |     @abstractmethod
35 |     def peek(self):
36 |         """Peeks what would be the next job"""
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2020 Renato L. de F. Cunha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/schedgym.rst:
--------------------------------------------------------------------------------
 1 | schedgym package
 2 | ===============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    schedgym.envs
11 |    schedgym.scheduler
12 |    schedgym.workload
13 | 
14 | Submodules
15 | ----------
16 | 
17 | schedgym.resource module
18 | -----------------------
19 | 
20 | .. automodule:: schedgym.resource
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | schedgym.cluster module
26 | ----------------------
27 | 
28 | .. automodule:: schedgym.cluster
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | schedgym.event module
34 | --------------------
35 | 
36 | .. automodule:: schedgym.event
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | schedgym.heap module
42 | -------------------
43 | 
44 | .. inheritance-diagram:: schedgym.heap.Heap
45 | 
46 | .. automodule:: schedgym.heap
47 |    :members:
48 |    :undoc-members:
49 |    :show-inheritance:
50 | 
51 | schedgym.job module
52 | ------------------
53 | 
54 | .. automodule:: schedgym.job
55 |    :members:
56 |    :undoc-members:
57 |    :show-inheritance:
58 | 
59 | schedgym.pool module
60 | -------------------
61 | 
62 | .. automodule:: schedgym.pool
63 |    :members:
64 |    :undoc-members:
65 |    :show-inheritance:
66 | 
67 | schedgym.simulator module
68 | ------------------------
69 | 
70 | .. automodule:: schedgym.simulator
71 |    :members:
72 |    :undoc-members:
73 |    :show-inheritance:
74 | 
75 | Module contents
76 | ---------------
77 | 
78 | .. automodule:: schedgym
79 |    :members:
80 |    :undoc-members:
81 |    :show-inheritance:
82 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/sjf_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """sjf_scheduler - Shortest job first scheduler"""
 5 | 
 6 | from typing import List
 7 | 
 8 | from schedgym.job import Job
 9 | from schedgym.scheduler import Scheduler
10 | 
11 | 
12 | class SjfScheduler(Scheduler):
13 |     """A shortest job first scheduler."""
14 | 
15 |     def schedule(self) -> None:
16 |         """Schedules jobs according to shortest job first.
17 | 
18 |         This does so by re-sorting the queue by requested time and iterating
19 |         through it until a job can be scheduled.
20 |         """
21 |         ignored_jobs: List[Job] = []
22 |         # XXX: We always re-sort the queue. If we ever want to learn from
23 |         # demonstration, we'd probably have to do something like:
24 |         # candidates = sorted(
25 |         #   enumerate(self.queue_admission),
26 |         #   key=lambda e:(e[1].requested_time, e[1].submission_time)
27 |         # )
28 |         # and work from there. Hence, the jobs we scheduled would have their
29 |         # indices and we could generate intermediate states as needed.
30 |         # FIXME: With the current implementation, smaller jobs that are
31 |         # longer may be scheduled first.
32 |         for job in sorted(
33 |             self.queue_admission,
34 |             key=lambda j: (j.requested_time, j.submission_time),
35 |         ):
36 |             resources = self.can_schedule_now(job)
37 |             if resources:
38 |                 self.assign_schedule(job, resources, self.current_time)
39 |             else:
40 |                 ignored_jobs.append(job)
41 |         self.queue_admission = ignored_jobs
42 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/tetris_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """tetris_scheduler - A scheduler that mixes Packer and SJF"""
 5 | 
 6 | from schedgym.job import Job
 7 | from schedgym.scheduler import PackerScheduler
 8 | 
 9 | 
10 | class TetrisScheduler(PackerScheduler):
11 |     """Implements the Tetris scheduler.
12 | 
13 |     Adds a factor that controls how much Packer behavior and how much SJF
14 |     behavior influences the scheduler.
15 | 
16 |     Parameters
17 |     ----------
18 |         number_of_processors : int
19 |             Number of processors in the cluster this scheduler manages
20 |         total_memory : int
21 |             Amount of memory in the cluster this scheduler manages
22 |         packer_sjf_ratio : float
23 |             Dial to tune packer to sjf ratio. Valid values in [0, 1], with
24 |             0 being SJF and 1 being Packer behavior.
25 |     """
26 | 
27 |     packer_sjf_ratio: float
28 | 
29 |     def __init__(
30 |         self,
31 |         number_of_processors: int,
32 |         total_memory: int,
33 |         packer_sjf_ratio: float = 0.5,
34 |     ):
35 |         super().__init__(number_of_processors, total_memory)
36 |         self.packer_sjf_ratio = packer_sjf_ratio
37 | 
38 |     def get_priority(self, j: Job) -> float:
39 |         """Gives the packer/sjf mixed priority.
40 | 
41 |         Parameters
42 |         ----------
43 |             j : Job
44 |                 The job for which we're computing priority.
45 |         """
46 |         return (
47 |             self.packer_sjf_ratio
48 |             * (
49 |                 self.free_resources[0] * j.requested_processors
50 |                 + self.free_resources[1]
51 |                 + j.requested_memory
52 |             )
53 |             + (1 - self.packer_sjf_ratio) * 1.0 / j.requested_time
54 |         )
55 | 


--------------------------------------------------------------------------------
/docs/img/job-resource.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="Electron" modified="2020-10-12T16:41:52.585Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.6.2 Chrome/83.0.4103.122 Electron/9.2.1 Safari/537.36" etag="5vEDHXnuA9f2CwnPj6yL" version="13.6.2" type="device"><diagram id="j4lIVj7C0_MCUy1W32IY" name="Page-1">7Vtdb+I6EP01PLbCCfngsUD37q66EtpWur1PKzcxxFonZh1TYH/9HRObkARoSgmw2kioYia2sX3OmfEkaccexst/BJ5F33hIWMfqhsuOPepYlocc+Kscq8zRs1DmmAoaZq4txyP9TbSzq71zGpK00FByziSdFZ0BTxISyIIPC8EXxWYTzoq/OsNTUnE8BphVvf/SUEaZ13e6uf8zodPI/DLq6isxNo21I41wyBdbLvu+Yw8F5zL7Fi+HhKm9M/uS9fu05+pmYoIksk6Hm+cxm/zw5vzml//6YxLcre6fb1AvG+YVs7le8XeS8rkIiJ61XJmtSBc0ZjgBazDhiXzUVxDYmNFpAt8DmAsR4HglQlLYxTt9QfIZeIOIsvABr/hczTiVOPhprEHEBf0Nw2Kmx4TLQmpCWG6hxaPqCe4ueAVJoc3YbAMqub7hZaHhA06ldgScMTxL6ctmGTEWU5oMuJQ83jSKaaAv662CpZHlXhDQBlqQBOExkWIFTXSHnqGHloPh0SLnFnK1L9rile9qSms6Tzcj54jDFw36ewjgVAjQsQZqTYIHJE25gF+8A/uLQhZaPQlSpQZsiFxDJvhPMuSMAwdGCc+4QhkruQxdGJnIvWRJZzigyfRh3WbUyz3f9bYoF4e+E7aWVUTDkCQKaC6xxBmqCsIZp4lc75szgA/s7rB763QcmPgQbJTb8FHNhRzyBNaC6RpUApRZEEWbegw4ILQqLzQPLLceD6zGeODu4UFMYg7jtxxonAOOdWkOeDs4UEKZ0TV6GcomI6KjII4BLEZyTJ8U5KMbVMHdruJu78CY4RfCxjylknI1vsjalrC/GLyeXzPUN4Vuv4LuV/7SZvkGsjzyj8vyxndy7M0ZtBLdhT7s6SS/9+zXBvc31d+/7gRvFtIG9ybgrZu7mwru1cxtpDyGarkN8g0EebvfLwR55F26lvP3BPm2lDtpKPCuO9CjXYG+reTOSIGLF3LVk36b6k8F7qXLOKuC7ZDNU5Wq2yTffCVXO8k3VsrZe6J7W8mdSv/Wdef36hObNrifCtxLl3FWNbofPKm1If7jId61SiHerqlwu9cUC1qJf0jipwngjWm8xlMWkoR36uUFsEKKY56ETxFVCRMufKLM6BEso0aVYSMZ51dgqs/bxn8KHcid2hwZDWbWylhLKp+3vm/1AivvpAzTJ5s9CSsvUpQUCCvMjiQHalf9NBrizJQcxLi7G+NdoArCsKSvxdntglUPN1Y8zeNDzyvGh80jfDOEuc+27pWTozKQa78xULbqykBrlm3W+AHi1SgLr4x41tmYZ/biTeZlFP07mAdUwKutZjqC752wW3q7BfWcw/Mqtff7Jd5nEzipCsxW/EEquEIRuGcTQZm7yOndOsfJwCnpydzAb1oFjv0uFSBUUE1DMqjxOPDKZOAidFAIyhgTQWFrVMFzMXH459MGKj6CsnqlQ2ttZTilgVAzCcIpPzNzvMPz6u+eV7PSqBbh1y4N7wpTxJ6q+xwpwvfVjhynBLfEONtVp9Cz5AnXeleesOxz5Ild95uvWwxn04JfUwr22ZRQfifBQtatV7oRXVcJtv32WEdXrGDm//mQNc//fcS+/x8=</diagram></mxfile>


--------------------------------------------------------------------------------
/schedgym/scheduler/packer_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """packer_scheduler - A scheduler based on the Packer heuristic"""
 5 | 
 6 | from typing import List
 7 | 
 8 | from schedgym.job import Job
 9 | from schedgym.scheduler import Scheduler
10 | 
11 | 
12 | class PackerScheduler(Scheduler):
13 |     """Implements the Packer heuristic.
14 | 
15 |     In the paper, they give higher priority to the dot product of requested
16 |     resources with the set of available resources. This means that, since
17 |     the number of resources don't change, the jobs that use more resources
18 |     will be preferred. Every time a new job is scheduled, the prioritization
19 |     changes, since they always schedule one job at a time.
20 |     """
21 | 
22 |     def get_priority(self, j: Job) -> float:
23 |         """Computes the priority of a given job.
24 | 
25 |         This computes the priority of a job according to the Packer heuristic,
26 |         which will prefer jobs with higher dot-product between free and
27 |         requested resources.
28 | 
29 |         Parameters
30 |         ----------
31 |             j : Job
32 |                 The job whose priority is to be calculated.
33 |         """
34 |         if self.ignore_memory:
35 |             return self.free_resources[0] * j.requested_processors
36 |         return (
37 |             self.free_resources[0] * j.requested_processors
38 |             + self.free_resources[1] * j.requested_memory
39 |         )
40 | 
41 |     def schedule(self) -> None:
42 |         """Schedules jobs according to the Packer heuristic."""
43 |         ignored_jobs: List[Job] = []
44 |         for job in reversed(
45 |             sorted(self.queue_admission, key=lambda j: self.get_priority(j))
46 |         ):
47 |             resources = self.can_schedule_now(job)
48 |             if resources:
49 |                 self.assign_schedule(job, resources, self.current_time)
50 |             else:
51 |                 ignored_jobs.append(job)
52 |         self.queue_admission = ignored_jobs
53 | 


--------------------------------------------------------------------------------
/sjf-agent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import gym
 6 | import json
 7 | import numpy as np
 8 | import schedgym.envs as deeprm
 9 | 
10 | EPISODES = 1
11 | MAX_EPISODE_LENGTH = 200
12 | 
13 | 
14 | def sjf_action(observation):
15 |     "Selects the job SJF (Shortest Job First) would select."
16 | 
17 |     current, wait, _, _ = observation
18 |     best = wait.shape[2] + 1  # infinity
19 |     best_idx = wait.shape[1]
20 | 
21 |     free = np.ones(current.shape[0]) * current.shape[-1] - np.sum(current[:, 0, :] != 0)
22 | 
23 |     for slot in range(wait.shape[1]):
24 |         required_resources = (wait[:, slot, 0, :] != 0).sum(axis=1)
25 |         if np.all(required_resources) and np.all(required_resources <= free):
26 |             tmp = np.sum(wait[0, slot, :, 0])
27 |             if tmp < best:
28 |                 best_idx = slot
29 |                 best = tmp
30 |     return best_idx
31 | 
32 | 
33 | def find_position(q, idx):
34 |     for i, j in enumerate(q):
35 |         if j.slot_position == idx:
36 |             return i
37 |     return idx
38 | 
39 | 
40 | def pack_observation(ob, time_horizon):
41 |     current, wait, backlog, time = ob
42 |     wait = wait.reshape(time_horizon, -1)
43 |     current = current.reshape(time_horizon, -1)
44 |     return np.hstack((current, wait, backlog, time))
45 | 
46 | def main():
47 |     kwargs = {'use_raw_state': True}
48 |     if os.path.exists('config/test.json'):
49 |         with open('config/test.json', 'r') as fp:
50 |             kwargs = json.load(fp)
51 |     env: deeprm.DeepRmEnv = gym.make('DeepRM-v0', **kwargs)
52 |     time_horizon = env.reset()[0].shape[1]
53 |     for episode in range(EPISODES):
54 |         ob = env.reset()
55 |         action = sjf_action(ob)
56 |         while True:
57 |             ob, reward, done, _ = env.step(action)
58 |             action = sjf_action(ob)
59 |             ob = pack_observation(ob, time_horizon)
60 |             env.render()
61 |             if done:
62 |                 break
63 |     env.close()
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | import schedgym
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'sched-rl-gym'
23 | copyright = '2020, Renato L. de F. Cunha'
24 | author = 'Renato L. de F. Cunha'
25 | 
26 | 
27 | # -- General configuration ---------------------------------------------------
28 | 
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = [
33 |     'nbsphinx',
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.coverage',
36 |     'sphinx.ext.napoleon',
37 |     'sphinx.ext.inheritance_diagram',
38 | ]
39 | 
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ['_templates']
42 | 
43 | # List of patterns, relative to source directory, that match files and
44 | # directories to ignore when looking for source files.
45 | # This pattern also affects html_static_path and html_extra_path.
46 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
47 | 
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_theme = 'alabaster'
55 | 
56 | # Add any paths that contain custom static files (such as style sheets) here,
57 | # relative to this directory. They are copied after the builtin static files,
58 | # so a file named "default.css" will overwrite the builtin "default.css".
59 | html_static_path = ['_static']
60 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: sched-rl-gym
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       max-parallel: 4
13 |       matrix:
14 |         python-version: [3.7, 3.8, 3.9]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v1
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v1
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install -e .
26 |     - name: Lint with flake8
27 |       run: |
28 |         pip install flake8
29 |         # stop the build if there are Python syntax errors or undefined names
30 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
31 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
32 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
33 |     - name: Test with pytest & coverage
34 |       run: |
35 |         pip install pytest pytest-cov coveralls
36 |         coverage run --source schedgym -m pytest schedgym
37 |         coverage report -m
38 |     - name: Coveralls Parallel
39 |       uses: AndreMiras/coveralls-python-action@develop
40 |       with:
41 |         github-token: ${{ secrets.github_token }}
42 |         parallel: true
43 |   finish_build:
44 |     name: Finish Coveralls
45 |     needs: build
46 |     runs-on: ubuntu-latest
47 |     steps:
48 |     - name: Finished
49 |       uses: AndreMiras/coveralls-python-action@develop
50 |       with:
51 |         github-token: ${{ secrets.github_token }}
52 |         parallel-finished: true
53 |   build_wheels:
54 |     name: Build wheels on ${{ matrix.os }}
55 |     runs-on: ${{ matrix.os }}
56 |     strategy:
57 |       matrix:
58 |         os: [ubuntu-20.04, macOS-10.15]
59 |     steps:
60 |       - uses: actions/checkout@v2
61 |       - uses: actions/setup-python@v2
62 |       - name: Build wheels
63 |         uses: pypa/cibuildwheel@v2.3.1
64 |       - uses: actions/upload-artifact@v2
65 |         with:
66 |           path: ./wheelhouse/*.whl
67 |   build_sdist:
68 |     name: Build source distribution
69 |     runs-on: ubuntu-latest
70 |     steps:
71 |     - uses: actions/checkout@v2
72 | 
73 |     - name: Build sdist
74 |       run: pip install cython && pipx run --system-site-packages build --sdist
75 | 
76 |     - uses: actions/upload-artifact@v2
77 |       with:
78 |         path: dist/*.tar.gz


--------------------------------------------------------------------------------
/schedgym/resource.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """resource - basic resource unit
 5 | 
 6 | This module has two classes:
 7 |   1. `PrimaryResource`, an enumeration for the different supported types (CPU
 8 |      and MEMORY)
 9 |   2. The basic resource group, which is comprised of *both* CPU and memory
10 | """
11 | 
12 | import copy
13 | import enum
14 | from typing import Tuple
15 | 
16 | from intervaltree import IntervalTree
17 | 
18 | 
19 | class PrimaryResource(enum.IntEnum):
20 |     """Enumeration for identifying the various supported resource types."""
21 | 
22 |     CPU = 0
23 |     MEMORY = 1
24 | 
25 | 
26 | class Resource(object):
27 |     """The basic resource group.
28 | 
29 |     This groups IntervalTrees into as many resources that can are supported in
30 |     the system.
31 | 
32 |     This is referenced by a :class:`schedgym.job.Job` to represent *which
33 |     specific resources* are being used by that job.
34 | 
35 |     Parameters
36 |     ----------
37 |         processors : IntervalTree
38 |             An interval tree that defines a set of processors
39 |         memory : IntervalTree
40 |             An interval tree that defines a set of memory resources
41 |         ignore_memory : bool
42 |             Whether memory should be taken in consideration when measuring
43 |             resource usage.
44 |     """
45 | 
46 |     memory: IntervalTree
47 |     """IntervalTree that stores memory used"""
48 |     processors: IntervalTree
49 |     """IntervalTree that stores processors used"""
50 | 
51 |     def __init__(
52 |         self,
53 |         processors: IntervalTree = IntervalTree(),
54 |         memory: IntervalTree = IntervalTree(),
55 |         ignore_memory: bool = False,
56 |     ):
57 |         self.ignore_memory = ignore_memory
58 |         self.processors = copy.copy(processors)
59 |         self.memory = copy.copy(memory)
60 | 
61 |     def measure(self) -> Tuple[int, int]:
62 |         """Returns the total amount of resources in use.
63 | 
64 |         Returns:
65 |             Tuple: A tuple containing the amount of resources used for each
66 |             resource type supported.
67 |         """
68 |         processors = sum([i.end - i.begin for i in self.processors])
69 |         memory = sum([i.end - i.begin for i in self.memory])
70 |         return processors, memory
71 | 
72 |     def __bool__(self) -> bool:
73 |         return bool(self.processors) and (
74 |             self.ignore_memory or bool(self.memory)
75 |         )
76 | 
77 |     def __repr__(self):
78 |         return f'Resource({self.processors}, {self.memory})'
79 | 
80 |     def __str__(self):
81 |         return f'Resource({self.processors}, {self.memory})'
82 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '31 17 * * 2'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/schedgym/simulator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """simulator - Classes for simulating job submission and execution.
 5 | 
 6 | This module comprises an abstract base class for simulation and a time-based
 7 | simulator that inherits directly from `Simulator`.
 8 | 
 9 | The time-based simulator is coupled with
10 | a :class:`schedgym.workload.WorkloadGenerator` to generate jobs at a given time
11 | step.
12 | """
13 | 
14 | import enum
15 | 
16 | from abc import ABC, abstractmethod
17 | 
18 | from . import workload, scheduler as sched
19 | 
20 | 
21 | class SimulationType(enum.Enum):
22 |     """Enumeration to differentiate between simulation types"""
23 | 
24 |     TIME_BASED = 0
25 |     EVENT_BASED = 1
26 | 
27 | 
28 | class Simulator(ABC):
29 |     """Abstract base class for simulation.
30 | 
31 |     Parameters
32 |     ----------
33 |         workload_generator : workload.WorkloadGenerator
34 |             An object to generate load when time is stepped.
35 |         scheduler : sched.Scheduler
36 |             A scheduling algorithm that will schedule jobs according to a given
37 |             rule.
38 |     """
39 | 
40 |     current_time: int
41 |     scheduler: sched.Scheduler
42 |     simulation_start_time: int
43 | 
44 |     def __init__(
45 |         self,
46 |         workload_generator: workload.WorkloadGenerator,
47 |         scheduler: sched.Scheduler,
48 |     ):
49 | 
50 |         self.current_time = 0
51 |         self.scheduler = scheduler
52 |         self.simulation_start_time = 0
53 |         self.workload: workload.WorkloadGenerator = workload_generator
54 | 
55 |     @staticmethod
56 |     def make(
57 |         simulation_type: SimulationType,
58 |         workload_generator: workload.WorkloadGenerator,
59 |         scheduler: sched.Scheduler,
60 |     ):
61 |         """Factory method for instantiating new simulators."""
62 |         if simulation_type == SimulationType.TIME_BASED:
63 |             return TimeBasedSimulator(workload_generator, scheduler)
64 |         raise RuntimeError(f'Unsupported simulation type {simulation_type}')
65 | 
66 |     @abstractmethod
67 |     def step(self, submit) -> None:
68 |         """Runs a simulation step."""
69 | 
70 | 
71 | class TimeBasedSimulator(Simulator):
72 |     """A simulator that is based on time."""
73 | 
74 |     scheduler: sched.Scheduler
75 | 
76 |     def __init__(
77 |         self,
78 |         workload_generator: workload.WorkloadGenerator,
79 |         scheduler: sched.Scheduler,
80 |     ):
81 |         super().__init__(workload_generator, scheduler)
82 |         self.current_time = 0
83 | 
84 |     def step(self, submit=True):
85 |         self.current_time += 1
86 |         self.scheduler.step()
87 |         jobs = self.workload.step()
88 |         if submit and jobs:
89 |             for job in jobs:
90 |                 if job is not None:
91 |                     self.scheduler.submit(job)
92 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/easy_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """easy_scheduler - A scheduler that uses easy backfilling.
 5 | """
 6 | 
 7 | from typing import List, Tuple, Optional
 8 | 
 9 | from schedgym.job import Job, JobStatus
10 | from schedgym.scheduler import Scheduler
11 | from schedgym.event import JobEvent
12 | 
13 | 
14 | class EasyScheduler(Scheduler):
15 |     """EASY backfilling scheduler.
16 | 
17 |     This is a backfilling scheduling that uses the EASY strategy. Upon
18 |     encountering a single job that cannot be scheduled, it makes a reservation
19 |     for that job on which would be the first time it should start on.
20 | 
21 |     Smaller jobs than the one currenly with a reservation may start, provided
22 |     they do not delay the one with a reservation.
23 |     """
24 | 
25 |     reservation: Optional[Tuple[JobEvent, JobEvent]]
26 | 
27 |     def __init__(self, *args, **kwargs):
28 |         super().__init__(*args, **kwargs)
29 |         self.reservation = None
30 | 
31 |     def _handle_reservation(self) -> None:
32 |         if not self.reservation:
33 |             return
34 | 
35 |         start, finish = self.reservation
36 |         if (
37 |             start.time == self.current_time
38 |             or start.job.status != JobStatus.WAITING
39 |         ):
40 |             # Reservation will be fulfilled
41 |             self.reservation = None
42 |             return
43 | 
44 |         resources = self.can_schedule_now(start.job)
45 |         if resources:
46 |             self.queue_waiting.remove(start.job)
47 | 
48 |             self.job_events.remove(start)
49 |             self.job_events.remove(finish)
50 | 
51 |             self.assign_schedule(start.job, resources, self.current_time)
52 |             self.reservation = None
53 | 
54 |     def schedule(self) -> None:
55 |         ignored_jobs: List[Job] = []
56 | 
57 |         self._handle_reservation()
58 |         for job in self.queue_admission:
59 |             resources = self.can_schedule_now(job)
60 |             if resources:
61 |                 self.assign_schedule(job, resources, self.current_time)
62 |             else:
63 |                 if not self.reservation:
64 |                     # This is the first job without a reservation.
65 |                     # We're doing EASY backfilling, so we create a
66 |                     # reservation for this one job and keep going
67 |                     time, resources = self.find_first_time_for(job)
68 |                     if not resources:
69 |                         raise AssertionError('Something is terribly wrong')
70 |                     self.reservation = self.assign_schedule(
71 |                         job, resources, time
72 |                     )
73 |                 else:
74 |                     # We already have a reservation, so we skip this job
75 |                     ignored_jobs.append(job)
76 |         self.queue_admission = ignored_jobs
77 | 


--------------------------------------------------------------------------------
/schedgym/heap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """heap - A Priority Queue based on the `heapq` module."""
 5 | 
 6 | import heapq
 7 | import itertools
 8 | from typing import Generic, TypeVar, List, Dict, Generator, Optional, Iterator
 9 | from typing import Tuple, cast
10 | 
11 | T = TypeVar('T')
12 | ENTRY_T = Tuple[int, int, List[Optional[T]]]
13 | 
14 | 
15 | class Heap(Generic[T]):
16 |     """A Priority Queue that is backed by a heap data structure.
17 | 
18 |     To reduce the computational cost of key removal, this class wastes a bit
19 |     memory by *not* actually deleting items.
20 |     """
21 | 
22 |     entry_finder: Dict[Optional[T], ENTRY_T]
23 |     'Cache to check in O(1) whether an entry exists in the heap.'
24 |     priority_queue: List[ENTRY_T]
25 |     'The actual priority queue, implemented as a list with heap ordering.'
26 | 
27 |     def __init__(self):
28 |         """Initializes the heap.
29 | 
30 |         """
31 |         self.priority_queue = []
32 |         self.entry_finder = {}
33 |         self.counter = itertools.count()
34 | 
35 |     def add(self, item, priority=0) -> None:
36 |         """Add a new item or update the priority of an existing item"""
37 |         if item in self.entry_finder:
38 |             self.remove(item)
39 |         count = next(self.counter)
40 |         entry = (priority, count, [item])
41 |         self.entry_finder[item] = entry
42 |         heapq.heappush(self.priority_queue, entry)
43 | 
44 |     def remove(self, item) -> None:
45 |         """Mark an existing item as removed. Raise KeyError if not found."""
46 |         entry = self.entry_finder.pop(item)
47 |         entry[-1][0] = None
48 | 
49 |     def pop(self) -> T:
50 |         """Remove and return the lowest priority task.
51 | 
52 |         Raises KeyError if empty."""
53 |         while self.priority_queue:
54 |             _, _, (item,) = heapq.heappop(self.priority_queue)
55 |             if item is not None:
56 |                 del self.entry_finder[item]  # type: ignore
57 |                 return cast(T, item)
58 |         raise KeyError('pop from an empty priority queue')
59 | 
60 |     def __iter__(self) -> Iterator[T]:
61 |         return iter(self.heapsort())
62 | 
63 |     def __contains__(self, item):
64 |         return item in self.entry_finder
65 | 
66 |     def __len__(self):
67 |         return len(self.entry_finder)
68 | 
69 |     @property
70 |     def first(self) -> Optional[T]:
71 |         """Returns the "first" item (highest priority item) in the Heap."""
72 |         if len(self.entry_finder) == 0:
73 |             return None
74 |         for (_, _, (item,)) in self.priority_queue:
75 |             if item is not None:
76 |                 return cast(T, item)
77 |         return None
78 | 
79 |     def heapsort(self) -> Generator[T, None, None]:
80 |         """Generator that iterates over all elements in the heap in priority
81 |         order."""
82 |         h = [e for e in self.priority_queue]
83 |         while h:
84 |             entry = heapq.heappop(h)[-1][0]
85 |             if entry is not None:
86 |                 yield cast(T, entry)
87 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://github.com/renatolfc/sched-rl-gym/workflows/sched-rl-gym/badge.svg
  2 |    :alt: sched-rl-gym
  3 | .. image:: https://coveralls.io/repos/github/renatolfc/sched-rl-gym/badge.svg?branch=master
  4 |    :target: https://coveralls.io/github/renatolfc/sched-rl-gym?branch=master
  5 | .. image:: https://readthedocs.org/projects/sched-rl-gym/badge/?version=latest
  6 |    :target: https://sched-rl-gym.readthedocs.io/en/latest/?badge=latest
  7 |    :alt: Documentation Status
  8 | 
  9 | 
 10 | sched-rl-gym: Gym environment for HPC job scheduling problems
 11 | =============================================================
 12 | 
 13 | .. inclusion-marker-do-not-remove
 14 | 
 15 | ``sched-rl-gym`` is an `OpenAI Gym <https://gym.openai.com>`__
 16 | environment for job scheduling problems. Currently, it implements `the
 17 | Markov Decision
 18 | Process <https://en.wikipedia.org/wiki/Markov_decision_process>`__
 19 | defined by
 20 | `DeepRM <https://people.csail.mit.edu/hongzi/content/publications/DeepRM-HotNets16.pdf>`__.
 21 | 
 22 | You can `use it as any other OpenAI Gym
 23 | environment <https://gym.openai.com/docs/>`__, provided the module is
 24 | registered. Lucky for you, it supports auto registration upon first
 25 | import.
 26 | 
 27 | Therefore, you can get started by importing the environment with
 28 | ``import schedgym.envs as schedgym``.
 29 | 
 30 | As a parallel with the CartPole example in the Gym documentation, the
 31 | following code will implement a random agent:
 32 | 
 33 | .. code:: python
 34 | 
 35 |    import gym
 36 |    import schedgym.envs as schedgym
 37 | 
 38 |    env = gym.make('DeepRM-v0', use_raw_state=True)
 39 |    env.reset()
 40 | 
 41 |    for _ in range(200):
 42 |      env.render()
 43 |      observation, reward, done, info = env.step(env.action_space.sample())
 44 |    env.close()
 45 | 
 46 | With the following rendering:
 47 | 
 48 | .. figure:: ./docs/img/gym.gif
 49 |    :alt: OpenAI Gym Environment rendering
 50 | 
 51 |    OpenAI Gym Environment rendering
 52 | 
 53 | Features
 54 | --------
 55 | 
 56 | -  OpenAI Gym environment
 57 | -  Human rendering
 58 | -  Configurable environment
 59 | 
 60 | Installation
 61 | ------------
 62 | 
 63 | The easiest/quickest way to install sched-rl-gym is to use ``pip`` with
 64 | the command:
 65 | 
 66 | ::
 67 | 
 68 |    pip install -e git+https://github.com/renatolfc/sched-rl-gym.git#egg=sched-rl-gym
 69 | 
 70 | We do recommend you use a `virtual
 71 | environment <https://docs.python-guide.org/dev/virtualenvs/>`__, to not
 72 | pollute your python installation with custom packages.
 73 | 
 74 | If you want to be able to edit the code, then your best bet is to clone
 75 | this repository with
 76 | 
 77 | ::
 78 | 
 79 |    git clone https://github.com/renatolfc/sched-rl-gym.git
 80 | 
 81 | In this case, you will need to install the dependencies manually.
 82 | 
 83 | Dependencies
 84 | ~~~~~~~~~~~~
 85 | 
 86 | The dependencies are documented in the ``requirements.txt`` file. You
 87 | can install them with
 88 | 
 89 | ::
 90 | 
 91 |    pip install -r requirements.txt
 92 | 
 93 | Contribute
 94 | ----------
 95 | 
 96 | -  Issue tracker: https://github.com/renatolfc/sched-rl-gym/issues
 97 | -  Source code: https://github.com/renatolfc/sched-rl-gym
 98 | 
 99 | Support
100 | -------
101 | 
102 | If you’re having issues, please let us know. The easiest way is to `open
103 | an issue on
104 | github <https://github.com/renatolfc/sched-rl-gym/issues>`__.
105 | 
106 | License
107 | -------
108 | 
109 | The project is licensed under the MIT license.
110 | 


--------------------------------------------------------------------------------
/schedgym/workload/swf_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | swf_parser - Parser for the Standard Workload Format (SWF)
  6 | 
  7 | A full description of the format, with meanings for each field is available on
  8 | the web at http://www.cs.huji.ac.il/labs/parallel/workload/swf.html.
  9 | """
 10 | 
 11 | from enum import IntEnum
 12 | 
 13 | import logging
 14 | 
 15 | from ..job import Job, SwfJobStatus
 16 | 
 17 | logger = logging.getLogger(__name__)  # pylint: disable=C
 18 | 
 19 | 
 20 | class SwfFields(IntEnum):
 21 |     """Fields of the Standard Workload Format."""
 22 | 
 23 |     JOB_ID = 0
 24 |     SUBMITTED = 1
 25 |     WAIT_TIME = 2
 26 |     EXEC_TIME = 3
 27 |     ALLOC_PROCS = 4
 28 |     AVG_CPU_USAGE = 5
 29 |     USED_MEM = 6
 30 |     REQ_PROCS = 7
 31 |     REQ_TIME = 8
 32 |     REQ_MEM = 9
 33 |     STATUS = 10
 34 |     USER_ID = 11
 35 |     GROUP_ID = 12
 36 |     EXECUTABLE = 13
 37 |     QUEUE_NUM = 14
 38 |     PART_NUM = 15
 39 |     PRECEDING_JOB = 16
 40 |     THINK_TIME = 17
 41 | 
 42 | 
 43 | CONVERTERS = {
 44 |     key: int if key != SwfFields.AVG_CPU_USAGE else float for key in SwfFields
 45 | }
 46 | 
 47 | 
 48 | def parse(filename, processors, memory, ignore_memory=False):
 49 |     """Parser for SWF job files.
 50 | 
 51 |     The SWF is a simple format with commented lines starting with the ';'
 52 |     character and other lines separated by spaces.
 53 | 
 54 |     Parsing, therefore, involves splitting the lines and associating each
 55 |     column of the file with a field.
 56 |     """
 57 | 
 58 |     with open(filename, 'r') as fp:  # pylint: disable=C
 59 |         for line in fp:
 60 |             if ';' in line:
 61 |                 continue
 62 |             fields = line.strip().split()
 63 |             fields = [  # Converts all fields according to our rules
 64 |                 CONVERTERS[SwfFields(i)](f) for i, f in enumerate(fields)
 65 |             ]
 66 | 
 67 |             job = Job(
 68 |                 fields[SwfFields.JOB_ID],
 69 |                 fields[SwfFields.SUBMITTED],
 70 |                 fields[SwfFields.EXEC_TIME],
 71 |                 fields[SwfFields.ALLOC_PROCS],
 72 |                 fields[SwfFields.AVG_CPU_USAGE],
 73 |                 fields[SwfFields.USED_MEM],
 74 |                 fields[SwfFields.REQ_PROCS],
 75 |                 fields[SwfFields.REQ_TIME],
 76 |                 fields[SwfFields.REQ_MEM],
 77 |                 SwfJobStatus(fields[SwfFields.STATUS]),
 78 |                 fields[SwfFields.USER_ID],
 79 |                 fields[SwfFields.GROUP_ID],
 80 |                 fields[SwfFields.EXECUTABLE],
 81 |                 fields[SwfFields.QUEUE_NUM],
 82 |                 fields[SwfFields.PART_NUM],
 83 |                 fields[SwfFields.PRECEDING_JOB],
 84 |                 fields[SwfFields.THINK_TIME],
 85 |                 fields[SwfFields.WAIT_TIME],
 86 |             )
 87 | 
 88 |             if job.requested_memory < 0 < job.memory_use:
 89 |                 job.requested_memory = job.memory_use
 90 | 
 91 |             if job.requested_processors < 0 < job.processors_allocated:
 92 |                 job.requested_processors = job.processors_allocated
 93 | 
 94 |             if job.requested_memory < 0 and ignore_memory:
 95 |                 job.requested_memory = 0
 96 | 
 97 |             if (
 98 |                 job.requested_processors < 1
 99 |                 or (job.requested_memory < 1 and not ignore_memory)
100 |                 or job.execution_time < 1
101 |                 or job.submission_time < 0
102 |             ):
103 |                 logger.warning(f'Ignoring malformed job {job.id}')
104 |                 continue
105 | 
106 |             if job.requested_time < job.execution_time:
107 |                 job.requested_time = job.execution_time
108 | 
109 |             if job.requested_processors > processors:
110 |                 job.requested_processors = processors
111 | 
112 |             if job.requested_memory > memory:
113 |                 job.requested_memory = memory
114 | 
115 |             yield job
116 | 


--------------------------------------------------------------------------------
/schedgym/envs/render.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import matplotlib
  5 | import numpy as np
  6 | 
  7 | import pyglet
  8 | from matplotlib import pylab as plt
  9 | import matplotlib.backends.backend_agg as agg
 10 | 
 11 | DPI = 96
 12 | WIDTH = 800
 13 | HEIGHT = 600
 14 | RESOLUTION = (WIDTH, HEIGHT)
 15 | 
 16 | SUPPORTED_MODES = {
 17 |     'human': lambda: DeepRmHumanRenderer,
 18 |     'rgb_array': lambda: DeepRmRgbRenderer,
 19 | }
 20 | 
 21 | 
 22 | class DeepRmRgbRenderer(object):
 23 |     def __init__(self, resolution=RESOLUTION, dpi=DPI):
 24 |         self.resolution = resolution
 25 |         self.dpi = DPI
 26 | 
 27 |     @staticmethod
 28 |     def plot_substate(ax, title, state, colorbar=False):
 29 |         cmap = matplotlib.cm.get_cmap('rainbow')
 30 |         cmap.set_under('w')
 31 |         im = ax.imshow(state, cmap=cmap, vmin=0.001, vmax=1)
 32 |         if colorbar:
 33 |             ax.figure.colorbar(im, ax=ax)
 34 |         ax.set_title(title)
 35 |         ax.set_xlabel('Slots')
 36 |         ax.set_ylabel('Time horizon (timesteps)')
 37 |         ax.set_xticks([])
 38 |         ax.set_yticks([])
 39 |         ax.grid()
 40 | 
 41 |     def render(self, state):
 42 |         width = self.resolution[0] / self.dpi
 43 |         height = self.resolution[1] / self.dpi
 44 |         fig = plt.figure(0, figsize=(width, height), dpi=self.dpi)
 45 | 
 46 |         current, wait, backlog, time = state
 47 |         lines = current.shape[0]
 48 | 
 49 |         # Axes {{{
 50 |         axs_current = [
 51 |             plt.subplot2grid((lines, 3), (i, 0)) for i in range(lines)
 52 |         ]
 53 |         axs_wait = [plt.subplot2grid((lines, 3), (i, 1)) for i in range(lines)]
 54 |         ax_backlog = plt.subplot2grid((lines, 3), (0, 2), rowspan=lines)
 55 |         # End of Axes }}}
 56 | 
 57 |         for i, (ax_current, ax_wait) in enumerate(zip(axs_current, axs_wait)):
 58 |             self.plot_substate(
 59 |                 ax_current, f'Current resources {i}', current[i]
 60 |             )
 61 |             self.plot_substate(
 62 |                 ax_wait, f'Waiting jobs stack {i}', np.mean(wait[i], axis=0)
 63 |             )
 64 |         self.plot_substate(ax_backlog, 'Backlog', backlog, True)
 65 | 
 66 |         fig.tight_layout()
 67 |         canvas = agg.FigureCanvasAgg(fig)
 68 |         canvas.draw()
 69 |         renderer = canvas.get_renderer()
 70 |         raw_data = renderer.tostring_rgb()
 71 |         size = canvas.get_width_height()
 72 |         plt.close(fig)
 73 | 
 74 |         return np.frombuffer(raw_data, dtype=np.uint8).reshape(
 75 |             (size[0], size[1], 3)
 76 |         )
 77 | 
 78 | 
 79 | class DeepRmHumanRenderer(DeepRmRgbRenderer, pyglet.window.Window):
 80 |     def __init__(self, resolution=RESOLUTION, dpi=DPI):
 81 |         super().__init__(resolution, dpi)
 82 | 
 83 |         self.rendering = None
 84 |         width, height = resolution
 85 |         self.window = pyglet.window.Window(width, height, visible=False)
 86 |         self.window.set_caption('Scheduler State')
 87 |         self.window.set_visible()
 88 |         self.window.on_draw = self.on_draw
 89 | 
 90 |     def on_draw(self):
 91 |         self.window.clear()
 92 |         if self.rendering is not None:
 93 |             height, width, _ = self.rendering.shape
 94 |             img = pyglet.image.ImageData(
 95 |                 height,
 96 |                 width,
 97 |                 'RGB',
 98 |                 self.rendering.data.tobytes(),
 99 |                 -3 * height,
100 |             )
101 | 
102 |             img.blit(0, 0)
103 | 
104 |     def render(self, state):
105 |         self.rendering = super().render(state)
106 | 
107 |         pyglet.clock.tick()
108 |         self.window.switch_to()
109 |         self.window.dispatch_events()
110 |         self.window.dispatch_event('on_draw')
111 |         self.window.flip()
112 | 
113 |         return self.rendering
114 | 
115 | 
116 | class DeepRmRenderer(object):
117 |     def __init__(self, mode, *args, **kwargs):
118 |         if mode not in SUPPORTED_MODES:
119 |             raise RuntimeError('Requested unsupported mode %s' % mode)
120 |         self.renderer = SUPPORTED_MODES[mode]()(*args, **kwargs)
121 | 
122 |     def render(self, state):
123 |         return self.renderer.render(state)
124 | 


--------------------------------------------------------------------------------
/schedgym/workload/distribution.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """distribution - Generative models for workload generation"""
  5 | 
  6 | import math
  7 | import random
  8 | import itertools
  9 | from abc import ABC, abstractmethod
 10 | from typing import List, Optional
 11 | 
 12 | from schedgym.job import Job, JobParameters
 13 | from schedgym.workload.base import WorkloadGenerator
 14 | 
 15 | 
 16 | class DistributionalWorkloadGenerator(WorkloadGenerator, ABC):
 17 |     """An abstract class for workload generation based on distributions.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |         length : int
 22 |             An optional length of workload generation. When length samples
 23 |             are generated, automatic iteration will stop.
 24 |     """
 25 | 
 26 |     length: int
 27 |     current_element: int
 28 | 
 29 |     def __init__(self, length=0):
 30 |         self.length = length
 31 |         self.current_element = 0
 32 | 
 33 |     @abstractmethod
 34 |     def step(self, offset=1) -> List[Optional[Job]]:
 35 |         """Steps the workload generator by :param offset:.
 36 | 
 37 |         This may, or may not, return new jobs, depending on the internal
 38 |         probability distributions of the workload generator.
 39 | 
 40 |         Parameters
 41 |         ----------
 42 |             offset : int
 43 |                 The number of time steps to advance the workload generator.
 44 |         """
 45 | 
 46 | 
 47 | class BinomialWorkloadGenerator(DistributionalWorkloadGenerator):
 48 |     """A workload generator that is based on a Bernoulli distribution.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |         new_job_rate : float
 53 |             The probability of generating a new job
 54 |         small_job_chance : float
 55 |             The probability a sampled job will be "small"
 56 |         small_job_parameters : JobParameters
 57 |             The characteristics of "small" jobs
 58 |         large_job_parameters : JobParameters
 59 |             The characteristics of "large" jobs
 60 |         length : int
 61 |             The size of the sequence of jobs generated when iterating over this
 62 |             workload generator
 63 |     """
 64 | 
 65 |     new_job_rate: float
 66 |     small_job_chance: float
 67 |     large_job: JobParameters
 68 |     small_job: JobParameters
 69 | 
 70 |     def __init__(
 71 |         self,
 72 |         new_job_rate,
 73 |         small_job_chance,
 74 |         small_job_parameters,
 75 |         large_job_parameters,
 76 |         length=0,
 77 |         runtime_estimates=None,
 78 |         estimate_parameters=None,
 79 |     ):
 80 |         super().__init__(length)
 81 | 
 82 |         self.current_time = 0
 83 |         self.counter = itertools.count(1)
 84 |         self.new_job_rate = new_job_rate
 85 |         self.small_job_chance = small_job_chance
 86 |         self.small_job = small_job_parameters
 87 |         self.large_job = large_job_parameters
 88 | 
 89 |         if runtime_estimates is not None and runtime_estimates not in [
 90 |             'gaussian',
 91 |             'gaussian-over',
 92 |             'gaussian-under',
 93 |         ]:
 94 |             raise ValueError(f'Unsupported estimate type {runtime_estimates}')
 95 | 
 96 |         self.runtime_estimates = runtime_estimates
 97 |         self.estimate_parameters = estimate_parameters
 98 | 
 99 |     def step(self, offset=1) -> List[Optional[Job]]:
100 |         self.current_time += offset
101 |         if random.random() > self.new_job_rate:
102 |             return []
103 |         if random.random() < self.small_job_chance:
104 |             j = self.small_job.sample(self.current_time)
105 |         else:
106 |             j = self.large_job.sample(self.current_time)
107 |         if self.runtime_estimates and self.runtime_estimates.startswith(
108 |             'gaussian'
109 |         ):
110 |             if self.estimate_parameters is None:
111 |                 raise RuntimeError(
112 |                     "Can't sample runtime estimates with undefined parameters"
113 |                 )
114 |             diff = random.gauss(0, self.estimate_parameters * j.execution_time)
115 |             if 'over' in self.runtime_estimates:
116 |                 diff = abs(diff)
117 |             elif 'under' in self.runtime_estimates:
118 |                 diff = -abs(diff)
119 |             j.requested_time = max(math.ceil(j.execution_time + diff), 1)
120 |         j.id = next(self.counter)
121 |         return [j]
122 | 
123 |     def __len__(self):
124 |         return self.length
125 | 
126 |     def peek(self):
127 |         return self.step(0)
128 | 


--------------------------------------------------------------------------------
/schedgym/workload/trace.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """trace - A trace-based workload generator
  5 | 
  6 | Inherits from the base WorkloadGenerator and uses the swf_parser to parse SWF
  7 | files.
  8 | """
  9 | 
 10 | from itertools import takewhile
 11 | from typing import Iterator, Optional, Sequence, Callable
 12 | 
 13 | from ..job import Job
 14 | from .base import WorkloadGenerator
 15 | from .swf_parser import parse as parse_swf
 16 | 
 17 | 
 18 | class TraceGenerator(WorkloadGenerator):
 19 |     restart: bool
 20 |     trace: Sequence[Job]
 21 |     refresh_jobs: Optional[Callable] = None
 22 | 
 23 |     def __init__(self, restart=False, trace=None):
 24 |         self.current_time = 0
 25 |         self.restart = restart
 26 |         self.current_element = 0
 27 | 
 28 |         if trace is not None:
 29 |             self.trace = trace
 30 |         else:
 31 |             self.trace = []
 32 | 
 33 |     def step(self, offset=1):
 34 |         """ "Samples" jobs from the trace file.
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |             offset : int
 39 |                 The amount to offset the current time step
 40 |         """
 41 |         if offset < 0:
 42 |             raise ValueError('Submission time must be positive')
 43 |         if self.current_element >= len(self.trace):
 44 |             if self.restart:
 45 |                 self.current_element = 0
 46 |                 for job in self.trace:
 47 |                     job.submission_time += self.current_time
 48 |                 if self.refresh_jobs is not None:
 49 |                     self.refresh_jobs()
 50 |             else:
 51 |                 raise StopIteration('Workload finished')
 52 |         submission_time = self.current_time + offset
 53 |         jobs = takewhile(
 54 |             lambda j: j[1].submission_time <= submission_time,
 55 |             enumerate(
 56 |                 self.trace[self.current_element:], self.current_element
 57 |             ),
 58 |         )
 59 |         self.current_time = submission_time
 60 |         jobs = list(jobs)
 61 |         if jobs:
 62 |             self.current_element = jobs[-1][0] + 1
 63 |             return [j for (i, j) in jobs]
 64 |         return []
 65 | 
 66 |     @property
 67 |     def last_event_time(self):
 68 |         """The submission time of the last generated job"""
 69 |         offset = (
 70 |             self.current_element
 71 |             if self.current_element < len(self.trace)
 72 |             else -1
 73 |         )
 74 |         return self.trace[offset].submission_time
 75 | 
 76 |     def __len__(self):
 77 |         return len(self.trace)
 78 | 
 79 |     def __next__(self) -> Job:
 80 |         if self.current_element >= len(self.trace):
 81 |             if self.restart:
 82 |                 self.current_element = 0
 83 |                 if self.refresh_jobs is not None:
 84 |                     self.refresh_jobs()
 85 |             else:
 86 |                 raise StopIteration()
 87 |         job = self.trace[self.current_element]
 88 |         self.current_element += 1
 89 |         return job
 90 | 
 91 |     def __iter__(self) -> Iterator[Optional[Job]]:
 92 |         return iter(self.trace)
 93 | 
 94 |     def peek(self) -> Optional[Job]:
 95 |         job = next(self)
 96 |         if self.current_element > 0:
 97 |             self.current_element -= 1
 98 |         return job
 99 | 
100 | 
101 | class SwfGenerator(TraceGenerator):
102 |     """A trace-based (workload log) generator.
103 | 
104 |     Supports starting the parsing after an offset, and also supports reading a
105 |     pre-specified number of jobs.
106 | 
107 |     Parameters
108 |     ----------
109 |         tracefile : str
110 |             The path to the filed to be parsed and used as input for workload
111 |             generation.
112 |         processors : int
113 |             The number of processors in this trace
114 |         memory : int
115 |             The amount of memory in this trace
116 |         restart : bool
117 |             Whether to restart from the beginning of the file when we reach
118 |             its end (or, in the case we're using an offset and a length, to
119 |             restart from the offset up to the length)
120 |         ignore_memory : bool
121 |             Whether to ignore (or not) memory usage
122 |     """
123 | 
124 |     tracefile: str
125 |     ignore_memory: bool
126 | 
127 |     def __init__(
128 |         self,
129 |         tracefile,
130 |         processors,
131 |         memory,
132 |         offset=0,
133 |         length=None,
134 |         restart=False,
135 |         ignore_memory=False,
136 |     ):
137 | 
138 |         super().__init__(
139 |             restart,
140 |             list(parse_swf(tracefile, processors, memory, ignore_memory)),
141 |         )
142 |         self.tracefile = tracefile
143 | 
144 |         if length is None:
145 |             length = len(self.trace)
146 |         else:
147 |             length = length if length <= len(self.trace) else len(self.trace)
148 | 
149 |         self.trace = self.trace[offset:offset + length]
150 | 
151 |         self.current_element = 0
152 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/null_scheduler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """null_scheduler - a module that doesn't do any scheduling
  5 | 
  6 | The purposed of this module is to provide a way for clients of the simulator to
  7 | implement different scheduling strategies.
  8 | 
  9 | Most notably, this can be used by learning agents to select which jobs to
 10 | schedule in an iterative way.
 11 | """
 12 | 
 13 | from typing import Optional
 14 | 
 15 | from ..job import Job
 16 | from ..scheduler import Scheduler
 17 | 
 18 | # The main issue here is that we have two kinds of steps:
 19 | # 1. OpenAI Gym steps
 20 | # 2. Scheduler steps
 21 | # For OpenAI Gym steps, we need to pass an action. For Scheduler steps, we need
 22 | # to pass an offset.
 23 | 
 24 | 
 25 | class NullScheduler(Scheduler):
 26 |     """An scheduler that receives scheduling commands from a client.
 27 | 
 28 |     This is a null scheduler in the sense that scheduling decisions aren't made
 29 |     by this class, but by another class, which forwards its decisions to this
 30 |     one so that they can be propagated into the simulator. As such, this
 31 |     implements the interface between RL environments (such as OpenAI gym)
 32 |     and the scheduler simulator.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |         number_of_processors : int
 37 |             The number of processors managed by this scheduler
 38 |         total_memory : int
 39 |             The total amount of memory in the cluster managed by this scheduler
 40 |     """
 41 | 
 42 |     current_slot: Optional[int]
 43 | 
 44 |     def __init__(
 45 |         self, number_of_processors, total_memory, ignore_memory=False
 46 |     ):
 47 |         self.current_slot: Optional[int] = None
 48 |         super().__init__(
 49 |             number_of_processors, total_memory, ignore_memory=ignore_memory
 50 |         )
 51 | 
 52 |     def step(self, offset: int = None) -> bool:
 53 |         """Steps the scheduler by setting which job to choose.
 54 | 
 55 |         Uses the offset to select a position in the admission queue. If the
 56 |         agent select a job that doesn't fit the cluster, such a selection is
 57 |         ignored by the scheduler.
 58 | 
 59 |         Differently from its base class, this method **does not** forward time.
 60 |         For this, please see :func:`forward_time`.
 61 | 
 62 |         Parameters
 63 |         ----------
 64 |             offset : int
 65 |                 The offset in the admission queue of the job to select. Any
 66 |                 negative number represents a no-op.
 67 |         """
 68 |         if self.current_slot is not None:
 69 |             raise AssertionError('current_slot invariant not true')
 70 | 
 71 |         self.current_slot = offset if offset is not None else -1
 72 |         return self.schedule()
 73 | 
 74 |     def forward_time(self):
 75 |         """Forwards time by one time step.
 76 | 
 77 |         For details, see :func:`step`.
 78 |         """
 79 | 
 80 |         present = self.job_events.step(1)
 81 |         self.cluster = self.play_events(
 82 |             present, self.cluster, update_queues=True
 83 |         )
 84 |         self.current_time += 1
 85 |         self.schedule()
 86 | 
 87 |     @property
 88 |     def action_space(self):
 89 |         """Helper that gives the number of actions available for the agent."""
 90 |         # We always support the null action
 91 |         return len(self.queue_admission) + 1
 92 | 
 93 |     def schedule(self) -> bool:
 94 |         """Tries to schedule the job selected with :func:`step`.
 95 | 
 96 |         When :func:`step` is called, it stores the job currently selected by
 97 |         the client. This function will check in the queue which job the
 98 |         selection corresponds to and will check if the job fits in the cluster
 99 |         *right now*.  If it does, the job is scheduled, otherwise, it is
100 |         ignored.
101 |         In either case, the current selection is cleared.
102 | 
103 |         Returns:
104 |             bool: True if the selected job was scheduled. False otherwise.
105 |         """
106 |         try:
107 |             if (
108 |                 self.current_slot is not None
109 |                 and len(self.queue_admission) > 0
110 |                 and 0 <= self.current_slot < len(self.queue_admission)
111 |             ):
112 |                 job: Job = self.queue_admission[self.current_slot]
113 |                 if not self.cluster.fits(job):
114 |                     return False
115 |                 resources = self.can_schedule_now(job)
116 |                 if resources:
117 |                     self.assign_schedule(job, resources, self.current_time)
118 |                     self.queue_admission.pop(self.current_slot)
119 |                     return True
120 |                 return False
121 |             return False
122 |         finally:
123 |             self.current_slot = None
124 | 
125 |     def sjf_lt(
126 |         self, a: Job, b: Optional[Job]
127 |     ):  # pylint: disable=C, no-self-use
128 |         """Comparison function that gives the same ordering SJF would give.
129 | 
130 |         Parameters
131 |         ----------
132 |             a: Job
133 |                 A first job
134 |             b: Job
135 |                 A second job
136 | 
137 |         Returns:
138 |             bool: True if `a` is shorter than `b`. False otherwise.
139 |         """
140 |         return b is None or (a.requested_time < b.requested_time)
141 | 
142 |     def sjf_action(self, limit: int) -> int:
143 |         """Returns the index of the job SJF would pick.
144 | 
145 |         Parameters
146 |         ----------
147 |             limit : int
148 |                 How far in the admission queue to look when searching for the
149 |                 shortest job.
150 |         """
151 | 
152 |         best = None
153 |         bestidx = limit
154 |         limits = slice(0, limit if limit >= 0 else None)
155 |         for i, job in enumerate(self.queue_admission[limits]):
156 |             if self.sjf_lt(job, best):
157 |                 if self.cluster.fits(job):
158 |                     best = job
159 |                     bestidx = i
160 |         return bestidx
161 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """A setuptools based setup module.
  5 | See:
  6 | https://packaging.python.org/guides/distributing-packages-using-setuptools/
  7 | https://github.com/pypa/sampleproject
  8 | """
  9 | 
 10 | # Always prefer setuptools over distutils
 11 | from setuptools import setup, find_packages, Extension
 12 | import pathlib
 13 | import schedgym  # noqa
 14 | try:
 15 |     from Cython.Build import cythonize
 16 | except (NameError, ModuleNotFoundError):
 17 |     def cythonize(*args, **kwargs):
 18 |         pass
 19 | 
 20 | here = pathlib.Path(__file__).parent.resolve()
 21 | 
 22 | # Get the long description from the README file
 23 | long_description = (here / 'README.rst').read_text(encoding='utf-8')
 24 | 
 25 | extras = {
 26 |     'render': [
 27 |         'matplotlib',
 28 |         'pyglet',
 29 |     ],
 30 |     'test': [
 31 |         'pytest',
 32 |         'coverage',
 33 |     ],
 34 |     'docs': [
 35 |         'Sphinx',
 36 |         'docutils',
 37 |         'nbsphinx',
 38 |     ]
 39 | }
 40 | 
 41 | extras['all'] = [item for group in extras.values() for item in group]
 42 | 
 43 | # Arguments marked as "Required" below must be included for upload to PyPI.
 44 | # Fields marked as "Optional" may be commented out.
 45 | 
 46 | setup(
 47 |     name='sched-rl-gym',
 48 |     description='OpenAI Gym environment for HPC job scheduling',
 49 |     long_description=long_description,
 50 |     long_description_content_type='text/x-rst',
 51 |     url='https://github.com/renatolfc/sched-rl-gym',
 52 |     author='Renato L. de F. Cunha',
 53 |     author_email='renatocunha@acm.org',
 54 | 
 55 |     classifiers=[
 56 |         'Development Status :: 3 - Alpha',
 57 | 
 58 |         # Indicate who your project is intended for
 59 |         'Intended Audience :: Developers',
 60 |         'Intended Audience :: Information Technology',
 61 | 
 62 |         'Topic :: Scientific/Engineering',
 63 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
 64 | 
 65 |         # Pick your license as you wish
 66 |         'License :: OSI Approved :: MIT License',
 67 | 
 68 |         # Specify the Python versions you support here. In particular, ensure
 69 |         # that you indicate you support Python 3. These classifiers are *not*
 70 |         # checked by 'pip install'. See instead 'python_requires' below.
 71 |         'Programming Language :: Python :: 3',
 72 |         'Programming Language :: Python :: 3.6',
 73 |         'Programming Language :: Python :: 3.7',
 74 |         'Programming Language :: Python :: 3.8',
 75 |         'Programming Language :: Python :: 3.9',
 76 |         'Programming Language :: Python :: 3 :: Only',
 77 |     ],
 78 | 
 79 |     keywords='gym, reinforcement learning, artificial intelligence',
 80 | 
 81 |     package_dir={'schedgym': 'schedgym'},
 82 |     packages=find_packages(),
 83 |     python_requires='>=3.6, <4',
 84 | 
 85 |     # This field lists other packages that your project depends on to run.
 86 |     # Any package you put here will be installed by pip when your project is
 87 |     # installed, so they must be valid existing projects.
 88 |     #
 89 |     # For an analysis of "install_requires" vs pip's requirements files see:
 90 |     # https://packaging.python.org/en/latest/requirements.html
 91 |     install_requires=[
 92 |         'gym',
 93 |         'numpy',
 94 |         'cython',
 95 |         'intervaltree>=3.0',
 96 |         'parallelworkloads',
 97 |     ],
 98 | 
 99 |     extras_require=extras,
100 | 
101 |     ext_modules=cythonize([
102 |         Extension('schedgym.job', ['schedgym/job.py']),
103 |         Extension('schedgym.pool', ['schedgym/pool.py']),
104 |         Extension('schedgym.simulator', ['schedgym/simulator.py']),
105 |         Extension('schedgym.resource', ['schedgym/resource.py']),
106 |         Extension('schedgym.cluster', ['schedgym/cluster.py']),
107 |         Extension('schedgym.envs.workload', ['schedgym/envs/workload.py']),
108 |         Extension('schedgym.envs.simulator', ['schedgym/envs/simulator.py']),
109 |         Extension('schedgym.envs.compact_env', ['schedgym/envs/compact_env.py']),
110 |         Extension('schedgym.envs.base', ['schedgym/envs/base.py']),
111 |         Extension('schedgym.envs.deeprm_env', ['schedgym/envs/deeprm_env.py']),
112 |         Extension('schedgym.scheduler.backfilling_scheduler', ['schedgym/scheduler/backfilling_scheduler.py']),
113 |         Extension('schedgym.scheduler.null_scheduler', ['schedgym/scheduler/null_scheduler.py']),
114 |         Extension('schedgym.scheduler.easy_scheduler', ['schedgym/scheduler/easy_scheduler.py']),
115 |         Extension('schedgym.scheduler.fifo_scheduler', ['schedgym/scheduler/fifo_scheduler.py']),
116 |         Extension('schedgym.scheduler.packer_scheduler', ['schedgym/scheduler/packer_scheduler.py']),
117 |         Extension('schedgym.scheduler.random_scheduler', ['schedgym/scheduler/random_scheduler.py']),
118 |         Extension('schedgym.scheduler.sjf_scheduler', ['schedgym/scheduler/sjf_scheduler.py']),
119 |         Extension('schedgym.scheduler.tetris_scheduler', ['schedgym/scheduler/tetris_scheduler.py']),
120 |         Extension('schedgym.workload.base', ['schedgym/workload/base.py']),
121 |         Extension('schedgym.workload.trace', ['schedgym/workload/trace.py']),
122 |         Extension('schedgym.workload.distribution', ['schedgym/workload/distribution.py']),
123 |         Extension('schedgym.workload.swf_parser', ['schedgym/workload/swf_parser.py']),
124 |     ], language_level=3),
125 | 
126 |     # List additional URLs that are relevant to your project as a dict.
127 |     #
128 |     # This field corresponds to the "Project-URL" metadata fields:
129 |     # https://packaging.python.org/specifications/core-metadata/#project-url-multiple-use
130 |     #
131 |     # Examples listed include a pattern for specifying where the package tracks
132 |     # issues, where the source is hosted, where to say thanks to the package
133 |     # maintainers, and where to support the project financially. The key is
134 |     # what's used to render the link text on PyPI.
135 |     project_urls={  # Optional
136 |         'Bug Reports': 'https://github.com/renatolfc/sched-rl-gym/issues',
137 |         'Say Thanks!': 'https://saythanks.io/to/renatolfc',
138 |         'Source': 'https://github.com/renatolfc/sched-rl-gym',
139 |     },
140 | )
141 | 


--------------------------------------------------------------------------------
/schedgym/envs/deeprm_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from typing import Union
  5 | 
  6 | import numpy as np
  7 | 
  8 | import gym.spaces.box
  9 | import gym.spaces.discrete
 10 | import gym.spaces.tuple
 11 | 
 12 | from ..job import Job
 13 | from .base import BaseRmEnv
 14 | from .simulator import DeepRmSimulator
 15 | from .workload import DeepRmWorkloadGenerator
 16 | 
 17 | import logging
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | MAXIMUM_QUEUE_SIZE = 16
 22 | 
 23 | RESOURCE_SLOTS = 10
 24 | 
 25 | NUMBER_OF_RESOURCES = 2
 26 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10
 27 | 
 28 | 
 29 | class DeepRmEnv(BaseRmEnv):
 30 |     n_work: int
 31 |     n_resources: int
 32 |     use_raw_sate: bool
 33 |     simulator: DeepRmSimulator
 34 |     workload: DeepRmWorkloadGenerator
 35 |     observation_space: Union[gym.spaces.tuple.Tuple, gym.spaces.box.Box]
 36 |     action_space: gym.spaces.discrete.Discrete
 37 | 
 38 |     metadata = {'render.modes': ['human', 'rgb_array']}
 39 | 
 40 |     def __init__(self, **kwargs):
 41 |         super().__init__(**kwargs)
 42 | 
 43 |         self.use_raw_state = kwargs.get('use_raw_state', False)
 44 | 
 45 |         self.n_resources = kwargs.get(
 46 |             'n_resources', NUMBER_OF_RESOURCES
 47 |         )  # resources in the system
 48 |         self.n_work = kwargs.get(
 49 |             'n_work', MAXIMUM_QUEUE_SIZE
 50 |         )  # max amount of work in the queue
 51 |         if self.backlog_size % self.time_horizon:
 52 |             raise AssertionError('Backlog must be a multiple of time horizon')
 53 | 
 54 |         self.backlog_width = self.backlog_size // self.time_horizon
 55 | 
 56 |         self.setup_spaces()
 57 | 
 58 |     def setup_spaces(self):
 59 |         self.action_space = gym.spaces.discrete.Discrete(self.job_slots + 1)
 60 |         if self.use_raw_state:
 61 |             self.setup_raw_spaces()
 62 |         else:
 63 |             self.setup_image_spaces()
 64 | 
 65 |     def setup_image_spaces(self):
 66 |         self.observation_space = gym.spaces.box.Box(
 67 |             low=0.0,
 68 |             high=1.0,
 69 |             shape=(
 70 |                 self.time_horizon,
 71 |                 (
 72 |                     (0 if self.ignore_memory else (self.job_slots + 1))
 73 |                     * self.scheduler.total_memory
 74 |                 )
 75 |                 + (self.job_slots + 1) * self.scheduler.number_of_processors
 76 |                 + self.backlog_width
 77 |                 + 1,
 78 |             ),
 79 |         )
 80 | 
 81 |     def setup_raw_spaces(self):
 82 |         self.memory_space = gym.spaces.box.Box(
 83 |             low=0.0,
 84 |             high=1.0,
 85 |             shape=(self.time_horizon, self.scheduler.total_memory),
 86 |         )
 87 |         self.processor_space = gym.spaces.box.Box(
 88 |             low=0.0,
 89 |             high=1.0,
 90 |             shape=(self.time_horizon, self.scheduler.number_of_processors),
 91 |         )
 92 |         self.backlog_space = gym.spaces.box.Box(
 93 |             low=0.0, high=1.0, shape=(self.time_horizon, self.backlog_width)
 94 |         )
 95 |         self.memory_slots_space = gym.spaces.box.Box(
 96 |             low=0.0,
 97 |             high=1.0,
 98 |             shape=(
 99 |                 self.job_slots,
100 |                 self.time_horizon,
101 |                 self.scheduler.total_memory,
102 |             ),
103 |         )
104 |         self.processor_slots_space = gym.spaces.box.Box(
105 |             low=0.0,
106 |             high=1.0,
107 |             shape=(
108 |                 self.job_slots,
109 |                 self.time_horizon,
110 |                 self.scheduler.number_of_processors,
111 |             ),
112 |         )
113 |         self.time_since_space = gym.spaces.discrete.Discrete(self.time_horizon)
114 | 
115 |         self.observation_space = gym.spaces.tuple.Tuple(
116 |             (
117 |                 self.processor_space,
118 |                 self.memory_space,
119 |                 self.processor_slots_space,
120 |                 self.memory_slots_space,
121 |                 self.backlog_space,
122 |                 self.time_since_space,
123 |             )
124 |         )
125 |         self.observation_space.n = np.sum(  # type: ignore
126 |             [
127 |                 np.prod(e.shape) if isinstance(e, gym.spaces.box.Box) else e.n
128 |                 for e in self.observation_space
129 |             ]
130 |         )
131 | 
132 |     @property
133 |     def state(self):
134 |         state, jobs, backlog = self.scheduler.state(
135 |             self.time_horizon, self.job_slots
136 |         )
137 |         s = self._convert_state(
138 |             state,
139 |             jobs,
140 |             backlog,
141 |             (
142 |                 (self.simulator.current_time - self.simulator.last_job_time)
143 |                 / MAX_TIME_TRACKING_SINCE_LAST_JOB
144 |             ),
145 |         )
146 |         if self.use_raw_state:
147 |             return s
148 |         return self.pack_observation(s)
149 | 
150 |     def pack_observation(self, ob):
151 |         current, wait, backlog, time = ob
152 |         wait = wait.reshape(self.time_horizon, -1)
153 |         current = current.reshape(self.time_horizon, -1)
154 |         return np.hstack((current, wait, backlog, time))
155 | 
156 |     def find_slot_position(self, action):
157 |         if action < len(self.scheduler.queue_admission):
158 |             return action
159 |         return self.action_space.n - 1
160 | 
161 |     def step(self, action: int):
162 |         done = False
163 |         found = False
164 |         if 0 <= action < self.action_space.n - 1:
165 |             action = self.find_slot_position(action)
166 |             found = True
167 |         try:
168 |             intermediate = self.simulator.rl_step(
169 |                 action if found else None, self.reward_mapper[self.reward_jobs]
170 |             )
171 |         except StopIteration:
172 |             intermediate = [[Job()]]
173 |             done = True
174 | 
175 |         reward = self.reward if any(intermediate) else 0
176 |         done = bool(self.time_limit) and (
177 |             self.scheduler.current_time > self.time_limit or done
178 |         )
179 | 
180 |         if not done and self.smdp and any(intermediate):
181 |             rewards = [self.compute_reward(js) for js in intermediate]
182 |             rewards[0] = 0
183 |             reward = (
184 |                 self.gamma ** np.arange(len(intermediate))
185 |             ).dot(rewards)
186 | 
187 |         return (
188 |             self.state,
189 |             reward,
190 |             done,
191 |             self.stats if done else {}
192 |         )
193 | 


--------------------------------------------------------------------------------
/schedgym/pool.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """pool - Resource Pool management (see :class:`schedgym.cluster.Cluster`)."""
  5 | 
  6 | import copy
  7 | import enum
  8 | from typing import Iterable, List, Optional
  9 | 
 10 | from intervaltree import IntervalTree, Interval
 11 | 
 12 | 
 13 | class ResourceType(enum.IntEnum):
 14 |     """Enumeration to determine which kind of resource we're managing."""
 15 | 
 16 |     CPU = 1
 17 |     MEMORY = 0
 18 | 
 19 | 
 20 | class ResourcePool:
 21 |     """A pool of resources.
 22 | 
 23 |     This is the basic structure managed by a :class:`schedgym.cluster.Cluster`.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |         resource_type : ResourceType
 28 |             The type of resource in this pool
 29 |         size : int
 30 |             The amount of resources available in this pool
 31 |         used_pool : IntervalTree
 32 |             The set of resources currently in use in this resource pool.
 33 |     """
 34 | 
 35 |     used_pool: IntervalTree
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         resource_type: ResourceType,
 40 |         size: int,
 41 |         used_pool: IntervalTree = None,
 42 |     ):
 43 |         self.size = size
 44 |         self.used_resources = 0
 45 |         self.type = resource_type
 46 |         if used_pool is None:
 47 |             self.used_pool = IntervalTree()
 48 |         else:
 49 |             self.used_pool = used_pool
 50 |             self.used_resources = sum(
 51 |                 [ResourcePool.measure(i) for i in used_pool]
 52 |             )
 53 | 
 54 |     def clone(self):
 55 |         """Duplicates this ResourcePool in memory."""
 56 |         return copy.deepcopy(self)
 57 | 
 58 |     @property
 59 |     def free_resources(self) -> int:
 60 |         """Returns the amount of free resources in this resource pool"""
 61 |         return self.size - self.used_resources
 62 | 
 63 |     def fits(self, size) -> bool:
 64 |         """Checks whether a given amount of resources can be allocated.
 65 | 
 66 |         Parameters
 67 |         ----------
 68 |             size : int
 69 |                 The amount of resources to allocate in this pool
 70 | 
 71 |         Returns:
 72 |             bool: True when the size fits the pool, and False otherwise.
 73 |         """
 74 |         if size <= 0:
 75 |             raise AssertionError("Can't allocate zero resources")
 76 |         return size <= self.free_resources
 77 | 
 78 |     @staticmethod
 79 |     def measure(interval: Interval):
 80 |         """Measures the size of an interval.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |             interval : Interval
 85 |                 The interval to be measured.
 86 |         """
 87 |         return interval.end - interval.begin
 88 | 
 89 |     def find(self, size: int, data: Optional[int] = None) -> IntervalTree:
 90 |         """Finds an interval tree of a given size in this resource pool.
 91 | 
 92 |         This is essentially an operation to find *which* resources to allocate
 93 |         considering that we manage individual resource units and guarantee
 94 |         exclusive usage by a resource unit.
 95 | 
 96 |         Parameters
 97 |         ----------
 98 |             size : int
 99 |                 The size (amount) of resources to allocate
100 |             data : Optional[int]
101 |                 The identifier of the "owner" of the found resources. This
102 |                 allows us to keep track which job "owns" which resources during
103 |                 execution.
104 | 
105 |         Returns:
106 |             IntervalTree: An interval tree with the size requested if such
107 |             a tree can be found. Otherwise, an empty tree is returned.
108 |         """
109 |         used = IntervalTree()
110 |         if not self.fits(size):
111 |             return used
112 |         free = IntervalTree([Interval(0, self.size, data)])
113 |         used_size: int = 0
114 |         for interval in self.used_pool:
115 |             free.chop(interval.begin, interval.end)
116 |         for interval in free:
117 |             temp_size = ResourcePool.measure(interval) + used_size
118 |             if temp_size == size:
119 |                 used.add(interval)
120 |                 break
121 |             if temp_size < size:
122 |                 used.add(interval)
123 |                 used_size = temp_size
124 |             else:
125 |                 used.add(
126 |                     Interval(
127 |                         interval.begin, interval.begin + size - used_size, data
128 |                     )
129 |                 )
130 |                 break
131 |         return used
132 | 
133 |     def allocate(self, intervals: Iterable[Interval]) -> None:
134 |         """Adds a set of intervals to the current used pool of resources.
135 | 
136 |         This is the opposite of :func:`schedgym.cluster.Cluster.free`.
137 | 
138 |         Parameters
139 |         ----------
140 |             intervals : Iterable[Interval]
141 |                 The set of intervals that should be allocated (most likely,
142 |                 this will be the resource of calling
143 |                 :func:`schedgym.cluster.Cluster.find`).
144 | 
145 |         Returns:
146 |             None
147 |         """
148 |         for i in intervals:
149 |             if self.used_resources + self.measure(i) > self.size:
150 |                 raise AssertionError(
151 |                     'Tried to allocate past size of resource pool'
152 |                 )
153 |             self.used_pool.add(i)
154 |             self.used_resources += self.measure(i)
155 | 
156 |     def free(self, intervals: Iterable[Interval]) -> None:
157 |         """Frees a set of used resources.
158 | 
159 |         This is the opposite of :func:`schedgym.cluster.Cluster.allocate`.
160 | 
161 |         Parameters
162 |         ----------
163 |             intervals : Iterable[Interval]
164 |                 The set of intervals to be freed (most likely, these will have
165 |                 been allocated with the output of
166 |                 :func:`schedgym.cluster.Cluster.find`).
167 |         """
168 |         for i in intervals:
169 |             if i not in self.used_pool:
170 |                 raise AssertionError('Tried to free unused resource set')
171 |             self.used_pool.remove(i)
172 |             self.used_resources -= self.measure(i)
173 | 
174 |     @property
175 |     def intervals(self) -> List[Interval]:
176 |         """The set of intervals currently used in this resource pool."""
177 |         # pylint: disable=unnecessary-comprehension
178 |         return [i for i in self.used_pool]
179 | 
180 |     def __repr__(self):
181 |         return (
182 |             f'ResourcePool(resource_type={self.type}, '
183 |             f'size={self.size}, used_pool={self.used_pool})'
184 |         )
185 | 


--------------------------------------------------------------------------------
/schedgym/event.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """event - Event Handling classes
  5 | 
  6 | We have a basic Event type, which is specialized by
  7 |     1. A ResourceEvent, related to events that occur to resources and
  8 |     2. A JobEvent, related to events that occur to jobs
  9 | """
 10 | 
 11 | import copy
 12 | import enum
 13 | import warnings
 14 | from typing import List, Optional, Iterable, TypeVar, Generic, Iterator
 15 | 
 16 | from intervaltree import Interval
 17 | 
 18 | from .job import Job
 19 | from .heap import Heap
 20 | from .pool import ResourceType
 21 | 
 22 | T = TypeVar('T', bound='Event')  # pylint: disable=C
 23 | 'Generic type for type annotations'
 24 | 
 25 | 
 26 | class EventType(enum.IntEnum):
 27 |     """Enumeration for the different types of events that can occur."""
 28 | 
 29 |     RESOURCE_ALLOCATE = 0
 30 |     RESOURCE_FREE = 1
 31 |     JOB_FINISH = 2
 32 |     JOB_START = 3
 33 | 
 34 | 
 35 | class Event:
 36 |     """A base event class.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |         time : int
 41 |             The time at which this event occurs
 42 |         type : EventType
 43 |             What is the type of this event
 44 |     """
 45 | 
 46 |     time: int
 47 |     type: EventType
 48 | 
 49 |     def __init__(self, time: int, type: EventType):
 50 |         # pylint: disable=redefined-builtin
 51 |         self.time = time
 52 |         self.type = type
 53 | 
 54 |     def clone(self):
 55 |         """Clones this event.
 56 | 
 57 |         Returns:
 58 |             A new event identical to this one, but with no memory sharing.
 59 |         """
 60 |         return copy.copy(self)
 61 | 
 62 | 
 63 | class ResourceEvent(Event):
 64 |     """An event related to resource allocation or to the freeing of resources.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |         time : int
 69 |             The time at which this event occurs
 70 |         type : EventType
 71 |             What is the type of this event
 72 |         resources : Iterable[Interval]
 73 |             The resources that are being allocated/free'd by this event
 74 |     """
 75 | 
 76 |     resources: Iterable[Interval]
 77 |     resource_type: ResourceType
 78 | 
 79 |     def __init__(
 80 |         self,
 81 |         time: int,
 82 |         type: EventType,
 83 |         resource_type: ResourceType,
 84 |         resources: Iterable[Interval],
 85 |     ):
 86 |         # pylint: disable=redefined-builtin
 87 |         super().__init__(time, type)
 88 |         self.resources = resources
 89 |         self.resource_type = resource_type
 90 | 
 91 | 
 92 | class JobEvent(Event):
 93 |     """An event related to the start of finishing of jobs.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |         time : int
 98 |             The time at which this event occurs
 99 |         type : EventType
100 |             What is the type of this event
101 |         job : Job
102 |             The job to which this event applies
103 |     """
104 | 
105 |     job: Job
106 | 
107 |     def __init__(self, time: int, type: EventType, job: Job):
108 |         # pylint: disable=redefined-builtin
109 |         super().__init__(time, type)
110 |         self.job = job
111 | 
112 |     @property
113 |     def processors(self) -> Iterable[Interval]:
114 |         """The processors touched by the job that caused this event"""
115 |         return self.job.resources.processors
116 | 
117 |     @property
118 |     def memory(self) -> Iterable[Interval]:
119 |         """The memory touched by the job that caused this event"""
120 |         return self.job.resources.memory
121 | 
122 |     def __str__(self):
123 |         return f'JobEvent<{self.time}, {self.type.name}, {self.job}>'
124 | 
125 |     def __repr__(self):
126 |         return str(self)
127 | 
128 | 
129 | class EventQueue(Generic[T]):
130 |     """A priority-queue of events sorted by time.
131 | 
132 |     Parameters
133 |     ----------
134 |         time : int
135 |             The moment in time this event queue begins.
136 |     """
137 | 
138 |     time: int
139 |     past: List[T]
140 |     future: Heap[T]
141 | 
142 |     def __init__(self, time: int = 0):
143 |         self.past = []
144 |         self.time = time
145 |         self.future = Heap()
146 | 
147 |     def add(self, event: T) -> None:
148 |         """Adds a new event to the priority queue.
149 | 
150 |         Parameters
151 |         ----------
152 |             event
153 |                 The event to be added
154 |         """
155 |         if event.time >= self.time:
156 |             self.future.add(event, (event.time, event.type))
157 |         else:
158 |             self.past.append(event)
159 |             self.past.sort(key=lambda e: e.time)
160 |             warnings.warn(
161 |                 'Adding events to the past might change the '
162 |                 'ordering of events that happened at the same '
163 |                 'time.'
164 |             )
165 | 
166 |     def step(self, time: int = 1) -> Iterable[T]:
167 |         """Steps time in the event queue.
168 | 
169 |         Parameters
170 |         ----------
171 |             time : int
172 |                 The amount of time steps to perform
173 | 
174 |         Returns:
175 |             A list with all events that happened between the previous time and
176 |             the current time.
177 |         """
178 |         if time < 0:
179 |             raise AssertionError('Tried to move into the past.')
180 |         self.time += time
181 |         present: List[T] = []
182 |         first = self.future.first
183 |         while first and first.time <= self.time:
184 |             current = self.future.pop()
185 |             present.append(current)
186 |             self.past.append(current)
187 |             first = self.future.first
188 |         return present
189 | 
190 |     def remove(self, event: Event) -> None:
191 |         """Removes an event from the queue.
192 | 
193 |         The event is required to not have happened yet, as removal of past
194 |         events is not supported.
195 |         """
196 |         if event not in self.future:
197 |             raise ValueError('Tried to remove non-existant value')
198 |         self.future.remove(event)
199 | 
200 |     @property
201 |     def first(self) -> Optional[T]:  # XXX: This is probably not needed
202 |         """The first event in the future to happen in this queue."""
203 |         return self.future.first
204 | 
205 |     @property
206 |     def next(self) -> Optional[T]:
207 |         """The next event to happen in this queue."""
208 |         if len(self.future) == 0:
209 |             return None
210 |         return self.future.first
211 | 
212 |     @property
213 |     def last(self) -> Optional[T]:
214 |         """The last event to have happened in this queue."""
215 |         return self.past[-1] if self.past else None
216 | 
217 |     def __iter__(self) -> Iterator[T]:
218 |         return self.future.heapsort()
219 | 
220 |     def __str__(self) -> str:
221 |         return f'{[e for e in self.future.heapsort()]}'
222 | 
223 |     def __repr__(self):
224 |         return str(self)
225 | 


--------------------------------------------------------------------------------
/schedgym/envs/simulator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from enum import IntEnum
  5 | from typing import Callable, List, Optional, Union, cast
  6 | 
  7 | from schedgym.job import Job
  8 | from schedgym.scheduler import NullScheduler
  9 | from schedgym.envs.workload import (
 10 |     DeepRmWorkloadGenerator,
 11 |     SyntheticWorkloadGenerator,
 12 | )
 13 | 
 14 | WorkloadGeneratorType = Union[
 15 |     DeepRmWorkloadGenerator, SyntheticWorkloadGenerator
 16 | ]
 17 | 
 18 | 
 19 | class SimulationType(IntEnum):
 20 |     EVENT_BASED = (0,)
 21 |     TIME_BASED = 1
 22 | 
 23 |     @staticmethod
 24 |     def from_str(simulation_type: str):
 25 |         simulation_type = simulation_type.upper().replace('-', '_')
 26 |         if simulation_type in SimulationType.__members__:
 27 |             return SimulationType[simulation_type]
 28 |         else:
 29 |             raise ValueError(
 30 |                 f'{simulation_type} is not a valid SimulationType.'
 31 |             )
 32 | 
 33 | 
 34 | class DeepRmSimulator:
 35 |     scheduler: NullScheduler
 36 |     workload: Union[DeepRmWorkloadGenerator, SyntheticWorkloadGenerator]
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         workload_generator: WorkloadGeneratorType,
 41 |         scheduler: NullScheduler,
 42 |         simulation_type: SimulationType = SimulationType.TIME_BASED,
 43 |         job_slots: Optional[int] = None,
 44 |     ):
 45 | 
 46 |         self.scheduler = scheduler
 47 |         self.workload = workload_generator
 48 |         self.simulation_type = simulation_type
 49 |         self.job_slots = slice(0, job_slots)
 50 |         self.simulator = self.build()
 51 |         self.reset(self.workload, scheduler)
 52 | 
 53 |     def rl_step(
 54 |         self,
 55 |         action: Optional[int],
 56 |         listjobs: Optional[Callable[[], List[Job]]],
 57 |     ) -> List[List[Job]]:
 58 |         return self.simulator.rl_step(
 59 |             action if action is not None else -1,
 60 |             listjobs if listjobs else lambda: self.scheduler.jobs_in_system,
 61 |         )
 62 | 
 63 |     def build(self):
 64 |         if self.simulation_type == SimulationType.EVENT_BASED:
 65 |             return EventBasedDeepRmSimulator(
 66 |                 self.workload,
 67 |                 self.scheduler,
 68 |                 self.job_slots,
 69 |             )
 70 |         elif self.simulation_type == SimulationType.TIME_BASED:
 71 |             return TimeBasedDeepRmSimulator(
 72 |                 self.workload,
 73 |                 self.scheduler,
 74 |                 self.job_slots,
 75 |             )
 76 |         else:
 77 |             raise NotImplementedError(
 78 |                 f'Unsupported simulation type {self.simulation_type}'
 79 |             )
 80 | 
 81 |     @property
 82 |     def current_time(self):
 83 |         return self.simulator.current_time
 84 | 
 85 |     @property
 86 |     def last_job_time(self):
 87 |         return self.simulator.last_job_time
 88 | 
 89 |     def reset(self, workload, scheduler):
 90 |         self.scheduler = scheduler
 91 |         self.workload = workload
 92 |         self.simulator = self.build()
 93 | 
 94 | 
 95 | class EventBasedDeepRmSimulator:
 96 |     last_job_time: int
 97 |     scheduler: NullScheduler
 98 |     job_slots: slice
 99 | 
100 |     def __init__(
101 |         self,
102 |         workload_generator: WorkloadGeneratorType,
103 |         scheduler: NullScheduler,
104 |         job_slots: slice,
105 |     ):
106 |         if (
107 |             not isinstance(workload_generator, DeepRmWorkloadGenerator)
108 |             and not isinstance(workload_generator, SyntheticWorkloadGenerator)
109 |         ) or not isinstance(scheduler, NullScheduler):
110 |             raise AssertionError('Invalid arguments received.')
111 | 
112 |         self.current_time = 0
113 |         self.scheduler = scheduler
114 |         self.simulation_start_time = 0
115 |         self.workload = workload_generator
116 |         self.job_slots = job_slots
117 | 
118 |         self.current_time = self.last_job_time = 0
119 |         if isinstance(workload_generator, SyntheticWorkloadGenerator):
120 |             first_job_time = cast(
121 |                 Job, workload_generator.peek()
122 |             ).submission_time - 1
123 |             workload_generator.current_time = first_job_time
124 |             scheduler.job_events.time = first_job_time
125 |             scheduler.current_time = first_job_time
126 |             self.current_time = first_job_time
127 | 
128 |     def rl_step(
129 |         self, action: int, listjobs: Callable[[], List[Job]]
130 |     ) -> List[List[Job]]:
131 |         "Returns a list of jobs for each successful intermediate time step."
132 | 
133 |         if self.scheduler.step(action):
134 |             return [[]]
135 | 
136 |         jobs: List[List[Job]] = []
137 |         self.current_time += 1
138 |         while True:
139 |             j = self.workload.step()
140 |             if j:
141 |                 self.scheduler.submit(j)
142 |                 self.last_job_time = self.current_time
143 |             self.scheduler.forward_time()
144 |             jobs.append(listjobs())
145 |             if self.scheduler.some_job_fits(self.job_slots):
146 |                 break
147 |         return jobs
148 | 
149 | 
150 | class TimeBasedDeepRmSimulator:
151 |     last_job_time: int
152 |     scheduler: NullScheduler
153 |     job_slots: slice
154 | 
155 |     def __init__(
156 |         self,
157 |         workload_generator: WorkloadGeneratorType,
158 |         scheduler: NullScheduler,
159 |         job_slots: slice,
160 |     ):
161 |         if (
162 |             not isinstance(workload_generator, DeepRmWorkloadGenerator)
163 |             and not isinstance(workload_generator, SyntheticWorkloadGenerator)
164 |         ) or not isinstance(scheduler, NullScheduler):
165 |             raise AssertionError('Invalid arguments received.')
166 | 
167 |         self.scheduler = scheduler
168 |         self.simulation_start_time = 0
169 |         self.workload = workload_generator
170 |         self.current_time = self.last_job_time = 0
171 |         self.job_slots = job_slots
172 | 
173 |         if isinstance(workload_generator, SyntheticWorkloadGenerator):
174 |             first_job_time = cast(
175 |                 Job, workload_generator.peek()
176 |             ).submission_time - 1
177 |             workload_generator.current_time = first_job_time
178 |             scheduler.job_events.time = first_job_time
179 |             scheduler.current_time = first_job_time
180 | 
181 |     def step(self, _=True):
182 |         """Not implemented in DeepRmSimulator"""
183 |         raise NotImplementedError('This simulator cannot follow the base API')
184 | 
185 |     def rl_step(
186 |         self, action: int, listjobs: Callable[[], List[Job]]
187 |     ) -> List[List[Job]]:
188 |         "Returns a list of jobs for each successful intermediate time step."
189 | 
190 |         if self.scheduler.step(action):
191 |             return [[]]
192 |         else:
193 |             self.current_time += 1
194 |             j = self.workload.step()
195 |             if j:
196 |                 self.scheduler.submit(j)
197 |                 self.last_job_time = self.current_time
198 |             self.scheduler.forward_time()
199 |             return [listjobs()]
200 | 


--------------------------------------------------------------------------------
/schedgym/envs/compact_env.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import numpy as np
  5 | import gym.spaces.box
  6 | import gym.spaces.discrete
  7 | 
  8 | from ..job import Job
  9 | from .base import BaseRmEnv
 10 | 
 11 | import logging
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | MAXIMUM_JOB_LENGTH = 15
 17 | RESOURCE_SLOTS = 10
 18 | MAXIMUM_JOB_SIZE = 10
 19 | 
 20 | AMOUNT_OF_MEMORY = 10
 21 | NUMBER_OF_RESOURCES = 2
 22 | NUMBER_OF_PROCESSORS = 10
 23 | MAXIMUM_NUMBER_OF_ACTIVE_JOBS = 40  # Number of colors in image
 24 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10
 25 | 
 26 | NEW_JOB_RATE = 0.7
 27 | SMALL_JOB_CHANCE = 0.8
 28 | 
 29 | DEFAULT_WORKLOAD = {
 30 |     'type': 'deeprm',
 31 |     'new_job_rate': NEW_JOB_RATE,
 32 |     'max_job_size': MAXIMUM_JOB_SIZE,
 33 |     'max_job_len': MAXIMUM_JOB_LENGTH,
 34 |     'small_job_chance': SMALL_JOB_CHANCE,
 35 | }
 36 | 
 37 | 
 38 | class CompactRmEnv(BaseRmEnv):
 39 |     metadata = {'render.modes': ['human', 'rgb_array']}
 40 | 
 41 |     def __init__(self, **kwargs):
 42 |         super().__init__(**kwargs)
 43 | 
 44 |         self.memory = kwargs.get('memory', AMOUNT_OF_MEMORY)
 45 |         self.processors = kwargs.get('processors', NUMBER_OF_PROCESSORS)
 46 | 
 47 |         self.renderer = kwargs.get('renderer', None)
 48 | 
 49 |         self.maximum_work = self.processors
 50 |         self.maximum_work_mem = self.memory
 51 | 
 52 |         self._setup_spaces()
 53 | 
 54 |     def _setup_spaces(self):
 55 |         self.action_space = gym.spaces.discrete.Discrete(self.job_slots + 1)
 56 | 
 57 |         self.observation_space = gym.spaces.box.Box(
 58 |             low=0.0, high=1.0, shape=((len(self.state),)), dtype=np.float32
 59 |         )
 60 | 
 61 |     def reset(self) -> np.ndarray:
 62 |         super().reset()
 63 |         self.maximum_work = self.time_limit * self.processors
 64 |         self.maximum_work_mem = self.time_limit * self.memory
 65 |         return super().reset()
 66 | 
 67 |     def step(self, action: int):
 68 |         done = False
 69 |         found = True
 70 |         if not (0 <= action < self.action_space.n - 1):
 71 |             found = False
 72 | 
 73 |         try:
 74 |             intermediate = self.simulator.rl_step(
 75 |                 action if found else None, self.reward_mapper[self.reward_jobs]
 76 |             )
 77 |             # XXX: This is technically incorrect. The correct thing to do here
 78 |             # is: when we have a trace-based workload generator, we need to
 79 |             # maintain a check on whether we want to sample from it or not, and
 80 |             # use the time limit to actually decide whether we're done or not.
 81 |             # In the current setting, we might potentially "lose" the last jobs
 82 |             # of the workload.
 83 |         except StopIteration:
 84 |             intermediate = [[Job()]]
 85 |             done = True
 86 | 
 87 |         reward = self.reward if any(intermediate) else 0
 88 |         done = bool(self.time_limit) and (
 89 |             self.scheduler.current_time > self.time_limit or done
 90 |         )
 91 | 
 92 |         if not done and self.smdp and any(intermediate):
 93 |             rewards = [self.compute_reward(js) for js in intermediate]
 94 |             rewards[0] = 0
 95 |             reward = (
 96 |                 self.gamma ** np.arange(len(intermediate))
 97 |             ).dot(rewards)
 98 | 
 99 |         return (
100 |             self.state,
101 |             reward,
102 |             done,
103 |             self.stats if done else {}
104 |         )
105 | 
106 |     @property
107 |     def state(self):
108 |         state, jobs, backlog = self.scheduler.state(
109 |             self.time_horizon, self.job_slots
110 |         )
111 |         newstate = np.zeros(
112 |             (len(state[0]) * (1 if self.ignore_memory else 2) * 2)
113 |         )
114 |         newstate[: len(state[0]) * 2] = (
115 |             np.array(
116 |                 [(e[0], e[1]) for e in state[0]],
117 |                 dtype=np.float32
118 |             ).reshape((-1,),) / self.processors
119 |         )
120 |         if not self.ignore_memory:
121 |             newstate[len(state[0]) * 2:] = (
122 |                 np.array(
123 |                     [(e[0], e[1]) for e in state[1]],
124 |                     dtype=np.float32
125 |                 ).reshape((-1,)) / self.memory
126 |             )
127 |         jobs = self._normalize_jobs(jobs).reshape((-1,))
128 |         backlog = backlog * np.ones(1) / self.backlog_size
129 | 
130 |         running = [
131 |             j
132 |             for j in self.scheduler.queue_running
133 |             if j.submission_time + j.requested_time
134 |             > self.scheduler.current_time
135 |         ]
136 | 
137 |         remaining_work = (
138 |             sum(
139 |                 [
140 |                     (
141 |                         j.submission_time
142 |                         + j.requested_time
143 |                         - self.scheduler.current_time
144 |                     )
145 |                     * j.requested_processors
146 |                     for j in running
147 |                 ]
148 |             )
149 |             / self.maximum_work
150 |         )
151 |         remaining_work_mem = (
152 |             sum(
153 |                 [
154 |                     (
155 |                         j.submission_time
156 |                         + j.requested_time
157 |                         - self.scheduler.current_time
158 |                     )
159 |                     * j.requested_memory
160 |                     for j in running
161 |                 ]
162 |             )
163 |             / self.maximum_work_mem
164 |         )
165 | 
166 |         # XXX: this normalization only works while we're sampling at most one
167 |         # job per time step. Once this is not true, we risk having the
168 |         # queue_size feature > 1.0 (which is incorrect)
169 |         queue_size = len(self.scheduler.queue_admission) / self.time_limit
170 |         time_left = 1 - self.scheduler.current_time / self.time_limit
171 | 
172 |         try:
173 |             next_free = min(
174 |                 running, key=lambda x: x.start_time + x.requested_time
175 |             )
176 |             next_free = np.array(
177 |                 (
178 |                     (
179 |                         next_free.start_time
180 |                         + next_free.requested_time
181 |                         - self.scheduler.current_time
182 |                     )
183 |                     / self.time_limit,
184 |                     next_free.requested_processors / self.processors,
185 |                     (state[0][0][0] + next_free.requested_processors)
186 |                     / self.processors,
187 |                 )
188 |             )
189 |         except ValueError:
190 |             next_free = np.array((0, 0, 1.0))
191 | 
192 |         return np.hstack(
193 |             (
194 |                 newstate,
195 |                 jobs,
196 |                 backlog,
197 |                 next_free,
198 |                 np.array(
199 |                     (remaining_work, remaining_work_mem, queue_size, time_left)
200 |                 ),
201 |             ),
202 |         )
203 | 
204 |     def _normalize_jobs(self, jobs):
205 |         def _sumdiv(arr, idx, orig, limit):
206 |             arr[idx] = (orig + 1) / (limit + 1)
207 | 
208 |         ret = np.zeros((len(jobs), len(jobs[0])), dtype=np.float32)
209 |         for i, job in enumerate(jobs):
210 |             _sumdiv(ret[i], 0, job.submission_time, self.time_limit)
211 |             _sumdiv(ret[i], 1, job.requested_time, self.time_limit)
212 |             _sumdiv(ret[i], 2, job.requested_memory, self.memory)
213 |             _sumdiv(ret[i], 3, job.requested_processors, self.processors)
214 |             _sumdiv(ret[i], 4, job.queue_size, self.time_limit)
215 |             _sumdiv(
216 |                 ret[i],
217 |                 5,
218 |                 job.queued_work,
219 |                 self.time_limit * self.time_limit * self.processors,
220 |             )
221 |             _sumdiv(ret[i], 6, job.free_processors, self.processors)
222 |         return ret
223 | 


--------------------------------------------------------------------------------
/docs/img/job-resource.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="601px" height="130px" viewBox="-0.5 -0.5 601 130" content="&lt;mxfile host=&quot;Electron&quot; modified=&quot;2020-10-12T16:42:16.559Z&quot; agent=&quot;5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.6.2 Chrome/83.0.4103.122 Electron/9.2.1 Safari/537.36&quot; version=&quot;13.6.2&quot; etag=&quot;J1qyNCrwzJvj0bNLHVP_&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;Vs2rBxUbhU9-WLeEVujd&quot;&gt;7ZldTyIxFIZ/DZdr6AwzwKWA7kc0MWqy7mWdKUxjZw7pFAF//Z7SlvkCRUDXZEmI4bw97bR93rZjafnDdPFd0mlyDTETLa8dL1r+qOV5XRLgXy0sjdDxiBEmksdGKgl3/IVZsW3VGY9ZXklUAELxaVWMIMtYpCoalRLm1bQxiOpTp3TCGsJdREVT/c1jlRi1F7QL/Qfjk8Q9mbRtSUpdshXyhMYwL0n+RcsfSgBlvqWLIRN67ty8mHqXW0rXHZMsU7tU8EyFZypmdmy3LIeZjJjtn1q6QedzngqaYTQYQ6bubAnBmAo+yfB7hE9lEoVnJhXH+Tq3BQqmqEYJF/EVXcJM9y1XNHpy0SAByV+wWSpsm1gslUXvhZWMO10T5TaqkuWYc+MGTGrSNV1UEq9orqwQgRB0mvPH9TBSKic8G4BSkK6TUh7ZYjtVODS22DrdZA0Rzc8gZUouMcVW6DgjWOM7x8wLF5HQaknJQb3Qmtcad7JuuWCLXyzezaj9BuqWN9C9lxCxPAeJbZ9j/FMzxKx7yZomwKGrFRwJT2wIApD2KAPjCi5ETXLGEGysttoin9KIZ5OrVc6oUyi3dgK0BFh3LFZLJeFxzDKNFBRV1PDTsKbAM7WaoWCAH5zHYfssaAXY8SHGpIjxo9OlGkKGY6F8hY+hOeZMG2Q31t5m1patF+7G1jsC284WtilLAVs6cT0i18D7PK7BBq41coKviBhy7jQie2FLEYBgBad7jXH0jTRY+k2W/gZugj4ycQM5Vxx0+9Lk1nh+ArJub8dt9nBiYYPYL3g8naUfcJaS3n5nqdMOodzdst9K+/Jkj9Kt71Kn7baEOfwyx2jvtN0eA9muJ+QRttt+g9irrzmnfffwfTf0qvsu8XdcoX7ncN7uBuC0RN+C2D/KrnqEJUrI28hYFp/rexmMYk5TyOL7hOuzCQsuuXDrCSO3mvRhlqi0KMFePZSDP3rK8Ziy4citIRMtXbTg6qH0vVQLo6KSDlwd03sW1+6I7FFf/l8bl/+Eqcorww7rbBMTyQRV/Ln6xE1UbHM32lCle4dudc2u7yFcE6bztlb5vqjWUOi/0ZAZcqOhlUnWY9zNN827qa/uG+9g4/SbxvH/a+MgXrospdmdcmuHw9oNG+kEr/erlt/r12xrOrC3iTfdun1tE3+Ehzv/0sN165Ggcxbs5+Kgthz6/c8xceC/y8SEVEz/XhdjWPwCYdKLn3H8i78=&lt;/diagram&gt;&lt;/mxfile&gt;"><defs><clipPath id="mx-clip-224-31-152-26-0"><rect x="224" y="31" width="152" height="26"/></clipPath><clipPath id="mx-clip-224-57-152-26-0"><rect x="224" y="57" width="152" height="26"/></clipPath><clipPath id="mx-clip-4-31-152-26-0"><rect x="4" y="31" width="152" height="26"/></clipPath></defs><g><path d="M 220 26 L 219.96 26 L 219.91 20.8 L 219.82 15.6 L 220.05 10.4 L 219.96 5.2 L 220 0 L 220 0.07 L 230 -0.22 L 240 -0.26 L 250 -0.74 L 260 0.06 L 270 -0.13 L 280 -0.61 L 290 -0.39 L 300 0.28 L 310 0.51 L 320 -0.14 L 330 0.32 L 340 -0.21 L 350 0.7 L 360 -0.45 L 370 -0.45 L 380 0 L 379.79 0 L 380.23 5.2 L 380 10.4 L 380.12 15.6 L 380.09 20.8 L 380 26" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="all"/><path d="M 220 26 L 220.27 26 L 220.23 36 L 219.81 46 L 219.31 56 L 220.74 66 L 220.15 76 L 220 86 L 220 86.27 L 230 85.51 L 240 85.43 L 250 85.56 L 260 86.65 L 270 86.12 L 280 85.3 L 290 85.69 L 300 86.11 L 310 85.36 L 320 85.52 L 330 85.32 L 340 86.29 L 350 85.5 L 360 85.93 L 370 86.51 L 380 86 L 380.15 86 L 379.76 76 L 380.71 66 L 379.61 56 L 380.5 46 L 379.58 36 L 380 26" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 220 26 L 220 25.96 L 230 25.43 L 240 25.35 L 250 26.36 L 260 26.6 L 270 26.65 L 280 25.98 L 290 25.55 L 300 26.43 L 310 25.54 L 320 26.63 L 330 26.24 L 340 26.08 L 350 26 L 360 26.47 L 370 26.58 L 380 26" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="299.5" y="17.5">Resource</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-224-31-152-26-0)" font-size="12px"><text x="225.5" y="43.5">+ processors: IntervalTree</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-224-57-152-26-0)" font-size="12px"><text x="225.5" y="69.5">+ memory: IntervalTree</text></g><path d="M 220 82 L 380 82" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 26 L 0.09 26 L -0.13 20.8 L 0.04 15.6 L 0.08 10.4 L 0.06 5.2 L 0 0 L 0 0.2 L 10 -0.48 L 20 -0.53 L 30 0.69 L 40 0.29 L 50 0.75 L 60 0.26 L 70 -0.59 L 80 -0.25 L 90 0.62 L 100 -0.4 L 110 0.32 L 120 0.71 L 130 0.07 L 140 -0.25 L 150 -0.6 L 160 0 L 160.07 0 L 160.08 5.2 L 160.01 10.4 L 159.91 15.6 L 159.97 20.8 L 160 26" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 26 L -0.24 26 L 0.12 32.8 L 0.23 39.6 L 0.11 46.4 L 0.12 53.2 L 0 60 L 0 59.83 L 10 60.38 L 20 60.37 L 30 59.88 L 40 59.92 L 50 59.52 L 60 59.72 L 70 60.6 L 80 59.74 L 90 59.97 L 100 60.11 L 110 60.25 L 120 60.02 L 130 60.32 L 140 60.61 L 150 60.16 L 160 60 L 160.21 60 L 160.11 53.2 L 159.84 46.4 L 160 39.6 L 160.1 32.8 L 160 26" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 26 L 0 25.48 L 10 26.73 L 20 26.55 L 30 26.3 L 40 25.26 L 50 25.76 L 60 25.47 L 70 26.1 L 80 26.01 L 90 26.67 L 100 26.63 L 110 25.39 L 120 25.48 L 130 26.01 L 140 25.79 L 150 25.83 L 160 26" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="79.5" y="17.5">Job</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-4-31-152-26-0)" font-size="12px"><text x="5.5" y="43.5">+ resources: Resource</text></g><path d="M 0 56 L 160 56" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 112 L 440.14 112 L 440.15 106.8 L 440.16 101.6 L 440.01 96.4 L 440.09 91.2 L 440 86 L 440 85.66 L 450 86.14 L 460 85.34 L 470 86.55 L 480 85.28 L 490 85.78 L 500 86.25 L 510 85.85 L 520 86.74 L 530 85.83 L 540 85.32 L 550 86.55 L 560 86.61 L 570 86.62 L 580 86.07 L 590 86.71 L 600 86 L 600.01 86 L 600.16 91.2 L 600.21 96.4 L 600.05 101.6 L 600.12 106.8 L 600 112" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 112 L 440.07 112 L 440.22 113.6 L 440.12 115.2 L 439.8 116.8 L 440.14 118.4 L 440 120 L 440 119.47 L 450 120.74 L 460 120.47 L 470 120.15 L 480 119.64 L 490 119.58 L 500 120.1 L 510 119.74 L 520 119.59 L 530 120.61 L 540 120.65 L 550 119.92 L 560 120.36 L 570 120.31 L 580 120.32 L 590 120.67 L 600 120 L 600.21 120 L 599.97 118.4 L 599.85 116.8 L 599.88 115.2 L 599.79 113.6 L 600 112" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 112 L 440 111.46 L 450 111.58 L 460 111.55 L 470 112.3 L 480 112.4 L 490 111.52 L 500 112.11 L 510 112.47 L 520 111.76 L 530 111.26 L 540 111.92 L 550 112.15 L 560 112.56 L 570 112.15 L 580 112.08 L 590 111.44 L 600 112" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="519.5" y="103.5">IntervalTree</text></g><path d="M 440 116 L 600 116" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 220 39 L 185.99 39" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 160.99 39 L 173.49 31.65 L 185.99 39 L 173.49 46.35 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 94.5 L 430 94.75 Q 420 95 420 85 L 420 49 Q 420 39 412.99 39 L 405.99 39" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 380.99 39 L 393.49 31.65 L 405.99 39 L 393.49 46.35 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 103 L 430 103 Q 420 103 420 93 L 420 75 Q 420 65 412.99 65 L 405.99 65" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 380.99 65 L 393.49 57.65 L 405.99 65 L 393.49 72.35 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/></g></svg>


--------------------------------------------------------------------------------
/docs/img/cluster-resourcepool.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="601px" height="130px" viewBox="-0.5 -0.5 601 130" content="&lt;mxfile host=&quot;Electron&quot; modified=&quot;2020-10-12T16:42:43.432Z&quot; agent=&quot;5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/13.6.2 Chrome/83.0.4103.122 Electron/9.2.1 Safari/537.36&quot; version=&quot;13.6.2&quot; etag=&quot;_ltOTvMVRL7VpOFL17Rg&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;XEENtnksOY1hfIiVmHsr&quot;&gt;7Vnfb9owEP5reFyFYxLgscC6Teok1Fba9ugmLrHmxMgxBfrX74xtkjj8iEq09gEJVdzns3O+77tzanp4mm2+SbJMf4qE8l7QTzY9POsFwRCF8FcDWwMMAmSAhWSJgSrAI3ujFuxbdMUSWtQclRBcsWUdjEWe01jVMCKlWNfdXgSvP3VJFrQBPMaEN9FfLFGpQUdhv8S/U7ZI3ZNR345kxDlboEhJItYVCH/t4akUQplv2WZKuc6dy4uZd3dkdB+YpLlqMyEwE14JX9m9PdBCrGRM55BQG6Pauo0Xa5ZxkoM1eRG5erQjCGzC2SKH7zE8mUoAXqlUDHJ2aweUWAIap4wn92QrVjq+QpH4r7MmqZDsDZYl3K4Jw1JZ+oOo5vGoZwLcB1TSAnzmbtPIg36STc3xnhTKArHgnCwL9rzfRkbkguUToZTI9k4Zi+2wTRdsjW6OphztiYQCoCKjSm7BxU7A47GZYsWPhlYL61JKKLJYWpHRKLIKtupd7JcuCYYvluPDfOMG371gosOXIqZFISSsfQv2D00ieD1JShsqgL2rHTtS/KVTwQXQPcuFkQXj3IOcMjh9UUd1USxJzPLF/c5nNiiRB5sADQmY+8J39ZKyJKG55lQoooghULO1FCxXuwyFE/hAHqf9m7AXQuBTsFFpw0e7SzUVOeyFsB1/FNSxploh7cgODpNtyQ2idtwGHXA7OMJtRjMBK1157ZDXMPh/vIYHePWY42zHiGHOHUnoXbRlQACnJU9PmsbZF9TgEje5xAd44+SZ8rkomGJCry+Nr8fnf6BsOGrZZi9nLGowNuWrQh+L1wO1+wMVjfrvO1AddgnVwyNNV9rXKHueureqa889xXP0ac7S0bXndkFZ22Oyg547bjB28l3n2ngvb7xR4DVe3LJC8eByvt1dwLVEz5E47qSrdlCiCJ2njObJrb6hASthJBN58pQyfTbBwB3jrp7ActWkD7NUZeUIRPW7avzRKYdjypozV0PG2jprw9TvynczK0LI2uU0bVRnzalkkAvdKnaY2RNNvDsk+wJQbVbQFBbU1UTr6jvElKScKPZaf+Ihruxycy2zSiWj+p1EMPDodndCu1nV+yRvoTD0FkLeO5bZcmMh4JxsK262DI4GHPqXKOHwdFzjw3GVyjURlDreJ7ydtJsXaZ9d2sPwpLLfp+LBh6oYe+fRaKR3+T4hR55gcHQTeGt1pOWo74UdBSdD8/0DHHar5UOXhJ9byxdLGTelPPxIKfuXxNBgbobe/zVtpYzx+bWOSPm8esAsf60w7uVPPvjrPw==&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(255, 255, 255);"><defs><clipPath id="mx-clip-223-69-152-26-0"><rect x="223" y="69" width="152" height="26"/></clipPath><clipPath id="mx-clip-223-95-152-26-0"><rect x="223" y="95" width="152" height="26"/></clipPath><clipPath id="mx-clip-4-69-152-26-0"><rect x="4" y="69" width="152" height="26"/></clipPath></defs><g><path d="M 219 64 L 219.11 64 L 218.93 58.8 L 218.87 53.6 L 219 48.4 L 218.83 43.2 L 219 38 L 219 37.5 L 229 38.42 L 239 37.58 L 249 38.08 L 259 37.64 L 269 38.16 L 279 37.54 L 289 37.71 L 299 38.34 L 309 38 L 319 37.79 L 329 37.67 L 339 38.2 L 349 37.93 L 359 38.26 L 369 37.92 L 379 38 L 379.21 38 L 379.24 43.2 L 378.95 48.4 L 378.91 53.6 L 379.06 58.8 L 379 64" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="all"/><path d="M 219 64 L 219.37 64 L 219.72 74 L 219.34 84 L 219.38 94 L 218.85 104 L 219.45 114 L 219 124 L 219 123.89 L 229 123.3 L 239 123.99 L 249 124.68 L 259 124.62 L 269 123.41 L 279 124.4 L 289 123.79 L 299 124.13 L 309 124.26 L 319 123.95 L 329 123.7 L 339 123.78 L 349 123.44 L 359 124.27 L 369 123.6 L 379 124 L 379.61 124 L 378.3 114 L 379.67 104 L 379.67 94 L 379.37 84 L 379.04 74 L 379 64" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 219 64 L 219 64.39 L 229 63.57 L 239 63.8 L 249 64.25 L 259 63.42 L 269 63.4 L 279 63.69 L 289 63.85 L 299 63.89 L 309 64.35 L 319 64.23 L 329 64.64 L 339 64.53 L 349 63.8 L 359 64.62 L 369 63.89 L 379 64" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="298.5" y="55.5">ResourcePool</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-223-69-152-26-0)" font-size="12px"><text x="224.5" y="81.5">+ processors: IntervalTree</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-223-95-152-26-0)" font-size="12px"><text x="224.5" y="107.5">+ memory: IntervalTree</text></g><path d="M 219 120 L 379 120" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 64 L 0.15 64 L -0.01 58.8 L -0.17 53.6 L -0.11 48.4 L 0.23 43.2 L 0 38 L 0 38.58 L 10 37.68 L 20 38.34 L 30 38.64 L 40 37.39 L 50 37.76 L 60 37.42 L 70 38.57 L 80 37.41 L 90 38.66 L 100 38.17 L 110 38.55 L 120 38.46 L 130 37.65 L 140 37.88 L 150 37.99 L 160 38 L 159.99 38 L 159.85 43.2 L 159.79 48.4 L 160 53.6 L 159.76 58.8 L 160 64" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 64 L 0 64 L -0.24 70.8 L -0.07 77.6 L -0.09 84.4 L -0.16 91.2 L 0 98 L 0 97.84 L 10 98.65 L 20 97.61 L 30 98.19 L 40 98.53 L 50 98.34 L 60 98.44 L 70 98.68 L 80 97.78 L 90 98.48 L 100 98.15 L 110 97.53 L 120 97.45 L 130 97.8 L 140 98.71 L 150 97.92 L 160 98 L 160.13 98 L 159.89 91.2 L 160.13 84.4 L 160.23 77.6 L 159.92 70.8 L 160 64" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 0 64 L 0 63.69 L 10 64.4 L 20 63.46 L 30 64.11 L 40 64.58 L 50 63.27 L 60 64.17 L 70 63.39 L 80 63.3 L 90 64.73 L 100 63.53 L 110 64.15 L 120 64.55 L 130 63.42 L 140 64.45 L 150 64.22 L 160 64" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="79.5" y="55.5">Cluster</text></g><g fill="#000000" font-family="Helvetica" pointer-events="none" clip-path="url(#mx-clip-4-69-152-26-0)" font-size="12px"><text x="5.5" y="81.5">+ resources: Resource</text></g><path d="M 0 94 L 160 94" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 30 L 440.09 30 L 439.8 24.8 L 440.25 19.6 L 439.9 14.4 L 439.76 9.2 L 440 4 L 440 3.36 L 450 4.5 L 460 3.6 L 470 3.79 L 480 4.02 L 490 3.84 L 500 3.92 L 510 3.98 L 520 3.32 L 530 4.24 L 540 3.6 L 550 4.58 L 560 3.94 L 570 3.29 L 580 3.28 L 590 3.77 L 600 4 L 599.98 4 L 599.92 9.2 L 599.79 14.4 L 600.19 19.6 L 599.85 24.8 L 600 30" fill="#ffffff" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 30 L 439.94 30 L 440.22 31.6 L 440.13 33.2 L 439.94 34.8 L 439.8 36.4 L 440 38 L 440 37.78 L 450 38.52 L 460 38.36 L 470 37.67 L 480 37.66 L 490 38.52 L 500 38.71 L 510 37.69 L 520 38.03 L 530 38.36 L 540 38.47 L 550 38.18 L 560 37.74 L 570 37.37 L 580 37.55 L 590 37.89 L 600 38 L 599.94 38 L 599.92 36.4 L 599.78 34.8 L 600.12 33.2 L 600.08 31.6 L 600 30" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 30 L 440 30.63 L 450 30.29 L 460 30.59 L 470 29.6 L 480 29.44 L 490 30.08 L 500 29.93 L 510 30.03 L 520 29.7 L 530 29.69 L 540 30.61 L 550 30.18 L 560 29.29 L 570 30.51 L 580 30.39 L 590 29.81 L 600 30" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="none"/><g fill="#000000" font-family="Helvetica" font-weight="bold" pointer-events="none" text-anchor="middle" font-size="12px"><text x="519.5" y="21.5">IntervalTree</text></g><path d="M 440 34 L 600 34" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 24.77 L 429 24.89 Q 419 25 419 35 L 419 68 Q 419 78 411.99 77.82 L 404.98 77.65" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 379.99 77.02 L 392.67 69.99 L 404.98 77.65 L 392.3 84.69 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 440 29.5 L 430 29.75 Q 420 30 420 40 L 420 93 Q 420 103 412.49 103 L 404.99 103" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 379.99 103 L 392.49 95.65 L 404.99 103 L 392.49 110.35 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 219 77 L 185.99 77" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/><path d="M 160.99 77 L 173.49 69.65 L 185.99 77 L 173.49 84.35 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="none"/></g></svg>


--------------------------------------------------------------------------------
/schedgym/cluster.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """cluster - Classes for cluster management
  5 | 
  6 | The workhorse of this module is the :class:`schedgym.cluster.Cluster` class,
  7 | which manages resources in a cluster.
  8 | """
  9 | 
 10 | import copy
 11 | from typing import Tuple, Iterable, Optional
 12 | 
 13 | from . import pool
 14 | 
 15 | from .job import Job, Resource
 16 | from .event import JobEvent, EventType
 17 | 
 18 | # pylint: disable=C
 19 | RESOURCE_TYPE = Tuple[Iterable[pool.Interval], Iterable[pool.Interval]]
 20 | 
 21 | 
 22 | class Cluster:
 23 |     """A cluster as a set of resources.
 24 | 
 25 |     Currently, this doesn't make a distinction between machines. So it only
 26 |     manages groups of resources.
 27 | 
 28 |     Note that although we don't differentiate between machines, we **do** honor
 29 |     resources. Therefore, if a given processor is allocated by a job j, we make
 30 |     sure not to allocate processor n for any other job until j finishes.
 31 | 
 32 |     Due to the above constraint, some checks are more complex (and,
 33 |     consequently, slower) than if we disregarded *which* processors and memory
 34 |     units were used and only counted the *amount* of resources used.
 35 | 
 36 |     This makes our design slightly closer to reality, though.
 37 | 
 38 |     The figure below shows the relationship between clusters, ResourcePools,
 39 |     and the basic data structure for resource management (`IntervalTree`).
 40 | 
 41 |     .. image:: /img/cluster-resourcepool.svg
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |         processors : int
 46 |             The number of processors in this cluster
 47 |         memory : int
 48 |             The amount of memory in this cluster
 49 |         ignore_memory : bool
 50 |             Whether memory should be considered for decisions or not
 51 |         used_processors : Optional[Resource]
 52 |             Processors already in use in this cluster
 53 |         used_memory : Optional[Resource]
 54 |             Amount of memory already used in this cluster
 55 |     """
 56 | 
 57 |     ignore_memory: bool
 58 |     memory: pool.ResourcePool
 59 |     processors: pool.ResourcePool
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         processors: int,
 64 |         memory: int,
 65 |         ignore_memory: bool = False,
 66 |         used_processors: Optional[Resource] = None,
 67 |         used_memory: Optional[Resource] = None,
 68 |     ):
 69 |         self.ignore_memory = ignore_memory
 70 |         self.memory = pool.ResourcePool(
 71 |             pool.ResourceType.MEMORY, memory, used_memory
 72 |         )
 73 |         self.processors = pool.ResourcePool(
 74 |             pool.ResourceType.CPU, processors, used_processors
 75 |         )
 76 | 
 77 |     @property
 78 |     def free_resources(self) -> Tuple[int, int]:
 79 |         """The set of resources *not* in use in this cluster."""
 80 |         return self.processors.free_resources, self.memory.free_resources
 81 | 
 82 |     def fits(self, job: Job) -> bool:
 83 |         """Checks whether a job fits in this cluster.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |             job : Job
 88 |                 The job to check against in this cluster
 89 | 
 90 |         Returns:
 91 |             True if the job fits the cluster (can be added to the cluster), and
 92 |             False otherwise
 93 |         """
 94 |         return self.processors.fits(job.requested_processors) and (
 95 |             self.ignore_memory or self.memory.fits(job.requested_memory)
 96 |         )
 97 | 
 98 |     def allocate(self, job: Job) -> None:
 99 |         """Checks whether a job fits the system and allocates resources for it.
100 | 
101 |         Parameters
102 |         ----------
103 |             job : Job
104 |                 The job to allocate resources to.
105 |         """
106 |         if not self.fits(job):
107 |             raise AssertionError(
108 |                 f'Unable to allocate resources for {job} in {self}'
109 |             )
110 |         self.processors.allocate(job.resources.processors)
111 |         self.memory.allocate(job.resources.memory)
112 | 
113 |     def clone(self):
114 |         """Clones this Cluster (duplicating it in memory)."""
115 |         return copy.deepcopy(self)
116 | 
117 |     def find(self, job: Job) -> Resource:
118 |         """Finds resources for a job.
119 | 
120 |         If the job fits in the system, this will return a set of resources that
121 |         can be used by a job. If it doesn't, will return an empty set of
122 |         resources (which evaluate to False in boolean expressions).
123 | 
124 |         Parameters
125 |         ----------
126 |             job : Job
127 |                 The job to find resources to.
128 |         """
129 |         p = self.processors.find(job.requested_processors, job.id)
130 |         if not p:
131 |             return Resource()
132 |         if self.ignore_memory:
133 |             return Resource(p, ignore_memory=True)
134 |         m = self.memory.find(job.requested_memory, job.id)
135 |         return Resource(p, m)
136 | 
137 |     def free(self, job: Job) -> None:
138 |         """Frees the resources used by a job.
139 | 
140 |         Parameters
141 |         ----------
142 |             job : Job
143 |                 The job to free resources from.
144 |         """
145 |         self.processors.free(job.resources.processors)
146 |         if not self.ignore_memory:
147 |             self.memory.free(job.resources.memory)
148 | 
149 |     def find_resources_at_time(
150 |         self, time: int, job: Job, events: Iterable[JobEvent]
151 |     ) -> Resource:
152 |         """Finds resources for a job at a given time step.
153 | 
154 |         This is probably the most complex (and most important) function in this
155 |         class. To find an allocation for a job, we have to iterate through the
156 |         queue of events and evaluating the state of the system given that set
157 |         of events to check whether a given job would fit the system.
158 | 
159 |         Since this method can be called with time stamps in the far future, we
160 |         are required to play events to find the exact configuration in the
161 |         future.
162 | 
163 |         Parameters
164 |         ----------
165 |             time : int
166 |                 The time at which to check whether the job fits the system
167 |             job : Job
168 |                 The job to check
169 |             events : Iterable[JobEvent]
170 |                 A set of events that will play out in the future
171 | 
172 |         Returns:
173 |             A set of resources if the job fits the cluster at time `time`, or
174 |             an empty set of resources otherwise. (See
175 |             :func:`schedgym.cluster.Cluster.find`.)
176 |         """
177 |         def valid(e, time):
178 |             return time + 1 <= e.time < job.requested_time + time
179 | 
180 |         used = Resource(self.processors.used_pool, self.memory.used_pool)
181 |         for event in (
182 |             e
183 |             for e in events
184 |             if (valid(e, time) and e.type == EventType.JOB_START)
185 |         ):
186 |             for i in event.processors:
187 |                 used.processors.add(i)
188 |             for i in event.memory:
189 |                 used.memory.add(i)
190 |         used.processors.merge_overlaps()
191 |         used.memory.merge_overlaps()
192 |         return Cluster(
193 |             self.processors.size,
194 |             self.memory.size,
195 |             self.ignore_memory,
196 |             used.processors,
197 |             used.memory,
198 |         ).find(job)
199 | 
200 |     @property
201 |     def state(self) -> Tuple[Tuple[int, int, dict], ...]:
202 |         """Gets the current state of the cluster as numpy arrays.
203 | 
204 |         Returns:
205 |             Tuple: a pair containing the number of processors used and the
206 |             memory used and the jobs that are using such resources.
207 |         """
208 |         processors = (
209 |             self.processors.free_resources,
210 |             self.processors.used_resources,
211 |             {(i.begin, i.end): i.data for i in self.processors.used_pool},
212 |         )
213 |         memory = (
214 |             self.memory.free_resources,
215 |             self.memory.used_resources,
216 |             {(i.begin, i.end): i.data for i in self.memory.used_pool},
217 |         )
218 |         if self.ignore_memory:
219 |             return (processors,)
220 |         else:
221 |             return processors, memory
222 | 
223 |     def __bool__(self):
224 |         return (
225 |             self.processors.free_resources != 0
226 |             and self.memory.free_resources != 0
227 |         )
228 | 
229 |     def __repr__(self):
230 |         return (
231 |             f'Cluster({self.processors}, {self.memory}, {self.ignore_memory})'
232 |         )
233 | 
234 |     def __str__(self):
235 |         return (
236 |             f'Cluster({self.processors}, {self.memory}, {self.ignore_memory})'
237 |         )
238 | 


--------------------------------------------------------------------------------
/schedgym/envs/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import random
  5 | from enum import IntEnum
  6 | from typing import List, Dict
  7 | from abc import ABC, abstractmethod
  8 | 
  9 | import gym
 10 | 
 11 | import numpy as np
 12 | 
 13 | from .simulator import SimulationType, DeepRmSimulator
 14 | from ..scheduler.null_scheduler import NullScheduler
 15 | from .workload import build as build_workload
 16 | 
 17 | BACKLOG_SIZE = 60
 18 | MAXIMUM_NUMBER_OF_ACTIVE_JOBS = 40  # Number of colors in image
 19 | MAX_TIME_TRACKING_SINCE_LAST_JOB = 10
 20 | 
 21 | TIME_HORIZON = 20
 22 | JOB_SLOTS = 5
 23 | AMOUNT_OF_MEMORY = 10
 24 | NUMBER_OF_PROCESSORS = 10
 25 | MAXIMUM_JOB_LENGTH = 15
 26 | MAXIMUM_JOB_SIZE = 10
 27 | NEW_JOB_RATE = 0.7
 28 | SMALL_JOB_CHANCE = 0.8
 29 | DEFAULT_WORKLOAD = {
 30 |     'type': 'deeprm',
 31 |     'new_job_rate': NEW_JOB_RATE,
 32 |     'max_job_size': MAXIMUM_JOB_SIZE,
 33 |     'max_job_len': MAXIMUM_JOB_LENGTH,
 34 |     'small_job_chance': SMALL_JOB_CHANCE,
 35 | }
 36 | 
 37 | 
 38 | class RewardJobs(IntEnum):
 39 |     ALL = (0,)
 40 |     JOB_SLOTS = (1,)
 41 |     WAITING = (2,)
 42 |     RUNNING_JOB_SLOTS = (3,)
 43 | 
 44 |     @staticmethod
 45 |     def from_str(reward_range: str):
 46 |         reward_range = reward_range.upper().replace('-', '_')
 47 |         if reward_range in RewardJobs.__members__:
 48 |             return RewardJobs[reward_range]
 49 |         else:
 50 |             raise ValueError(
 51 |                 f'{reward_range} is not a valid RewardJobs range. '
 52 |                 f'Valid options are: {list(RewardJobs.__members__.keys())}.'
 53 |             )
 54 | 
 55 | 
 56 | class BaseRmEnv(ABC, gym.Env):
 57 |     metadata = {'render.modes': ['human', 'rgb_array']}
 58 | 
 59 |     job_slots: int
 60 |     time_limit: int
 61 |     job_num_cap: int
 62 |     time_horizon: int
 63 |     ignore_memory: bool
 64 |     color_index: List[int]
 65 |     color_cache: Dict[int, int]
 66 |     simulator: DeepRmSimulator
 67 | 
 68 |     @abstractmethod
 69 |     def __init__(self, **kwargs):
 70 |         self.color_cache = {}
 71 |         self.renderer = kwargs.get('renderer', None)
 72 |         self.shuffle_colors = kwargs.get('shuffle_colors', False)
 73 |         self.job_num_cap = kwargs.get(
 74 |             'job_num_cap', MAXIMUM_NUMBER_OF_ACTIVE_JOBS
 75 |         )
 76 |         self.simulation_type = SimulationType.from_str(
 77 |             kwargs.get('simulation_type', 'time_based')
 78 |         )
 79 | 
 80 |         self.reward_jobs = RewardJobs.from_str(
 81 |             kwargs.get('reward_jobs', 'all')
 82 |         )
 83 | 
 84 |         self.smdp = self.simulation_type == SimulationType.EVENT_BASED
 85 |         self.gamma = kwargs.get('gamma', 1.0)
 86 | 
 87 |         self.time_horizon = kwargs.get(
 88 |             'time_horizon', TIME_HORIZON
 89 |         )  # number of time steps in the graph
 90 | 
 91 |         time_limit = kwargs.get('time_limit', 200)
 92 |         if time_limit is None:
 93 |             self.time_limit = 1
 94 |             self.update_time_limit = True
 95 |         else:
 96 |             self.time_limit = time_limit
 97 |             self.update_time_limit = False
 98 | 
 99 |         step = 1.0 / self.job_num_cap
100 |         # zero is already present and set to "no job there"
101 |         self.colormap = np.arange(start=step, stop=1, step=step)
102 |         if self.shuffle_colors:
103 |             np.random.shuffle(self.colormap)
104 |         self.color_index = list(range(len(self.colormap)))
105 | 
106 |         # Number of jobs to show
107 |         self.job_slots = kwargs.get('job_slots', JOB_SLOTS)
108 | 
109 |         self.reward_mapper = {
110 |             RewardJobs.ALL: lambda: self.scheduler.jobs_in_system,
111 |             RewardJobs.WAITING: lambda: self.scheduler.queue_admission,
112 |             RewardJobs.JOB_SLOTS: lambda: self.scheduler.queue_admission[
113 |                 : self.job_slots
114 |             ],
115 |             RewardJobs.RUNNING_JOB_SLOTS: lambda: self.scheduler.queue_running
116 |             + self.scheduler.queue_admission[: self.job_slots],
117 |         }
118 | 
119 |         self.backlog_size = kwargs.get('backlog_size', BACKLOG_SIZE)
120 |         self.memory = kwargs.get('memory', AMOUNT_OF_MEMORY)
121 |         self.processors = kwargs.get('processors', NUMBER_OF_PROCESSORS)
122 |         self.ignore_memory = kwargs.get('ignore_memory', False)
123 | 
124 |         self.workload_config = kwargs.get('workload', DEFAULT_WORKLOAD)
125 |         wl = build_workload(self.workload_config)
126 | 
127 |         scheduler = NullScheduler(
128 |             self.processors, self.memory, ignore_memory=self.ignore_memory
129 |         )
130 |         self.simulator = DeepRmSimulator(
131 |             wl,
132 |             scheduler,
133 |             simulation_type=self.simulation_type,
134 |             job_slots=self.job_slots,
135 |         )
136 | 
137 |     def reset(self) -> np.ndarray:
138 |         scheduler = NullScheduler(
139 |             self.processors, self.memory, ignore_memory=self.ignore_memory
140 |         )
141 |         wl = build_workload(self.workload_config)
142 |         if self.update_time_limit and hasattr(wl, 'trace'):
143 |             self.time_limit = (
144 |                 wl.trace[-1].submission_time +  # type: ignore
145 |                 wl.trace[-1].execution_time  # type: ignore
146 |             )
147 |         self.simulator.reset(wl, scheduler)
148 |         return self.state
149 | 
150 |     def _render_state(self):
151 |         state, jobs, backlog = self.scheduler.state(
152 |             self.time_horizon, self.job_slots
153 |         )
154 |         s = self._convert_state(
155 |             state,
156 |             jobs,
157 |             backlog,
158 |             (
159 |                 (self.simulator.current_time - self.simulator.last_job_time)
160 |                 / MAX_TIME_TRACKING_SINCE_LAST_JOB
161 |             ),
162 |         )
163 |         return s
164 | 
165 |     def build_current_state(self, current):
166 |         ret = [np.zeros((self.time_horizon, sum(e[0][:-1]))) for e in current]
167 |         for i, _ in enumerate(current):
168 |             for t in range(self.time_horizon):
169 |                 for k, v in current[i][t][-1].items():
170 |                     ret[i][t][slice(*k)] = v
171 |         return ret
172 | 
173 |     def build_job_slots(self, wait):
174 |         memory = np.zeros(
175 |             (self.job_slots, self.time_horizon, self.scheduler.total_memory)
176 |         )
177 |         processors = np.zeros(
178 |             (
179 |                 self.job_slots,
180 |                 self.time_horizon,
181 |                 self.scheduler.number_of_processors,
182 |             )
183 |         )
184 |         for i, j in enumerate(wait):
185 |             if j.requested_processors == -1:
186 |                 break
187 |             time_slice = slice(
188 |                 0,
189 |                 self.time_horizon
190 |                 if j.requested_time > self.time_horizon
191 |                 else j.requested_time,
192 |             )
193 |             processors[i, time_slice, : j.requested_processors] = 1.0
194 |             if j.requested_memory != -1:
195 |                 memory[i, time_slice, : j.requested_memory] = 1.0
196 |         return (processors,) if self.ignore_memory else (processors, memory)
197 | 
198 |     def _convert_state(self, current, wait, backlog, time):
199 |         current = self.build_current_state(current)
200 |         wait = self.build_job_slots(wait)
201 |         backlog_width = self.backlog_size // self.time_horizon
202 |         backlog = np.ones(self.time_horizon * backlog_width) * backlog
203 |         unique = set(np.unique(current[0])) - {0.0}
204 |         if len(unique) > self.job_num_cap:
205 |             raise AssertionError('Number of jobs > number of colors')
206 |         available_colors = list(
207 |             set(self.color_index)
208 |             - set(
209 |                 [self.color_cache[j] for j in unique if j in self.color_cache]
210 |             )
211 |         )
212 |         need_color = unique - set(self.color_cache.keys())
213 |         for i, j in enumerate(need_color):
214 |             self.color_cache[j] = available_colors[i]
215 |         for j in unique:  # noqa
216 |             for resource in current:
217 |                 resource[resource == j] = self.colormap[self.color_cache[j]]
218 | 
219 |         return (
220 |             np.array(current),
221 |             np.array(wait),
222 |             backlog.reshape((self.time_horizon, -1)),
223 |             np.ones((self.time_horizon, 1)) * min(1.0, time),
224 |         )
225 | 
226 |     def render(self, mode='human'):
227 |         if self.renderer is None:
228 |             from .render import DeepRmRenderer
229 | 
230 |             self.renderer = DeepRmRenderer(mode)
231 |         rgb = self.renderer.render(self._render_state())
232 |         return rgb
233 | 
234 |     def seed(self, seed=None):
235 |         if seed is None:
236 |             seed = random.randint(0, 99999999)
237 |         np.random.seed(seed)
238 |         random.seed(seed)
239 |         return [seed]
240 | 
241 |     def compute_reward(self, joblist):
242 |         return -np.sum([1 / j.execution_time for j in joblist])
243 | 
244 |     @property
245 |     def reward(self):
246 |         return self.compute_reward(self.reward_mapper[self.reward_jobs]())
247 | 
248 |     @property
249 |     def stats(self):
250 |         return self.scheduler.stats
251 | 
252 |     @property
253 |     @abstractmethod
254 |     def state(self):
255 |         raise NotImplementedError
256 | 
257 |     @property
258 |     def scheduler(self) -> NullScheduler:
259 |         return self.simulator.scheduler
260 | 


--------------------------------------------------------------------------------
/schedgym/envs/workload.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # flake8: noqa E501
  4 | 
  5 | import math
  6 | import random
  7 | import warnings
  8 | import itertools
  9 | from math import log2
 10 | from typing import Optional, List
 11 | from collections import namedtuple
 12 | from parallelworkloads.lublin99 import Lublin99
 13 | from parallelworkloads.tsafrir05 import Tsafrir05
 14 | 
 15 | from schedgym import workload as wl, job
 16 | 
 17 | JobParameters = namedtuple('JobParameters', ['small', 'large'])
 18 | 
 19 | 
 20 | class DeepRmWorkloadGenerator(wl.DistributionalWorkloadGenerator):
 21 |     def __init__(self, *args: wl.BinomialWorkloadGenerator):
 22 |         super().__init__(max([w.length for w in args]))
 23 | 
 24 |         self.generators = args
 25 |         self.counter = itertools.count(1)
 26 | 
 27 |         for generator in self.generators:
 28 |             generator.counter = self.counter
 29 | 
 30 |     def step(self, offset=1) -> List[Optional[job.Job]]:
 31 |         return self.generators[
 32 |             random.randint(0, len(self.generators) - 1)
 33 |         ].step()
 34 | 
 35 |     def __len__(self):
 36 |         return self.generators[0].length
 37 | 
 38 |     def peek(self):
 39 |         return self.step()
 40 | 
 41 |     @staticmethod
 42 |     def build(
 43 |         new_job_rate,
 44 |         small_job_chance,
 45 |         max_job_len,
 46 |         max_job_size,
 47 |         ignore_memory=False,
 48 |         min_large_job_len=None,
 49 |         max_small_job_len=None,
 50 |         min_small_job_len=None,
 51 |         min_dominant_job_size=None,
 52 |         min_other_job_size=None,
 53 |         max_other_job_size=None,
 54 |         runtime_estimates=None,
 55 |         estimate_parameters=None,
 56 |     ) -> 'DeepRmWorkloadGenerator':
 57 |         # Time-related job parameters {{{
 58 |         small_job_time_lower = (
 59 |             1 if min_small_job_len is None else min_small_job_len
 60 |         )
 61 |         small_job_time_upper = (
 62 |             max(max_job_len // 5, 1)
 63 |             if max_small_job_len is None
 64 |             else max_small_job_len
 65 |         )
 66 |         large_job_time_lower = (
 67 |             int(max_job_len * (2 / 3))
 68 |             if min_large_job_len is None
 69 |             else min_large_job_len
 70 |         )
 71 |         large_job_time_upper = max_job_len
 72 |         # }}}
 73 | 
 74 |         # Resource-related job parameters {{{
 75 |         dominant_resource_lower = (
 76 |             max_job_size // 2
 77 |             if min_dominant_job_size is None
 78 |             else min_dominant_job_size
 79 |         )
 80 |         dominant_resource_upper = max_job_size
 81 |         other_resource_lower = (
 82 |             1 if min_other_job_size is None else min_other_job_size
 83 |         )
 84 |         other_resource_upper = (
 85 |             max_job_size // 5
 86 |             if max_other_job_size is None
 87 |             else max_other_job_size
 88 |         )
 89 |         # }}}
 90 | 
 91 |         cpu_dominant_parameters = JobParameters(  # {{{
 92 |             job.JobParameters(
 93 |                 small_job_time_lower,
 94 |                 small_job_time_upper,
 95 |                 dominant_resource_lower,
 96 |                 dominant_resource_upper,
 97 |                 other_resource_lower,
 98 |                 other_resource_upper,
 99 |             ),
100 |             job.JobParameters(
101 |                 large_job_time_lower,
102 |                 large_job_time_upper,
103 |                 dominant_resource_lower,
104 |                 dominant_resource_upper,
105 |                 other_resource_lower,
106 |                 other_resource_upper,
107 |             ),
108 |         )  # }}}
109 | 
110 |         mem_dominant_parameters = JobParameters(  # {{{
111 |             job.JobParameters(
112 |                 small_job_time_lower,
113 |                 small_job_time_upper,
114 |                 other_resource_lower,
115 |                 other_resource_upper,
116 |                 dominant_resource_lower,
117 |                 dominant_resource_upper,
118 |             ),
119 |             job.JobParameters(
120 |                 large_job_time_lower,
121 |                 large_job_time_upper,
122 |                 other_resource_lower,
123 |                 other_resource_upper,
124 |                 dominant_resource_lower,
125 |                 dominant_resource_upper,
126 |             ),
127 |         )  # }}}
128 | 
129 |         generators = (
130 |             wl.BinomialWorkloadGenerator(
131 |                 new_job_rate,
132 |                 small_job_chance,
133 |                 cpu_dominant_parameters.small,
134 |                 cpu_dominant_parameters.large,
135 |                 runtime_estimates=runtime_estimates,
136 |                 estimate_parameters=estimate_parameters,
137 |             ),
138 |             wl.BinomialWorkloadGenerator(
139 |                 new_job_rate,
140 |                 small_job_chance,
141 |                 mem_dominant_parameters.small,
142 |                 mem_dominant_parameters.large,
143 |                 runtime_estimates=runtime_estimates,
144 |                 estimate_parameters=estimate_parameters,
145 |             ),
146 |         )
147 | 
148 |         return DeepRmWorkloadGenerator(
149 |             *generators[: (1 if ignore_memory else None)]
150 |         )
151 | 
152 | 
153 | class SyntheticWorkloadGenerator(wl.TraceGenerator):
154 |     """A synthetic workload generator based on realistic models."""
155 | 
156 |     def __init__(
157 |         self,
158 |         length,
159 |         nodes,
160 |         start_time=8,
161 |         random_seed=0,
162 |         restart=False,
163 |         uniform_proportion=0.95,
164 |         cdf_break=0.5,
165 |         runtime_estimates=None,
166 |         estimate_parameters=None,
167 |     ):
168 |         """Synthetic workload generator based on Lublin's work.
169 | 
170 |         Parameters
171 |         ----------
172 |             length : int
173 |                 number of jobs to generate
174 |             nodes : int
175 |                 number of compute nodes in the system
176 |             start_time : int
177 |                 hour of day in which to start simulation
178 |             random_seed : int
179 |                 random seed to use to generate jobs
180 |             restart : bool
181 |                 whether to restart after a sample finishes
182 |             uniform_proportion : float
183 |                 tunes the proportion between the first and second uniform
184 |                 distributions in the two-stage uniform process
185 |             cdf_break : float
186 |                 whether to move the break closer to the inferior or superior
187 |                 limit. A value closer to 0 will (tend to) produce bigger jobs,
188 |                 while a value closer to 1 will (tend to) produce smaller jobs
189 |             runtime_estimates : {'gaussian', 'tsafrir', None}
190 |                 whether to include runtime estimates and the method used
191 |                 to compute them:
192 |                 * None generates perfect estimate (estimates equal run time)
193 |                 * 'gaussian' generates estimates with zero-mean Gaussian noise
194 |                   added to them
195 |                 * 'tsafrir' uses Dan Tsafrir's model of user runtime estimates
196 |                   to generate estimates
197 |             estimate_parameters : Union[float, List[Tuple[float, float]]
198 |                 the parameters used for generating user estimates.
199 |                 Depends on :param:`runtime_estimates`.
200 |                 When `runtime_estimates` is 'gaussian', this is a single
201 |                 floating-point number that sets the standard deviation of the
202 |                 noise.
203 |                 When `runtime_estimates` is 'tsafrir', this is a list of
204 |                 floating-point pairs specifying a histogram (time, number of
205 |                 jobs) of job runtime popularity.
206 |         """
207 |         random.seed(random_seed)
208 | 
209 |         self.lublin = Lublin99(False, random_seed, length)
210 |         self.lublin.start = start_time
211 |         self.random_seed = random_seed
212 |         self.nodes = nodes
213 | 
214 |         uniform_low_prob = 0.8
215 |         log2_size = log2(nodes)
216 |         min_umed = log2_size - 3.5
217 |         max_umed = log2_size - 1.5
218 |         breaking_point = cdf_break * min_umed + (1 - cdf_break) * max_umed
219 | 
220 |         self.lublin.setParallelJobProbabilities(
221 |             False,
222 |             uniform_low_prob,
223 |             breaking_point,
224 |             log2_size,
225 |             uniform_proportion,
226 |         )
227 | 
228 |         self.runtime_estimates = runtime_estimates
229 |         self.estimate_parameters = estimate_parameters
230 | 
231 |         trace = self.refresh_jobs()
232 |         super().__init__(restart, trace)
233 | 
234 |     def refresh_jobs(self):
235 |         """Refreshes the underlying job list."""
236 |         jobs = self.lublin.generate()
237 |         if self.runtime_estimates:
238 |             if self.runtime_estimates == 'tsafrir':
239 |                 if self.estimate_parameters is not None:
240 |                     warnings.warn(
241 |                         'Setting tsafrir parameters is currently unsupported'
242 |                     )
243 |                 tsafrir = Tsafrir05(jobs)
244 |                 jobs = tsafrir.generate(jobs)
245 |             elif self.runtime_estimates == 'gaussian':
246 |                 for j in jobs:
247 |                     j.reqTime = math.ceil(
248 |                         random.gauss(
249 |                             j.runTime, self.estimate_parameters * j.runTime
250 |                         )
251 |                     )
252 |                     if j.reqTime < 1:
253 |                         j.reqTime = 1
254 |             else:
255 |                 raise ValueError(
256 |                     f'Unsupported estimate type {self.runtime_estimates}'
257 |                 )
258 | 
259 |         self.trace = [job.Job.from_swf_job(j) for j in jobs]
260 |         return self.trace
261 | 
262 | 
263 | def build(workload_config: dict):
264 |     type = workload_config['type']
265 |     kwargs = {k: v for k, v in workload_config.items() if k != 'type'}
266 |     if type == 'deeprm':
267 |         return DeepRmWorkloadGenerator.build(**kwargs)
268 |     elif type == 'lublin':
269 |         return SyntheticWorkloadGenerator(**kwargs)
270 |     else:
271 |         raise RuntimeError(f'Unsupported workload model type {type} requested')
272 | 


--------------------------------------------------------------------------------
/schedgym/job.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """job - Classes for jobs in the simulator.
  5 | """
  6 | 
  7 | import enum
  8 | 
  9 | import random
 10 | import warnings
 11 | 
 12 | from collections import namedtuple
 13 | 
 14 | from .resource import Resource, PrimaryResource
 15 | 
 16 | JobState = namedtuple(
 17 |     'JobState',
 18 |     [
 19 |         'submission_time',
 20 |         'requested_time',
 21 |         'requested_memory',
 22 |         'requested_processors',
 23 |         'queue_size',
 24 |         'queued_work',
 25 |         'free_processors',
 26 |     ],
 27 | )
 28 | 
 29 | 
 30 | class JobStatus(enum.IntEnum):
 31 |     """An enumeration for different states of a job within our simulator."""
 32 | 
 33 |     SUBMITTED = 0
 34 |     RUNNING = 1
 35 |     WAITING = 2
 36 |     COMPLETED = 3
 37 |     SCHEDULED = 4
 38 | 
 39 | 
 40 | class SwfJobStatus(enum.IntEnum):
 41 |     """An enumeration for different states of a job in the SWF_.
 42 | 
 43 |     .. _SWF: https://www.cs.huji.ac.il/labs/parallel/workload/swf.html
 44 |     """
 45 | 
 46 |     FAILED = 0
 47 |     COMPLETED = 1
 48 |     PARTIAL_TO_BE_CONTINUED = 2
 49 |     PARTIAL_LAST_COMPLETED = 3
 50 |     PARTIAL_LAST_FAILED = 4
 51 |     CANCELLED = 5
 52 |     MEANINGLESS = -1
 53 | 
 54 | 
 55 | class Job:
 56 |     """A job in the system.
 57 | 
 58 |     This follows the fields of the `Standard Workload Format
 59 |     <https://www.cs.huji.ac.il/labs/parallel/workload/swf.html>`_ with a couple
 60 |     of helper methods to compute slowdown and bounded slowdown of a job. The
 61 |     initializer arguments follow the same ordering and have the same meaning
 62 |     than those in the SWF description.
 63 | 
 64 |     This makes use of the :class:`schedgym.resource.Resource` class to keep
 65 |     track of the assigned resources to the job. Resource assignment itself is
 66 |     performed by
 67 |     :func:`schedgym.scheduler.scheduler.Scheduler.assign_schedule`.
 68 | 
 69 |     The figure below shows the relationship between jobs, resources, and the
 70 |     basic data structure for resource management (`IntervalTree`).
 71 | 
 72 |     .. image:: /img/job-resource.svg
 73 |     """
 74 | 
 75 |     resources: Resource
 76 | 
 77 |     SWF_JOB_MAP = {
 78 |         'jobId': 'id',
 79 |         'submissionTime': 'submission_time',
 80 |         'waitTime': 'wait_time',
 81 |         'runTime': 'execution_time',
 82 |         'allocProcs': 'processors_allocated',
 83 |         'avgCpuUsage': 'average_cpu_use',
 84 |         'usedMem': 'memory_use',
 85 |         'reqProcs': 'requested_processors',
 86 |         'reqTime': 'requested_time',
 87 |         'reqMem': 'requested_memory',
 88 |         'status': 'status',
 89 |         'userId': 'user_id',
 90 |         'groupId': 'group_id',
 91 |         'executable': 'executable',
 92 |         'queueNum': 'queue_number',
 93 |         'partNum': 'partition_number',
 94 |         'precedingJob': 'preceding_job_id',
 95 |         'thinkTime': 'think_time',
 96 |     }
 97 | 
 98 |     def __init__(
 99 |         self,
100 |         job_id=-1,
101 |         submission_time=-1,
102 |         execution_time=-1,
103 |         processors_allocated=-1,
104 |         average_cpu_use=-1,
105 |         memory_use=-1,
106 |         requested_processors=-1,
107 |         requested_time=-1,
108 |         requested_memory=-1,
109 |         status=-1,
110 |         user_id=-1,
111 |         group_id=-1,
112 |         executable=-1,
113 |         queue_number=-1,
114 |         partition_number=-1,
115 |         preceding_job_id=-1,
116 |         think_time=-1,
117 |         wait_time=-1,
118 |         ignore_memory=True,
119 |     ):
120 |         self.id: int = job_id
121 |         self.submission_time: int = submission_time
122 |         self.execution_time: int = execution_time
123 |         self.requested_time: int = requested_time
124 |         self.requested_processors: int = requested_processors
125 |         self.processors_allocated: int = processors_allocated
126 |         self.average_cpu_use: int = average_cpu_use
127 |         self.memory_use: int = memory_use
128 |         self.requested_memory: int = requested_memory
129 |         self.status: JobStatus = status
130 |         self.user_id: int = user_id
131 |         self.group_id: int = group_id
132 |         self.executable: int = executable
133 |         self.queue_number: int = queue_number
134 |         self.partition_number: int = partition_number
135 |         self.preceding_job_id: int = preceding_job_id
136 |         self.think_time = think_time
137 |         self.wait_time = wait_time
138 | 
139 |         self.resources = Resource()
140 |         self.first_scheduling_promise: int = -1
141 |         self.start_time: int = -1
142 |         self.finish_time: int = -1
143 |         self.ignore_memory = ignore_memory
144 |         self.slot_position: int = -1
145 |         self.free_processors = -1
146 |         self.queued_work = -1
147 |         self.queue_size = -1
148 | 
149 |     def __str__(self):
150 |         return (
151 |             f'Job<{self.id}, {self.status.name}, start={self.start_time}, '
152 |             f'processors={self.requested_processors}, '
153 |             f'memory={self.requested_memory} '
154 |             f'duration={self.execution_time}>'
155 |         )
156 | 
157 |     __repr__ = __str__
158 | 
159 |     @property
160 |     def proper(self):
161 |         """Checks whether this job is a proper job with assigned resources.
162 | 
163 |         Returns:
164 |             bool: True if the job is proper, and False otherwise.
165 |         """
166 |         processors, memory = self.resources.measure()
167 |         return processors == self.requested_processors and (
168 |             self.ignore_memory or memory == self.requested_memory
169 |         )
170 | 
171 |     @property
172 |     def slowdown(self):
173 |         """Computes the slowdown of the current job."""
174 |         if self.finish_time < 0:
175 |             warnings.warn(
176 |                 f'Failed to obtain slowdown for job {self}. '
177 |                 'It may not have finished yet.'
178 |             )
179 |             return -1
180 |         return (
181 |             self.finish_time - self.submission_time
182 |         ) / self.execution_time
183 | 
184 |     @property
185 |     def bounded_slowdown(self):
186 |         """Gives the bounded slowdown of a job"""
187 |         if self.finish_time < 0:
188 |             warnings.warn(
189 |                 f'Failed to obtain avg bounded slowdown for job {self}.'
190 |                 'It may not have finished yet.'
191 |             )
192 |             return -1
193 |         return max(
194 |             1,
195 |             (self.finish_time - self.submission_time)
196 |             / max(10, self.execution_time),
197 |         )
198 | 
199 |     @property
200 |     def swf(self):
201 |         """Returns an SWF representation of this job"""
202 |         return (
203 |             f'{self.id} {self.submission_time} {self.wait_time} '
204 |             f'{self.execution_time} {self.processors_allocated} '
205 |             f'{self.average_cpu_use} '
206 |             f'{self.memory_use} {self.requested_processors} '
207 |             f'{self.requested_time} {self.requested_memory} '
208 |             f'{self.swfstatus} {self.user_id} {self.group_id} '
209 |             f'{self.executable} {self.queue_number} '
210 |             f'{self.partition_number} {self.preceding_job_id} '
211 |             f'{self.think_time}'
212 |         )
213 | 
214 |     @property
215 |     def swfstatus(self):
216 |         """Returns the job status in the format expected by the SWF."""
217 |         if self.status == JobStatus.COMPLETED:
218 |             return SwfJobStatus.COMPLETED
219 |         return SwfJobStatus.MEANINGLESS
220 | 
221 |     @staticmethod
222 |     def from_swf_job(swf_job):
223 |         """Converts an SWF job to our internal job format."""
224 |         new_job = Job()
225 |         for key, value in Job.SWF_JOB_MAP.items():
226 |             tmp = getattr(swf_job, key)
227 |             setattr(new_job, value, int(tmp) if 'time' in value else tmp)
228 | 
229 |         new_job.status = JobStatus.SUBMITTED
230 |         new_job.requested_processors = new_job.processors_allocated
231 |         if new_job.requested_time == -1:
232 |             new_job.requested_time = new_job.execution_time
233 | 
234 |         return new_job
235 | 
236 |     @property
237 |     def state(self):
238 |         return JobState(
239 |             self.submission_time,
240 |             self.requested_time,
241 |             self.requested_memory,
242 |             self.requested_processors,
243 |             self.queue_size,
244 |             self.queued_work,
245 |             self.free_processors,
246 |         )
247 | 
248 | 
249 | class JobParameters:
250 |     """Class for using with generative models for job creation.
251 | 
252 |     Assumes two types of jobs:
253 |         1. "Small" jobs and
254 |         2. "Large" jobs
255 | 
256 |     A job has probability s of being small and (1-s) of being large.
257 | 
258 |     Moreover, jobs have a dominant resource to distinguish between CPU-bound
259 |     and I/O bound jobs, with probability of being either CPU-bound and I/O
260 |     bound
261 |     0.5.
262 | 
263 |     A user of this class must specify all bounds.
264 | 
265 |     Parameters
266 |     ----------
267 |         lower_time_bound : int
268 |             The minimum time a job will run for
269 |         upper_time_bound : int
270 |             The maximum time a job will run for
271 |         lower_cpu_bound : int
272 |             The minimum number of processors a job will consume
273 |         upper_cpu_bound : int
274 |             The maximum number of processors a job will consume
275 |         lower_mem_bound : int
276 |             The minimum amount of memory a job will consume
277 |         upper_mem_bound : int
278 |             The maximum amount of memory a job will consume
279 | 
280 |     Used by :class:`schedgym.workload.distribution.BinomialWorkloadGenerator`.
281 |     """
282 | 
283 |     lower_time_bound: int
284 |     upper_time_bound: int
285 |     lower_resource_bound: int
286 |     upper_resource_bound: int
287 | 
288 |     @staticmethod
289 |     def _validate_parameters(*args):
290 |         for param in args:
291 |             if param <= 0:
292 |                 raise AssertionError(
293 |                     'Unable to work with non-positive bounds.'
294 |                 )
295 | 
296 |     def __init__(
297 |         self,
298 |         lower_time_bound: int,
299 |         upper_time_bound: int,
300 |         lower_cpu_bound: int,
301 |         upper_cpu_bound: int,
302 |         lower_mem_bound: int,
303 |         upper_mem_bound: int,
304 |     ):
305 |         self._validate_parameters(
306 |             lower_time_bound,
307 |             upper_time_bound,
308 |             lower_cpu_bound,
309 |             upper_cpu_bound,
310 |             lower_mem_bound,
311 |             upper_mem_bound,
312 |         )
313 | 
314 |         self.lower_time_bound = lower_time_bound
315 |         self.upper_time_bound = upper_time_bound
316 |         self.lower_cpu_bound = lower_cpu_bound
317 |         self.upper_cpu_bound = upper_cpu_bound
318 |         self.lower_mem_bound = lower_mem_bound
319 |         self.upper_mem_bound = upper_mem_bound
320 | 
321 |         self.resource_samplers = {
322 |             PrimaryResource.CPU: lambda: random.randint(
323 |                 self.lower_cpu_bound, self.upper_cpu_bound
324 |             ),
325 |             PrimaryResource.MEMORY: lambda: random.randint(
326 |                 self.lower_mem_bound, self.upper_mem_bound
327 |             ),
328 |         }
329 | 
330 |         self.job_id = 1
331 |         self.time_step = 0
332 | 
333 |     def add_time(self, steps: int = 1) -> None:
334 |         """Increments time in the internal counter."""
335 |         if steps < 0:
336 |             raise AssertionError("Time can't be negative.")
337 |         self.time_step += steps
338 | 
339 |     def sample(self, submission_time: int = 0) -> Job:
340 |         """Samples a new job.
341 | 
342 |         Parameters
343 |         ----------
344 |             submission_time : int
345 |                 The time at which the new sampled job would have been
346 |                 submitted. If omitted, the current times step is used.
347 |         """
348 |         time_duration = random.randint(
349 |             self.lower_time_bound, self.upper_time_bound
350 |         )
351 | 
352 |         cpu = self.resource_samplers[PrimaryResource.CPU]()
353 |         mem = self.resource_samplers[PrimaryResource.MEMORY]()
354 | 
355 |         job = Job(
356 |             self.job_id,
357 |             submission_time if submission_time else self.time_step,
358 |             time_duration,
359 |             cpu,
360 |             0,
361 |             mem,
362 |             cpu,
363 |             time_duration,
364 |             mem,
365 |             JobStatus.WAITING,
366 |             1,
367 |             1,
368 |             1,
369 |             1,
370 |             1,
371 |             -1,
372 |             -1,
373 |             -1,
374 |         )
375 |         self.job_id += 1
376 | 
377 |         return job
378 | 


--------------------------------------------------------------------------------
/deeprm-agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from collections import namedtuple, defaultdict
  5 | 
  6 | import argparse
  7 | 
  8 | import os
  9 | import gym
 10 | import json
 11 | import pickle
 12 | import numpy as np
 13 | from typing import List
 14 | from pathlib import Path
 15 | from collections import OrderedDict
 16 | 
 17 | import schedgym.envs as deeprm
 18 | 
 19 | from numpy.lib.stride_tricks import as_strided
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | import torch.optim as optim
 24 | import torch.nn.functional as F
 25 | import torch.utils.data as data
 26 | import torch.multiprocessing as mp
 27 | from torch.distributions import Categorical
 28 | 
 29 | from torch.utils.tensorboard.writer import SummaryWriter
 30 | 
 31 | SLOTS: int = 10
 32 | BACKLOG: int = 60
 33 | TIME_LIMIT: int = 50
 34 | TIME_HORIZON: int = 20
 35 | PARALLEL_WORKERS: int = 20
 36 | TRAINING_ITERATIONS: int = 6
 37 | OPTIMIZERS = {
 38 |     'adam': lambda model, args: optim.Adam(model.parameters(), lr=args.lr),
 39 |     'rmsprop': lambda model, args: optim.RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum),
 40 | }
 41 | 
 42 | TMPDIR = Path(f'/run/user/{os.getuid()}')
 43 | Experience = namedtuple(
 44 |     'Experience',
 45 |     field_names='state action reward'.split()
 46 | )
 47 | 
 48 | 
 49 | class PGNet(nn.Module):
 50 |     def __init__(self, env):
 51 |         super().__init__()
 52 | 
 53 |         self.input_height = env.observation_space.shape[0]
 54 |         self.input_width = env.observation_space.shape[1]
 55 |         self.output_size = env.action_space.n
 56 | 
 57 |         self.nn = nn.Sequential(OrderedDict([
 58 |             ('fc1', nn.Linear(self.input_height * self.input_width, 512)),
 59 |             ('relu1', nn.ReLU()),
 60 |             ('fc2', nn.Linear(512, 256)),
 61 |             ('relu2', nn.ReLU()),
 62 |         ]))
 63 |         self.out = nn.Linear(256, self.output_size)
 64 | 
 65 |     def forward(self, x):
 66 |         x = x.view(-1, self.input_height * self.input_width)
 67 |         x = self.nn(x)
 68 |         scores = self.out(x)
 69 |         return F.softmax(scores, dim=1)
 70 | 
 71 |     def select_action(self, state, device='cpu'):
 72 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 73 |         probs = self(state)
 74 |         mass = Categorical(probs)
 75 |         action = mass.sample()
 76 |         return action.item()
 77 | 
 78 |     def log_prob(self, state, action, device='cpu'):
 79 |         state = state.float()
 80 |         action = action.float()
 81 |         probs = self(state).view((action.shape[0], action.shape[1], -1))
 82 |         mass = Categorical(probs)
 83 |         return mass.log_prob(action), mass.entropy()
 84 | 
 85 | class Callback(object):
 86 |     def __call__(self, score) -> None:
 87 |         raise NotImplementedError
 88 | 
 89 | 
 90 | class ReduceLROnPlateau(Callback):
 91 |     def __init__(self, patience, rate, args, minimum=None, negate_score=True):
 92 |         self.patience = patience
 93 |         self.args = args
 94 |         self.rate = rate
 95 |         self.counter = 0
 96 |         self.best_score = None
 97 |         self.minimum = minimum
 98 |         self.negate_score = negate_score
 99 | 
100 |     def __call__(self, score):
101 |         if self.negate_score:
102 |             score = -score
103 |         if self.best_score is None:
104 |             self.best_score = score
105 |         elif score <= self.best_score:
106 |             self.counter += 1
107 |             if self.counter >= self.patience:
108 |                 self.counter = 0
109 |                 print(
110 |                     f'Reducing learning rate from {self.args.lr} '
111 |                     f'to {self.args.lr * self.rate} '
112 |                     f'(best score was {self.best_score})'
113 |                 )
114 |                 tmp = self.args.lr * self.rate
115 |                 if self.minimum and tmp < self.minimum:
116 |                     tmp = self.minimum
117 |                 self.args.lr = tmp
118 |         else:
119 |             self.best_score = score
120 |             self.counter = 0
121 | 
122 | 
123 | def make_discount_array(gamma, timesteps):
124 |     vals = np.zeros(2 * timesteps - 1)
125 |     vals[timesteps - 1:] = gamma ** np.arange(timesteps)
126 |     return as_strided(
127 |         vals[timesteps - 1:],
128 |         shape=(timesteps, timesteps),
129 |         strides=(-vals.strides[0], vals.strides[0]),
130 |         writeable=False
131 |     )
132 | 
133 | 
134 | def setup_environment(envname, wlkwargs) -> deeprm.DeepRmEnv:
135 |     env: deeprm.DeepRmEnv = gym.make(envname, **wlkwargs)
136 |     env.reset()
137 | 
138 |     return env
139 | 
140 | 
141 | def run_episode(env, model, max_episode_length, device='cpu'):
142 |     trajectory = []
143 |     total_reward = 0
144 |     state = env.reset()
145 |     for _ in range(max_episode_length):
146 |         action = model.select_action(state, device)
147 |         next_state, reward, done, _ = env.step(action)
148 |         exp = Experience(state, action, reward)
149 |         trajectory.append(exp)
150 |         total_reward += reward
151 |         if done:
152 |             break
153 |         state = next_state
154 |     return trajectory
155 | 
156 | 
157 | def compute_baselines(trajectories):
158 |     returns = np.zeros((len(trajectories), max((len(traj) for traj in trajectories))))
159 |     for i in range(len(trajectories)):
160 |         tmp = np.array([e.reward for e in trajectories[i]])
161 |         returns[i, :len(tmp)] = tmp
162 |     return returns, returns.mean(axis=0)
163 | 
164 | 
165 | def run_episodes(rank, args, model, device, wlkwargs) -> List[List[Experience]]:
166 |     np.random.seed(args.seed + rank)
167 |     torch.manual_seed(args.seed + rank)
168 |     env = setup_environment(args.envname, wlkwargs)
169 | 
170 |     return [run_episode(env, model, args.max_episode_length, device)
171 |             for _ in range(args.trajectories_per_batch)]
172 | 
173 | 
174 | def run_episodes_pickle(rank, args, model, device, wlkwargs):
175 |     trajectories = run_episodes(rank, args, model, device, wlkwargs)
176 |     with open(TMPDIR / f'{rank}.pkl', 'wb') as fp:
177 |         pickle.dump(trajectories, fp, pickle.HIGHEST_PROTOCOL)
178 | 
179 | 
180 | def train_one_epoch(rank, args, model, device, loss_queue, wlkwargs) -> None:
181 |     """Trains the model for one epoch.
182 | 
183 |     This uses baselining in the REINFORCE algorithm. There are many ways to
184 |     compute baselines. Examples:
185 | 
186 |         1. The approach taken by DeepRM in the original paper, in which each
187 |            timestep has its own baseline, which is computed as the average
188 |            return for a trajectory.
189 |         2. Computing a global baseline for each trajectory in which it is the
190 |            average return in that trajectory.
191 |         3. A global baseline computed as the average return over all
192 |            trajectories.
193 | 
194 |     In this function, we follow 1., but nothing prevents us from using 2 or 3.
195 |     """
196 |     # You might need to divide the learning rate by the number of workers
197 | 
198 |     optimizer = OPTIMIZERS[args.optimizer.lower()](model, args)
199 | 
200 |     optimizer.zero_grad()
201 |     trajectories = run_episodes(rank, args, model, device, wlkwargs)
202 | 
203 |     rewards, baselines = compute_baselines(trajectories)
204 |     baselines_mat = np.array([baselines
205 |                               for _ in range(args.trajectories_per_batch)])
206 |     baselines_mat = baselines_mat * (rewards != 0)
207 |     discounts = make_discount_array(args.gamma, rewards.shape[1])
208 |     discounted_returns = (discounts @ rewards.T).T
209 |     advantages = discounted_returns - baselines_mat
210 | 
211 |     policy_loss, entropy = [], []
212 |     for i, t in enumerate(trajectories):
213 |         for j, e in enumerate(t):
214 |             policy_loss.append(e.log_prob * advantages[i, j])
215 |             entropy.append(e.entropy())
216 | 
217 |     policy_loss = torch.cat(policy_loss).sum() + torch.cat(entropy).sum() * args.entropy
218 |     (-policy_loss).backward()
219 |     optimizer.step()
220 | 
221 |     lengths = [len(t) for t in trajectories]
222 |     loss_queue.put((
223 |         rank, policy_loss.clone().cpu().data.numpy(),
224 |         advantages.mean(), advantages.std(),
225 |         rewards.mean(), rewards.std(),
226 |         discounted_returns.mean(), discounted_returns.std(),
227 |         np.mean(lengths), np.std(lengths)
228 |     ))
229 | 
230 | 
231 | def build_argument_parser():
232 |     parser = argparse.ArgumentParser(description='DeepRM training')
233 |     parser.add_argument('--epochs', type=int, default=TRAINING_ITERATIONS,
234 |                         metavar='N', help='number of epochs to train')
235 |     parser.add_argument('--workers', type=int, default=PARALLEL_WORKERS,
236 |                         metavar='N', help='number of workers to train')
237 |     parser.add_argument('--seed', type=int, default=42,
238 |                         metavar='S', help='random seed to use')
239 |     parser.add_argument('--lr', type=float, default=1e-2, metavar='LR',
240 |                         help='Learning rate for gradient ascent')
241 |     parser.add_argument('--momentum', type=float, default=0.99, metavar='LR',
242 |                         help='momentum for gradient ascent')
243 |     parser.add_argument('--cuda', action='store_true', default=False,
244 |                         help='enables training with CUDA')
245 |     parser.add_argument('--envname', type=str, default='DeepRM-v0',
246 |                         help='OpenAI Gym environment to use')
247 |     parser.add_argument('--max-episode-length', type=int, default=200,
248 |                         metavar='N', help='Maximum number of timesteps in episode')
249 |     parser.add_argument('--trajectories-per-batch', type=int, default=200,
250 |                         metavar='N', help='Number of trajectories in a batch')
251 |     parser.add_argument('--gamma', type=float, default=0.99, metavar='γ',
252 |                         help='Discount factor')
253 |     parser.add_argument('--debug', action='store_true', default=False)
254 |     parser.add_argument('--load', type=str, default=None, metavar='PATH',
255 |                         help='Loads a previously-trained model')
256 |     parser.add_argument('--optimizer', type=str, default='adam',
257 |                         help='optimizer to use')
258 |     parser.add_argument('--workload', type=str, default=None,
259 |                         help='Path to a workload configuration file')
260 |     parser.add_argument('--entropy', type=float, default=0.,
261 |                         help='entropy regularization factor')
262 |     return parser
263 | 
264 | 
265 | def main():
266 |     args = build_argument_parser().parse_args()
267 | 
268 |     use_cuda = args.cuda and torch.cuda.is_available()
269 |     device = torch.device('cuda' if use_cuda else 'cpu')
270 | 
271 |     torch.manual_seed(args.seed)
272 |     mp.set_start_method('spawn')
273 | 
274 |     if args.workload is None:
275 |         wlkwargs = {}
276 |     else:
277 |         with open(args.workload) as fp:
278 |             wlkwargs = json.load(fp)
279 | 
280 |     model = PGNet(setup_environment(args.envname, wlkwargs)).to(device)
281 |     if args.load is not None:
282 |         model.load_state_dict(torch.load(args.load))
283 |     model.share_memory()
284 | 
285 |     writer = SummaryWriter()
286 |     loss_queue = mp.Queue()
287 | 
288 |     callbacks = [ReduceLROnPlateau(500, .5, args, 1e-5, negate_score=True)]
289 |     train_synchronous_parallel(args, callbacks, device, loss_queue, model, writer, wlkwargs)
290 | 
291 |     writer.close()
292 |     torch.save(model.state_dict(), 'policy.pth')
293 | 
294 | 
295 | def train_synchronous_parallel(args, callbacks, device, loss_queue, model, writer, wlkwargs):
296 |     for epoch in range(args.epochs):
297 |         print(f'Current epoch: {epoch}')
298 |         losses = []
299 |         if args.debug:
300 |             train_one_epoch(0, args, model, device, loss_queue)
301 |         else:
302 |             with mp.Pool(processes=args.workers) as pool:
303 |                 pool.starmap_async(
304 |                     run_episodes_pickle,
305 |                     [(i, args, model, device, wlkwargs) for i in range(args.workers)],
306 |                     1
307 |                 ).get()
308 | 
309 |                 fps = [open(TMPDIR / f'{i}.pkl', 'rb') for i in range(args.workers)]
310 |                 ret = [pickle.load(fp) for fp in fps]
311 |                 [fp.close() for fp in fps]
312 | 
313 |                 optimizer = OPTIMIZERS[args.optimizer.lower()](model, args)
314 |                 optimizer.zero_grad()
315 | 
316 |                 trajectories = [e for l in ret for e in l]
317 |                 rewards, baselines = compute_baselines(trajectories)
318 |                 baselines_mat = np.array([baselines
319 |                                           for _ in range(len(trajectories))])
320 |                 baselines_mat = baselines_mat * (rewards != 0)
321 |                 discounts = make_discount_array(args.gamma, rewards.shape[1])
322 |                 discounted_returns = (discounts @ rewards.T).T
323 |                 advantages = discounted_returns - baselines_mat
324 | 
325 |                 states = [[e.state for e in t] for t in trajectories]
326 |                 actions = [[e.action for e in t] for t in trajectories]
327 |                 maxlen = max((len(s) for s in states))
328 |                 for s, a in zip(states, actions):
329 |                     s += [np.zeros_like(s[0])] * (maxlen - len(s))
330 |                     a += [np.zeros_like(a[0])] * (maxlen - len(a))
331 | 
332 |                 def compute_loss(model, states, actions, advantages, device):
333 |                     states, actions, advantages = [torch.from_numpy(t) for t in (states, actions, advantages)]
334 |                     dataset = data.TensorDataset(
335 |                         states, actions, advantages
336 |                     )
337 |                     loader = data.DataLoader(
338 |                         dataset, batch_size=64, shuffle=False
339 |                     )
340 |                     loss = 0
341 |                     for state, action, advantage in loader:
342 |                         l, e = model.log_prob(
343 |                             state.to(device), action.to(device), device
344 |                         )
345 |                         loss += (l * advantage.to(device)).sum()
346 |                         loss += (e * args.entropy).sum()
347 |                     return loss
348 | 
349 |                 policy_loss = compute_loss(
350 |                     model,
351 |                     np.array(states),
352 |                     np.array(actions),
353 |                     np.array(advantages),
354 |                     device
355 |                 )
356 |                 (-policy_loss).backward()
357 |                 optimizer.step()
358 | 
359 |                 lengths = [len(t) for t in trajectories]
360 |                 loss_queue.put((
361 |                     0, policy_loss.clone().cpu().data.numpy(),
362 |                     advantages.mean(), advantages.std(),
363 |                     rewards.mean(), rewards.std(),
364 |                     discounted_returns.mean(), discounted_returns.std(),
365 |                     np.mean(lengths), np.std(lengths)
366 |                 ))
367 | 
368 |                 for name, param in model.named_parameters():
369 |                     writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch)
370 | 
371 |                 losses, extras = [], defaultdict(list)
372 |                 features = 'ardl'
373 |                 while not loss_queue.empty():
374 |                     rank, loss, *extra = loss_queue.get()
375 |                     print(
376 |                         f'Loss for worker {rank} on epoch {epoch}: {loss}'
377 |                     )
378 |                     losses.append(loss)
379 |                     for i, feature in enumerate(features):
380 |                         extras[f'{feature}μ'].append(extra[i * 2])
381 |                         extras[f'{feature}σ'].append(extra[i * 2 + 1])
382 |                         writer.add_scalar(f'{feature}μ/{rank}', extra[i * 2], epoch)
383 |                         writer.add_scalar(f'{feature}σ/{rank}', extra[i * 2 + 1], epoch)
384 |                 print(
385 |                     'Loss for epoch {}: {}±{}'.format(epoch, np.mean(losses), np.std(losses))
386 |                 )
387 |                 writer.add_scalar('loss', np.mean(losses), epoch)
388 |                 for i, feature in enumerate(features):
389 |                     writer.add_scalar(f'{feature}μ', np.mean(extras[f'{feature}μ']), epoch)
390 |                     writer.add_scalar(f'{feature}σ', np.mean(extras[f'{feature}σ']), epoch)
391 |                 writer.add_scalar('α', args.lr, epoch)
392 |         for callback in callbacks:
393 |             callback(np.mean(losses))
394 | 
395 |         writer.flush()
396 |         torch.save(model.state_dict(), f'checkpoint/policy-{epoch}.pth')
397 | 
398 | 
399 | if __name__ == '__main__':
400 |     main()
401 | 


--------------------------------------------------------------------------------
/schedgym/scheduler/scheduler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """scheduler - Module with basic scheduling functionality.
  5 | 
  6 | This is the core of the simulator, since this module contains functionality
  7 | that interacts with all other components.
  8 | """
  9 | 
 10 | from abc import ABC, abstractmethod
 11 | from collections import defaultdict
 12 | from typing import (
 13 |     List,
 14 |     Iterable,
 15 |     Tuple,
 16 |     Dict,
 17 |     Any,
 18 |     Union,
 19 |     NamedTuple,
 20 |     Optional,
 21 | )
 22 | 
 23 | import collections.abc
 24 | 
 25 | import numpy as np
 26 | 
 27 | from schedgym.cluster import Cluster
 28 | from schedgym.job import Job, JobStatus, Resource
 29 | from schedgym.event import JobEvent, EventType, EventQueue
 30 | 
 31 | 
 32 | class Stats(NamedTuple):
 33 |     """A named tuple with scheduling statistics"""
 34 | 
 35 |     utilization: float
 36 |     load: float
 37 |     slowdown: float
 38 |     makespan: float
 39 |     bsld: float
 40 | 
 41 | 
 42 | class Scheduler(ABC):
 43 |     # pylint: disable=too-many-instance-attributes
 44 |     # pylint: disable=too-many-public-methods
 45 |     """Base class for scheduling.
 46 | 
 47 |     This class implements the core scheduling primitives common to all
 48 |     schedulers, and it also manages the "connection" with Cluster objects
 49 |     to manage them.
 50 | 
 51 |     Internally, the scheduler manages four general "queues":
 52 |         * Admission: For jobs that have been submitted, but to which the
 53 |           scheduler hasn't made a decision yet
 54 |         * Waiting: For jobs that the scheduler has already generated an
 55 |           schedule, but that haven't been started yet
 56 |         * Running: For jobs that have started execution, but hasn't
 57 |           finished yet
 58 |         * Completed: For jobs that have finished execution
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |         number_of_processors : int
 63 |             The number of processors in the system
 64 |         total_memory : int
 65 |             The amount of memory in the system
 66 |         ignore_memory : bool
 67 |             Whether memory should be ignored when making decisions, or not
 68 |     """
 69 | 
 70 |     used_memory: int
 71 |     current_time: int
 72 |     total_memory: int
 73 |     used_processors: int
 74 |     need_schedule_call: bool
 75 |     number_of_processors: int
 76 |     queue_waiting: List[Job]
 77 |     queue_running: List[Job]
 78 |     queue_admission: List[Job]
 79 |     queue_completed: List[Job]
 80 |     cluster: Cluster
 81 |     job_events: EventQueue[JobEvent]
 82 |     stats: Dict[int, Stats]
 83 | 
 84 |     def __init__(
 85 |         self, number_of_processors, total_memory, ignore_memory=False
 86 |     ):
 87 |         self.number_of_processors = number_of_processors
 88 |         self.total_memory = total_memory
 89 | 
 90 |         self.queue_waiting = []
 91 |         self.queue_running = []
 92 |         self.queue_completed = []
 93 |         self.queue_admission = []
 94 | 
 95 |         self.stats = {}
 96 |         self.used_memory = 0
 97 |         self.current_time = 0
 98 |         self.used_processors = 0
 99 |         self.ignore_memory = ignore_memory
100 |         self.job_events = EventQueue(self.current_time - 1)
101 |         self.cluster = Cluster(
102 |             number_of_processors, total_memory, ignore_memory
103 |         )
104 |         self.need_schedule_call = False
105 |         'Tracks whether we might need to schedule jobs'
106 | 
107 |     @property
108 |     def all_jobs(self) -> List[Job]:
109 |         """Returns a list of all the jobs that ever got into the system"""
110 |         return (
111 |             self.queue_completed
112 |             + self.queue_running
113 |             + self.queue_waiting
114 |             + self.queue_admission
115 |         )
116 | 
117 |     @property
118 |     def slowdown(self) -> List[float]:
119 |         """Returns the slowdown of all completed jobs"""
120 |         return [j.slowdown for j in self.queue_completed]
121 | 
122 |     @property
123 |     def jobs_in_system(self) -> List[Job]:
124 |         """Returns a list with all the jobs that haven't completed yet"""
125 |         return self.queue_running + self.queue_waiting + self.queue_admission
126 | 
127 |     @property
128 |     def makespan(self) -> int:
129 |         """Computes the makespan of all finished jobs"""
130 |         return max([0] + [j.finish_time for j in self.queue_completed])
131 | 
132 |     @property
133 |     def load(self) -> float:
134 |         """Computes the current load in the system.
135 | 
136 |         The load is the ratio between the number of requested processors and
137 |         the number of processors in the system.
138 |         """
139 |         requested_processors = sum(
140 |             [j.requested_processors for j in self.jobs_in_system]
141 |         )
142 |         return requested_processors / self.number_of_processors
143 | 
144 |     @property
145 |     def utilization(self) -> float:
146 |         """Instant processor utilization."""
147 |         return self.used_processors / self.number_of_processors
148 | 
149 |     @property
150 |     def bounded_slowdown(self) -> List[float]:
151 |         """Computes the bounded slowdown for all completed jobs"""
152 |         return [j.bounded_slowdown for j in self.queue_completed]
153 | 
154 |     def _start_running(self, j: Job) -> None:
155 |         """Starts running job `j`.
156 | 
157 |         Parameters
158 |         ----------
159 |             j : Job
160 |                 The job to start running
161 |         """
162 |         self.queue_waiting.remove(j)
163 |         self.queue_running.append(j)
164 | 
165 |         j.status = JobStatus.RUNNING
166 |         self.used_memory += j.memory_use
167 |         self.used_processors += j.processors_allocated
168 |         j.wait_time = j.start_time - j.submission_time
169 | 
170 |     def _complete_job(self, j: Job) -> None:
171 |         """Marks a job as completed.
172 | 
173 |         Parameters
174 |         ----------
175 |             j : Job
176 |                 The job to mark completed
177 |         """
178 |         self.queue_running.remove(j)
179 |         self.queue_completed.append(j)
180 | 
181 |         j.status = JobStatus.COMPLETED
182 |         j.finish_time = j.start_time + j.execution_time
183 |         self.used_memory -= j.memory_use
184 |         self.used_processors -= j.processors_allocated
185 | 
186 |     def _add_job_events(
187 |         self, job: Job, time: int
188 |     ) -> Tuple[JobEvent, JobEvent]:
189 |         """Adds start and finish events for a job to the current events.
190 | 
191 |         Parameters
192 |         ----------
193 |             job : Job
194 |                 The job whose events are to be added to the system
195 |             time : int
196 |                 The time step to associate the start event with
197 |         """
198 |         if not job.resources or not job.proper:
199 |             raise AssertionError(
200 |                 'Malformed job submitted either with no processors, '
201 |                 'or with insufficient number of '
202 |                 'processors'
203 |             )
204 |         start = JobEvent(time, EventType.JOB_START, job)
205 |         finish = start.clone()
206 |         finish.time += job.execution_time
207 |         finish.type = EventType.JOB_FINISH
208 |         self.job_events.add(start)
209 |         self.job_events.add(finish)
210 | 
211 |         return start, finish
212 | 
213 |     @property
214 |     def free_resources(self) -> Tuple[int, int]:
215 |         """Returns the amount of free resources in the system."""
216 |         return (
217 |             self.number_of_processors - self.used_processors,
218 |             self.total_memory - self.used_memory,
219 |         )
220 | 
221 |     def step(self, offset: int = None) -> bool:
222 |         """Steps the simulation
223 | 
224 |         Parameters
225 |         ----------
226 |             offset : int
227 |                 The number of time steps to take (must be >= 0)
228 |         """
229 |         if offset is None:
230 |             offset = 1
231 |         if offset < 0:
232 |             raise AssertionError('Tried to move backwards in time')
233 | 
234 |         scheduled = False
235 |         for _ in range(offset):
236 |             if self.need_schedule_call or (
237 |                 self.queue_admission
238 |                 and self.job_events.first
239 |                 and self.job_events.first.time == self.current_time
240 |             ):
241 |                 self.need_schedule_call = False
242 |                 scheduled = True
243 |                 self.schedule()
244 |             present = self.job_events.step(1)
245 |             self.cluster = self.play_events(
246 |                 present, self.cluster, update_queues=True
247 |             )
248 |             self.current_time += 1
249 |         return scheduled
250 | 
251 |     def play_events(
252 |         self,
253 |         events: Iterable[JobEvent],
254 |         cluster: Cluster,
255 |         update_queues: bool = False,
256 |     ) -> Cluster:
257 |         """Play events from a given event queue, updating state accordingly.
258 | 
259 |         On top of playing the events, this also updates job statistics,
260 |         which can be queried at any given time.
261 | 
262 |         After execution, the current state of the cluster is returned.
263 | 
264 |         This method is used by a number of operations: both to find future
265 |         schedules for jobs and to check whether a job can be added at a given
266 |         time step. For this reason, an optional argument is included to define
267 |         whether to update queues or not.
268 | 
269 |         Parameters
270 |         ----------
271 |             events : Iterable[JobEvent]
272 |                 The events to play
273 |             cluster : Cluster
274 |                 The cluster to operate on when playing events
275 |             update_queues : bool
276 |                 Whether to update queues when job start and job finished events
277 |                 are found.
278 |         """
279 |         for event in events:
280 |             if event.type == EventType.JOB_START:
281 |                 cluster.allocate(event.job)
282 |                 if update_queues:
283 |                     self._start_running(event.job)
284 |                     self.update_stats()
285 |             elif event.type == EventType.JOB_FINISH:
286 |                 cluster.free(event.job)
287 |                 if update_queues:
288 |                     self._complete_job(event.job)
289 |                     self.update_stats()
290 |             else:
291 |                 raise RuntimeError('Unexpected event type found')
292 |         return cluster
293 | 
294 |     @staticmethod
295 |     def fits(
296 |         time: int, job: Job, cluster: Cluster, events: Iterable[JobEvent]
297 |     ) -> Resource:
298 |         """Checks whether a job fits a given cluster at a given time.
299 | 
300 |         Once again, this requires an iterable of events and a cluster to
301 |         operate on to check whether the job fits the cluster.
302 | 
303 |         Parameters
304 |         ----------
305 |             job : Job
306 |                 The job to check
307 |             cluster : Cluster
308 |                 The cluster to operate on
309 |             events : Iterable[JobEvent]
310 |                 An iterable that provides the job events this scheduler will
311 |                 operate on
312 | 
313 |         Returns:
314 |             Resource: The set of resources (when found) or an empty set of
315 |             resources (when the job won't fit the cluster).
316 |         """
317 |         return cluster.find_resources_at_time(time, job, events)
318 | 
319 |     def some_job_fits(self, job_slots: slice = slice(0, None)):
320 |         """Checks whether any jobs in the admission queue fits _right now_."""
321 | 
322 |         return any(
323 |             [self.cluster.fits(j) for j in self.queue_admission[job_slots]]
324 |         )
325 | 
326 |     def can_schedule_now(self, job: Job) -> Resource:
327 |         """Checks whether a job can be scheduled in the current cluster now.
328 | 
329 |         This is a special case of :func:`fits` in which we're operating right
330 |         now with the current cluster.
331 | 
332 |         Parameters
333 |         ----------
334 |             job : Job
335 |                 The job to check.
336 |         """
337 |         cluster = self.cluster.clone()
338 |         events = filter(lambda e: e.time <= self.current_time, self.job_events)
339 |         for event in events:
340 |             if event.type == EventType.JOB_START:
341 |                 cluster.allocate(event.job)
342 |             elif event.type == EventType.JOB_FINISH:
343 |                 cluster.free(event.job)
344 |         return cluster.find_resources_at_time(
345 |             self.current_time, job, self.job_events
346 |         )
347 | 
348 |     def find_first_time_for(self, job: Job) -> Tuple[int, Resource]:
349 |         """Finds the first time stamp on which we can start a job.
350 | 
351 |         Parameters
352 |         ----------
353 |             job : Job
354 |                 The job to find a time for
355 |         """
356 | 
357 |         if (not self.job_events.next) or (
358 |             self.job_events.next.time > self.current_time
359 |         ):
360 |             resources = self.cluster.find_resources_at_time(
361 |                 self.current_time, job, self.job_events
362 |             )
363 |             if resources:
364 |                 return self.current_time, resources
365 | 
366 |         near_future: Dict[int, List[JobEvent]] = defaultdict(list)
367 |         for e in self.job_events:
368 |             near_future[e.time].append(e)
369 | 
370 |         cluster = self.cluster.clone()
371 |         for time in sorted(near_future):
372 |             cluster = self.play_events(near_future[time], cluster)
373 |             resources = cluster.find_resources_at_time(
374 |                 time, job, self.job_events
375 |             )
376 |             if resources:
377 |                 return time, resources
378 | 
379 |         raise AssertionError(
380 |             'Failed to find time for job, even in the far future.'
381 |         )
382 | 
383 |     def submit(self, job: Union[Job, Iterable[Optional[Job]]]) -> None:
384 |         """Submits a new job to the system.
385 | 
386 |         Parameters
387 |         ----------
388 |             job : Union[Job, Sequence[Job]]
389 |                 Can either be a single job, or a sequence of jobs. If
390 |                 a sequence, all jobs in the sequence are submitted at the same
391 |                 time.
392 |         """
393 |         if isinstance(job, collections.abc.Iterable):
394 |             for j in job:
395 |                 self._submit(j)
396 |         else:
397 |             self._submit(job)
398 |         self.need_schedule_call = True
399 | 
400 |     def _submit(self, job: Optional[Job]) -> None:
401 |         """Internal implementation of job submission.
402 | 
403 |         Adds the new job to the `submission_queue` and sets job status to
404 |         `JobStatus.SUBMITTED`.
405 |         """
406 |         if job is None:
407 |             return
408 | 
409 |         if job.requested_processors > self.number_of_processors:
410 |             raise RuntimeError(
411 |                 'Impossible to allocate resources for job bigger than cluster.'
412 |             )
413 |         job.submission_time = self.current_time
414 |         job.status = JobStatus.SUBMITTED
415 | 
416 |         # Compute statistics to be used in state representation {{{
417 |         job.queue_size = len(self.queue_admission)
418 |         job.queued_work = sum(
419 |             [
420 |                 j.requested_time * j.requested_processors
421 |                 for j in self.queue_admission
422 |             ]
423 |         )
424 |         job.free_processors = self.cluster.state[0][0]
425 |         # }}}
426 | 
427 |         self.queue_admission.append(job)
428 | 
429 |     def state(self, timesteps: int, job_slots: int):
430 |         """Returns the current state of the cluster as viewed by the scheduler.
431 | 
432 |         The state representation used here is deeply inspired by the DeepRM
433 |         state representation, meaning it will return three blocks of
434 |         information:
435 |         * The current status of processors and memory used in the system
436 |         * A select number of jobs in the admission queue
437 |         * A "backlog" representing the presence or absence of jobs in the queue
438 |         (for jobs that didn't make into the previous representation)
439 | 
440 |         Parameters
441 |         ----------
442 |             timesteps : int
443 |                 The number of time steps to look into the future
444 |             job_slots : int
445 |                 The number of job slots to use (the amount of jobs in the
446 |                 admission queue to represent)
447 |         """
448 |         # Gets all events between now and `timesteps` {{{
449 |         near_future: Dict[int, List[JobEvent]] = defaultdict(list)
450 |         for e in filter(
451 |             lambda e: e.time < self.current_time + timesteps + 1,
452 |             self.job_events,
453 |         ):
454 |             near_future[e.time - self.current_time].append(e)
455 |         # }}}
456 | 
457 |         # Gets the state representation of currently in use resources {{{
458 |         tmp = []
459 |         cluster = self.cluster.clone()
460 |         for t in range(timesteps):
461 |             if t in near_future:
462 |                 cluster = self.play_events(near_future[t], cluster)
463 |             tmp.append(cluster.state)
464 |         state = list(zip(*tmp))
465 |         if self.ignore_memory:
466 |             state = state[:1]
467 |         # }}}
468 | 
469 |         # Gets the representation of jobs in `job_slots` {{{
470 |         jobs = [
471 |             j.state
472 |             for i, j in enumerate(self.queue_admission)
473 |             if i < job_slots
474 |         ]
475 |         for i, job in enumerate(self.queue_admission):
476 |             if i >= job_slots:
477 |                 break
478 |             job.slot_position = i
479 |         jobs += [Job().state for _ in range(job_slots - len(jobs))]
480 |         # }}}
481 | 
482 |         # Gets the backlog {{{
483 |         backlog = max(len(self.queue_admission) - len(jobs), 0)
484 |         # }}}
485 | 
486 |         return state, jobs, backlog
487 | 
488 |     def assign_schedule(
489 |         self, job, resources, time
490 |     ) -> Tuple[JobEvent, JobEvent]:
491 |         """Assigns a schedule to a job.
492 | 
493 |         What this means is that the job is removed from the admission queue
494 |         and is put into the "waiting" queue, which contains jobs that *will*
495 |         run and already have an schedule. Also changes job status and assigns
496 |         resources to a joub, along with the time it will start running.
497 | 
498 |         Parameters
499 |         ----------
500 |             job : Job
501 |                 The job to be assigned a schedule
502 |             resources : Resource
503 |                 The set of resources the job will use
504 |             time : int
505 |                 The start time of the job
506 |         """
507 |         job.status = JobStatus.WAITING
508 |         job.resources.memory = resources.memory
509 |         job.resources.processors = resources.processors
510 |         job.resources.ignore_memory = resources.ignore_memory
511 |         job.start_time = time
512 |         self.queue_waiting.append(job)
513 |         return self._add_job_events(job, time)
514 | 
515 |     @abstractmethod
516 |     def schedule(self) -> Any:
517 |         """Schedules tasks."""
518 | 
519 |     def update_stats(self) -> None:
520 |         """Updates the usage statistics of the system.
521 | 
522 |         Statistics are only computed when job events happen in the cluster.
523 |         """
524 |         self.stats[self.current_time] = Stats(
525 |             self.utilization,
526 |             self.load,
527 |             np.mean(self.slowdown) if self.queue_completed else 0.0,
528 |             self.makespan,
529 |             np.mean(self.bounded_slowdown) if self.queue_completed else 0.0,
530 |         )
531 | 


--------------------------------------------------------------------------------