├── or_gym
    ├── version.py
    ├── envs
    │   ├── finance
    │   │   ├── __init__.py
    │   │   └── portfolio_opt.py
    │   ├── registry.py
    │   ├── classic_or
    │   │   ├── __init__.py
    │   │   ├── newsvendor.py
    │   │   ├── binpacking.py
    │   │   ├── vmpacking.py
    │   │   ├── tsp.py
    │   │   ├── vehicle_routing.py
    │   │   └── knapsack.py
    │   ├── supply_chain
    │   │   ├── __init__.py
    │   │   ├── inventory_management.py
    │   │   └── network_management.py
    │   ├── env_list.py
    │   └── __init__.py
    ├── __init__.py
    └── utils.py
├── .github
    └── workflows
    │   └── run_env_test.yml
├── pyproject.toml
├── LICENSE
├── setup.py
├── examples
    ├── ray_rllib_taxi_demo.py
    ├── ray_tests.py
    ├── rllib-validate-env.py
    ├── ray_rllib_knapsack.py
    ├── tf_orgym_examples.ipynb
    ├── ray_rllib_knapsack.ipynb
    ├── inv-management-quickstart.ipynb
    └── how-to-use-rl-to-improve-your-supply-chain.ipynb
├── tests
    ├── env_test.py
    └── rllib_test.py
├── .gitignore
└── README.md


/or_gym/version.py:
--------------------------------------------------------------------------------
1 | VERSION='0.5.0'


--------------------------------------------------------------------------------
/or_gym/envs/finance/__init__.py:
--------------------------------------------------------------------------------
1 | from or_gym.envs.finance.portfolio_opt import PortfolioOptEnv
2 | 


--------------------------------------------------------------------------------
/or_gym/envs/registry.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import EnvRegistry
 2 | 
 3 | registry = EnvRegistry()
 4 | 
 5 | def register(id, **kwargs):
 6 |     return registry.register(id, **kwargs)
 7 | 
 8 | def make(id, **kwargs):
 9 |     return registry.make(id, **kwargs)
10 | 
11 | def spec(id):
12 |     return registry.spec(id)


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/__init__.py:
--------------------------------------------------------------------------------
1 | from or_gym.envs.classic_or.knapsack import *
2 | from or_gym.envs.classic_or.binpacking import *
3 | from or_gym.envs.classic_or.vmpacking import *
4 | from or_gym.envs.classic_or.tsp import *
5 | from or_gym.envs.classic_or.vehicle_routing import VehicleRoutingEnv
6 | from or_gym.envs.classic_or.newsvendor import NewsvendorEnv


--------------------------------------------------------------------------------
/or_gym/envs/supply_chain/__init__.py:
--------------------------------------------------------------------------------
1 | from or_gym.envs.supply_chain.inventory_management import InvManagementBacklogEnv
2 | from or_gym.envs.supply_chain.inventory_management import InvManagementLostSalesEnv
3 | from or_gym.envs.supply_chain.network_management import NetInvMgmtBacklogEnv
4 | from or_gym.envs.supply_chain.network_management import NetInvMgmtLostSalesEnv


--------------------------------------------------------------------------------
/or_gym/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import warnings
 4 | 
 5 | from gym import error
 6 | from or_gym.version import VERSION as __version__
 7 | from or_gym.utils import *
 8 | 
 9 | from gym.core import Env, Wrapper, ObservationWrapper, ActionWrapper, RewardWrapper
10 | from gym.envs import make, spec, register
11 | from or_gym.envs import classic_or, finance, supply_chain


--------------------------------------------------------------------------------
/or_gym/envs/env_list.py:
--------------------------------------------------------------------------------
 1 | ENV_LIST = ['Newsvendor-v0',
 2 |             'TSP-v0', 'TSP-v1',
 3 |             'Knapsack-v0', 'Knapsack-v1', 'Knapsack-v2', 'Knapsack-v3',
 4 |             'BinPacking-v0', 'BinPacking-v1', 'BinPacking-v2',
 5 |             'BinPacking-v3', 'BinPacking-v4', 'BinPacking-v5',
 6 |             'VMPacking-v0',
 7 |             'InvManagement-v0', 'InvManagement-v1',
 8 |             'NetworkManagement-v0', 'NetworkManagement-v1',
 9 |             'PortfolioOpt-v0',
10 |             'VehicleRouting-v0']


--------------------------------------------------------------------------------
/.github/workflows/run_env_test.yml:
--------------------------------------------------------------------------------
 1 | name: run-env-test
 2 | on: [push]
 3 | jobs:
 4 |   check-environments:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - name: checkout repo content
 8 |         uses: actions/checkout@v3
 9 |       - name: setup python
10 |         uses: actions/setup-python@v2
11 |         with:
12 |           python-version: 3.7
13 |       - name: install or_gym
14 |         run: pip install -e . 
15 |       - name: execute py test script 
16 |         run: |
17 |           python tests/env_test.py
18 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "or-gym"
 7 | version = "0.5.0"
 8 | authors = [
 9 |   { name="Christian Hubbs", email="christiandhubbs@gmail.com"},
10 |   { name="Owais Sarwar", email="owais.sarwar@gmail.com"},
11 |   { name="Hector Perez", email="hdperez@cmu.edu"}
12 | ]
13 | description = "OR-Gym: A set of environments for developing reinforcement learning agents for OR problems."
14 | readme = "README.md"
15 | license = { file="LICENSE" }
16 | requires-python = ">=3.7"
17 | dependencies = [
18 |     'gym<=0.21.0',
19 |     'numpy>=1.16.1',
20 |     'pandas>=1.2',
21 |     'scipy>=1.0',
22 |     'matplotlib>=3.1',
23 |     'networkx>=2.3'
24 | ]
25 | classifiers = [
26 |     "Programming Language :: Python :: 3",
27 |     "License :: OSI Approved :: MIT License",
28 |     "Operating System :: OS Independent",
29 | ]
30 | 
31 | [project.urls]
32 | "Homepage" = "https://github.com/hubbs5/or-gym"
33 | "Bug Tracker" = "https://github.com/hubbs5/or-gym/issues"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Christian
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | import sys
 5 | import os
 6 | 
 7 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'or_gym'))
 8 | 
 9 | 
10 | with open(os.path.join('or_gym', 'version.py')) as version_file:
11 |     version = version_file.read().strip()
12 | 
13 | VERSION = version.split("=")[-1].replace("'", "")
14 | 
15 | # version = {}
16 | # with open("or_gym/version.py") as fp:
17 |     # exec(fp.read(), version)
18 | # later on we use: version['__version__']
19 | # from or_gym.version import VERSION
20 | 
21 | setup(name='or-gym',
22 | 	version=VERSION,
23 | 	description='OR-Gym: A set of environments for developing reinforcement learning agents for OR problems.',
24 | 	author='Christian Hubbs, Hector Perez Parra, Owais Sarwar',
25 | 	license='MIT',
26 | 	url='https://github.com/hubbs5/or-gym',
27 | 	packages=find_packages(),
28 | 	install_requires=[
29 | 		'gym<=0.21.0',
30 | 		'numpy>=1.16.1',
31 | 		'pandas>=1.2',
32 | 		'scipy>=1.0',
33 | 		'matplotlib>=3.1',
34 | 		'networkx>=2.3'
35 |   ],
36 | 	zip_safe=False,
37 | 	python_requires='>=3.7',
38 | 	classifiers=[
39 | 		'Development Status :: 3 - Alpha',
40 | 		'Intended Audience :: Developers',
41 | 		'Programming Language :: Python :: 3.7',
42 | 		'Programming Language :: Python :: 3.8',
43 | 		'Programming Language :: Python :: 3.9',
44 | 	]
45 | 	)
46 | 


--------------------------------------------------------------------------------
/examples/ray_rllib_taxi_demo.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | from ray.rllib.agents.ppo import PPOTrainer
 3 | 
 4 | ray.shutdown()
 5 | ray.init(ignore_reinit_error=True)
 6 | 
 7 | # Configure the algorithm.
 8 | config = {
 9 |     # Environment (RLlib understands openAI gym registered strings).
10 |     "env": "Taxi-v3",
11 |     # Use 2 environment workers (aka "rollout workers") that parallelly
12 |     # collect samples from their own environment clone(s).
13 |     "num_workers": 2,
14 |     # Change this to "framework: torch", if you are using PyTorch.
15 |     # Also, use "framework: tf2" for tf2.x eager execution.
16 |     "framework": "tf",
17 |     # Tweak the default model provided automatically by RLlib,
18 |     # given the environment's observation- and action spaces.
19 |     "model": {
20 |         "fcnet_hiddens": [64, 64],
21 |         "fcnet_activation": "relu",
22 |     },
23 |     # Set up a separate evaluation worker set for the
24 |     # `trainer.evaluate()` call after training (see below).
25 |     "evaluation_num_workers": 1,
26 |     # Only for evaluation runs, render the env.
27 |     "evaluation_config": {
28 |         "render_env": True,
29 |     }
30 | }
31 | 
32 | # Create our RLlib Trainer.
33 | trainer = PPOTrainer(config=config)
34 | 
35 | # Run it for n training iterations. A training iteration includes
36 | # parallel sample collection by the environment workers as well as
37 | # loss calculation on the collected batch and a model update.
38 | for _ in range(3):
39 |     print(trainer.train())
40 | 
41 | # Evaluate the trained Trainer (and render each timestep to the shell's
42 | # output).
43 | trainer.evaluate()
44 | 


--------------------------------------------------------------------------------
/tests/env_test.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/env python
 2 | 
 3 | '''
 4 | Tests to ensure environments load and basic functionality
 5 | is satisfied.
 6 | '''
 7 | 
 8 | import or_gym
 9 | from or_gym.envs.env_list import ENV_LIST
10 | import traceback
11 | 
12 | def pytest_generate_tests(metafunc):
13 |     idlist = []
14 |     argvalues = []
15 |     for scenario in metafunc.cls.scenarios:
16 |         idlist.append(scenario[0])
17 |         items = scenario[1].items()
18 |         argnames = [x[0] for x in items]
19 |         argvalues.append([x[1] for x in items])
20 |     metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
21 | 
22 | class TestEnv:
23 |     scenarios = [(i, {'config': {'env_name': i}}) for i in ENV_LIST]
24 | 
25 |     def _build_env(self, env_name):
26 |         env = or_gym.make(env_name)
27 |         return env
28 | 
29 |     def test_make(self, config):
30 |         # Ensures that environments are instantiated
31 |         env_name = config['env_name']
32 |         try:
33 |             _ = self._build_env(env_name)
34 |             success = True
35 |         except Exception as e:
36 |             tb = e.__traceback__
37 |             success = False
38 |         assert success, ''.join(traceback.format_tb(tb))
39 | 
40 |     def test_episode(self, config):
41 |         # Run 100 episodes and check observation space
42 |         env_name = config['env_name']
43 |         EPISODES = 100
44 |         env = self._build_env(env_name)
45 |         for ep in range(EPISODES):
46 |             state = env.reset()
47 |             while True:
48 |                 assert env.observation_space.contains(state), \
49 |                     f"State out of range of observation space: {state}"
50 |                 action = env.action_space.sample()
51 |                 state, reward, done, info = env.step(action)
52 |                 if done:
53 |                     break
54 |         
55 |         assert done


--------------------------------------------------------------------------------
/examples/ray_tests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Dec 29 21:51:45 2021
 4 | 
 5 | @author: phili
 6 | """
 7 | 
 8 | import ray
 9 | import time
10 | 
11 | # Start the ray core
12 | ray.init()
13 | 
14 | # By adding the `@ray.remote` decorator, a regular Python function
15 | # becomes a Ray remote function.
16 | @ray.remote
17 | def f(x):
18 |     return x * x
19 | 
20 | # To invoke this remote function, use the `remote` method.
21 | # This will immediately return an object ref (a future) and then create
22 | # a task that will be executed on a worker process.
23 | futures = [f.remote(i) for i in range(4)]
24 | 
25 | # The result can be retrieved with ``ray.get``.
26 | print(ray.get(futures)) # [0, 1, 4, 9]
27 | 
28 | @ray.remote
29 | class Counter(object):
30 |     def __init__(self):
31 |         self.n = 0
32 | 
33 |     def increment(self):
34 |         self.n += 1
35 | 
36 |     def read(self):
37 |         return self.n
38 | 
39 | counters = [Counter.remote() for i in range(4)]
40 | [c.increment.remote() for c in counters]
41 | futures = [c.read.remote() for c in counters]
42 | print(ray.get(futures)) # [1, 1, 1, 1]
43 | 
44 | # Note the following behaviors:
45 | # The second task will not be executed until the first task has finished executing because the second task depends on the output of the first task.
46 | # If the two tasks are scheduled on different machines, the output of the first task (the value corresponding to obj_ref1/objRef1) will be sent over the network to the machine where the second task is scheduled.
47 | 
48 | # MWE for tiny work parallelization
49 | def tiny_work(x):
50 |     time.sleep(0.0001) # replace this is with work you need to do
51 |     return x
52 | 
53 | @ray.remote
54 | def mega_work(start, end):
55 |     return [tiny_work(x) for x in range(start, end)]
56 | 
57 | start = time.time()
58 | result_ids = []
59 | [result_ids.append(mega_work.remote(x*1000, (x+1)*1000)) for x in range(100)]
60 | results = ray.get(result_ids)
61 | print("duration =", time.time() - start)
62 | 
63 | # Close down at the end of the session
64 | ray.shutdown()


--------------------------------------------------------------------------------
/tests/rllib_test.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/env python
 2 | 
 3 | '''
 4 | Tests to ensure environments are compatible with RLLib.
 5 | Note RLLib is NOT a required package, but tests are included
 6 | because it is very useful for RL work.
 7 | '''
 8 | 
 9 | import or_gym
10 | from or_gym.utils import create_env
11 | from or_gym.envs.env_list import ENV_LIST
12 | import ray
13 | from ray import tune
14 | from ray.rllib.agents.ppo import PPOTrainer
15 | import traceback
16 | 
17 | def pytest_generate_tests(metafunc):
18 |     idlist = []
19 |     argvalues = []
20 |     for scenario in metafunc.cls.scenarios:
21 |         idlist.append(scenario[0])
22 |         items = scenario[1].items()
23 |         argnames = [x[0] for x in items]
24 |         argvalues.append([x[1] for x in items])
25 |     metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
26 |     
27 | 
28 | def register_env(env_name, env_config={}):
29 |   env = create_env(env_name)
30 |   tune.register_env(env_name,
31 |                     lambda env_name: env(env_name, env_config=env_config))
32 | 
33 | class TestEnv:
34 |   scenarios = [(i, {"config": {"env_name": i}}) for i in ENV_LIST]
35 |   
36 |   def _build_env(self, env_name):
37 |     env = or_gym.make(env_name)
38 |     return env
39 |   
40 |   def _get_rl_config_dict(self, env_name, env_config={}):
41 |     rl_config = dict(
42 |       env=env_name,
43 |       num_workers=2,
44 |       env_config=env_config,
45 |       model=dict(
46 |           vf_share_layers=False,
47 |           fcnet_activation='elu',
48 |           fcnet_hiddens=[256, 256]
49 |           ),
50 |       lr=1e-5
51 |       )
52 |     return rl_config
53 |   
54 |   def test_ray(self, config):
55 |     env_name = config["env_name"]
56 |     env = self._build_env(env_name)
57 |     register_env(env_name)
58 |     rl_config = self._get_rl_config_dict(env_name)
59 |     ray.init(ignore_reinit_error=True)
60 |     agent = PPOTrainer(env=env_name, config=rl_config)
61 |     # Train 1 episode for testing
62 |     try:
63 |       res = agent.train()
64 |       passed = True
65 |     except:
66 |       passed = False
67 |       
68 |     ray.shutdown()
69 |     assert passed
70 |       
71 |     
72 |     


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.gz
  6 | *.csv
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | # Output files
134 | nohup.out
135 | 
136 | # .bak files
137 | *.bak
138 | 
139 | # wandb output
140 | wandb/
141 | 
142 | notebooks/


--------------------------------------------------------------------------------
/or_gym/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # Knapsack Environments
 4 | register(id='Knapsack-v0',
 5 |     entry_point='or_gym.envs.classic_or.knapsack:KnapsackEnv'
 6 | )
 7 | 
 8 | register(id='Knapsack-v1',
 9 | 	entry_point='or_gym.envs.classic_or.knapsack:BinaryKnapsackEnv'
10 | )
11 | 
12 | register(id='Knapsack-v2',
13 | 	entry_point='or_gym.envs.classic_or.knapsack:BoundedKnapsackEnv'
14 | )
15 | 
16 | register(id='Knapsack-v3',
17 | 	entry_point='or_gym.envs.classic_or.knapsack:OnlineKnapsackEnv'
18 | )
19 | 
20 | # Bin Packing Environments
21 | register(id='BinPacking-v0',
22 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingEnv'
23 | )
24 | 
25 | register(id='BinPacking-v1',
26 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingLW1'
27 | )
28 | 
29 | register(id='BinPacking-v2',
30 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingPP0'
31 | )
32 | 
33 | register(id='BinPacking-v3',
34 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingPP1'
35 | )
36 | 
37 | register(id='BinPacking-v4',
38 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingBW0'
39 | )
40 | 
41 | register(id='BinPacking-v5',
42 | 	entry_point='or_gym.envs.classic_or.binpacking:BinPackingBW1'
43 | )
44 | 
45 | # Newsvendor Envs
46 | register(id='Newsvendor-v0',
47 | 	entry_point='or_gym.envs.classic_or.newsvendor:NewsvendorEnv'
48 | )
49 | 
50 | # Virtual Machine Packing Envs
51 | register(id='VMPacking-v0',
52 | 	entry_point='or_gym.envs.classic_or.vmpacking:VMPackingEnv'
53 | )
54 | 
55 | register(id='VMPacking-v1',
56 | 	entry_point='or_gym.envs.classic_or.vmpacking:TempVMPackingEnv'
57 | )
58 | 
59 | # Vehicle Routing Envs
60 | register(id='VehicleRouting-v0',
61 | 	entry_point='or_gym.envs.classic_or.vehicle_routing:VehicleRoutingEnv'
62 | )
63 | 
64 | # TSP
65 | register(id='TSP-v0',
66 | 	entry_point='or_gym.envs.classic_or.tsp:TSPEnv'
67 | )
68 | 
69 | register(id='TSP-v1',
70 | 	entry_point='or_gym.envs.classic_or.tsp:TSPDistCost'
71 | )
72 | 
73 | # Inventory Management Envs
74 | register(id='InvManagement-v0',
75 | 	entry_point='or_gym.envs.supply_chain.inventory_management:InvManagementBacklogEnv'
76 | )
77 | 
78 | register(id='InvManagement-v1',
79 | 	entry_point='or_gym.envs.supply_chain.inventory_management:InvManagementLostSalesEnv'
80 | )
81 | 
82 | register(id='NetworkManagement-v0',
83 | 	entry_point='or_gym.envs.supply_chain.network_management:NetInvMgmtBacklogEnv'
84 | )
85 | 
86 | register(id='NetworkManagement-v1',
87 | 	entry_point='or_gym.envs.supply_chain.network_management:NetInvMgmtLostSalesEnv'
88 | )
89 | 
90 | # Asset Allocation Envs
91 | register(id='PortfolioOpt-v0',
92 | 	entry_point='or_gym.envs.finance.portfolio_opt:PortfolioOptEnv'
93 | )
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # or-gym
 2 | ## Environments for OR and RL Research
 3 | 
 4 | This library contains environments consisting of operations research problems which adhere to the OpenAI Gym API. The purpose is to bring reinforcement learning to the operations research community via accessible simulation environments featuring classic problems that are solved both with reinforcement learning as well as traditional OR techniques.
 5 | 
 6 | ## Installation
 7 | 
 8 | This library requires Python 3.5+ in order to function.
 9 | 
10 | Installation is possible via `pip`:
11 | 
12 | `$ pip install or-gym`
13 | 
14 | Or, you can install directly from GitHub with:
15 | 
16 | ```
17 | git clone https://github.com/hubbs5/or-gym.git
18 | cd or-gym
19 | pip install -e .
20 | ```
21 | ## Quickstart Example and Benchmarking Example 
22 | 
23 | See the IPython notebook entitled `inv-management-quickstart.ipynb` in the `examples` folder for a quickstart example for training an agent in an OR-GYM environemnt, and for using the environment for benchmarking policies found by other algorithms. For the RL algorithm, Ray 1.0.0 is required.
24 | 
25 | ## Citation
26 | ```
27 | @misc{HubbsOR-Gym,
28 |     author={Christian D. Hubbs and Hector D. Perez and Owais Sarwar and Nikolaos V. Sahinidis and Ignacio E. Grossmann and John M. Wassick},
29 |     title={OR-Gym: A Reinforcement Learning Library for Operations Research Problems},
30 |     year={2020},
31 |     Eprint={arXiv:2008.06319}
32 | }
33 | ```
34 | 
35 | ## Environments
36 | 
37 | - `Knapsack-v0`: a small version of the classic unbounded knapsack problem with 200 items.
38 | - `Knapsack-v1`: binary (0-1) knapsack problem with 200 items.
39 | - `Knapsack-v2`: bounded knapsack problem with 200 items.
40 | - `Knapsack-v3`: stochastic, online knapsack problem.
41 | - `BinPacking-v0` through `BinPacking-v5`: online bin packing problem taken from [Balaji et al.](https://arxiv.org/abs/1911.10641).
42 | - `Newsvendor-v0`: multi-period newsvendor problem with lead times taken from [Balaji et al.](https://arxiv.org/abs/1911.10641).
43 | - `VMPacking-v0`: permanent, multi-dimensional virtual machine packing problem.
44 | - `VMPacking-v1`: temporary, multi-dimensional virtual machine packing problem.
45 | - `VehicleRouting-v0`: pick-up and delivery problem with delivery windows taken from [Balaji et al.](https://arxiv.org/abs/1911.10641).
46 | - `InvManagement-v0`: multi-echelon supply chain re-order problem with backlogs.
47 | - `InvManagement-v1`: multi-echelon supply chain re-order problem without backlog.
48 | - `NetworkManagement-v0`: multi-echelon supply chain network problem with backlogs from [Perez et al.](https://www.mdpi.com/2227-9717/9/1/102).
49 | - `NetworkManagement-v1`: multi-echelon supply chain network problem without backlogs from [Perez et al.](https://www.mdpi.com/2227-9717/9/1/102).
50 | - `PortfolioOpt-v0`: Multi-period asset allocation problem for managing investment decisions taken from [Dantzig and Infanger](https://apps.dtic.mil/dtic/tr/fulltext/u2/a242510.pdf).
51 | - `TSP-v0`: traveling salesman problem with bi-directional connections and uniform cost.
52 | - `TSP-v1`: traveling salesman problem with bi-directional connections.
53 | 
54 | ## Resources
55 | 
56 | Information on results and supporting models can be found [here](https://arxiv.org/abs/2008.06319).
57 | 
58 | ## Examples
59 | 
60 | - [Action Masking with RLlib using the Knapsack Environment](https://www.datahubbs.com/action-masking-with-rllib/)
61 | - [How to Use Deep Reinforcement Learning to Improve your Supply Chain](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/)


--------------------------------------------------------------------------------
/or_gym/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def assign_env_config(self, kwargs):
 4 |     for key, value in kwargs.items():
 5 |         setattr(self, key, value)
 6 |     if hasattr(self, 'env_config'):
 7 |         for key, value in self.env_config.items():
 8 |             # Check types based on default settings
 9 |             if hasattr(self, key):
10 |                 if type(getattr(self,key)) == np.ndarray:
11 |                     setattr(self, key, value)
12 |                 else:
13 |                     setattr(self, key,
14 |                         type(getattr(self, key))(value))
15 |             else:
16 |                 raise AttributeError(f"{self} has no attribute, {key}")
17 | 
18 | # Get Ray to work with gym registry
19 | def create_env(config, *args, **kwargs):
20 |   if type(config) == dict:
21 |     env_name = config['env']
22 |   else:
23 |     env_name = config
24 |   
25 |   if env_name.lower() == 'knapsack-v0':
26 |     from or_gym.envs.classic_or.knapsack import KnapsackEnv as env
27 |   elif env_name.lower() == 'knapsack-v1':
28 |     from or_gym.envs.classic_or.knapsack import BinaryKnapsackEnv as env
29 |   elif env_name.lower() == 'knapsack-v2':
30 |     from or_gym.envs.classic_or.knapsack import BoundedKnapsackEnv as env
31 |   elif env_name.lower() == 'knapsack-v3':
32 |     from or_gym.envs.classic_or.knapsack import OnlineKnapsackEnv as env
33 |   elif env_name.lower() == 'binpacking-v0':
34 |     from or_gym.envs.classic_or.binpacking import BinPackingEnv as env
35 |   elif env_name.lower() == 'binpacking-v1':
36 |     from or_gym.envs.classic_or.binpacking import BinPackingLW1 as env
37 |   elif env_name.lower() == 'binpacking-v2':
38 |     from or_gym.envs.classic_or.binpacking import BinPackingPP0 as env
39 |   elif env_name.lower() == 'binpacking-v3':
40 |     from or_gym.envs.classic_or.binpacking import BinPackingPP1 as env
41 |   elif env_name.lower() == 'binpacking-v4':
42 |     from or_gym.envs.classic_or.binpacking import BinPackingBW0 as env
43 |   elif env_name.lower() == 'binpacking-v5':
44 |     from or_gym.envs.classic_or.binpacking import BinPackingBW1 as env
45 |   elif env_name.lower() == 'vmpacking-v0':
46 |     from or_gym.envs.classic_or.vmpacking import VMPackingEnv as env
47 |   elif env_name.lower() == 'vmpacking-v1':
48 |     from or_gym.envs.classic_or.vmpacking import TempVMPackingEnv as env
49 |   elif env_name.lower() == 'portfolioopt-v0':
50 |     from or_gym.envs.finance.portfolio_opt import PortfolioOptEnv as env
51 |   elif env_name.lower() == 'tsp-v0':
52 |     from or_gym.envs.classic_or.tsp import TSPEnv as env
53 |   elif env_name.lower() == 'tsp-v1':
54 |     from or_gym.envs.classic_or.tsp import TSPDistCost as env
55 |   elif env_name.lower() == 'vehiclerouting-v0':
56 |     from or_gym.envs.classic_or.vehicle_routing import VehicleRoutingEnv as env
57 |   elif env_name.lower() == 'newsvendor-v0':
58 |     from or_gym.envs.classic_or.newsvendor import NewsvendorEnv as env
59 |   elif env_name.lower() == 'invmanagement-v0':
60 |     from or_gym.envs.supply_chain.inventory_management import InvManagementBacklogEnv as env
61 |   elif env_name.lower() == 'invmanagement-v1':
62 |     from or_gym.envs.supply_chain.inventory_management import InvManagementLostSalesEnv as env
63 |   elif env_name.lower() == 'networkmanagement-v0':
64 |     from or_gym.envs.supply_chain.network_management import NetInvMgmtBacklogEnv as env
65 |   elif env_name.lower() == 'networkmanagement-v1':
66 |     from or_gym.envs.supply_chain.network_management import NetInvMgmtLostSalesEnv as env
67 |   else:
68 |     raise NotImplementedError('Environment {} not recognized.'.format(env_name))
69 |   return env
70 | 


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/newsvendor.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Example taken from Balaji et al.
  3 | Paper: https://arxiv.org/abs/1911.10641
  4 | GitHub: https://github.com/awslabs/or-rl-benchmarks
  5 | '''
  6 | import gym
  7 | from gym import spaces
  8 | import itertools
  9 | import numpy as np
 10 | from collections.abc import Iterable
 11 | from or_gym.utils import assign_env_config
 12 | 
 13 | class NewsvendorEnv(gym.Env):
 14 |     '''
 15 |     Multi-Period Newsvendor with Lead Times
 16 | 
 17 |     The MPNV requires meeting stochastic demand by having sufficient
 18 |     inventory on hand to satisfy customers. The inventory orders are not
 19 |     instantaneous and have multi-period leadtimes. Additionally, there are
 20 |     costs associated with holding unsold inventory, however unsold inventory
 21 |     expires at the end of each period.
 22 | 
 23 |     Observation:
 24 |         Type: Box 
 25 |         State Vector: S = (p, c, h, k, mu, x_l, x_l-1)
 26 |         p = price
 27 |         c = cost
 28 |         h = holding cost
 29 |         k = lost sales penalty
 30 |         mu = mean of demand distribution
 31 |         x_l = order quantities in the queue
 32 | 
 33 |     Actions:
 34 |         Type: Box
 35 |         Amount of product to order.
 36 | 
 37 |     Reward:
 38 |         Sales minus discounted purchase price, minus holding costs for
 39 |         unsold product or penalties associated with insufficient inventory.
 40 | 
 41 |     Initial State:
 42 |         Parameters p, c, h, k, and mu, with no inventory in the pipeline.
 43 | 
 44 |     Episode Termination:
 45 |         Termination occurs after the maximum number of time steps is reached
 46 |         (40 by default).
 47 |     '''
 48 |     def __init__(self, *args, **kwargs):
 49 |         self.lead_time = 5
 50 |         self.max_inventory = 4000
 51 |         self.max_order_quantity = 2000
 52 |         self.step_limit = 40
 53 |         self.p_max = 100    # Max sale price
 54 |         self.h_max = 5      # Max holding cost
 55 |         self.k_max = 10     # Max lost sales penalty
 56 |         self.mu_max = 200   # Max mean of the demand distribution
 57 |         self.gamma = 1      # Discount factor
 58 |         assign_env_config(self, kwargs)
 59 | 
 60 |         self.obs_dim = self.lead_time + 5
 61 |         self.observation_space = spaces.Box(
 62 |             low=np.zeros(self.obs_dim, dtype=np.float32),
 63 |             high=np.array(
 64 |                 [self.p_max, self.p_max, self.h_max, self.k_max, self.mu_max] +
 65 |                 [self.max_order_quantity] * self.lead_time),
 66 |             dtype=np.float32)
 67 |         self.action_space = spaces.Box(
 68 |             low=np.zeros(1), high=np.array([self.max_order_quantity]), 
 69 |             dtype=np.float32)
 70 | 
 71 |         self.reset()
 72 | 
 73 |     def _STEP(self, action):
 74 |         done = False
 75 |         order_qty = max(0, # Ensure order > 0
 76 |             min(action, self.max_inventory - self.state[5:].sum())) # Cap inventory
 77 |         demand = np.random.poisson(self.mu)
 78 |         inventory = self.state[5:]
 79 |         if self.lead_time == 0: # No lead time -> instant fulfillment
 80 |             inv_on_hand = order_qty
 81 |         else:
 82 |             inv_on_hand = inventory[0]
 83 |         sales = min(inv_on_hand, demand) * self.price
 84 |         excess_inventory = max(0, inv_on_hand - demand)
 85 |         short_inventory = max(0, demand - inv_on_hand)
 86 |         purchase_cost = excess_inventory * self.cost * order_qty * \
 87 |             self.gamma ** self.lead_time
 88 |         holding_cost = excess_inventory * self.h
 89 |         lost_sales_penalty = short_inventory * self.k
 90 |         reward = sales - purchase_cost - holding_cost - lost_sales_penalty
 91 | 
 92 |         # Update state, note inventory on hand expires at each time step
 93 |         new_inventory = np.zeros(self.lead_time)
 94 |         new_inventory[:-1] += inventory[1:]
 95 |         new_inventory[-1] += order_qty
 96 |         self.state = np.hstack([self.state[:5], new_inventory] , dtype=np.float32)
 97 | 
 98 |         self.step_count += 1
 99 |         if self.step_count >= self.step_limit:
100 |             done = True
101 |         if isinstance(reward, Iterable):
102 |             # TODO: Sometimes reward is np.array with one entry
103 |             reward = sum(reward)
104 | 
105 |         return self.state, reward, done, {}
106 | 
107 |     def _RESET(self):
108 |         # Randomize costs
109 |         self.price = max(1, np.random.rand() * self.p_max)
110 |         self.cost = max(1, np.random.rand() * self.price)
111 |         self.h = np.random.rand() * min(self.cost, self.h_max)
112 |         self.k = np.random.rand() * self.k_max
113 |         self.mu = np.random.rand() * self.mu_max
114 |         self.state = np.zeros(self.obs_dim, dtype=np.float32)
115 |         self.state[:5] = np.array([self.price, self.cost, self.h,
116 |             self.k, self.mu])
117 | 
118 |         self.step_count = 0
119 | 
120 |         return self.state
121 | 
122 |     def reset(self):
123 |         return self._RESET()
124 | 
125 |     def step(self, action):
126 |         return self._STEP(action)


--------------------------------------------------------------------------------
/examples/rllib-validate-env.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | This python script outlines the _validate_env logic for Rllib (v.1.9.1) and
  5 | can be used for debugginf issues in environment configuration.
  6 | 
  7 | Created on Thu Dec 30 16:04:12 2021
  8 | 
  9 | @author: Philipp Willms
 10 | """
 11 | import or_gym
 12 | import gym
 13 | import numpy as np
 14 | 
 15 | 
 16 | # Configuration for gym environment
 17 | env_config = {'N': 200,
 18 |               'max_weight': 60,
 19 |              # 'item_weights': np.array([1, 12, 2, 1, 4]),
 20 |              # 'item_values': np.array([2, 4, 2, 1, 10]),
 21 |               'mask': True}
 22 |  
 23 | env_name = 'Knapsack-v0'
 24 | env = or_gym.make('Knapsack-v0', env_config=env_config)
 25 | 
 26 |    
 27 | if isinstance(env, gym.Env) :
 28 |     # Make sure the gym.Env has the two space attributes properly set.
 29 |     assert hasattr(env, "observation_space") and hasattr(
 30 |         env, "action_space")
 31 |     # Get a dummy observation by resetting the env.
 32 |     dummy_obs = env.reset()
 33 |     # Convert lists to np.ndarrays.
 34 |     if type(dummy_obs) is list and isinstance(env.observation_space,
 35 |                                               gym.spaces.Box):
 36 |         dummy_obs = np.array(dummy_obs)
 37 |         print("Dummy obs after np array conversion: ")
 38 |         print(dummy_obs)
 39 |     # Ignore float32/float64 diffs.
 40 |     if isinstance(env.observation_space, gym.spaces.Box) and \
 41 |             env.observation_space.dtype != dummy_obs.dtype:
 42 |         dummy_obs = dummy_obs.astype(env.observation_space.dtype)
 43 |         print("Dummy obs after ignore float diffs ")
 44 |         print(dummy_obs)
 45 |     # Check, if observation is ok (part of the observation space). If not,
 46 |     # error.
 47 |     
 48 |     determined_obs_space = env.observation_space
 49 |     
 50 |     # original code from box.py
 51 |     #    def contains(self, x):
 52 |     #        if not isinstance(x, np.ndarray):
 53 |     #            logger.warn("Casting input x to numpy array.")
 54 |     #            x = np.asarray(x, dtype=self.dtype)
 55 | 
 56 |     #        return (
 57 |     #            np.can_cast(x.dtype, self.dtype)
 58 |     #            and x.shape == self.shape
 59 |     #            and np.all(x >= self.low)
 60 |     #            and np.all(x <= self.high)
 61 |     #        )
 62 |     
 63 |     # action_mask check
 64 |     x = determined_obs_space["action_mask"]
 65 |     y = dummy_obs["action_mask"]
 66 |     print(x)
 67 |     print(y)
 68 |     print(np.can_cast(x.dtype, y.dtype))
 69 |     print(x.shape == y.shape)
 70 |     print(np.all(y >= x.low))
 71 |     print(np.all(y <= x.high))
 72 |     
 73 |     # avail actions check
 74 |     x = determined_obs_space["avail_actions"]
 75 |     y = dummy_obs["avail_actions"]
 76 |     print(x)
 77 |     print(y)
 78 |     print(np.can_cast(x.dtype, y.dtype))
 79 |     print(x.shape == y.shape)
 80 |     print(np.all(y >= x.low))
 81 |     print(np.all(y <= x.high))
 82 |     
 83 |     # state check
 84 |     x = determined_obs_space["state"]
 85 |     y = dummy_obs["state"]
 86 |     print(x)
 87 |     print(y)
 88 |     print(np.can_cast(x.dtype, y.dtype))
 89 |     print(x.shape == y.shape)
 90 |     print(np.all(y >= x.low))
 91 |     print(np.all(y <= x.high))
 92 |     
 93 | # original code from dict.py
 94 | #    def contains(self, x):
 95 | #        if not isinstance(x, dict) or len(x) != len(self.spaces):
 96 | #            return False
 97 | #        for k, space in self.spaces.items():
 98 | #            if k not in x:
 99 | #                return False
100 | #            if not space.contains(x[k]):
101 | #                return False
102 | #        return True
103 |     
104 |     x = determined_obs_space
105 |     y = dummy_obs
106 |     print("Dict check")
107 |     print(isinstance(y, dict))
108 |     print("Length observation space: " + str(len(x.spaces)))
109 |     print("Length dummy observation: " + str(len(y)))
110 |     print(len(y) == len(x.spaces))
111 |     for k, space in x.spaces.items():
112 |             print(k)
113 |             print(space)
114 |             if k not in y:
115 |                 #return False
116 |                 print("Element not found in dummy observation")
117 |                 print(k)
118 |             if not space.contains(y[k]):
119 |                 print("Contains check failed")
120 |                 print(y[k])
121 |                # return False
122 |     
123 |     # If there is a hard nut to crack with specific observation state, use the following
124 |     x = determined_obs_space["state"]
125 |     mal_state = y[k]
126 |     print(np.can_cast(x.dtype, mal_state.dtype))
127 |     print(x.shape == mal_state.shape)
128 |     print(np.all(mal_state >= x.low))
129 |     print(np.all(mal_state  <= x.high))
130 |     print(isinstance(y[k], np.ndarray))
131 |     print(x.contains(mal_state))
132 |     
133 |     # Copied from rollout_worker.py
134 |     if not env.observation_space.contains(dummy_obs):
135 |         print(
136 |             f"Env's `observation_space` {env.observation_space} does not "
137 |             f"contain returned observation after a reset ({dummy_obs})!")
138 |     else:
139 |         print("All checks passed")


--------------------------------------------------------------------------------
/examples/ray_rllib_knapsack.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Spyder Editor
  4 | 
  5 | This is a temporary script file.
  6 | """
  7 | import ray
  8 | from ray.rllib.agents.ppo import PPOTrainer
  9 | from ray import tune
 10 | from ray.rllib.models import ModelCatalog
 11 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 12 | from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
 13 | from ray.rllib.utils import try_import_tf
 14 | from gym import spaces
 15 | import or_gym
 16 | from or_gym.utils import create_env
 17 | import numpy as np
 18 |  
 19 | tf_api, tf_original, tf_version = try_import_tf(error = True)
 20 | 
 21 | 
 22 | class KP0ActionMaskModel(TFModelV2):
 23 |      
 24 |     def __init__(self, obs_space, action_space, num_outputs,
 25 |         model_config, name, true_obs_shape=(11,),
 26 |         action_embed_size=5, *args, **kwargs):
 27 |         
 28 |         # true_obs_shape is going to match the size of the state. 
 29 |         # If we stick with our reduced KP, that will be a vector with 11 entries. 
 30 |         # The other value we need to provide is the action_embed_size, which is going to be the size of our action space (5)
 31 |          
 32 |         super(KP0ActionMaskModel, self).__init__(obs_space,
 33 |             action_space, num_outputs, model_config, name, 
 34 |             *args, **kwargs)
 35 |          
 36 |         self.action_embed_model = FullyConnectedNetwork(
 37 |             spaces.Box(0, 1, shape=true_obs_shape), 
 38 |                 action_space, action_embed_size,
 39 |             model_config, name + "_action_embedding")
 40 |         self.register_variables(self.action_embed_model.variables())
 41 |  
 42 |     def forward(self, input_dict, state, seq_lens):
 43 |         
 44 |         # The actual masking takes place in the forward method where we unpack the mask, actions, and state from 
 45 |         # the observation dictionary provided by our environment. The state yields our action embeddings which gets 
 46 |         # combined with our mask to provide logits with the smallest value we can provide. 
 47 |         # This will get passed to a softmax output which will reduce the probability of selecting these actions to 0, 
 48 |         # effectively blocking the agent from ever taking these illegal actions.
 49 |         
 50 |         avail_actions = input_dict["obs"]["avail_actions"]
 51 |         action_mask = input_dict["obs"]["action_mask"]
 52 |         action_embedding, _ = self.action_embed_model({
 53 |             "obs": input_dict["obs"]["state"]})
 54 |         # intent_vector = tf.expand_dims(action_embedding, 1)
 55 |         # action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=1)
 56 |         # inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
 57 |         
 58 |         intent_vector = tf_api.expand_dims(action_embedding, 1)
 59 |         action_logits = tf_api.reduce_sum(avail_actions * intent_vector, axis=1)
 60 |         inf_mask = tf_api.maximum(tf_api.log(action_mask), tf_api.float32.min)
 61 |         
 62 |         return action_logits + inf_mask, state
 63 |  
 64 |     def value_function(self):
 65 |         return self.action_embed_model.value_function()
 66 | 
 67 | # Configuration for gym environment
 68 | # Not to be used for Online Knapsack
 69 | env_config = {'N': 5,
 70 |               'max_weight': 15,
 71 |               'item_weights': np.array([1, 12, 2, 1, 4]),
 72 |               'item_values': np.array([2, 4, 2, 1, 10]),
 73 |               'mask': True}
 74 |  
 75 | env_name = 'Knapsack-v0'
 76 | env = or_gym.make('Knapsack-v0', env_config=env_config)
 77 |  
 78 | print("Max weight capacity:\t{}kg".format(env.max_weight))
 79 | print("Number of items:\t{}".format(env.N))
 80 | 
 81 | # Register the model for Rllib usage
 82 | ModelCatalog.register_custom_model('kp_mask', KP0ActionMaskModel)
 83 | # Register the environment
 84 | # ATTENTION: Tune needs the base class, not an instance of the environment like we get from or_gym.make(env_name) to work with. So we need to pass this to register_env using a lambda function as shown below.
 85 | env = create_env(env_name)
 86 | tune.register_env(env_name, lambda env_name: env(env_name, env_config=env_config))
 87 | 
 88 | trainer_config = {
 89 |     "model": {
 90 |         "custom_model": "kp_mask"   # Here we must use the custom model name taken in register process before
 91 |         },
 92 |     "env_config": env_config        # env config from (or_)gym
 93 |      }
 94 | 
 95 | # ray.shutdown()
 96 | # Ensure that a ray instance is running, e.g. via http://127.0.0.1:8265/#/
 97 | # ray.init(address="auto", ignore_reinit_error = True, local_mode=True)
 98 | ray.init()
 99 | trainer = PPOTrainer(env='Knapsack-v0', config=trainer_config)
100 | 
101 | # The real action masking logic: disable the agent to take action 0
102 | env = trainer.env_creator('Knapsack-v0')
103 | state = env.state
104 | state['action_mask'][0] = 0
105 | 
106 | # Train an agent for 1000 states and check if action 0 was not taken ever
107 | actions = np.array([trainer.compute_single_action(state) for i in range(10000)])
108 | print(any(actions==0))
109 | 
110 | # Use tune for hyperparameter tuning
111 | tune_config = {
112 |     'env': 'Knapsack-v0'
113 | }
114 | stop = {
115 |     'timesteps_total': 10000
116 | }
117 | results = tune.run(
118 |     'PPO', # Specify the algorithm to train
119 |     metric="score",
120 |     config=tune_config,
121 |     stop=stop
122 | ) 
123 | 
124 | ray.shutdown()
125 | 


--------------------------------------------------------------------------------
/examples/tf_orgym_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "2244831d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import gym\n",
 11 |     "import or_gym\n",
 12 |     "import numpy as np\n",
 13 |     "import random\n",
 14 |     "from tensorflow.keras.models import Sequential\n",
 15 |     "from tensorflow.keras.layers import Input, Dense, Flatten, Reshape\n",
 16 |     "from tensorflow.keras.optimizers import Adam\n",
 17 |     "from rl.agents import DQNAgent\n",
 18 |     "from rl.policy import BoltzmannQPolicy\n",
 19 |     "from rl.memory import SequentialMemory"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "cc47ced5",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Binary Knapsack"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "118ab57c",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "env_config = {'N': 5,\n",
 38 |     "              'max_weight': 15,\n",
 39 |     "              'item_weights': np.array([1, 12, 2, 1, 4]),\n",
 40 |     "              'item_values': np.array([2, 4, 2, 1, 10]),\n",
 41 |     "              'mask': False}\n",
 42 |     "env = or_gym.make('Knapsack-v0', env_config=env_config)  \n",
 43 |     "initial_state = env.reset()"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "e05f8c49",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "The state variable must be read as the following:\n",
 52 |     "\n",
 53 |     "    Observation:\n",
 54 |     "        Type: Tuple, Discrete\n",
 55 |     "        0: list of item weights\n",
 56 |     "        1: list of item values\n",
 57 |     "        2: maximum weight of the knapsack\n",
 58 |     "        3: current weight in knapsack\n",
 59 |     "\n",
 60 |     "    Actions:\n",
 61 |     "        Type: Discrete\n",
 62 |     "        0: Place item 0 into knapsack\n",
 63 |     "        1: Place item 1 into knapsack\n",
 64 |     "        2: ...\n",
 65 |     "\n",
 66 |     "    Reward:\n",
 67 |     "        Value of item successfully placed into knapsack or 0 if the item\n",
 68 |     "        doesn't fit, at which point the episode ends.\n",
 69 |     "\n",
 70 |     "    Starting State:\n",
 71 |     "        Lists of available items and empty knapsack.\n",
 72 |     "\n",
 73 |     "    Episode Termination:\n",
 74 |     "        Full knapsack or selection that puts the knapsack over the limit."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "70567316",
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "actions = env.action_space.n\n",
 85 |     "states = env.observation_space\n",
 86 |     "states.shape"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "25fd9311",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "Simulate random item selection for 10 episodes"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "14c8bf18",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "env.reset()\n",
105 |     "episode = 0\n",
106 |     "done = False\n",
107 |     "while not done :\n",
108 |     "    episode += 1\n",
109 |     "    print(\"Episode: \" + str(episode))\n",
110 |     "    action = np.random.randint(actions)\n",
111 |     "    print(\"Take element number: \" + str(action))\n",
112 |     "    next_state, reward, done, info = env.step(action)\n",
113 |     "    print(\"Reward: \" + str(reward))\n",
114 |     "    print(next_state)\n",
115 |     "    print(env.render())"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "7375ad77",
121 |    "metadata": {},
122 |    "source": [
123 |     "As we can see in the detailed print out of the observation space, it is just the last index value which changes from episode to episode. This index is equal to the current total weight of the knapsack. The observation space from the environment gives no indication on the total value collected, which is instead added by the render() function."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "id": "4260a48b",
129 |    "metadata": {},
130 |    "source": [
131 |     "# Keras model for the knapsack decision environment\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "91d5ba79",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "model = Sequential()  \n",
142 |     "model.add(Dense(24, activation='relu', input_shape=states.shape))\n",
143 |     "model.add(Flatten())\n",
144 |     "model.add(Dense(actions, activation='linear'))\n",
145 |     "model.summary()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "34e842c5",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "model.layers[0].get_input_shape_at(0) # get the input shape of desired layer"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "id": "52f787d2",
161 |    "metadata": {},
162 |    "source": [
163 |     "# Agent training with Keras RL"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "b3e0907c",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "policy = BoltzmannQPolicy()\n",
174 |     "memory = SequentialMemory(limit=50000, window_length=1)\n",
175 |     "dqn = DQNAgent(model=model, memory=memory, policy=policy, \n",
176 |     "                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n",
177 |     "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n",
178 |     "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)"
179 |    ]
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "kernelspec": {
184 |    "display_name": "Python 3 (ipykernel)",
185 |    "language": "python",
186 |    "name": "python3"
187 |   },
188 |   "language_info": {
189 |    "codemirror_mode": {
190 |     "name": "ipython",
191 |     "version": 3
192 |    },
193 |    "file_extension": ".py",
194 |    "mimetype": "text/x-python",
195 |    "name": "python",
196 |    "nbconvert_exporter": "python",
197 |    "pygments_lexer": "ipython3",
198 |    "version": "3.8.12"
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 5
203 | }
204 | 


--------------------------------------------------------------------------------
/or_gym/envs/finance/portfolio_opt.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces, logger
  4 | from gym.utils import seeding
  5 | from or_gym.utils import assign_env_config
  6 | from copy import copy
  7 | 
  8 | class PortfolioOptEnv(gym.Env): 
  9 |     '''
 10 |     Portfolio Optimization Problem 
 11 | 
 12 |     Instance: Multi-Period Asset Allocation Problem, Dantzing & Infager, 1993 
 13 | 
 14 |     The Portfolio Optimization (PO) Problem is a problem that seeks to optimize 
 15 |     the distribution of assets in a financial portfolio to with respect to a desired 
 16 |     financial metric (e.g. maximal return, minimum risk, etc.). 
 17 | 
 18 |     In this particular instance by Dantzing & Infager, the optimizer begins with a 
 19 |     quantity of cash and has the opportunity to purchase or sell 3 other assets in each  
 20 |     of 10 different investement periods. Each transaction incurs a cost and prices of 
 21 |     the 3 assets are subject to change over time. Cash value is consant (price = 1). 
 22 |     The objective is to maximize the amount of wealth (i.e. the sum total of asset values)
 23 |     at the end of the total investment horizon.
 24 | 
 25 |     The episodes proceed by the optimizer deciding whether to buy or sell each asset
 26 |     in each time period. The episode ends when either all 10 periods have passed or 
 27 |     if the amount of any given asset held becomes negative.  
 28 | 
 29 |     Observation:
 30 |         Type: Box(9)
 31 |         "asset prices" (idx 0, 1, 2, 3): array of asset prices [cash, asset1, asset2, asset3]
 32 |         "asset quantities" (idx 4, 5, 6, 7): array of asset quantities [cash, asset1, asset2, asset3]
 33 |         "total wealth" (idx 8): current total wealth (sum of price*quantity for each asset)
 34 |     
 35 | 
 36 |     Actions:
 37 |         Type: Box (3)
 38 |         "asset 1 transaction amount" (idx 0): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 1; 
 39 |         "asset 2 transaction amount" (idx 1): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 2; 
 40 |         "asset 3 transaction amount" (idx 2): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 3; 
 41 | 
 42 |     Reward:
 43 |         Change in total wealth from previous period or [-max(asset price of all assets) *  maximum transaction size]
 44 |         if an asset quantity becomes negative, at which 
 45 |         point the episode ends.
 46 | 
 47 |     Starting State:
 48 |         Starting amount of cash and wealth and prices. 
 49 | 
 50 |     Episode Termination:
 51 |         Negative asset quantity or traversal of investment horizon. 
 52 |     '''    
 53 |     def __init__(self, *args, **kwargs):
 54 |         self.num_assets = 3 # Number of assets 
 55 |         self.initial_cash = 100 # Starting amount of capital 
 56 |         self.step_limit = 10 # Investment horizon 
 57 | 
 58 |         self.cash = copy(self.initial_cash)
 59 | 
 60 |         #Transaction costs proportional to amount bought 
 61 |         self.buy_cost = np.array([0.045, 0.025, 0.035])
 62 |         self.sell_cost = np.array([0.04, 0.02, 0.03])
 63 |         # self.step_limit = 10
 64 |         # assign_env_config(self, kwargs)
 65 |         # self.asset_price_means = asset_price_means.T
 66 |         # # self.asset_price_means = (np.random.randint(10, 50, self.num_assets) \
 67 |         # #     * np.ones((self.step_limit, self.num_assets))).T
 68 |         # self.asset_price_var = np.ones(self.asset_price_means.shape)
 69 | 
 70 |         # Prices of assets have a mean value in every period and vary according to a Gaussian distribution 
 71 |         asset1mean = np.array([1.25, 2, 4, 5, 3, 2, 3, 6, 9, 7]).reshape(1, -1)  # Up and down all the way 
 72 |         asset2mean = np.array([5, 3, 2, 2, 1.25, 4, 5, 6, 7, 8]).reshape(1, -1)  # Down intially then up 
 73 |         asset3mean = np.array([3, 5, 6, 9, 10, 8, 4, 2, 1.25, 4]).reshape(1, -1) # Up initially then down 
 74 |         self.asset_price_means = np.vstack([asset1mean, asset2mean, asset3mean])
 75 |         self.asset_price_var = np.ones((self.asset_price_means.shape)) * 0.45
 76 |         
 77 |         # Cash on hand, asset prices, num of shares, portfolio value
 78 |         self.obs_length = 1 + 2 * self.num_assets
 79 | 
 80 |         self.observation_space = spaces.Box(-20000, 20000, shape=(self.obs_length,), dtype=np.float32)
 81 |         self.action_space = spaces.Box(-2000, 2000, shape=(self.num_assets,), dtype=np.float32)
 82 |         
 83 |         self.seed()
 84 |         self.reset()
 85 |         
 86 |         
 87 |     def _RESET(self):
 88 |         self.step_count = 0
 89 |         self.asset_prices = self._generate_asset_prices()
 90 |         self.holdings = np.zeros(self.num_assets)
 91 |         self.cash = copy(self.initial_cash)
 92 |         self.state = np.hstack([
 93 |             self.initial_cash,
 94 |             self.asset_prices[:, self.step_count],
 95 |             self.holdings],
 96 |             dtype=np.float32)
 97 |         return self.state
 98 |     
 99 |     def _generate_asset_prices(self):
100 |         asset_prices = np.array([self.np_random.normal(mu, sig) for mu, sig in 
101 |             zip(self.asset_price_means.flatten(), self.asset_price_var.flatten())]
102 |             ).reshape(self.asset_price_means.shape)
103 |         # Zero out negative asset prices and all following prices - implies
104 |         # equity is bankrupt and worthless.
105 |         zero_vals = np.vstack(np.where(asset_prices<0))
106 |         cols = np.unique(zero_vals[0])
107 |         for c in cols:
108 |             first_zero = zero_vals[1][np.where(zero_vals[0]==c)[0].min()]
109 |             asset_prices[c,first_zero:] = 0
110 |         return asset_prices
111 |     
112 |     def _STEP(self, action):
113 |         
114 |         assert self.action_space.contains(action)
115 |     
116 |         asset_prices = self.asset_prices[:, self.step_count].copy()
117 |         
118 |         for idx, a in enumerate(action):
119 |             if a == 0:
120 |                 continue
121 |             # Sell a shares of asset
122 |             elif a < 0:
123 |                 a = np.abs(a)
124 |                 if a > self.holdings[idx]:
125 |                     a = self.holdings[idx]
126 |                 self.holdings[idx] -= a
127 |                 self.cash += asset_prices[idx] * a * (1 - self.sell_cost[idx])
128 |             # Buy a shares of asset
129 |             elif a > 0:
130 |                 purchase_cost = asset_prices[idx] * a * (1 + self.buy_cost[idx])
131 |                 if self.cash < purchase_cost:
132 |                     a = np.floor(self.cash / (
133 |                         asset_prices[idx] * (1 + self.buy_cost[idx])))
134 |                     purchase_cost = asset_prices[idx] * a * (1 + self.buy_cost[idx])
135 |                 self.holdings[idx] += a
136 |                 self.cash -= purchase_cost
137 |                 
138 |         # Return total portfolio value at the end of the horizon as reward
139 |         if self.step_count + 1 == self.step_limit: 
140 |             reward = np.dot(asset_prices, self.holdings) + self.cash
141 |         else: 
142 |             reward = 0 
143 |         self.step_count += 1
144 | 
145 |         # Finish if 10 periods have passed - end of investment horizon 
146 |         if self.step_count >= self.step_limit:
147 |             done = True
148 |         else:
149 |             self._update_state()
150 |             done = False
151 |             
152 |         return self.state, reward, done, {}
153 |     
154 |     def _update_state(self):
155 |         self.state = np.hstack([
156 |             self.cash,
157 |             self.asset_prices[:, self.step_count],
158 |             self.holdings
159 |         ], dtype=np.float32)
160 | 
161 |     def step(self, action):
162 |         return self._STEP(action)
163 | 
164 |     def reset(self):
165 |         return self._RESET()
166 |     
167 |     def seed(self, seed=None):
168 |         self.np_random, seed = seeding.np_random(seed)
169 |         return [seed]
170 | 


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/binpacking.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Example taken from Balaji et al.
  3 | Paper: https://arxiv.org/abs/1911.10641
  4 | GitHub: https://github.com/awslabs/or-rl-benchmarks
  5 | '''
  6 | import numpy as np
  7 | import gym
  8 | from gym import spaces, logger
  9 | from gym.utils import seeding
 10 | from or_gym.utils import assign_env_config
 11 | import copy
 12 | 
 13 | BIG_NEG_REWARD = -100
 14 | BIG_POS_REWARD = 10
 15 | 
 16 | class BinPackingEnv(gym.Env):
 17 |     '''
 18 |     Small Bin Packing with Bounded Waste
 19 |     Env Registration: BinPacking-v0
 20 | 
 21 |     The Bin Packing Problem (BPP) is a combinatorial optimization problem which
 22 |     requires the user to select from a range of goods of different values and
 23 |     weights in order to maximize the value of the selected items within a 
 24 |     given weight limit. This version is online meaning each item is randonly
 25 |     presented to the algorithm one at a time, at which point the algorithm 
 26 |     can either accept or reject the item. After seeing a fixed number of 
 27 |     items are shown, the episode terminates. If the weight limit is reached
 28 |     before the episode ends, then it terminates early.
 29 | 
 30 |     Observation:
 31 |         If mask == False:
 32 |             Type: Discrete
 33 |             0 - bin_capacity: Count of bins at a given level h
 34 |             -1: Current item size
 35 |         if mask == True:
 36 |             Type: Dict
 37 |             'state': vector of bins where 0 to bin capacity is the count of
 38 |                 bins at that load level h and the last entry is the current
 39 |                 item size.
 40 |             'action_mask': binary vector where 0 indicates infeasible
 41 |                 actions and 1 feasible actions.
 42 |             'avail_actions': vector of values to be combined with mask.
 43 | 
 44 |     Actions:
 45 |         Type: Discrete
 46 |         0: Open a new bin and place item into bin
 47 |         1+: Attempt to place item into bin at the corresponding level
 48 | 
 49 |     Reward:
 50 |         Negative of the waste, which is the difference between the current
 51 |         size and excess space of the bin.
 52 | 
 53 |     Starting State:
 54 |         No available bins and random starting item
 55 |         
 56 |     Episode Termination:
 57 |         When invalid action is selected (e.g. attempt to place item in non-existent
 58 |         bin), bin limits are exceeded, or step limit is reached.
 59 |     '''
 60 |     def __init__(self, *args, **kwargs):
 61 |         self.bin_capacity = 9
 62 |         self.item_sizes = [2, 3]
 63 |         self.item_probs = [0.8, 0.2]
 64 |         self.step_count = 0
 65 |         self.step_limit = 100
 66 |         self.mask = False
 67 |         assign_env_config(self, kwargs)
 68 |         self._build_obs_space()
 69 |         self._check_settings()
 70 |         self.seed()
 71 |         self.state = self.reset()
 72 |         
 73 |     def _STEP(self, action):
 74 |         done = False
 75 |         if action >= self.bin_capacity:
 76 |             raise ValueError('{} is an invalid action. Must be between {} and {}'.format(
 77 |                 action, 0, self.bin_capacity))
 78 |         elif action > (self.bin_capacity - self.item_size):
 79 |             # Bin overflows
 80 |             reward = BIG_NEG_REWARD - self.waste
 81 |             done = True
 82 |         elif action == 0:
 83 |             # Create new bin
 84 |             self.bin_levels[self.item_size] += 1
 85 |             self.waste = self.bin_capacity - self.item_size
 86 |             reward = -1 * self.waste
 87 |         elif self.bin_levels[action] == 0:
 88 |             # Can't insert item into non-existent bin
 89 |             reward = BIG_NEG_REWARD - self.waste
 90 |             done = True
 91 |         else:
 92 |             if action + self.item_size == self.bin_capacity:
 93 |                 self.num_full_bins += 1
 94 |             else:
 95 |                 self.bin_levels[action + self.item_size] += 1
 96 |             self.waste = -self.item_size
 97 |             reward = -1 * self.waste
 98 |             
 99 |             self.bin_levels[action] -= 1
100 |         
101 |         self.total_reward += reward
102 |         
103 |         self.step_count += 1 
104 |         
105 |         if self.step_count >= self.step_limit:
106 |             done = True
107 |         
108 |         self.state = self._update_state()
109 |         
110 |         return self.state, reward, done, {}
111 | 
112 |     def _update_state(self):
113 |         self.item_size = self.get_item()
114 |         state = np.array(self.bin_levels + [self.item_size], dtype=np.uint32)
115 |         if self.mask:
116 |             state_dict = {
117 |                 'state': state,
118 |                 'avail_actions': np.ones(self.bin_capacity, dtype=np.uint8)}
119 |             # Mask actions for closed bins
120 |             mask = np.ones(self.bin_capacity, dtype=np.uint8) * np.array(state[:-1])
121 |             # Mask actions where packing would exceed capacity
122 |             overflow = self.bin_capacity - self.item_size
123 |             mask[overflow+1:] = 0
124 |             # Ensure open new bin is available
125 |             mask[0] = 1
126 |             state_dict['action_mask'] = mask
127 |             return state_dict
128 |         else:
129 |             return state
130 |     
131 |     def get_item(self):
132 |         return np.random.choice(self.item_sizes, p=self.item_probs)
133 |         
134 |     def sample_action(self):
135 |         return self.action_space.sample()
136 |     
137 |     def _RESET(self):
138 |         self.current_weight = 0
139 |         self.step_count = 0        
140 |         self.num_full_bins = 0
141 |         self.total_reward = 0
142 |         self.waste = 0
143 |         self.bin_levels = [0] * self.bin_capacity
144 |         self.item_size = self.get_item()
145 |         self.state = self._update_state()
146 |         return self.state
147 | 
148 |     def _build_obs_space(self):
149 |         if self.mask:
150 |             self.observation_space = spaces.Dict({
151 |                 'action_mask': spaces.Box(0, 1, 
152 |                     shape=(self.bin_capacity,),
153 |                     dtype=np.uint8),
154 |                 'avail_actions': spaces.Box(0, 1, 
155 |                     shape=(self.bin_capacity,),
156 |                     dtype=np.uint8),
157 |                 'state': spaces.Box(
158 |                     low=np.array([0] * (1 + self.bin_capacity)),
159 |                     high=np.array([self.step_limit] * self.bin_capacity +
160 |                         [max(self.item_sizes)]),
161 |                     dtype=np.uint32)
162 |             })
163 |         else:
164 |             self.observation_space = spaces.Box(
165 |                 low=np.array([0] * (1 + self.bin_capacity)),
166 |                 high=np.array([self.step_limit] * self.bin_capacity + 
167 |                     [max(self.item_sizes)]),
168 |                 dtype=np.uint32)
169 |         
170 |         self.action_space = spaces.Discrete(self.bin_capacity)
171 | 
172 |     def _check_settings(self):
173 |         # Ensure setting sizes and probs are correct at initialization
174 |         assert sum(self.item_probs) == 1, 'Item probabilities do not sum to 1.'
175 |         assert len(self.item_probs) == len(self.item_sizes), \
176 |             'Dimension mismatch between item probabilities' + \
177 |                 ' ({}) and sizes ({})'.format(
178 |                 len(self.item_probs), len(self.item_sizes))
179 | 
180 |     def reset(self):
181 |         return self._RESET()
182 | 
183 |     def step(self, action):
184 |         return self._STEP(action)
185 | 
186 | class BinPackingLW1(BinPackingEnv):
187 |     '''
188 |     Large Bin Packing Probem with Bounded Waste
189 |     Env Registration: BinPacking-v1
190 |     '''
191 |     def __init__(self, *args, **kwargs):
192 |         super().__init__()
193 |         self.bin_capacity = 100
194 |         self.item_probs = [0.14, 0.1, 0.06, 0.13, 0.11, 0.13, 0.03, 0.11, 0.19]
195 |         self.item_sizes = np.arange(1, 10)
196 |         self.step_limit = 1000
197 |         assign_env_config(self, kwargs)
198 |         self._build_obs_space()
199 |         self._check_settings()
200 |         self.seed()
201 |         self.state = self.reset()
202 | 
203 | class BinPackingPP0(BinPackingEnv):
204 |     '''
205 |     Small Perfectly Packable Bin Packing with Linear Waste
206 |     Env Registration: BinPacking-v2
207 |     '''
208 |     def __init__(self, *args, **kwargs):
209 |         super().__init__()
210 |         self.item_probs = [0.75, 0.25]
211 |         assign_env_config(self, kwargs)
212 |         self._build_obs_space()
213 |         self._check_settings()
214 |         self.seed()
215 |         self.state = self.reset()
216 | 
217 | class BinPackingPP1(BinPackingPP0):
218 |     '''
219 |     Large Bin Packing Probem with Bounded Waste
220 |     Env Registration: BinPacking-v3
221 |     '''
222 |     def __init__(self, *args, **kwargs):
223 |         super().__init__()
224 |         self.bin_capacity = 100
225 |         self.item_probs = [0.06, 0.11, 0.11, 0.22, 0, 0.11, 0.06, 0, 0.33]
226 |         self.item_sizes = np.arange(1, 10)
227 |         self.step_limit = 1000
228 |         assign_env_config(self, kwargs)
229 |         self._build_obs_space()
230 |         self._check_settings()
231 |         self.seed()
232 |         self.state = self.reset()
233 | 
234 | class BinPackingBW0(BinPackingEnv):
235 |     '''
236 |     Small Perfectly Packable Bin Packing Problem with Bounded Waste
237 |     Env Registration: BinPacking-v4
238 |     '''
239 |     def __init__(self, *args, **kwargs):
240 |         super().__init__()
241 |         self.item_probs = [0.5, 0.5]
242 |         assign_env_config(self, kwargs)
243 |         self._build_obs_space()
244 |         self._check_settings()
245 |         self.seed()
246 |         self.state = self.reset()
247 | 
248 | class BinPackingBW1(BinPackingBW0):
249 |     '''
250 |     Large Perfectly Packable Bin Packing Problem with Bounded Waste
251 |     Env Registration: BinPacking-v5
252 |     '''
253 |     def __init__(self, *args, **kwargs):
254 |         super().__init__()
255 |         self.bin_capacity = 100
256 |         self.item_probs = [0, 0, 0, 1/3, 0, 0, 0, 0, 2/3]
257 |         self.item_sizes = np.arange(1, 10)
258 |         self.step_limit = 1000
259 |         assign_env_config(self, kwargs)
260 |         self._build_obs_space()
261 |         self._check_settings()
262 |         self.seed()
263 |         self.state = self.reset()


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/vmpacking.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces, logger
  4 | from gym.utils import seeding
  5 | from or_gym.utils import assign_env_config
  6 | import copy
  7 | 
  8 | class VMPackingEnv(gym.Env):
  9 |     '''
 10 |     Online VM Packing Problem
 11 | 
 12 |     The VM Packing Problem (VMPP) is a combinatorial optimization problem which
 13 |     requires the user to select from a series of physical machines (PM's) to
 14 |     send a virtual machine process to. Each VM process is characterized by
 15 |     two values, the memory and compute of the process. These are normalized
 16 |     by the PM capacities to range between 0-1. 
 17 | 
 18 |     Observation:
 19 |         Type: Tuple, Discrete
 20 |         [0][:, 0]: Binary indicator for open PM's
 21 |         [0][:, 1]: CPU load of PM's
 22 |         [0][:, 2]: Memory load of PM's
 23 |         [1][0]: Current CPU demand
 24 |         [1][1]: Current memory demand
 25 | 
 26 |     Actions:
 27 |         Type: Discrete
 28 |         Integer of PM number to send VM to that PM
 29 | 
 30 |     Reward:
 31 |         Negative of the waste, which is the difference between the current
 32 |         size and excess space on the PM.
 33 | 
 34 |     Starting State:
 35 |         No open PM's and random starting item
 36 |         
 37 |     Episode Termination:
 38 |         When invalid action is selected, attempt to overload VM, or step
 39 |         limit is reached.
 40 |     '''
 41 |     def __init__(self, *args, **kwargs):
 42 |         self.cpu_capacity = 1
 43 |         self.mem_capacity = 1
 44 |         self.t_interval = 20
 45 |         self.tol = 1e-5
 46 |         self.step_limit = int(60 * 24 / self.t_interval)
 47 |         self.n_pms = 50
 48 |         self.load_idx = np.array([1, 2])
 49 |         self.seed = 0
 50 |         self.mask = True
 51 |         assign_env_config(self, kwargs)
 52 |         self.action_space = spaces.Discrete(self.n_pms)
 53 | 
 54 |         if self.mask:
 55 |             self.observation_space = spaces.Dict({
 56 |                 "action_mask": spaces.Box(0, 1, shape=(self.n_pms,), dtype=np.uint8),
 57 |                 "avail_actions": spaces.Box(0, 1, shape=(self.n_pms,), dtype=np.uint8),
 58 |                 "state": spaces.Box(0, 1, shape=(self.n_pms+1, 3), dtype=np.float32)
 59 |             })
 60 |         else:
 61 |             self.observation_space = spaces.Box(0, 1, shape=(self.n_pms+1, 3), dtype=np.float32)
 62 |         self.reset()
 63 |         
 64 |     def _RESET(self):
 65 |         self.demand = self.generate_demand()
 66 |         self.current_step = 0
 67 |         self.state = {
 68 |             "action_mask": np.ones(self.n_pms, dtype=np.uint8),
 69 |             "avail_actions": np.ones(self.n_pms, dtype=np.uint8),
 70 |             "state": np.vstack([
 71 |                 np.zeros((self.n_pms, 3)),
 72 |                 self.demand[self.current_step]],
 73 |                 dtype=np.float32)
 74 |         }
 75 |         self.assignment = {}
 76 |         return self.state
 77 |     
 78 |     def _STEP(self, action):
 79 |         done = False
 80 |         pm_state = self.state["state"][:-1]
 81 |         demand = self.state["state"][-1, 1:]
 82 |         
 83 |         if action < 0 or action >= self.n_pms:
 84 |             raise ValueError("Invalid action: {}".format(action))
 85 |             
 86 |         elif any(pm_state[action, 1:] + demand > 1 + self.tol):
 87 |             # Demand doesn't fit into PM
 88 |             reward = -1000
 89 |             done = True
 90 |         else:
 91 |             if pm_state[action, 0] == 0:
 92 |                 # Open PM if closed
 93 |                 pm_state[action, 0] = 1
 94 |             pm_state[action, self.load_idx] += demand
 95 |             reward = np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2))
 96 |             self.assignment[self.current_step] = action
 97 |             
 98 |         self.current_step += 1
 99 |         if self.current_step >= self.step_limit:
100 |             done = True
101 |         self.update_state(pm_state)
102 |         return self.state, reward, done, {}
103 |     
104 |     def update_state(self, pm_state):
105 |         # Make action selection impossible if the PM would exceed capacity
106 |         step = self.current_step if self.current_step < self.step_limit else self.step_limit-1
107 |         data_center = np.vstack([pm_state, self.demand[step]], dtype=np.float32)
108 |         data_center = np.where(data_center>1,1,data_center) # Fix rounding errors
109 |         self.state["state"] = data_center
110 |         self.state["action_mask"] = np.ones(self.n_pms, dtype=np.uint8)
111 |         self.state["avail_actions"] = np.ones(self.n_pms, dtype=np.uint8)
112 |         if self.mask:
113 |             action_mask = (pm_state[:, 1:] + self.demand[step, 1:]) <= 1
114 |             self.state["action_mask"] = (action_mask.sum(axis=1)==2).astype(np.uint8)
115 | 
116 |     def sample_action(self):
117 |         return self.action_space.sample()
118 | 
119 |     def generate_demand(self):
120 |         n = self.step_limit
121 |         # From Azure data
122 |         mem_probs = np.array([0.12 , 0.165, 0.328, 0.287, 0.064, 0.036])
123 |         mem_bins = np.array([0.02857143, 0.05714286, 0.11428571, 0.45714286, 0.91428571,
124 |            1.]) # Normalized bin sizes
125 |         mu_cpu = 16.08
126 |         sigma_cpu = 1.26
127 |         cpu_demand = np.random.normal(loc=mu_cpu, scale=sigma_cpu, size=n)
128 |         cpu_demand = np.where(cpu_demand<=0, mu_cpu, cpu_demand) # Ensure demand isn't negative
129 |         mem_demand = np.random.choice(mem_bins, p=mem_probs, size=n)
130 |         return np.vstack([np.arange(n)/n, cpu_demand/100, mem_demand]).T
131 | 
132 |     def step(self, action):
133 |         return self._STEP(action)
134 | 
135 |     def reset(self):
136 |         return self._RESET()
137 | 
138 | class TempVMPackingEnv(VMPackingEnv):
139 |     '''
140 |     Online Temporary VM Packing Problem
141 | 
142 |     The VM Packing Problem (VMPP) is a combinatorial optimization problem which
143 |     requires the user to select from a series of physical machines (PM's) to
144 |     send a virtual machine process to. Each VM process is characterized by
145 |     two values, the memory and compute of the process. These are normalized
146 |     by the PM capacities to range between 0-1. 
147 | 
148 |     Observation:
149 |         Type: Tuple, Discrete
150 |         [0][:, 0]: Binary indicator for open PM's
151 |         [0][:, 1]: CPU load of PM's
152 |         [0][:, 2]: Memory load of PM's
153 |         [1][0]: Current CPU demand
154 |         [1][1]: Current memory demand
155 | 
156 |     Actions:
157 |         Type: Discrete
158 |         Integer of PM number to send VM to that PM
159 | 
160 |     Reward:
161 |         Negative of the waste, which is the difference between the current
162 |         size and excess space on the PM.
163 | 
164 |     Starting State:
165 |         No open PM's and random starting item
166 |         
167 |     Episode Termination:
168 |         When invalid action is selected, attempt to overload VM, or step
169 |         limit is reached.
170 |     '''
171 |     def __init__(self, *args, **kwargs):
172 |         super().__init__()       
173 |         self.state = self.reset()
174 | 
175 |     def step(self, action):
176 |         done = False
177 |         pm_state = self.state["state"][:-1]
178 |         demand = self.state["state"][-1, 1:]
179 |         
180 |         if action < 0 or action >= self.n_pms:
181 |             raise ValueError("Invalid action: {}".format(action))
182 |             
183 |         elif any(pm_state[action, 1:] + demand > 1 + self.tol):
184 |             # Demand doesn't fit into PM
185 |             reward = -1000
186 |             done = True
187 |         else:
188 |             if pm_state[action, 0] == 0:
189 |                 # Open PM if closed
190 |                 pm_state[action, 0] = 1
191 |             pm_state[action, self.load_idx] += demand
192 |             reward = np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2))
193 |             self.assignment[self.current_step] = action
194 | 
195 |         # Remove processes
196 |         if self.current_step in self.durations.values():
197 |             for process in self.durations.keys():
198 |                 # Remove process from PM
199 |                 if self.durations[process] == self.current_step:
200 |                     pm = self.assignment[process] # Find PM where process was assigned
201 |                     pm_state[pm, self.load_idx] -= self.demand[process]
202 |                     # Shut down PM's if state is 0
203 |                     if pm_state[pm, self.load_idx].sum() == 0:
204 |                         pm_state[pm, 0] = 0
205 |             
206 |         self.current_step += 1
207 |         if self.current_step >= self.step_limit:
208 |             done = True
209 |         self.update_state(pm_state)
210 |         return self.state, reward, done, {}
211 |     
212 |     def update_state(self, pm_state):
213 |         # Make action selection impossible if the PM would exceed capacity
214 |         step = self.current_step if self.current_step < self.step_limit else self.step_limit-1
215 |         data_center = np.vstack([pm_state, self.demand[step]])
216 |         data_center = np.where(data_center>1,1,data_center) # Fix rounding errors
217 |         self.state["state"] = data_center
218 |         self.state["action_mask"] = np.ones(self.n_pms)
219 |         self.state["avail_actions"] = np.ones(self.n_pms)
220 |         if self.mask:
221 |             action_mask = (pm_state[:, 1:] + self.demand[step, 1:]) <= 1
222 |             self.state["action_mask"] = (action_mask.sum(axis=1)==2).astype(int)
223 |         
224 |     def _RESET(self):
225 |         self.current_step = 0
226 |         self.assignment = {}
227 |         self.demand = self.generate_demand()
228 |         self.durations = generate_durations(self.demand)
229 |         self.state = (np.zeros((self.n_pms, 3)), self.demand[0])
230 |         return self.state
231 | 
232 |     def step(self, action):
233 |         return self._STEP(action)
234 | 
235 |     def reset(self):
236 |         return self._RESET()
237 | 
238 | def generate_durations(demand):
239 |     # duration_params = np.array([ 6.53563303e-02,  5.16222242e+01,  4.05028032e+06, -4.04960880e+06])
240 |     return {i: np.random.randint(low=i+1, high=len(demand)+1)
241 |         for i, j in enumerate(demand)}
242 | 
243 | def gaussian_model(params, x):
244 |     return params[2] * np.exp(-0.5*((x - params[0]) / params[1]) ** 2) + params[3]


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/tsp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces
  4 | from or_gym import utils
  5 | from copy import copy, deepcopy
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | class TSPEnv(gym.Env):
  9 |     '''
 10 |     Bi-directional connections and uniform cost
 11 | 
 12 |     This version of the TSP uses a sparse graph with uniform cost.
 13 |     The goal is to minimize the cost to traverse all of the nodes in the
 14 |     network. All connections are bi-directional meaning if a connection
 15 |     between nodes n and m exist, then the agent can move in either direction.
 16 |     The network is randomly generated with N nodes when the environment is
 17 |     initialized using or_gym.make(). 
 18 |     
 19 |     TSP-v0 allows repeat visits to nodes with no additional penalty beyond
 20 |     the nominal movement cost.
 21 | 
 22 |     Observation:
 23 |         
 24 | 
 25 |     Actions:
 26 |         Type: Discrete
 27 |         0: move to node 0
 28 |         1: move to node 1
 29 |         2: ...
 30 | 
 31 |     Action Masking (optional):
 32 |         Masks non-existent connections, otherwise a large penalty is imposed
 33 |         on the agent.
 34 | 
 35 |     Reward:
 36 |         Cost of moving from node to node or large negative penalty for
 37 |         attempting to move to a node via a non-existent connection.
 38 | 
 39 |     Starting State:
 40 |         Random node
 41 | 
 42 |     Episode Termination:
 43 |         All nodes have been visited or the maximimum number of steps (2N)
 44 |         have been reached.
 45 |     '''
 46 |     def __init__(self, *args, **kwargs):
 47 |         self.N = 50
 48 |         self.move_cost = -1
 49 |         self.invalid_action_cost = -100
 50 |         self.mask = False
 51 |         utils.assign_env_config(self, kwargs)
 52 | 
 53 |         self.nodes = np.arange(self.N)
 54 |         self.step_limit = 2*self.N
 55 |         self.obs_dim = 1+self.N**2
 56 |         obs_space = spaces.Box(-1, self.N, shape=(self.obs_dim,), dtype=np.int32)
 57 |         if self.mask:
 58 |             self.observation_space = spaces.Dict({
 59 |                 "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8),
 60 |                 "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8),
 61 |                 "state": obs_space
 62 |             })
 63 |         else:
 64 |             self.observation_space = obs_space
 65 |         self.action_space = spaces.Discrete(self.N)
 66 |         
 67 |         self.reset()
 68 |         
 69 |     def _STEP(self, action):
 70 |         done = False
 71 |         connections = self.node_dict[self.current_node]
 72 |         # Invalid action
 73 |         if action not in connections:
 74 |             reward = self.invalid_action_cost
 75 |         # Move to new node
 76 |         else:
 77 |             self.current_node = action
 78 |             reward = self.move_cost
 79 |             self.visit_log[self.current_node] += 1
 80 |             
 81 |         self.state = self._update_state()
 82 |         self.step_count += 1
 83 |         # See if all nodes have been visited
 84 |         unique_visits = sum([1 if v > 0 else 0 
 85 |             for v in self.visit_log.values()])
 86 |         if unique_visits >= self.N:
 87 |             done = True
 88 |             reward += 1000
 89 |         if self.step_count >= self.step_limit:
 90 |             done = True
 91 |             
 92 |         return self.state, reward, done, {}
 93 |         
 94 |     def _RESET(self):
 95 |         self.step_count = 0
 96 |         self._generate_connections()
 97 |         self.current_node = np.random.choice(self.nodes)
 98 |         self.visit_log = {n: 0 for n in self.nodes}
 99 |         self.visit_log[self.current_node] += 1
100 |         
101 |         self.state = self._update_state()
102 |         return self.state
103 |         
104 |     def _update_state(self):
105 |         node_connections = self.adjacency_matrix.copy()
106 |         # Set value to 1 for existing, un-visited nodes
107 |         # Set value to -1 for existing, visited nodes
108 |         # Set value to 0 if connection doesn't exist
109 |         visited = np.array([bool(min(v, 1))
110 |             for v in self.visit_log.values()])
111 |         node_connections[:, visited] = -1
112 |         node_connections[np.where(self.adjacency_matrix==0)] = 0
113 | 
114 |         connections = node_connections.flatten().astype(int)
115 |         obs = np.hstack([self.current_node, connections], dtype=np.int32)
116 |         if self.mask:
117 |             mask = node_connections[self.current_node]
118 |             # mask = np.array([1 if c==1 and v==0 else 0 
119 |             #     for c, v in zip(cons_from_node, self.visit_log.values())])
120 |             state = {
121 |                 "action_mask": mask,
122 |                 "avail_actions": np.ones(self.N, dtype=np.uint8),
123 |                 "state": obs,
124 |             }
125 |         else:
126 |             state = obs.copy()
127 | 
128 |         return state
129 |         
130 |     def _generate_connections(self):
131 |         node_dict = {}
132 |         for n in range(self.N):
133 |             connections = np.random.randint(2, self.N - 1)
134 |             node_dict[n] = np.sort(
135 |                np.random.choice(self.nodes[np.where(self.nodes!=n)],
136 |                                  size=connections, replace=False))
137 |         # Get unique, bi-directional connections
138 |         for k, v in node_dict.items():
139 |             for k1, v1 in node_dict.items():
140 |                 if k == k1:
141 |                     continue
142 |                 if k in v1 and k1 not in v:
143 |                     v = np.append(v, k1)
144 | 
145 |             node_dict[k] = np.sort(v.copy())
146 |         self.node_dict = deepcopy(node_dict)
147 |         self._generate_adjacency_matrix()
148 |     
149 |     def _generate_adjacency_matrix(self):
150 |         self.adjacency_matrix = np.zeros((self.N, self.N))
151 |         for k, v in self.node_dict.items():
152 |             self.adjacency_matrix[k][v] += 1
153 |         self.adjacency_matrix.astype(int)
154 |             
155 |     def _generate_coordinates(self):
156 |         n = np.linspace(0, 2*np.pi, self.N+1)
157 |         x = np.cos(n)
158 |         y = np.sin(n)
159 |         return np.vstack([x, y])
160 | 
161 |     def _get_node_distance(self, N0, N1):
162 |         return np.sqrt(np.power(N0[0] - N1[0], 2) + np.power(N0[1] - N1[1], 2))
163 |             
164 |     def plot_network(self, offset=(0.02, 0.02)):
165 |         coords = self._generate_coordinates()
166 |         fig, ax = plt.subplots(figsize=(12,8))
167 |         ax.scatter(coords[0], coords[1], s=40)
168 |         for n, c in self.node_dict.items():
169 |             for k in c:
170 |                 line = np.vstack([coords[:, n], coords[:, k]])
171 |                 dis = self._get_node_distance(line[0], line[1])
172 |                 # dis = np.sqrt(np.power(line[0, 0] - line[1, 0], 2) + 
173 |                 #               np.power(line[0, 1] - line[1, 1], 2))
174 |                 ax.plot(line[:,0], line[:,1], c='g', zorder=-1)
175 |         #         ax.arrow(line[0, 0], line[0, 1], line[1, 0], line[1, 1])
176 |             ax.annotate(r"$N_{:d}$".format(n), xy=(line[0]+offset), zorder=2)
177 |         ax.xaxis.set_visible(False)
178 |         ax.yaxis.set_visible(False)
179 |         plt.show()
180 | 
181 |     def step(self, action):
182 |         return self._STEP(action)
183 | 
184 |     def reset(self):
185 |         return self._RESET()
186 | 
187 | class TSPDistCost(TSPEnv):
188 |     '''
189 |     Fully connected network with distance-based cost.
190 | 
191 |     This environment enables travel between all nodes in the network and 
192 |     incurs cost based on the Euclidean distance between nodes. The goal is to
193 |     minimize the cost to traverse all of the nodes in the network exactly 
194 |     once. The agent incurs a large penalty and ends the episode if it moves to 
195 |     a node more than once. All connections are bi-directional meaning if a 
196 |     connection between nodes n and m exist, then the agent can move in either 
197 |     direction. The network is randomly generated with N nodes when the 
198 |     environment is initialized using or_gym.make(). 
199 |     
200 |     Observation:
201 |         Type: Box
202 |         0: Current Node
203 |         1: 0 or 1 if node 0 has been visited or not
204 |         2: 0 or 1 if node 1 has been visited or not
205 |         3: ...
206 | 
207 |     Actions:
208 |         Type: Discrete
209 |         0: move to node 0
210 |         1: move to node 1
211 |         2: ...
212 | 
213 |     Action Masking (optional):
214 |         Masks visited nodes.
215 | 
216 |     Reward:
217 |         Cost of moving from node to node.
218 | 
219 |     Starting State:
220 |         Random node
221 | 
222 |     Episode Termination:
223 |         All nodes have been visited or a node has been visited again.
224 |     '''
225 |     def __init__(self, *args, **kwargs):
226 |         self.N = 50
227 |         self.invalid_action_cost = -100
228 |         self.mask = False
229 |         utils.assign_env_config(self, kwargs)
230 |         self.nodes = np.arange(self.N)
231 |         self.coords = self._generate_coordinates()
232 |         self.distance_matrix = self._get_distance_matrix()
233 | 
234 |         self.obs_dim = 1+self.N
235 |         obs_space = spaces.Box(-1, self.N, shape=(self.obs_dim,), dtype=np.int32)
236 |         if self.mask:
237 |             self.observation_space = spaces.Dict({
238 |                 "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8),
239 |                 "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8),
240 |                 "state": obs_space
241 |             })
242 |         else:
243 |             self.observation_space = obs_space
244 | 
245 |         self.action_space = spaces.Discrete(self.N)
246 |         
247 |         self.reset()
248 | 
249 |     def _STEP(self, action):
250 |         done = False
251 |         if self.visit_log[action] > 0:
252 |             # Node already visited
253 |             reward = self.invalid_action_cost
254 |             done = True
255 |         else:
256 |             reward = self.distance_matrix[self.current_node, action]
257 |             self.current_node = action
258 |             self.visit_log[self.current_node] = 1
259 |             
260 |         self.state = self._update_state()
261 |         # See if all nodes have been visited
262 |         unique_visits = self.visit_log.sum()
263 |         if unique_visits == self.N:
264 |             done = True
265 |             
266 |         return self.state, reward, done, {}
267 | 
268 |     def _RESET(self):
269 |         self.step_count = 0
270 |         self.current_node = np.random.choice(self.nodes)
271 |         self.visit_log = np.zeros(self.N)
272 |         self.visit_log[self.current_node] += 1
273 |         
274 |         self.state = self._update_state()
275 |         return self.state
276 | 
277 |     def _generate_coordinates(self):
278 |         return np.vstack([np.random.rand(self.N), np.random.rand(self.N)])
279 | 
280 |     def _get_distance_matrix(self):
281 |         # Distance matrix
282 |         distance_matrix = np.zeros((self.N, self.N))
283 |         for i in range(self.N):
284 |             # Take advantage of symmetrical matrix
285 |             for j in range(self.N):
286 |                 if j <= i:
287 |                     continue
288 |                 d = self._get_node_distance(self.coords[:, i], self.coords[:, j])
289 |                 distance_matrix[i, j] += d
290 |                 
291 |         distance_matrix += distance_matrix.T
292 |         return distance_matrix
293 | 
294 |     def _update_state(self):
295 |         mask = np.where(self.visit_log==0, 0 , 1)
296 |         obs = np.hstack([self.current_node, mask])
297 |         if self.mask:
298 |             state = {
299 |                 "avail_actions": np.ones(self.N),
300 |                 "action_mask": mask,
301 |                 "state": obs
302 |             }
303 |         else:
304 |             state = obs.copy()
305 |         return state
306 | 
307 |     def step(self, action):
308 |         return self._STEP(action)
309 | 
310 |     def reset(self):
311 |         return self._RESET()
312 |     


--------------------------------------------------------------------------------
/examples/ray_rllib_knapsack.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "973f2613",
  7 |    "metadata": {
  8 |     "scrolled": true
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import ray\n",
 13 |     "from ray.rllib import agents\n",
 14 |     "from ray import tune\n",
 15 |     "from ray.rllib.models import ModelCatalog\n",
 16 |     "from ray.rllib.models.tf.tf_modelv2 import TFModelV2\n",
 17 |     "from ray.rllib.models.tf.fcnet import FullyConnectedNetwork\n",
 18 |     "from ray.rllib.utils import try_import_tf\n",
 19 |     "from gym import spaces\n",
 20 |     "import or_gym\n",
 21 |     "from or_gym.utils import create_env\n",
 22 |     "import numpy as np\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import pandas as pd"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "4201d91c",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "# Prepare Tensforflow and ray"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "2ac8ec32",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Establish Tensorflow API conncetion\n",
 43 |     "tf_api, tf_original, tf_version = try_import_tf(error = True) \n",
 44 |     "# Disable callback synch on Windows\n",
 45 |     "TUNE_DISABLE_AUTO_CALLBACK_SYNCER=1"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "6acd05db",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Knapsack environment with action masking"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "15bc4f2b",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "Class definition: customized Tensorflow-2-model for OR-Gym knapsack envrionemnt with action masking"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "a2b629ed",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "class KP0ActionMaskModel(TFModelV2):\n",
 72 |     "     \n",
 73 |     "    def __init__(self, obs_space, action_space, num_outputs,\n",
 74 |     "        model_config, name, true_obs_shape=(11,),\n",
 75 |     "        action_embed_size=5, *args, **kwargs):\n",
 76 |     "        \n",
 77 |     "        # true_obs_shape is going to match the size of the state. \n",
 78 |     "        # If we stick with our reduced KP, that will be a vector with 11 entries. \n",
 79 |     "        # The other value we need to provide is the action_embed_size, which is going to be the size of our action space (5)\n",
 80 |     "         \n",
 81 |     "        super(KP0ActionMaskModel, self).__init__(obs_space,\n",
 82 |     "            action_space, num_outputs, model_config, name, \n",
 83 |     "            *args, **kwargs)\n",
 84 |     "         \n",
 85 |     "        self.action_embed_model = FullyConnectedNetwork(\n",
 86 |     "            spaces.Box(0, 1, shape=true_obs_shape), \n",
 87 |     "                action_space, action_embed_size,\n",
 88 |     "            model_config, name + \"_action_embedding\")\n",
 89 |     "        self.register_variables(self.action_embed_model.variables())\n",
 90 |     " \n",
 91 |     "    def forward(self, input_dict, state, seq_lens):\n",
 92 |     "        \n",
 93 |     "        # The actual masking takes place in the forward method where we unpack the mask, actions, and state from \n",
 94 |     "        # the observation dictionary provided by our environment. The state yields our action embeddings which gets \n",
 95 |     "        # combined with our mask to provide logits with the smallest value we can provide. \n",
 96 |     "        # This will get passed to a softmax output which will reduce the probability of selecting these actions to 0, \n",
 97 |     "        # effectively blocking the agent from ever taking these illegal actions.\n",
 98 |     "        \n",
 99 |     "        avail_actions = input_dict[\"obs\"][\"avail_actions\"]\n",
100 |     "        action_mask = input_dict[\"obs\"][\"action_mask\"]\n",
101 |     "        action_embedding, _ = self.action_embed_model({\n",
102 |     "            \"obs\": input_dict[\"obs\"][\"state\"]})\n",
103 |     "        intent_vector = tf_api.expand_dims(action_embedding, 1)\n",
104 |     "        action_logits = tf_api.reduce_sum(avail_actions * intent_vector, axis=1)\n",
105 |     "        inf_mask = tf_api.maximum(tf_api.log(action_mask), tf_api.float32.min)\n",
106 |     "        return action_logits + inf_mask, state\n",
107 |     " \n",
108 |     "    def value_function(self):\n",
109 |     "        return self.action_embed_model.value_function()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "id": "4246a847",
115 |    "metadata": {},
116 |    "source": [
117 |     "Environment creation"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "9d15617a",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# Configuration for gym environment\n",
128 |     "env_config = {'N': 5,\n",
129 |     "              'max_weight': 15,\n",
130 |     "              'item_weights': np.array([1, 12, 2, 1, 4]),\n",
131 |     "              'item_values': np.array([2, 4, 2, 1, 10]),\n",
132 |     "              'mask': True}\n",
133 |     " \n",
134 |     "env_name = 'Knapsack-v0'\n",
135 |     "env = or_gym.make('Knapsack-v0', env_config=env_config)\n",
136 |     " \n",
137 |     "print(\"Max weight capacity:\\t{}kg\".format(env.max_weight))\n",
138 |     "print(\"Number of items:\\t{}\".format(env.N))"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "id": "f3710cae",
144 |    "metadata": {},
145 |    "source": [
146 |     "Create Rllib trainable instance"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "b78cf89e",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Register the model for Rllib usage\n",
157 |     "ModelCatalog.register_custom_model('kp_mask', KP0ActionMaskModel)\n",
158 |     "# Register the environment, so that we have a Trainable instance later\n",
159 |     "# ATTENTION: Tune needs the base class, not an instance of the environment like we get from or_gym.make(env_name) to work with. So we need to pass this to register_env using a lambda function as shown below.\n",
160 |     "env = create_env(env_name)\n",
161 |     "tune.register_env(env_name, lambda env_name: env(env_name, env_config=env_config))\n",
162 |     "\n",
163 |     "trainer_config = {\n",
164 |     "    \"model\": {\n",
165 |     "        \"custom_model\": \"kp_mask\"   # Here we must use the custom model name taken in register process before\n",
166 |     "        },\n",
167 |     "    \"env_config\": env_config,       # env config from (or_)gym\n",
168 |     "    #\"framework\" : \"tfe\"             # tip by rllib to enable TensorFlow eager exection\n",
169 |     "     }\n",
170 |     "\n",
171 |     "# ray.shutdown() maybe necessary in case of blocking instance\n",
172 |     "ray.init( ignore_reinit_error = True )\n",
173 |     "trainer = agents.ppo.PPOTrainer(env='Knapsack-v0', config=trainer_config)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "65e87dff",
179 |    "metadata": {},
180 |    "source": [
181 |     "Train the agent\n",
182 |     "\n"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "2da600b2",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "env = trainer.env_creator('Knapsack-v0')\n",
193 |     "state = env.state\n",
194 |     "\n",
195 |     "# Use the action masking to disable the agent to take specific actions, i.e. to avoid taking element in knapsack by index\n",
196 |     "# state['action_mask'][0] = 0\n",
197 |     "\n",
198 |     "# Train an agent for 1000 states \n",
199 |     "actions = np.array([trainer.compute_single_action(state) for i in range(10000)])\n",
200 |     "\n",
201 |     "# If action masking used, check that this action was never taken\n",
202 |     "# print(any(actions==0))"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "8848f472",
208 |    "metadata": {},
209 |    "source": [
210 |     "# Tuning hyperparameters"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "b07a0cb3",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# Use tune for hyperparameter tuning\n",
221 |     "tune_config = {\n",
222 |     "    'env': 'Knapsack-v0'\n",
223 |     "}\n",
224 |     "stop = {\n",
225 |     "    'timesteps_total': 10000\n",
226 |     "}\n",
227 |     "results = tune.run(\n",
228 |     "    'PPO', # Specify the algorithm to train\n",
229 |     "    config=tune_config,\n",
230 |     "    stop=stop\n",
231 |     ") "
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "id": "8e31e7ab",
237 |    "metadata": {},
238 |    "source": [
239 |     "Plot the results"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "id": "817660cf",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "colors = plt.rcParams['axes.prop_cycle'].by_key()['color']\n",
250 |     "df = results.dataframe()\n",
251 |     "# Get column for total loss, policy loss, and value loss\n",
252 |     "tl_col = [i for i, j in enumerate(df.columns)\n",
253 |     "          if 'total_loss' in j][0]\n",
254 |     "pl_col = [i for i, j in enumerate(df.columns)\n",
255 |     "          if 'policy_loss' in j][0]\n",
256 |     "vl_col = [i for i, j in enumerate(df.columns)\n",
257 |     "          if 'vf_loss' in j][0]\n",
258 |     "labels = []\n",
259 |     "fig, ax = plt.subplots(2, 2, figsize=(15, 15), sharex=True)\n",
260 |     "for i, path in df['logdir'].iteritems():\n",
261 |     "    data = pd.read_csv(path + '/progress.csv')\n",
262 |     "    # Get labels for legend\n",
263 |     "    lr = data['experiment_id'][0]\n",
264 |     "    layers = data['training_iteration'][0]\n",
265 |     "    labels.append('LR={}; Shared Layers={}'.format(lr, layers))\n",
266 |     "     \n",
267 |     "    ax[0, 0].plot(data['timesteps_total'], \n",
268 |     "            data['episode_reward_mean'], c=colors[i],\n",
269 |     "            label=labels[-1])\n",
270 |     "     \n",
271 |     "    ax[0, 1].plot(data['timesteps_total'], \n",
272 |     "           data.iloc[:, tl_col], c=colors[i],\n",
273 |     "           label=labels[-1])\n",
274 |     "     \n",
275 |     "    ax[1, 0].plot(data['timesteps_total'], \n",
276 |     "               data.iloc[:, pl_col], c=colors[i],\n",
277 |     "               label=labels[-1])\n",
278 |     "     \n",
279 |     "     \n",
280 |     "    ax[1, 1].plot(data['timesteps_total'], \n",
281 |     "               data.iloc[:, vl_col], c=colors[i],\n",
282 |     "               label=labels[-1])\n",
283 |     " \n",
284 |     "ax[0, 0].set_ylabel('Mean Rewards')\n",
285 |     "ax[0, 0].set_title('Training Rewards by Time Step')\n",
286 |     "ax[0, 0].legend(labels=labels, loc='upper center',\n",
287 |     "        ncol=3, bbox_to_anchor=[0.75, 1.2])\n",
288 |     " \n",
289 |     " \n",
290 |     "ax[0, 1].set_title('Total Loss by Time Step')\n",
291 |     "ax[0, 1].set_ylabel('Total Loss')\n",
292 |     "ax[0, 1].set_xlabel('Training Episodes')\n",
293 |     " \n",
294 |     "ax[1, 0].set_title('Policy Loss by Time Step')\n",
295 |     "ax[1, 0].set_ylabel('Policy Loss')\n",
296 |     "ax[1, 0].set_xlabel('Time Step')\n",
297 |     " \n",
298 |     "ax[1, 1].set_title('Value Loss by Time Step')\n",
299 |     "ax[1, 1].set_ylabel('Value Loss')\n",
300 |     "ax[1, 1].set_xlabel('Time Step')\n",
301 |     " \n",
302 |     "plt.show()"
303 |    ]
304 |   }
305 |  ],
306 |  "metadata": {
307 |   "kernelspec": {
308 |    "display_name": "or-gym",
309 |    "language": "python",
310 |    "name": "or-gym"
311 |   },
312 |   "language_info": {
313 |    "codemirror_mode": {
314 |     "name": "ipython",
315 |     "version": 3
316 |    },
317 |    "file_extension": ".py",
318 |    "mimetype": "text/x-python",
319 |    "name": "python",
320 |    "nbconvert_exporter": "python",
321 |    "pygments_lexer": "ipython3",
322 |    "version": "3.8.12"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 5
327 | }
328 | 


--------------------------------------------------------------------------------
/examples/inv-management-quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "937597e4",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# How to Use Deep Reinforcement Learning to Improve your Supply Chain\n",
  9 |     "\n",
 10 |     "Full write up available [here](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/).\n",
 11 |     "\n",
 12 |     "Note Ray is not a dependency of OR-Gym. We want OR-Gym to be able to stand independently of other RL libraries as much as possible.\n",
 13 |     "\n",
 14 |     "There have been breaking changes that have been introduced in later version of Ray which affect this environment in particular. To ensure no conflicts, please run:\n",
 15 |     "- `pip install ray==1.0.0`\n",
 16 |     "- `pip install ray[rllib]`\n",
 17 |     "- `pip install ray[tune]`\n",
 18 |     "- `pip install tensorflow==2.3.0`\n",
 19 |     "\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "id": "fefefc51",
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 33 |       "Instructions for updating:\n",
 34 |       "non-resource variables are not supported in the long term\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "import or_gym\n",
 40 |     "from or_gym.utils import create_env\n",
 41 |     "import ray\n",
 42 |     "from ray.rllib.agents.ppo import PPOTrainer\n",
 43 |     "from ray import tune\n",
 44 |     "import numpy as np\n",
 45 |     "import matplotlib.pyplot as plt\n",
 46 |     "from matplotlib import gridspec"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "id": "40fa580e",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def register_env(env_name, env_config={}):\n",
 57 |     "    env = create_env(env_name)\n",
 58 |     "    tune.register_env(env_name, \n",
 59 |     "        lambda env_name: env(env_name,\n",
 60 |     "            env_config=env_config))\n",
 61 |     "\n",
 62 |     "# Environment and RL Configuration Settings\n",
 63 |     "env_name = 'InvManagement-v1'\n",
 64 |     "# env_name = \"Knapsack-v0\"\n",
 65 |     "env_config = {} # Change environment parameters here\n",
 66 |     "rl_config = dict(\n",
 67 |     "    env=env_name,\n",
 68 |     "    num_workers=2,\n",
 69 |     "    env_config=env_config,\n",
 70 |     "    model=dict(\n",
 71 |     "        vf_share_layers=False,\n",
 72 |     "        fcnet_activation='elu',\n",
 73 |     "        fcnet_hiddens=[256, 256]\n",
 74 |     "    ),\n",
 75 |     "    lr=1e-5\n",
 76 |     ")\n",
 77 |     " \n",
 78 |     "# Register environment\n",
 79 |     "register_env(env_name, env_config)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "id": "ea13304f",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "2022-09-02 10:53:41,358\tINFO services.py:1164 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
 93 |       "2022-09-02 10:53:44,394\tINFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution\n",
 94 |       "2022-09-02 10:53:44,398\tINFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.\n",
 95 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 96 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n",
 97 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m non-resource variables are not supported in the long term\n",
 98 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 99 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n",
100 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m non-resource variables are not supported in the long term\n",
101 |       "2022-09-02 10:54:04,675\tINFO trainable.py:252 -- Trainable.setup took 20.284 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.\n",
102 |       "2022-09-02 10:54:04,677\tWARNING util.py:39 -- Install gputil for GPU system monitoring.\n"
103 |      ]
104 |     },
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
110 |       "Instructions for updating:\n",
111 |       "Prefer Variable.assign which has equivalent behavior in 2.X.\n"
112 |      ]
113 |     },
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
119 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n",
120 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n",
121 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
122 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n",
123 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n"
124 |      ]
125 |     },
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Iter: 100\tReward: 233.14"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "# Initialize Ray and Build Agent\n",
136 |     "ray.init(ignore_reinit_error=True)\n",
137 |     "agent = PPOTrainer(env=env_name,\n",
138 |     "    config=rl_config)\n",
139 |     " \n",
140 |     "results = []\n",
141 |     "for i in range(500):\n",
142 |     "    res = agent.train()\n",
143 |     "    results.append(res)\n",
144 |     "    if (i+1) % 5 == 0:\n",
145 |     "        print('\\rIter: {}\\tReward: {:.2f}'.format(\n",
146 |     "                i+1, res['episode_reward_mean']), end='')\n",
147 |     "ray.shutdown()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "id": "793e41cf",
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "Box(3,)"
160 |       ]
161 |      },
162 |      "execution_count": 5,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# Unpack values from each iteration\n",
169 |     "rewards = np.hstack([i['hist_stats']['episode_reward'] \n",
170 |     "    for i in results])\n",
171 |     "pol_loss = [\n",
172 |     "    i['info']['learner']['default_policy']['policy_loss'] \n",
173 |     "    for i in results]\n",
174 |     "vf_loss = [\n",
175 |     "    i['info']['learner']['default_policy']['vf_loss'] \n",
176 |     "    for i in results]\n",
177 |     "p = 100\n",
178 |     "mean_rewards = np.array([np.mean(rewards[i-p:i+1]) \n",
179 |     "                if i >= p else np.mean(rewards[:i+1]) \n",
180 |     "                for i, _ in enumerate(rewards)])\n",
181 |     "std_rewards = np.array([np.std(rewards[i-p:i+1])\n",
182 |     "               if i >= p else np.std(rewards[:i+1])\n",
183 |     "               for i, _ in enumerate(rewards)])\n",
184 |     "fig = plt.figure(constrained_layout=True, figsize=(20, 10))\n",
185 |     "gs = fig.add_gridspec(2, 4)\n",
186 |     "ax0 = fig.add_subplot(gs[:, :-2])\n",
187 |     "ax0.fill_between(np.arange(len(mean_rewards)), \n",
188 |     "                 mean_rewards - std_rewards, \n",
189 |     "                 mean_rewards + std_rewards, \n",
190 |     "                 label='Standard Deviation', alpha=0.3)\n",
191 |     "ax0.plot(mean_rewards, label='Mean Rewards')\n",
192 |     "ax0.set_ylabel('Rewards')\n",
193 |     "ax0.set_xlabel('Episode')\n",
194 |     "ax0.set_title('Training Rewards')\n",
195 |     "ax0.legend()\n",
196 |     "ax1 = fig.add_subplot(gs[0, 2:])\n",
197 |     "ax1.plot(pol_loss)\n",
198 |     "ax1.set_ylabel('Loss')\n",
199 |     "ax1.set_xlabel('Iteration')\n",
200 |     "ax1.set_title('Policy Loss')\n",
201 |     "ax2 = fig.add_subplot(gs[1, 2:])\n",
202 |     "ax2.plot(vf_loss)\n",
203 |     "ax2.set_ylabel('Loss')\n",
204 |     "ax2.set_xlabel('Iteration')\n",
205 |     "ax2.set_title('Value Function Loss')\n",
206 |     "plt.show()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "9e494fbe",
212 |    "metadata": {},
213 |    "source": [
214 |     "# Derivative Free Optimization"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "id": "44bb7398",
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "ename": "AssertionError",
225 |      "evalue": "",
226 |      "output_type": "error",
227 |      "traceback": [
228 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
229 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
230 |       "\u001b[1;32m/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb Cell 6\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m high_ \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmax(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh)\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mlow \u001b[39m==\u001b[39m low_)\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh \u001b[39m==\u001b[39m high_)\n",
231 |       "\u001b[0;31mAssertionError\u001b[0m: "
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "from scipy.optimize import minimize"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "id": "741bbd80",
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "(100, array([100,  90,  80], dtype=int16), array([0, 0, 0], dtype=int16))"
249 |       ]
250 |      },
251 |      "execution_count": 9,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "def base_stock_policy(policy, env):\n",
258 |     "  '''\n",
259 |     "  Implements a re-order up-to policy. This means that for\n",
260 |     "  each node in the network, if the inventory at that node \n",
261 |     "  falls below the level denoted by the policy, we will \n",
262 |     "  re-order inventory to bring it to the policy level.\n",
263 |     "  \n",
264 |     "  For example, policy at a node is 10, current inventory\n",
265 |     "  is 5: the action is to order 5 units.\n",
266 |     "  '''\n",
267 |     "  assert len(policy) == len(env.init_inv), (\n",
268 |     "    'Policy should match number of nodes in network' + \n",
269 |     "    '({}, {}).'.format(len(policy), len(env.init_inv)))\n",
270 |     "  \n",
271 |     "  # Get echelon inventory levels\n",
272 |     "  if env.period == 0:\n",
273 |     "    inv_ech = np.cumsum(env.I[env.period] +\n",
274 |     "      env.T[env.period])\n",
275 |     "  else:\n",
276 |     "    inv_ech = np.cumsum(env.I[env.period] +\n",
277 |     "      env.T[env.period] - env.B[env.period-1, :-1])\n",
278 |     "      \n",
279 |     "  # Get unconstrained actions\n",
280 |     "  unc_actions = policy - inv_ech\n",
281 |     "  unc_actions = np.where(unc_actions>0, unc_actions, 0)\n",
282 |     "  \n",
283 |     "  # Ensure that actions can be fulfilled by checking \n",
284 |     "  # constraints\n",
285 |     "  inv_const = np.hstack([env.I[env.period, 1:], np.Inf])\n",
286 |     "  actions = np.minimum(env.c, np.minimum(unc_actions, inv_const))\n",
287 |     "  return actions\n",
288 |     "\n",
289 |     "def dfo_func(policy, env, *args):\n",
290 |     "    '''\n",
291 |     "    Runs an episode based on current base-stock model \n",
292 |     "    settings. This allows us to use our environment for the \n",
293 |     "    DFO optimizer.\n",
294 |     "    '''\n",
295 |     "    env.reset() # Ensure env is fresh\n",
296 |     "    rewards = []\n",
297 |     "    done = False\n",
298 |     "    while not done:\n",
299 |     "        action = base_stock_policy(policy, env)\n",
300 |     "        state, reward, done, _ = env.step(action)\n",
301 |     "        rewards.append(reward)\n",
302 |     "        if done:\n",
303 |     "            break\n",
304 |     "            \n",
305 |     "    rewards = np.array(rewards)\n",
306 |     "    prob = env.demand_dist.pmf(env.D, **env.dist_param)\n",
307 |     "    \n",
308 |     "    # Return negative of expected profit\n",
309 |     "    return -1 / env.num_periods * np.sum(prob * rewards)\n",
310 |     "  \n",
311 |     "def optimize_inventory_policy(env_name, fun,\n",
312 |     "  init_policy=None, env_config={}, method='Powell'):\n",
313 |     "  \n",
314 |     "  env = or_gym.make(env_name, env_config=env_config)\n",
315 |     "  \n",
316 |     "  if init_policy is None:\n",
317 |     "      init_policy = np.ones(env.num_stages-1)\n",
318 |     "      \n",
319 |     "  # Optimize policy\n",
320 |     "  out = minimize(fun=fun, x0=init_policy, args=env, \n",
321 |     "      method=method)\n",
322 |     "  policy = out.x.copy()\n",
323 |     "  \n",
324 |     "  # Policy must be positive integer\n",
325 |     "  policy = np.round(np.maximum(policy, 0), 0).astype(int)\n",
326 |     "  \n",
327 |     "  return policy, out"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "3e11da7d",
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "Box(3,)"
340 |       ]
341 |      },
342 |      "execution_count": 8,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "policy, out = optimize_inventory_policy('InvManagement-v1',\n",
349 |     "    dfo_func)\n",
350 |     "print(\"Re-order levels: {}\".format(policy))\n",
351 |     "print(\"DFO Info:\\n{}\".format(out))\n",
352 |     "\n",
353 |     "env = or_gym.make(env_name, env_config=env_config)\n",
354 |     "eps = 1000\n",
355 |     "rewards = []\n",
356 |     "for i in range(eps):\n",
357 |     "    env.reset()\n",
358 |     "    reward = 0\n",
359 |     "    while True:\n",
360 |     "        action = base_stock_policy(policy, env)\n",
361 |     "        s, r, done, _ = env.step(action)\n",
362 |     "        reward += r\n",
363 |     "        if done:\n",
364 |     "            rewards.append(reward)\n",
365 |     "            break"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "def5147b",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": []
375 |   }
376 |  ],
377 |  "metadata": {
378 |   "kernelspec": {
379 |    "display_name": "Python 3.8.0 ('or-gym-dev')",
380 |    "language": "python",
381 |    "name": "python3"
382 |   },
383 |   "language_info": {
384 |    "codemirror_mode": {
385 |     "name": "ipython",
386 |     "version": 3
387 |    },
388 |    "file_extension": ".py",
389 |    "mimetype": "text/x-python",
390 |    "name": "python",
391 |    "nbconvert_exporter": "python",
392 |    "pygments_lexer": "ipython3",
393 |    "version": "3.8.13"
394 |   },
395 |   "vscode": {
396 |    "interpreter": {
397 |     "hash": "bc8a2230aa8b659650bd48bf6a546b4d453aa64d7078ee0770a23a54a48157c8"
398 |    }
399 |   }
400 |  },
401 |  "nbformat": 4,
402 |  "nbformat_minor": 5
403 | }
404 | 


--------------------------------------------------------------------------------
/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "937597e4",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# How to Use Deep Reinforcement Learning to Improve your Supply Chain\n",
  9 |     "\n",
 10 |     "Full write up available [here](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/).\n",
 11 |     "\n",
 12 |     "Note Ray is not a dependency of OR-Gym. We want OR-Gym to be able to stand independently of other RL libraries as much as possible.\n",
 13 |     "\n",
 14 |     "There have been breaking changes that have been introduced in later version of Ray which affect this environment in particular. To ensure no conflicts, please run:\n",
 15 |     "- `pip install ray==1.0.0`\n",
 16 |     "- `pip install ray[rllib]`\n",
 17 |     "- `pip install ray[tune]`\n",
 18 |     "- `pip install tensorflow==2.3.0`\n",
 19 |     "\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "id": "fefefc51",
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 33 |       "Instructions for updating:\n",
 34 |       "non-resource variables are not supported in the long term\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "import or_gym\n",
 40 |     "from or_gym.utils import create_env\n",
 41 |     "import ray\n",
 42 |     "from ray.rllib.agents.ppo import PPOTrainer\n",
 43 |     "from ray import tune\n",
 44 |     "import numpy as np\n",
 45 |     "import matplotlib.pyplot as plt\n",
 46 |     "from matplotlib import gridspec"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "id": "40fa580e",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def register_env(env_name, env_config={}):\n",
 57 |     "    env = create_env(env_name)\n",
 58 |     "    tune.register_env(env_name, \n",
 59 |     "        lambda env_name: env(env_name,\n",
 60 |     "            env_config=env_config))\n",
 61 |     "\n",
 62 |     "# Environment and RL Configuration Settings\n",
 63 |     "env_name = 'InvManagement-v1'\n",
 64 |     "# env_name = \"Knapsack-v0\"\n",
 65 |     "env_config = {} # Change environment parameters here\n",
 66 |     "rl_config = dict(\n",
 67 |     "    env=env_name,\n",
 68 |     "    num_workers=2,\n",
 69 |     "    env_config=env_config,\n",
 70 |     "    model=dict(\n",
 71 |     "        vf_share_layers=False,\n",
 72 |     "        fcnet_activation='elu',\n",
 73 |     "        fcnet_hiddens=[256, 256]\n",
 74 |     "    ),\n",
 75 |     "    lr=1e-5\n",
 76 |     ")\n",
 77 |     " \n",
 78 |     "# Register environment\n",
 79 |     "register_env(env_name, env_config)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "id": "ea13304f",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "2022-09-02 10:53:41,358\tINFO services.py:1164 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
 93 |       "2022-09-02 10:53:44,394\tINFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution\n",
 94 |       "2022-09-02 10:53:44,398\tINFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.\n",
 95 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 96 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n",
 97 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m non-resource variables are not supported in the long term\n",
 98 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
 99 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n",
100 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m non-resource variables are not supported in the long term\n",
101 |       "2022-09-02 10:54:04,675\tINFO trainable.py:252 -- Trainable.setup took 20.284 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.\n",
102 |       "2022-09-02 10:54:04,677\tWARNING util.py:39 -- Install gputil for GPU system monitoring.\n"
103 |      ]
104 |     },
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
110 |       "Instructions for updating:\n",
111 |       "Prefer Variable.assign which has equivalent behavior in 2.X.\n"
112 |      ]
113 |     },
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
119 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n",
120 |       "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n",
121 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
122 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n",
123 |       "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n"
124 |      ]
125 |     },
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Iter: 155\tReward: 339.55"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "# Initialize Ray and Build Agent\n",
136 |     "ray.init(ignore_reinit_error=True)\n",
137 |     "agent = PPOTrainer(env=env_name,\n",
138 |     "    config=rl_config)\n",
139 |     " \n",
140 |     "results = []\n",
141 |     "for i in range(500):\n",
142 |     "    res = agent.train()\n",
143 |     "    results.append(res)\n",
144 |     "    if (i+1) % 5 == 0:\n",
145 |     "        print('\\rIter: {}\\tReward: {:.2f}'.format(\n",
146 |     "                i+1, res['episode_reward_mean']), end='')\n",
147 |     "ray.shutdown()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "id": "793e41cf",
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "Box(3,)"
160 |       ]
161 |      },
162 |      "execution_count": 5,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# Unpack values from each iteration\n",
169 |     "rewards = np.hstack([i['hist_stats']['episode_reward'] \n",
170 |     "    for i in results])\n",
171 |     "pol_loss = [\n",
172 |     "    i['info']['learner']['default_policy']['policy_loss'] \n",
173 |     "    for i in results]\n",
174 |     "vf_loss = [\n",
175 |     "    i['info']['learner']['default_policy']['vf_loss'] \n",
176 |     "    for i in results]\n",
177 |     "p = 100\n",
178 |     "mean_rewards = np.array([np.mean(rewards[i-p:i+1]) \n",
179 |     "                if i >= p else np.mean(rewards[:i+1]) \n",
180 |     "                for i, _ in enumerate(rewards)])\n",
181 |     "std_rewards = np.array([np.std(rewards[i-p:i+1])\n",
182 |     "               if i >= p else np.std(rewards[:i+1])\n",
183 |     "               for i, _ in enumerate(rewards)])\n",
184 |     "fig = plt.figure(constrained_layout=True, figsize=(20, 10))\n",
185 |     "gs = fig.add_gridspec(2, 4)\n",
186 |     "ax0 = fig.add_subplot(gs[:, :-2])\n",
187 |     "ax0.fill_between(np.arange(len(mean_rewards)), \n",
188 |     "                 mean_rewards - std_rewards, \n",
189 |     "                 mean_rewards + std_rewards, \n",
190 |     "                 label='Standard Deviation', alpha=0.3)\n",
191 |     "ax0.plot(mean_rewards, label='Mean Rewards')\n",
192 |     "ax0.set_ylabel('Rewards')\n",
193 |     "ax0.set_xlabel('Episode')\n",
194 |     "ax0.set_title('Training Rewards')\n",
195 |     "ax0.legend()\n",
196 |     "ax1 = fig.add_subplot(gs[0, 2:])\n",
197 |     "ax1.plot(pol_loss)\n",
198 |     "ax1.set_ylabel('Loss')\n",
199 |     "ax1.set_xlabel('Iteration')\n",
200 |     "ax1.set_title('Policy Loss')\n",
201 |     "ax2 = fig.add_subplot(gs[1, 2:])\n",
202 |     "ax2.plot(vf_loss)\n",
203 |     "ax2.set_ylabel('Loss')\n",
204 |     "ax2.set_xlabel('Iteration')\n",
205 |     "ax2.set_title('Value Function Loss')\n",
206 |     "plt.show()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "9e494fbe",
212 |    "metadata": {},
213 |    "source": [
214 |     "# Derivative Free Optimization"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "id": "44bb7398",
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "ename": "AssertionError",
225 |      "evalue": "",
226 |      "output_type": "error",
227 |      "traceback": [
228 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
229 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
230 |       "\u001b[1;32m/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb Cell 6\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m high_ \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmax(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh)\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mlow \u001b[39m==\u001b[39m low_)\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh \u001b[39m==\u001b[39m high_)\n",
231 |       "\u001b[0;31mAssertionError\u001b[0m: "
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "from scipy.optimize import minimize"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "id": "741bbd80",
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "(100, array([100,  90,  80], dtype=int16), array([0, 0, 0], dtype=int16))"
249 |       ]
250 |      },
251 |      "execution_count": 9,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "def base_stock_policy(policy, env):\n",
258 |     "  '''\n",
259 |     "  Implements a re-order up-to policy. This means that for\n",
260 |     "  each node in the network, if the inventory at that node \n",
261 |     "  falls below the level denoted by the policy, we will \n",
262 |     "  re-order inventory to bring it to the policy level.\n",
263 |     "  \n",
264 |     "  For example, policy at a node is 10, current inventory\n",
265 |     "  is 5: the action is to order 5 units.\n",
266 |     "  '''\n",
267 |     "  assert len(policy) == len(env.init_inv), (\n",
268 |     "    'Policy should match number of nodes in network' + \n",
269 |     "    '({}, {}).'.format(len(policy), len(env.init_inv)))\n",
270 |     "  \n",
271 |     "  # Get echelon inventory levels\n",
272 |     "  if env.period == 0:\n",
273 |     "    inv_ech = np.cumsum(env.I[env.period] +\n",
274 |     "      env.T[env.period])\n",
275 |     "  else:\n",
276 |     "    inv_ech = np.cumsum(env.I[env.period] +\n",
277 |     "      env.T[env.period] - env.B[env.period-1, :-1])\n",
278 |     "      \n",
279 |     "  # Get unconstrained actions\n",
280 |     "  unc_actions = policy - inv_ech\n",
281 |     "  unc_actions = np.where(unc_actions>0, unc_actions, 0)\n",
282 |     "  \n",
283 |     "  # Ensure that actions can be fulfilled by checking \n",
284 |     "  # constraints\n",
285 |     "  inv_const = np.hstack([env.I[env.period, 1:], np.Inf])\n",
286 |     "  actions = np.minimum(env.c, np.minimum(unc_actions, inv_const))\n",
287 |     "  return actions\n",
288 |     "\n",
289 |     "def dfo_func(policy, env, *args):\n",
290 |     "    '''\n",
291 |     "    Runs an episode based on current base-stock model \n",
292 |     "    settings. This allows us to use our environment for the \n",
293 |     "    DFO optimizer.\n",
294 |     "    '''\n",
295 |     "    env.reset() # Ensure env is fresh\n",
296 |     "    rewards = []\n",
297 |     "    done = False\n",
298 |     "    while not done:\n",
299 |     "        action = base_stock_policy(policy, env)\n",
300 |     "        state, reward, done, _ = env.step(action)\n",
301 |     "        rewards.append(reward)\n",
302 |     "        if done:\n",
303 |     "            break\n",
304 |     "            \n",
305 |     "    rewards = np.array(rewards)\n",
306 |     "    prob = env.demand_dist.pmf(env.D, **env.dist_param)\n",
307 |     "    \n",
308 |     "    # Return negative of expected profit\n",
309 |     "    return -1 / env.num_periods * np.sum(prob * rewards)\n",
310 |     "  \n",
311 |     "def optimize_inventory_policy(env_name, fun,\n",
312 |     "  init_policy=None, env_config={}, method='Powell'):\n",
313 |     "  \n",
314 |     "  env = or_gym.make(env_name, env_config=env_config)\n",
315 |     "  \n",
316 |     "  if init_policy is None:\n",
317 |     "      init_policy = np.ones(env.num_stages-1)\n",
318 |     "      \n",
319 |     "  # Optimize policy\n",
320 |     "  out = minimize(fun=fun, x0=init_policy, args=env, \n",
321 |     "      method=method)\n",
322 |     "  policy = out.x.copy()\n",
323 |     "  \n",
324 |     "  # Policy must be positive integer\n",
325 |     "  policy = np.round(np.maximum(policy, 0), 0).astype(int)\n",
326 |     "  \n",
327 |     "  return policy, out"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "3e11da7d",
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "Box(3,)"
340 |       ]
341 |      },
342 |      "execution_count": 8,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "policy, out = optimize_inventory_policy('InvManagement-v1',\n",
349 |     "    dfo_func)\n",
350 |     "print(\"Re-order levels: {}\".format(policy))\n",
351 |     "print(\"DFO Info:\\n{}\".format(out))\n",
352 |     "\n",
353 |     "env = or_gym.make(env_name, env_config=env_config)\n",
354 |     "eps = 1000\n",
355 |     "rewards = []\n",
356 |     "for i in range(eps):\n",
357 |     "    env.reset()\n",
358 |     "    reward = 0\n",
359 |     "    while True:\n",
360 |     "        action = base_stock_policy(policy, env)\n",
361 |     "        s, r, done, _ = env.step(action)\n",
362 |     "        reward += r\n",
363 |     "        if done:\n",
364 |     "            rewards.append(reward)\n",
365 |     "            break"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "def5147b",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": []
375 |   }
376 |  ],
377 |  "metadata": {
378 |   "kernelspec": {
379 |    "display_name": "Python 3.8.0 ('or-gym-dev')",
380 |    "language": "python",
381 |    "name": "python3"
382 |   },
383 |   "language_info": {
384 |    "codemirror_mode": {
385 |     "name": "ipython",
386 |     "version": 3
387 |    },
388 |    "file_extension": ".py",
389 |    "mimetype": "text/x-python",
390 |    "name": "python",
391 |    "nbconvert_exporter": "python",
392 |    "pygments_lexer": "ipython3",
393 |    "version": "3.8.13"
394 |   },
395 |   "vscode": {
396 |    "interpreter": {
397 |     "hash": "bc8a2230aa8b659650bd48bf6a546b4d453aa64d7078ee0770a23a54a48157c8"
398 |    }
399 |   }
400 |  },
401 |  "nbformat": 4,
402 |  "nbformat_minor": 5
403 | }
404 | 


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/vehicle_routing.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Example taken from Balaji et al.
  3 | Paper: https://arxiv.org/abs/1911.10641
  4 | GitHub: https://github.com/awslabs/or-rl-benchmarks
  5 | '''
  6 | import gym
  7 | from gym import spaces
  8 | import or_gym
  9 | from or_gym.utils import assign_env_config
 10 | import random
 11 | import numpy as np
 12 | from scipy.stats import truncnorm
 13 | 
 14 | 
 15 | class VehicleRoutingEnv(gym.Env):
 16 |     '''
 17 |     Dynamic Vehicle Routing Problem
 18 | 
 19 |     This environment simulates a driver working with a food delivery app
 20 |     to move through a city, accept orders, pick them up from restaurants,
 21 |     and deliver them to waiting customers. Each order has a specific
 22 |     delivery value, restaurant, and delivery location, all of which are 
 23 |     known by the driver before he accepts the order. After accepting, the
 24 |     driver must navigate to the restaurant to collect the order and then
 25 |     deliver it. If an order isn't accepted, it may be taken by another
 26 |     driver. Additionally, the driver has 60 minutes to make a delivery
 27 |     from the time an order is created. 
 28 |     The city is represented as a grid with different zones that have
 29 |     different statistics for order creation and value. At each time step,
 30 |     new orders are created with a fixed probability unique to each zone.
 31 |     The driver's vehicle also has a finite capacity limiting the number of
 32 |     orders he can carry at a given time, although there is no limit on the
 33 |     number of accepted orders.
 34 |     The driver receives a penalty for time and distance spent during travel,
 35 |     but receives rewards for accepting and delivering orders.
 36 | 
 37 |     Observation:
 38 |         Type: Box
 39 |         State Vector: S = (p, h, c, l, w, e, v)
 40 |         p = pickup location
 41 |         h = driver's current position
 42 |         c = remaining vehicle capacity
 43 |         l = order location
 44 |         w = order status (open, accepted, picked up, delivered/inactive)
 45 |         e = time elapsed since order generation
 46 |         v = order value
 47 | 
 48 |     Action: 
 49 |         Type: Discrete
 50 |         0 = wait
 51 |         1:max_orders = accept order
 52 |         max_orders:2*max_orders = pickup order
 53 |         2*max_orders:3*max_orders = deliver order
 54 |         3*max_orders:3*max_orders + n_restaurants = go to restaurant
 55 | 
 56 |         Action masking is available for this environment. Set mask=True
 57 |         in the env_config dictionary.
 58 | 
 59 |     Reward:
 60 |         The agent recieves 1/3 of the order value for accepting an order,
 61 |         picking it up, and delivering the order. The cost is comprised of
 62 |         three elements: delivery time, delivery distance, and cost of failure
 63 |         (if the driver does not deliver the item). 
 64 | 
 65 |     Starting State:
 66 |         Restaurant and driver locations are randomized at the start of each
 67 |         episode. New orders are generated according to the order probability.
 68 | 
 69 |     Episode Terimantion:
 70 |         Episode termination occurs when the total time has elapsed.
 71 |     '''
 72 | 
 73 |     def __init__(self, *args, **kwargs):
 74 |         self.n_restaurants = 2
 75 |         self.max_orders = 10
 76 |         self.order_prob = 0.5
 77 |         self.vehicle_capacity = 4
 78 |         self.grid = (5, 5)
 79 |         self.order_promise = 60
 80 |         self.order_timeout_prob = 0.15
 81 |         self.num_zones = 4
 82 |         self.order_probs_per_zone = [0.1, 0.5, 0.3, 0.1]
 83 |         self.order_reward_min = [8, 5, 2, 1]
 84 |         self.order_reward_max = [12, 8, 5, 3]
 85 |         self.half_norm_scale_reward_per_zone = [0.5, 0.5, 0.5, 0.5]
 86 |         self.penalty_per_timestep = 0.1
 87 |         self.penalty_per_move = 0.1
 88 |         self.order_miss_penalty = 50
 89 |         self.step_limit = 1000
 90 |         self.mask = False
 91 |         self.info = {}
 92 | 
 93 |         assign_env_config(self, kwargs)
 94 |         self._order_nums = np.arange(self.max_orders)
 95 |         self.loc_permutations = [(x, y) for x in range(self.grid[0])
 96 |                                  for y in range(self.grid[1])]
 97 |         self.action_dim = 1 + 3 * self.max_orders + self.n_restaurants
 98 |         self.obs_dim = 2 * self.n_restaurants + 4 + 6 * self.max_orders
 99 |         box_low = np.zeros(self.obs_dim)
100 |         box_high = np.hstack([
101 |             np.repeat(
102 |                 max(self.grid), 2 * self.n_restaurants + 2),  # Locations 0-5
103 |             np.repeat(self.vehicle_capacity, 2),  # Vehicle capacities 6-7
104 |             np.tile(np.hstack([4, self.n_restaurants, self.grid,
105 |                                self.order_promise, max(self.order_reward_max)]), self.max_orders)
106 |         ])
107 | 
108 |         if self.mask:
109 |             self.observation_space = spaces.Dict({
110 |                 'action_mask': spaces.Box(
111 |                     low=np.zeros(self.action_dim),
112 |                     high=np.ones(self.action_dim),
113 |                     dtype=np.uint8),
114 |                 'avail_actions': spaces.Box(
115 |                     low=np.zeros(self.action_dim),
116 |                     high=np.ones(self.action_dim),
117 |                     dtype=np.uint8),
118 |                 'state': spaces.Box(
119 |                     low=box_low,
120 |                     high=box_high,
121 |                     dtype=np.float16)
122 |             })
123 |         else:
124 |             self.observation_space = spaces.Box(
125 |                 low=box_low,
126 |                 high=box_high,
127 |                 dtype=np.float16)
128 | 
129 |         self.action_space = spaces.Discrete(self.action_dim)
130 | 
131 |         self.reset()
132 | 
133 |     def _STEP(self, action):
134 |         done = False
135 |         self.reward = 0
136 |         self.late_penalty = 0
137 | 
138 |         if action == 0:
139 |             self.wait(action)
140 |         elif action <= self.max_orders:
141 |             self.accept_order(action)
142 |         elif action <= 2 * self.max_orders:
143 |             self.pickup_order(action)
144 |         elif action <= 3 * self.max_orders:
145 |             self.deliver_order(action)
146 |         elif action <= 3 * self.max_orders + self.n_restaurants:
147 |             self.return_to_restaurant(action)
148 |         else:
149 |             raise Exception(
150 |                 f"Selected action ({action}) outside of action space.")
151 | 
152 |         self.state = self._update_state()
153 | 
154 |         self.step_count += 1
155 |         if self.step_count >= self.step_limit:
156 |             done = True
157 | 
158 |         return self.state, self.reward, done, self.info
159 | 
160 |     def wait(self, action):
161 |         # Do nothing
162 |         pass
163 | 
164 |     def accept_order(self, action):
165 |         # Accept order denoted by action
166 |         order_idx = action - 1
167 |         if order_idx not in self.order_dict.keys():
168 |             # Invalid action, do nothing
169 |             pass
170 |         elif self.order_dict[order_idx]['Status'] == 1:
171 |             self.order_dict[order_idx]['Status'] = 2
172 |             self.reward += self.order_dict[order_idx]['Value'] / 3
173 | 
174 |     def pickup_order(self, action):
175 |         order_idx = action - self.max_orders - 1
176 |         if order_idx not in self.order_dict.keys():
177 |             # Invalid action, do nothing
178 |             pass
179 |         else:
180 |             restaurant = self.order_dict[order_idx]['RestaurantID']
181 |             restaurant_loc = self.restaurant_loc[restaurant]
182 |             self._go_to_destination(restaurant_loc)
183 |             self.reward -= self.penalty_per_move
184 |             # Movement and pickup can occur during same time step
185 |             if self.order_dict[order_idx]['Status'] == 2 and self.driver_loc[0] == restaurant_loc[0] and self.driver_loc[1] == restaurant_loc[1]:
186 |                 if self.vehicle_load < self.vehicle_capacity:
187 |                     self.order_dict[order_idx]['Status'] = 3
188 |                     self.vehicle_load += 1
189 |                     self.reward += self.order_dict[order_idx]['Value'] / 3
190 | 
191 |     def deliver_order(self, action):
192 |         order_idx = action - 2 * self.max_orders - 1
193 |         if order_idx not in self.order_dict.keys():
194 |             # Invalid action, do nothing
195 |             pass
196 |         else:
197 |             order_loc = self.order_dict[order_idx]['DeliveryLoc']
198 |             self._go_to_destination(order_loc)
199 |             self.reward -= self.penalty_per_move
200 |             # Can deliver multiple orders simultaneously
201 |             for k, v in self.order_dict.items():
202 |                 if v['Status'] == 3 and v['DeliveryLoc'][0] == self.driver_loc[0] and v['DeliveryLoc'][1] == self.driver_loc[1]:
203 |                     if v['Time'] <= self.order_promise:
204 |                         self.reward = v['Value'] / 3
205 |                     self.vehicle_load -= 1
206 |                     v['Status'] = 4  # Delivered
207 | 
208 |     def return_to_restaurant(self, action):
209 |         restaurant = action - 3 * self.max_orders - 1
210 |         restaurant_loc = self.restaurant_loc[restaurant]
211 |         self._go_to_destination(restaurant_loc)
212 |         self.reward -= self.penalty_per_move
213 | 
214 |     def _update_orders(self):
215 |         self._update_order_times()
216 |         self._remove_orders()
217 |         self._generate_orders()
218 | 
219 |     def _remove_orders(self):
220 |         # Remove orders if they're over due
221 |         orders_to_delete = []
222 |         for k, v in self.order_dict.items():
223 |             if v['Time'] >= self.order_promise:
224 |                 if v['Status'] >= 2:
225 |                     # Apply penalty and remove associated rewards
226 |                     self.reward -= (self.order_miss_penalty +
227 |                                     v['Value'] * (v['Status'] == 2)/3 +
228 |                                     v['Value'] * (v['Status'] == 3) * 2/3)
229 |                     self.late_penalty += self.order_miss_penalty
230 |                 if v['Status'] == 3:
231 |                     self.vehicle_capacity -= 1
232 |                 orders_to_delete.append(k)
233 | 
234 |             elif v['Status'] == 4:
235 |                 orders_to_delete.append(k)
236 | 
237 |             # Probabalistically remove open orders
238 |             elif v['Status'] == 1 and np.random.random() < self.order_timeout_prob:
239 |                 orders_to_delete.append(k)
240 | 
241 |         for k in orders_to_delete:
242 |             del self.order_dict[k]
243 | 
244 |     def _update_state(self):
245 |         self._update_orders()
246 |         # Placeholder for order data
247 |         order_array = np.zeros((self.max_orders, 6))
248 |         try:
249 |             order_data = np.hstack([v1 for v in self.order_dict.values()
250 |                                     for v1 in v.values()]).reshape(-1, 7)
251 |             order_array[order_data[:, 0].astype(int)] += order_data[:, 1:]
252 |         except ValueError:
253 |             # Occurs when order_data is empty
254 |             pass
255 |         state = np.hstack([
256 |             np.hstack(self.restaurant_loc),
257 |             np.hstack(self.driver_loc),
258 |             np.hstack([self.vehicle_load, self.vehicle_capacity]),
259 |             order_array.flatten()
260 |         ], dtype=np.float16)
261 |         if self.mask:
262 |             action_mask = self._update_mask(state)
263 |             state = {
264 |                 'state': state,
265 |                 'action_mask': action_mask,
266 |                 'avail_actions': np.ones(self.action_dim, dtype=np.uint8)
267 |             }
268 |         return state
269 | 
270 |     def _update_mask(self, state):
271 |         action_mask = np.zeros(self.action_dim, dtype=np.uint8)
272 |         # Wait and return to restaurant are always allowed
273 |         action_mask[0] = 1
274 |         action_mask[(3 * self.max_orders + 1)                    :(3 * self.max_orders + self.n_restaurants + 1)] = 1
275 | 
276 |         for k, v in self.order_dict.items():
277 |             status = v['Status']
278 |             # Allow accepting an open order
279 |             if status == 1:
280 |                 action_mask[k + 1] = 1
281 |             # Allow navigating to accepted order for pickup
282 |             elif status == 2 and self.vehicle_load < self.vehicle_capacity:
283 |                 action_mask[k + self.max_orders + 1] = 1
284 |             # Allow delivery of picked up order
285 |             elif status == 3:
286 |                 action_mask[k + 2 * self.max_orders + 1] = 1
287 | 
288 |         return action_mask
289 | 
290 |     def _RESET(self):
291 |         self.step_count = 0
292 |         self.vehicle_load = 0
293 |         self.randomize_locations()
294 |         self.zone_loc = self._get_zones()
295 |         self.order_dict = {}
296 |         self.state = self._update_state()
297 |         return self.state
298 | 
299 |     def _update_order_times(self):
300 |         for k, v in self.order_dict.items():
301 |             if v['Status'] >= 1:
302 |                 v['Time'] += 1
303 | 
304 |     def _generate_orders(self):
305 |         open_slots = self._order_nums[~np.isin(self._order_nums,
306 |                                                np.array([k for k in self.order_dict.keys()]))]
307 |         try:
308 |             order_num = open_slots.min()
309 |         except ValueError:
310 |             pass
311 |         for n in open_slots:
312 |             # Probabalistically create a new order
313 |             if np.random.random() < self.order_prob:
314 |                 zone = np.random.choice(
315 |                     self.num_zones, p=self.order_probs_per_zone)
316 |                 order = self._get_order_from_zone(zone, order_num)
317 |                 self.order_dict[order_num] = order
318 |                 order_num += 1
319 | 
320 |     def _get_order_from_zone(self, zone, n):
321 |         delivery_loc = random.choice(self.zone_loc[zone])
322 |         restaurant_idx = np.random.choice(self.n_restaurants)
323 |         value = truncnorm.rvs(0,
324 |                               (self.order_reward_max[zone] -
325 |                                self.order_reward_min[zone])
326 |                               / self.half_norm_scale_reward_per_zone[zone],
327 |                               self.order_reward_min[zone],
328 |                               self.half_norm_scale_reward_per_zone[zone])
329 |         return {'Number': n,
330 |                 'Status': 1,
331 |                 'RestaurantID': restaurant_idx,
332 |                 'DeliveryLoc': delivery_loc,
333 |                 'Time': 0,
334 |                 'Value': value}
335 | 
336 |     def randomize_locations(self):
337 |         self._place_restaurants()
338 |         self._place_driver()
339 | 
340 |     def _place_restaurants(self):
341 |         self.restaurant_loc = random.sample(
342 |             self.loc_permutations, self.n_restaurants)
343 | 
344 |     def _place_driver(self):
345 |         self.driver_loc = list(random.sample(self.loc_permutations, 1)[0])
346 | 
347 |     def _move_driver(self, direction):
348 |         if direction is None:
349 |             return None
350 |         # Receives direction from routing function
351 |         if direction == 0:  # Up
352 |             self.driver_loc[1] += 1
353 |         elif direction == 1:  # Down
354 |             self.driver_loc[1] -= 1
355 |         elif direction == 2:  # Right
356 |             self.driver_loc[0] += 1
357 |         elif direction == 3:  # Left
358 |             self.driver_loc[0] -= 1
359 |         # Check boundaries
360 |         if self.driver_loc[0] > self.grid[0]:
361 |             self.driver_loc[0] = self.grid[0]
362 |         if self.driver_loc[0] < 0:
363 |             self.driver_loc[0] = 0
364 |         if self.driver_loc[1] > self.grid[1]:
365 |             self.driver_loc[1] = self.grid[1]
366 |         if self.driver_loc[1] < 0:
367 |             self.driver_loc[1] = 0
368 | 
369 |     def _go_to_destination(self, destination):
370 |         # Automatically selects direction based on starting location and
371 |         # destination.
372 |         # 0 -> Up; 1 -> Down; 2 -> Right; 3 -> Left
373 |         x_diff = self.driver_loc[0] - destination[0]
374 |         y_diff = self.driver_loc[1] - destination[1]
375 |         if abs(x_diff) >= abs(y_diff):
376 |             if x_diff > 0:
377 |                 direction = 3
378 |             elif x_diff < 0:
379 |                 direction = 2
380 |             elif abs(x_diff) == abs(y_diff):  # 0 == 0
381 |                 # Do nothing
382 |                 direction = None
383 |         else:
384 |             if y_diff > 0:
385 |                 direction = 1
386 |             elif y_diff < 0:
387 |                 direction = 0
388 |         print('direction ',direction)
389 |         self._move_driver(direction)
390 | 
391 |     def _get_num_spaces_per_zone(self):
392 |         total_spaces = self.grid[0] * self.grid[1]
393 |         spaces_per_zone = np.array([np.floor(total_spaces / self.num_zones)
394 |                                     for i in range(self.num_zones)])
395 |         for i in range(total_spaces % self.num_zones):
396 |             spaces_per_zone[i] += 1
397 |         return spaces_per_zone.astype(int)
398 | 
399 |     def _get_zones(self):
400 |         # Slices the grid into zones by row
401 |         spaces_per_zone = self._get_num_spaces_per_zone()
402 |         zones = {}
403 |         for i, n in enumerate(spaces_per_zone):
404 |             x = sum(spaces_per_zone[:i])
405 |             zones[i] = self.loc_permutations[x:x+n]
406 | 
407 |         zones = self._remove_restaurants_from_zone_locs(zones)
408 |         return zones
409 | 
410 |     def _remove_restaurants_from_zone_locs(self, zones):
411 |         for k, v in zones.items():
412 |             for r in self.restaurant_loc:
413 |                 try:
414 |                     loc_to_remove = v.index(r)
415 |                     del zones[k][loc_to_remove]
416 |                 except ValueError:
417 |                     pass
418 |         return zones
419 | 
420 |     def step(self, action):
421 |         return self._STEP(action)
422 | 
423 |     def reset(self):
424 |         return self._RESET()
425 | 


--------------------------------------------------------------------------------
/or_gym/envs/supply_chain/inventory_management.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Multi-period inventory management
  3 | Hector Perez, Christian Hubbs, Owais Sarwar
  4 | 4/14/2020
  5 | '''
  6 | 
  7 | import gym
  8 | import itertools
  9 | import numpy as np
 10 | from scipy.stats import *
 11 | from or_gym.utils import assign_env_config
 12 | from collections import deque
 13 | 
 14 | class InvManagementMasterEnv(gym.Env):
 15 |     '''
 16 |     The supply chain environment is structured as follows:
 17 |     
 18 |     It is a multi-period multi-echelon production-inventory system for a single non-perishable product that is sold only
 19 |     in discrete quantities. Each stage in the supply chain consists of an inventory holding area and a production area.
 20 |     The exception are the first stage (retailer: only inventory area) and the last stage (raw material transformation
 21 |     plant: only production area, with unlimited raw material availability). The inventory holding area holds the inventory
 22 |     necessary to produce the material at that stage. One unit of inventory produces one unit of product at each stage.
 23 |     There are lead times between the transfer of material from one stage to the next. The outgoing material from stage i 
 24 |     is the feed material for production at stage i-1. Stages are numbered in ascending order: Stages = {0, 1, ..., M} 
 25 |     (i.e. m = 0 is the retailer). Production at each stage is bounded by the stage's production capacity and the available
 26 |     inventory.
 27 |         
 28 |     At the beginning of each time period, the following sequence of events occurs:
 29 |     
 30 |     0) Stages 0 through M-1 place replenishment orders to their respective suppliers. Replenishment orders are filled
 31 |         according to available production capacity and available inventory at the respective suppliers.
 32 |     1) Stages 0 through M-1 receive incoming inventory replenishment shipments that have made it down the product pipeline
 33 |         after the stage's respective lead time.
 34 |     2) Customer demand occurs at stage 0 (retailer). It is sampled from a specified discrete probability distribution.
 35 |     3) Demand is filled according to available inventory at stage 0.
 36 |     4) Option: one of the following occurs,
 37 |         a) Unfulfilled sales and replenishment orders are backlogged at a penalty. 
 38 |             Note: Backlogged sales take priority in the following period.
 39 |         b) Unfulfilled sales and replenishment orders are lost with a goodwill loss penalty. 
 40 |     5) Surpluss inventory is held at each stage at a holding cost.
 41 |         
 42 |     '''
 43 |     def __init__(self, *args, **kwargs):
 44 |         '''
 45 |         periods = [positive integer] number of periods in simulation.
 46 |         I0 = [non-negative integer; dimension |Stages|-1] initial inventories for each stage.
 47 |         p = [positive float] unit price for final product.
 48 |         r = [non-negative float; dimension |Stages|] unit cost for replenishment orders at each stage.
 49 |         k = [non-negative float; dimension |Stages|] backlog cost or goodwill loss (per unit) for unfulfilled orders (demand or replenishment orders).
 50 |         h = [non-negative float; dimension |Stages|-1] unit holding cost for excess on-hand inventory at each stage.
 51 |             (Note: does not include pipeline inventory).
 52 |         c = [positive integer; dimension |Stages|-1] production capacities for each suppliers (stages 1 through |Stage|).
 53 |         L = [non-negative integer; dimension |Stages|-1] lead times in betwen stages.
 54 |         backlog = [boolean] are unfulfilled orders backlogged? True = backlogged, False = lost sales.
 55 |         dist = [integer] value between 1 and 4. Specifies distribution for customer demand.
 56 |             1: poisson distribution
 57 |             2: binomial distribution
 58 |             3: uniform random integer
 59 |             4: geometric distribution
 60 |             5: user supplied demand values
 61 |         dist_param = [dictionary] named values for parameters fed to statistical distribution.
 62 |             poisson: {'mu': <mean value>}
 63 |             binom: {'n': <mean value>, 'p': <probability between 0 and 1 of getting the mean value>}
 64 |             raindint: {'low' = <lower bound>, 'high': <upper bound>}
 65 |             geom: {'p': <probability. Outcome is the number of trials to success>}
 66 |         alpha = [float in range (0,1]] discount factor to account for the time value of money
 67 |         seed_int = [integer] seed for random state.
 68 |         user_D = [list] user specified demand for each time period in simulation
 69 |         '''
 70 |         # set default (arbitrary) values when creating environment (if no args or kwargs are given)
 71 |         self.periods = 30
 72 |         self.I0 = [100, 100, 200]
 73 |         self.p = 2
 74 |         self.r = [1.5, 1.0, 0.75, 0.5]
 75 |         self.k = [0.10, 0.075, 0.05, 0.025]
 76 |         self.h = [0.15, 0.10, 0.05]
 77 |         self.c = [100, 90, 80]
 78 |         self.L = [3, 5, 10]
 79 |         self.backlog = True
 80 |         self.dist = 1
 81 |         self.dist_param = {'mu': 20}
 82 |         self.alpha = 0.97
 83 |         self.seed_int = 0
 84 |         self.user_D = np.zeros(self.periods)
 85 |         self._max_rewards = 2000
 86 |         
 87 |         # add environment configuration dictionary and keyword arguments
 88 |         assign_env_config(self, kwargs)
 89 |         
 90 |         # input parameters
 91 |         try:
 92 |             self.init_inv = np.array(list(self.I0))
 93 |         except:
 94 |             self.init_inv = np.array([self.I0])
 95 |         self.num_periods = self.periods
 96 |         self.unit_price = np.append(self.p,self.r[:-1]) # cost to stage 1 is price to stage 2
 97 |         self.unit_cost = np.array(self.r)
 98 |         self.demand_cost = np.array(self.k)
 99 |         self.holding_cost = np.append(self.h,0) # holding cost at last stage is 0
100 |         try:
101 |             self.supply_capacity = np.array(list(self.c))
102 |         except:
103 |             self.supply_capacity = np.array([self.c])
104 |         try:
105 |             self.lead_time = np.array(list(self.L))
106 |         except:
107 |             self.lead_time = np.array([self.L])
108 |         self.discount = self.alpha
109 |         self.user_D = np.array(list(self.user_D))
110 |         self.num_stages = len(self.init_inv) + 1
111 |         m = self.num_stages
112 |         lt_max = self.lead_time.max()
113 |         
114 |         #  parameters
115 |         #  dictionary with options for demand distributions
116 |         distributions = {1:poisson,
117 |                          2:binom,
118 |                          3:randint,
119 |                          4:geom,
120 |                          5:self.user_D}
121 | 
122 |         # check inputs
123 |         assert np.all(self.init_inv) >=0, "The initial inventory cannot be negative"
124 |         try:
125 |             assert self.num_periods > 0, "The number of periods must be positive. Num Periods = {}".format(self.num_periods)
126 |         except TypeError:
127 |             print('\n{}\n'.format(self.num_periods))
128 |         assert np.all(self.unit_price >= 0), "The sales prices cannot be negative."
129 |         assert np.all(self.unit_cost >= 0), "The procurement costs cannot be negative."
130 |         assert np.all(self.demand_cost >= 0), "The unfulfilled demand costs cannot be negative."
131 |         assert np.all(self.holding_cost >= 0), "The inventory holding costs cannot be negative."
132 |         assert np.all(self.supply_capacity > 0), "The supply capacities must be positive."
133 |         assert np.all(self.lead_time >= 0), "The lead times cannot be negative."
134 |         assert (self.backlog == False) | (self.backlog == True), "The backlog parameter must be a boolean."
135 |         assert m >= 2, "The minimum number of stages is 2. Please try again"
136 |         assert len(self.unit_cost) == m, "The length of r is not equal to the number of stages."
137 |         assert len(self.demand_cost) == m, "The length of k is not equal to the number of stages."
138 |         assert len(self.holding_cost) == m, "The length of h is not equal to the number of stages - 1."
139 |         assert len(self.supply_capacity) == m-1, "The length of c is not equal to the number of stages - 1."
140 |         assert len(self.lead_time) == m-1, "The length of L is not equal to the number of stages - 1."
141 |         assert self.dist in [1,2,3,4,5], "dist must be one of 1, 2, 3, 4, 5."
142 |         if self.dist < 5:
143 |             assert distributions[self.dist].cdf(0,**self.dist_param), "Wrong parameters given for distribution."
144 |         else:
145 |             assert len(self.user_D) == self.num_periods, "The length of the user specified distribution is not equal to the number of periods."
146 |         assert (self.alpha>0) & (self.alpha<=1), "alpha must be in the range (0,1]."
147 |         
148 |         # select distribution
149 |         self.demand_dist = distributions[self.dist]  
150 |         
151 |         # set random generation seed (unless using user demands)
152 |         if self.dist < 5:
153 |             self.seed(self.seed_int)
154 | 
155 |         # intialize
156 |         self.reset()
157 |         
158 |         # action space (reorder quantities for each stage; list)
159 |         # An action is defined for every stage (except last one)
160 |         # self.action_space = gym.spaces.Tuple(tuple(
161 |             # [gym.spaces.Box(0, i, shape=(1,)) for i in self.supply_capacity]))
162 |         self.pipeline_length = (m-1)*(lt_max+1)
163 |         self.action_space = gym.spaces.Box(
164 |             low=np.zeros(m-1), high=self.supply_capacity, dtype=np.int16)
165 |         # observation space (Inventory position at each echelon, which is any integer value)
166 |         self.observation_space = gym.spaces.Box(
167 |             low=-np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods*10,
168 |             high=np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods, dtype=np.int32)
169 | 
170 |         # self.observation_space = gym.spaces.Box(
171 |         #     low=-np.ones(m-1)*self.supply_capacity.max()*self.num_periods*10, 
172 |         #     high=self.supply_capacity*self.num_periods, dtype=np.int32)
173 | 
174 |     def seed(self,seed=None):
175 |         '''
176 |         Set random number generation seed
177 |         '''
178 |         # seed random state
179 |         if seed != None:
180 |             np.random.seed(seed=int(seed))
181 |         
182 |     def _RESET(self):
183 |         '''
184 |         Create and initialize all variables and containers.
185 |         Nomenclature:
186 |             I = On hand inventory at the start of each period at each stage (except last one).
187 |             T = Pipeline inventory at the start of each period at each stage (except last one).
188 |             R = Replenishment order placed at each period at each stage (except last one).
189 |             D = Customer demand at each period (at the retailer)
190 |             S = Sales performed at each period at each stage.
191 |             B = Backlog at each period at each stage.
192 |             LS = Lost sales at each period at each stage.
193 |             P = Total profit at each stage.
194 |         '''
195 |         periods = self.num_periods
196 |         m = self.num_stages
197 |         I0 = self.init_inv
198 |         
199 |         # simulation result lists
200 |         self.I=np.zeros([periods + 1, m - 1]) # inventory at the beginning of each period (last stage not included since iventory is infinite)
201 |         self.T=np.zeros([periods + 1, m - 1]) # pipeline inventory at the beginning of each period (no pipeline inventory for last stage)
202 |         self.R=np.zeros([periods, m - 1]) # replenishment order (last stage places no replenishment orders)
203 |         self.D=np.zeros(periods) # demand at retailer
204 |         self.S=np.zeros([periods, m]) # units sold
205 |         self.B=np.zeros([periods, m]) # backlog (includes top most production site in supply chain)
206 |         self.LS=np.zeros([periods, m]) # lost sales
207 |         self.P=np.zeros(periods) # profit
208 |         
209 |         # initializetion
210 |         self.period = 0 # initialize time
211 |         self.I[0,:]=np.array(I0) # initial inventory
212 |         self.T[0,:]=np.zeros(m-1) # initial pipeline inventory 
213 |         self.action_log = np.zeros((periods, m-1), dtype=np.int32)
214 |         # set state
215 |         self._update_state()
216 |         
217 |         return self.state
218 | 
219 |     def _update_state(self):
220 |         m = self.num_stages - 1
221 |         t = self.period
222 |         lt_max = self.lead_time.max()
223 |         state = np.zeros(m*(lt_max + 1), dtype=np.int32)
224 |         # state = np.zeros(m)
225 |         if t == 0:
226 |             state[:m] = self.I0
227 |         else:
228 |             state[:m] = self.I[t]
229 | 
230 |         if t == 0:
231 |             pass
232 |         elif t >= lt_max:
233 |             state[-m*lt_max:] += self.action_log[t-lt_max:t].flatten()
234 |         else:
235 |             state[-m*(t):] += self.action_log[:t].flatten()
236 | 
237 |         self.state = state.copy()
238 |     
239 |     def _update_base_stock_policy_state(self):
240 |         '''
241 |         Get current state of the system: Inventory position at each echelon
242 |         Inventory at hand + Pipeline inventory - backlog up to the current stage 
243 |         (excludes last stage since no inventory there, nor replenishment orders placed there).
244 |         '''
245 |         n = self.period
246 |         m = self.num_stages
247 |         if n>=1:
248 |             IP = np.cumsum(self.I[n,:] + self.T[n,:] - self.B[n-1,:-1])
249 |         else:
250 |             IP = np.cumsum(self.I[n,:] + self.T[n,:])
251 |         self.state = IP
252 |     
253 |     def _STEP(self,action):
254 |         '''
255 |         Take a step in time in the multiperiod inventory management problem.
256 |         action = [integer; dimension |Stages|-1] number of units to request from suppliers (last stage makes no requests)
257 |         '''
258 |         R = np.maximum(action, 0).astype(int)
259 | 
260 |         # get inventory at hand and pipeline inventory at beginning of the period
261 |         n = self.period
262 |         L = self.lead_time
263 |         I = self.I[n,:].copy() # inventory at start of period n
264 |         T = self.T[n,:].copy() # pipeline inventory at start of period n
265 |         m = self.num_stages # number of stages
266 |         
267 |         # get production capacities
268 |         c = self.supply_capacity # capacity
269 |         self.action_log[n] = R.copy()
270 |         # available inventory at the m+1 stage (note: last stage has unlimited supply)
271 |         Im1 = np.append(I[1:], np.Inf) 
272 |         
273 |         # place replenishment order
274 |         if n>=1: # add backlogged replenishment orders to current request
275 |             R = R + self.B[n-1,1:]
276 |         Rcopy = R.copy() # copy original replenishment quantity
277 |         R[R>=c] = c[R>=c] # enforce capacity constraint
278 |         R[R>=Im1] = Im1[R>=Im1] # enforce available inventory constraint
279 |         self.R[n,:] = R # store R[n]
280 |         
281 |         # receive inventory replenishment placed L periods ago
282 |         RnL = np.zeros(m-1) # initialize
283 |         for i in range(m-1):
284 |             if n - L[i] >= 0:
285 |                 RnL[i] = self.R[n-L[i],i].copy() # replenishment placed at the end of period n-L-1
286 |                 I[i] = I[i] + RnL[i]
287 |             
288 |         # demand is realized
289 |         if self.dist < 5:
290 |             D0 = self.demand_dist.rvs(**self.dist_param)
291 |         else:
292 |             D0 = self.demand_dist[n] # user specified demand
293 |         D = D0 # demand
294 |         self.D[n] = D0 # store D[n]
295 |         
296 |         # add previous backlog to demand
297 |         if n >= 1:
298 |             D = D0 + self.B[n-1,0].copy() # add backlogs to demand
299 |         
300 |         # units sold
301 |         S0 = min(I[0],D) # at retailer
302 |         S = np.append(S0,R) # at each stage
303 |         self.S[n,:] = S # store S[n]
304 |         
305 |         # update inventory on hand and pipeline inventory
306 |         I = I - S[:-1] # updated inventory at all stages (exclude last stage)
307 |         T = T - RnL + R # updated pipeline inventory at all stages (exclude last one)
308 |         self.I[n+1,:] = I # store inventory available at start of period n + 1 (exclude last stage)
309 |         self.T[n+1,:] = T # store pipeline inventory at start of period n + 1
310 |         
311 |         # unfulfilled orders
312 |         U = np.append(D, Rcopy) - S # unfulfilled demand and replenishment orders
313 |         
314 |         # backlog and lost sales
315 |         if self.backlog:
316 |             B = U
317 |             LS = np.zeros(m)
318 |         else:
319 |             LS = U # lost sales
320 |             B = np.zeros(m)
321 |         self.B[n,:] = B # store B[n]
322 |         self.LS[n,:] = LS # store LS[n]
323 | 
324 |         # calculate profit
325 |         p = self.unit_price 
326 |         r = self.unit_cost 
327 |         k = self.demand_cost
328 |         h = self.holding_cost
329 |         a = self.discount
330 |         II = np.append(I,0) # augment inventory so that last has no onsite inventory
331 |         RR = np.append(R,S[-1]) # augment replenishment orders to include production cost at last stage
332 |         P = a**n*np.sum(p*S - (r*RR + k*U + h*II)) # discounted profit in period n
333 |         # P = a**n*np.sum(p*S - (r*RR + k*U + h*I))
334 |         self.P[n] = P # store P
335 |         
336 |         # update period
337 |         self.period += 1  
338 |         
339 |         # update stae
340 |         self._update_state()
341 |         
342 |         # set reward (profit from current timestep)
343 |         reward = P 
344 |         
345 |         # determine if simulation should terminate
346 |         if self.period >= self.num_periods:
347 |             done = True
348 |         else:
349 |             done = False
350 |             
351 |         return self.state, reward, done, {}
352 |     
353 |     def sample_action(self):
354 |         '''
355 |         Generate an action by sampling from the action_space
356 |         '''
357 |         return self.action_space.sample()
358 |         
359 |     def base_stock_action(self,z):
360 |         '''
361 |         Sample action (number of units to request) based on a base-stock policy (order up to z policy)
362 |         z = [integer list; dimension |Stages| - 1] base stock level (no inventory at the last stage)
363 |         '''
364 |         n = self.period
365 |         c = self.supply_capacity
366 |         m = self.num_stages
367 |         IP = self._update_base_stock_policy_state() # extract inventory position (current state)
368 |         
369 |         try:
370 |             dimz = len(z)
371 |         except:
372 |             dimz = 1
373 |         assert dimz == m-1, "Wrong dimension on base stock level vector. Should be # Stages - 1."
374 |         
375 |         # calculate total inventory position at the beginning of period n
376 |         R = z - IP # replenishmet order to reach zopt
377 | 
378 |         # check if R can actually be fulfilled (capacity and inventory constraints)
379 |         Im1 = np.append(self.I[n,1:], np.Inf) # available inventory at the m+1 stage
380 |                                             # NOTE: last stage has unlimited raw materials
381 |         Rpos = np.column_stack((np.zeros(len(R)),R)) # augmented materix to get replenishment only if positive
382 |         A = np.column_stack((c, np.max(Rpos,axis=1), Im1)) # augmented matrix with c, R, and I_m+1 as columns
383 |         
384 |         R = np.min(A, axis = 1) # replenishmet order to reach zopt (capacity constrained)
385 |         
386 |         return R
387 | 
388 |     def step(self, action):
389 |         return self._STEP(action)
390 | 
391 |     def reset(self):
392 |         return self._RESET()
393 |         
394 | class InvManagementBacklogEnv(InvManagementMasterEnv):
395 |     def __init__(self, *args, **kwargs):
396 |         super().__init__(*args, **kwargs)
397 |         
398 | class InvManagementLostSalesEnv(InvManagementMasterEnv):
399 |     def __init__(self, *args, **kwargs):
400 |         super().__init__(*args, **kwargs)
401 |         self.backlog = False
402 |         self.observation_space = gym.spaces.Box(
403 |             low=np.zeros(self.pipeline_length), # Never goes negative without backlog
404 |             high=np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods, dtype=np.int32)


--------------------------------------------------------------------------------
/or_gym/envs/classic_or/knapsack.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces, logger
  4 | from gym.utils import seeding
  5 | from or_gym.utils import assign_env_config
  6 | import copy
  7 | 
  8 | class KnapsackEnv(gym.Env):
  9 |     '''
 10 |     Unbounded Knapsack Problem
 11 | 
 12 |     The Knapsack Problem (KP) is a combinatorial optimization problem which
 13 |     requires the user to select from a range of goods of different values and
 14 |     weights in order to maximize the value of the selected items within a 
 15 |     given weight limit. This version is unbounded meaning that we can select
 16 |     items without limit. 
 17 | 
 18 |     The episodes proceed by selecting items and placing them into the
 19 |     knapsack one at a time until the weight limit is reached or exceeded, at
 20 |     which point the episode ends.
 21 | 
 22 |     Observation:
 23 |         Type: Tuple, Discrete
 24 |         0: list of item weights
 25 |         1: list of item values
 26 |         2: maximum weight of the knapsack
 27 |         3: current weight in knapsack
 28 | 
 29 |     Actions:
 30 |         Type: Discrete
 31 |         0: Place item 0 into knapsack
 32 |         1: Place item 1 into knapsack
 33 |         2: ...
 34 | 
 35 |     Reward:
 36 |         Value of item successfully placed into knapsack or 0 if the item
 37 |         doesn't fit, at which point the episode ends.
 38 | 
 39 |     Starting State:
 40 |         Lists of available items and empty knapsack.
 41 | 
 42 |     Episode Termination:
 43 |         Full knapsack or selection that puts the knapsack over the limit.
 44 |     '''
 45 |     
 46 |     # Internal list of placed items for better rendering
 47 |     _collected_items = []
 48 |     
 49 |     def __init__(self, *args, **kwargs):
 50 |         # Generate data with consistent random seed to ensure reproducibility
 51 |         self.N = 200
 52 |         self.max_weight = 200
 53 |         self.current_weight = 0
 54 |         self._max_reward = 10000
 55 |         self.mask = True
 56 |         self.seed = 0
 57 |         self.item_numbers = np.arange(self.N)
 58 |         self.item_weights = np.random.randint(1, 100, size=self.N)
 59 |         self.item_values = np.random.randint(0, 100, size=self.N)
 60 |         self.over_packed_penalty = 0
 61 |         self.randomize_params_on_reset = False
 62 |         self._collected_items.clear()
 63 |         # Add env_config, if any
 64 |         assign_env_config(self, kwargs)
 65 |         self.set_seed()
 66 | 
 67 |         obs_space = spaces.Box(
 68 |             0, self.max_weight, shape=(2*self.N + 1,), dtype=np.int32)
 69 |         self.action_space = spaces.Discrete(self.N)
 70 |         if self.mask:
 71 |             self.observation_space = spaces.Dict({
 72 |                 "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8),
 73 |                 "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8),
 74 |                 "state": obs_space
 75 |                 })
 76 |         else:
 77 |             self.observation_space = spaces.Box(
 78 |                 0, self.max_weight, shape=(2, self.N + 1), dtype=np.int32)
 79 |         
 80 |         self.reset()
 81 |         
 82 |     def _STEP(self, item):
 83 |         # Check that item will fit
 84 |         if self.item_weights[item] + self.current_weight <= self.max_weight:
 85 |             self.current_weight += self.item_weights[item]
 86 |             reward = self.item_values[item]
 87 |             self._collected_items.append(item)
 88 |             if self.current_weight == self.max_weight:
 89 |                 done = True
 90 |             else:
 91 |                 done = False
 92 |         else:
 93 |             # End trial if over weight
 94 |             reward = self.over_packed_penalty
 95 |             done = True
 96 |             
 97 |         self._update_state()
 98 |         return self.state, reward, done, {}
 99 |     
100 |     def _get_obs(self):
101 |         return self.state
102 |     
103 |     def _update_state(self):
104 |         if self.mask:
105 |             mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8)
106 |             state = np.hstack([
107 |                 self.item_weights,
108 |                 self.item_values,
109 |                 np.array([self.current_weight])
110 |                 ], dtype=np.int32)
111 |             self.state = {
112 |                 "action_mask": mask,
113 |                 "avail_actions": np.ones(self.N, dtype=np.uint8),
114 |                 "state": state
115 |                 }
116 |         else:
117 |             state = np.vstack([
118 |                 self.item_weights,
119 |                 self.item_values], dtype=np.int32)
120 |             self.state = np.hstack([
121 |                 state,
122 |                 np.array([
123 |                     [self.max_weight],
124 |                      [self.current_weight]])
125 |                 ])        
126 |     
127 |     def _RESET(self):
128 |         if self.randomize_params_on_reset:
129 |             self.item_weights = np.random.randint(1, 100, size=self.N)
130 |             self.item_values = np.random.randint(0, 100, size=self.N)
131 |         self.current_weight = 0
132 |         self._collected_items.clear()
133 |         self._update_state()
134 |         return self.state
135 |     
136 |     def sample_action(self):
137 |         return np.random.choice(self.item_numbers)
138 | 
139 |     def set_seed(self, seed=None):
140 |         if seed == None:
141 |             seed = np.random.randint(0, np.iinfo(np.int32).max)        
142 |         self.np_random, seed = seeding.np_random(seed)
143 |         return [seed]
144 | 
145 |     def reset(self):
146 |         return self._RESET()
147 | 
148 |     def step(self, action):
149 |         return self._STEP(action)
150 |         
151 |     def render(self):
152 |         total_value = 0
153 |         total_weight = 0
154 |         for i in range(self.N) :
155 |             if i in self._collected_items :
156 |                 total_value += self.item_values[i]
157 |                 total_weight += self.item_weights[i]
158 |         print(self._collected_items, total_value, total_weight)
159 |         
160 |         # RlLib requirement: Make sure you either return a uint8/w x h x 3 (RGB) image or handle rendering in a window and then return `True`.
161 |         return True
162 | 
163 | class BinaryKnapsackEnv(KnapsackEnv):
164 |     '''
165 |     Binary Knapsack Problem
166 | 
167 |     The Binary or 0-1 KP allows selection of each item only once or not at
168 |     all.
169 | 
170 |     The episodes proceed by selecting items and placing them into the
171 |     knapsack one at a time until the weight limit is reached or exceeded, at
172 |     which point the episode ends.
173 | 
174 |     Observation:
175 |         Type: Tuple, Discrete
176 |         0: list of item weights
177 |         1: list of item values
178 |         2: list of item limits
179 |         3: maximum weight of the knapsack
180 |         4: current weight in knapsack
181 | 
182 |     Actions:
183 |         Type: Discrete
184 |         0: Place item 0 into knapsack
185 |         1: Place item 1 into knapsack
186 |         2: ...
187 | 
188 |     Reward:
189 |         Value of item successfully placed into knapsack or 0 if the item
190 |         doesn't fit, at which point the episode ends.
191 | 
192 |     Starting State:
193 |         Lists of available items and empty knapsack.
194 | 
195 |     Episode Termination:
196 |         Full knapsack or selection that puts the knapsack over the limit.
197 |     '''
198 |     def __init__(self, *args, **kwargs):
199 |         super().__init__()
200 |         self.item_weights = np.random.randint(1, 100, size=self.N)
201 |         self.item_values = np.random.randint(0, 100, size=self.N)
202 |         assign_env_config(self, kwargs)
203 | 
204 |         obs_space = spaces.Box(
205 |             0, self.max_weight, shape=(3, self.N + 1), dtype=np.int32)
206 |         if self.mask:
207 |             self.observation_space = spaces.Dict({
208 |                 "action_mask": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
209 |                 "avail_actions": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
210 |                 "state": obs_space
211 |             })
212 |         else:
213 |             self.observation_space = obs_space
214 | 
215 |         self.reset()
216 | 
217 |     def _STEP(self, item):
218 |         # Check item limit
219 |         if self.item_limits[item] > 0:
220 |             # Check that item will fit
221 |             if self.item_weights[item] + self.current_weight <= self.max_weight:
222 |                 self.current_weight += self.item_weights[item]
223 |                 reward = self.item_values[item]
224 |                 if self.current_weight == self.max_weight:
225 |                     done = True
226 |                 else:
227 |                     done = False
228 |                 self._update_state(item)
229 |             else:
230 |                 # End if over weight
231 |                 reward = 0
232 |                 done = True
233 |         else:
234 |             # End if item is unavailable
235 |             reward = 0
236 |             done = True
237 |             
238 |         return self.state, reward, done, {}
239 | 
240 |     def _update_state(self, item=None):
241 |         if item is not None:
242 |             self.item_limits[item] -= 1
243 |         state_items = np.vstack([
244 |             self.item_weights,
245 |             self.item_values,
246 |             self.item_limits
247 |         ], dtype=np.int32)
248 |         state = np.hstack([
249 |             state_items, 
250 |             np.array([[self.max_weight],
251 |                       [self.current_weight], 
252 |                       [0] # Serves as place holder
253 |                 ])
254 |         ], dtype=np.int32)
255 |         if self.mask:
256 |             mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8)
257 |             mask = np.where(self.item_limits > 0, mask, 0)
258 |             self.state = {
259 |                 "action_mask": mask,
260 |                 "avail_actions": np.ones(self.N, dtype=np.uint8),
261 |                 "state": state
262 |             }
263 |         else:
264 |             self.state = state.copy()
265 |         
266 |     def sample_action(self):
267 |         return np.random.choice(
268 |             self.item_numbers[np.where(self.item_limits!=0)])
269 |     
270 |     def _RESET(self):
271 |         if self.randomize_params_on_reset:
272 |             self.item_weights = np.random.randint(1, 100, size=self.N)
273 |             self.item_values = np.random.randint(0, 100, size=self.N)
274 |         self.current_weight = 0
275 |         self.item_limits = np.ones(self.N, dtype=np.int32)
276 |         self._update_state()
277 |         return self.state
278 | 
279 | class BoundedKnapsackEnv(KnapsackEnv):
280 |     '''
281 |     Bounded Knapsack Problem
282 | 
283 |     The Knapsack Problem (KP) is a combinatorial optimization problem which
284 |     requires the user to select from a range of goods of different values and
285 |     weights in order to maximize the value of the selected items within a 
286 |     given weight limit. This version is bounded meaning each item can be
287 |     selected a limited number of times.
288 | 
289 |     The episodes proceed by selecting items and placing them into the
290 |     knapsack one at a time until the weight limit is reached or exceeded, at
291 |     which point the episode ends.
292 | 
293 |     Observation:
294 |         Type: Tuple, Discrete
295 |         0: list of item weights
296 |         1: list of item values
297 |         2: list of item limits
298 |         3: maximum weight of the knapsack
299 |         4: current weight in knapsack
300 | 
301 |     Actions:
302 |         Type: Discrete
303 |         0: Place item 0 into knapsack
304 |         1: Place item 1 into knapsack
305 |         2: ...
306 | 
307 |     Reward:
308 |         Value of item successfully placed into knapsack or 0 if the item
309 |         doesn't fit, at which point the episode ends.
310 | 
311 |     Starting State:
312 |         Lists of available items and empty knapsack.
313 | 
314 |     Episode Termination:
315 |         Full knapsack or selection that puts the knapsack over the limit.
316 |     '''
317 |     def __init__(self, *args, **kwargs):
318 |         self.N = 200
319 |         self.item_limits_init = np.random.randint(1, 10, size=self.N, dtype=np.int32)
320 |         self.item_limits = self.item_limits_init.copy()
321 |         super().__init__()
322 |         self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32)
323 |         self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32)
324 | 
325 |         assign_env_config(self, kwargs)
326 | 
327 |         obs_space = spaces.Box(
328 |             0, self.max_weight, shape=(3, self.N + 1), dtype=np.int32)
329 |         if self.mask:
330 |             self.observation_space = spaces.Dict({
331 |                 "action_mask": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
332 |                 "avail_actions": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
333 |                 "state": obs_space
334 |             })
335 |         else:
336 |             self.observation_space = obs_space
337 |         
338 |     def _STEP(self, item):
339 |         # Check item limit
340 |         if self.item_limits[item] > 0:
341 |             # Check that item will fit
342 |             if self.item_weights[item] + self.current_weight <= self.max_weight:
343 |                 self.current_weight += self.item_weights[item]
344 |                 reward = self.item_values[item]
345 |                 if self.current_weight == self.max_weight:
346 |                     done = True
347 |                 else:
348 |                     done = False
349 |                 self._update_state(item)
350 |             else:
351 |                 # End if over weight
352 |                 reward = 0
353 |                 done = True
354 |         else:
355 |             # End if item is unavailable
356 |             reward = 0
357 |             done = True
358 |             
359 |         return self.state, reward, done, {}
360 | 
361 |     def _update_state(self, item=None):
362 |         if item is not None:
363 |             self.item_limits[item] -= 1
364 |         state_items = np.vstack([
365 |             self.item_weights,
366 |             self.item_values,
367 |             self.item_limits
368 |         ], dtype=np.int32)
369 |         state = np.hstack([
370 |             state_items, 
371 |             np.array([[self.max_weight],
372 |                       [self.current_weight], 
373 |                       [0] # Serves as place holder
374 |                 ], dtype=np.int32)
375 |         ])
376 |         if self.mask:
377 |             mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8)
378 |             mask = np.where(self.item_limits > 0, mask, 0)
379 |             self.state = {
380 |                 "action_mask": mask,
381 |                 "avail_actions": np.ones(self.N, dtype=np.uint8),
382 |                 "state": state
383 |             }
384 |         else:
385 |             self.state = state.copy()
386 |         
387 |     def sample_action(self):
388 |         return np.random.choice(
389 |             self.item_numbers[np.where(self.item_limits!=0)])
390 |     
391 |     def _RESET(self):
392 |         if self.randomize_params_on_reset:
393 |             self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32)
394 |             self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32)
395 |             self.item_limits = np.random.randint(1, 10, size=self.N, dtype=np.int32)
396 |         else:
397 |             self.item_limits = self.item_limits_init.copy()
398 | 
399 |         self.current_weight = 0
400 |         self._update_state()
401 |         return self.state
402 | 
403 | class OnlineKnapsackEnv(BoundedKnapsackEnv):
404 |     '''
405 |     Online Knapsack Problem
406 | 
407 |     The Knapsack Problem (KP) is a combinatorial optimization problem which
408 |     requires the user to select from a range of goods of different values and
409 |     weights in order to maximize the value of the selected items within a 
410 |     given weight limit. This version is online meaning each item is randomly
411 |     presented to the algorithm one at a time, at which point the algorithm 
412 |     can either accept or reject the item. After seeing a fixed number of 
413 |     items are shown, the episode terminates. If the weight limit is reached
414 |     before the episode ends, then it terminates early.
415 | 
416 |     Observation:
417 |         Type: Tuple, Discrete
418 |         0: list of item weights
419 |         1: list of item values
420 |         2: list of item limits
421 |         3: maximum weight of the knapsack
422 |         4: current weight in knapsack
423 | 
424 | 
425 |     Actions:
426 |         Type: Discrete
427 |         0: Reject item
428 |         1: Place item into knapsack
429 | 
430 |     Reward:
431 |         Value of item successfully placed into knapsack or 0 if the item
432 |         doesn't fit, at which point the episode ends.
433 | 
434 |     Starting State:
435 |         Lists of available items and empty knapsack.
436 | 
437 |     Episode Termination:
438 |         Full knapsack, selection that puts the knapsack over the limit, or
439 |         the number of items to be drawn has been reached.
440 |     '''
441 |     def __init__(self, *args, **kwargs):
442 |         BoundedKnapsackEnv.__init__(self)
443 |         assign_env_config(self, kwargs)
444 |         self.action_space = spaces.Discrete(2)
445 | 
446 |         obs_space = spaces.Box(0, self.max_weight, shape=(4,), dtype=np.int32)
447 |         if self.mask:
448 |             self.observation_space = spaces.Dict({
449 |                 'state': obs_space,
450 |                 'avail_actions': spaces.Box(0, 1, shape=(2,), dtype=np.uint8),
451 |                 'action_mask': spaces.Box(0, 1, shape=(2,), dtype=np.uint8)
452 |             })
453 |         else:
454 |             self.observation_space = obs_space
455 | 
456 |         self.step_counter = 0
457 |         self.step_limit = 50
458 |         
459 |         self.state = self.reset()
460 |         self._max_reward = 600
461 |         
462 |     def _STEP(self, action):
463 |         if bool(action):
464 |             # Check that item will fit
465 |             if self.item_weights[self.current_item] + self.current_weight <= self.max_weight:
466 |                 self.current_weight += self.item_weights[self.current_item]
467 |                 reward = self.item_values[self.current_item]
468 |                 if self.current_weight == self.max_weight:
469 |                     done = True
470 |                 else:
471 |                     done = False
472 |             else:
473 |                 # End if over weight
474 |                 reward = 0
475 |                 done = True
476 |         else:
477 |             reward = 0
478 |             done = False
479 |         
480 |         self._update_state()
481 |         self.step_counter += 1
482 |         if self.step_counter >= self.step_limit:
483 |             done = True
484 |             
485 |         return self.state, reward, done, {}
486 |     
487 |     def _update_state(self):
488 |         self.current_item = np.random.choice(self.item_numbers, p=self.item_probs)
489 |         current_item_weight = self.item_weights[self.current_item]
490 |         state = np.array([
491 |                 self.current_weight,
492 |                 self.current_item,
493 |                 current_item_weight,
494 |                 self.item_values[self.current_item]
495 |                 ],)
496 |         if self.mask:
497 |             mask = np.ones(2, dtype=np.uint8)
498 |             if current_item_weight + self.current_weight > self.max_weight:
499 |                 mask[1] = 0
500 |             self.state = {
501 |                 'state': state,
502 |                 'avail_actions': np.ones(2, dtype=np.uint8),
503 |                 'action_mask': mask
504 |             }            
505 |         else:
506 |             self.state = state
507 | 
508 |     def sample_action(self):
509 |         return np.random.choice([0, 1])
510 |     
511 |     def _RESET(self):
512 |         if self.randomize_params_on_reset:
513 |             self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32)
514 |             self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32)
515 |             self.item_limits = np.random.randint(1, 10, size=self.N, dtype=np.int32)
516 |         else:
517 |             self.item_limits = self.item_limits_init.copy()
518 | 
519 |         if not hasattr(self, 'item_probs'):
520 |             self.item_probs = self.item_limits_init / self.item_limits_init.sum()
521 |         self.current_weight = 0
522 |         self.step_counter = 0
523 |         self._update_state()
524 |         return self.state


--------------------------------------------------------------------------------
/or_gym/envs/supply_chain/network_management.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Multi-period inventory management
  3 | Hector Perez, Christian Hubbs, Can Li
  4 | 9/14/2020
  5 | '''
  6 | 
  7 | import gym
  8 | import itertools
  9 | import numpy as np
 10 | import networkx as nx
 11 | import pandas as pd
 12 | from scipy.stats import *
 13 | from or_gym.utils import assign_env_config
 14 | from collections import deque
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | class NetInvMgmtMasterEnv(gym.Env):
 18 |     '''
 19 |     The supply network environment is structured as follows:
 20 |     
 21 |     It is a multi-period multi-node production-inventory system for 
 22 |     a single non-perishable product that is sold in discrete quantities. 
 23 |     Two main types of nodes exist: 1) production nodes (which have an 
 24 |     inventory holding area and a manufacturing area), and 2) distribution
 25 |     nodes (which only have an inventory holding area). Retail nodes are
 26 |     considered distribution nodes. Other node types in the network include 
 27 |     1) raw material nodes (source nodes), which have unlimited supply
 28 |     of raw materials, and 2) market nodes (sink nodes), which generate an 
 29 |     uncertain demand on their respective retailers in each period. 
 30 | 
 31 |     Within production nodes, the inventory holding area holds the inventory 
 32 |     necessary to produce the respective intermediate material at that node. 
 33 |     Yield ratios are specified at each production stage relating the amount 
 34 |     of material produced from one unit of inventory. Production at each node
 35 |     is bounded by the nodes's production capacity and the available inventory.
 36 | 
 37 |     Lead times between neighbor nodes exist and are associated with the edges
 38 |     connecting them.
 39 |         
 40 |     At the beginning of each time period, the following sequence of events occurs:
 41 |     
 42 |     0) Each node places replenishment orders to their immediate suppliers. 
 43 |        Replenishment orders are filled according to available production capacity 
 44 |        and available inventory at the suppliers. There is a cost associated with
 45 |        each order request.
 46 |     1) Incoming inventory replenishment shipments that have made it down network
 47 |        pipeline (after the associated lead time) are received at each node.
 48 |     2) Market demands occur at the retail nodes. Demands are sampled from a 
 49 |        specified discrete probability distribution. Demands are filled according 
 50 |        to available inventory at the retailers.
 51 |     4) Option: one of the following occurs,
 52 |         a) Unfulfilled sales are backlogged at a penalty. 
 53 |             Note: Backlogged orders are added to the next period's market demand.
 54 |         b) Unfulfilled sales and replenishment orders are lost with a 
 55 |            goodwill-loss penalty. 
 56 |     5) Surpluss inventory is held at each stage at a holding cost. 
 57 |         Pipeline inventories also incur a cost for each period in the pipeline.
 58 |         
 59 |     '''
 60 |     def __init__(self, *args, **kwargs):
 61 |         '''
 62 |         num_periods = number of periods in simulation.
 63 |         Node specific parameters:
 64 |             - I0 = initial inventory.
 65 |             - C = production capacity.
 66 |             - v = production yield in the range (0, 1].
 67 |             - o = unit operating cost (feed-based)
 68 |             - h = unit holding cost for excess on-hand inventory.
 69 |         Edge specific parameters:
 70 |             - L = lead times in betwen adjacent nodes.
 71 |             - p = unit price to send material between adjacent nodes (purchase price/reorder cost)
 72 |             - b = unit backlog cost or good-wil loss for unfulfilled market demand between adjacent retailer and market.
 73 |             - g = unit holding cost for pipeline inventory on a specified edge.
 74 |             - prob_dist = probability distribution function on a (retailer, market) edge.
 75 |             - demand_dist = demand distribution for (retailer, market) edge. Two options:
 76 |                 - use scipy probability distribution: must be a lambda function calling the rvs method of the distribution
 77 |                     i.e. lambda: poisson.rvs(mu=20)
 78 |                 - use a list of user specified demands for each period. 
 79 |         backlog = Are unfulfilled orders backlogged? True = backlogged, False = lost sales.
 80 |         demand_dist = distribution function for customer demand (e.g. poisson, binomial, uniform, geometric, etc.)
 81 |         dist_param = named values for parameters fed to statistical distribution.
 82 |             poisson: {'mu': <mean value>}
 83 |             binom: {'n': <mean value>, 
 84 |                     'p': <probability between 0 and 1 of getting the mean value>}
 85 |             raindint: {'low' = <lower bound>, 'high': <upper bound>}
 86 |             geom: {'p': <probability. Outcome is the number of trials to success>}
 87 |         alpha = discount factor in the range (0,1] that accounts for the time value of money
 88 |         seed_int = integer seed for random state.
 89 |         user_D = dictionary containing user specified demand (list) for each (retail, market) pair at
 90 |             each time period in the simulation. If all zeros, ignored; otherwise, demands will be taken from this list.
 91 |         sample_path = dictionary specifying if is user_D (for each (retail, market) pair) is sampled from demand_dist.
 92 |         '''
 93 |         # set default (arbitrary) values when creating environment (if no args or kwargs are given)
 94 |         self._max_rewards = 2000
 95 |         self.num_periods = 30
 96 |         self.backlog = True
 97 |         self.alpha = 1.00
 98 |         self.seed_int = 0
 99 |         self.user_D = {(1,0): np.zeros(self.num_periods)}
100 |         self.sample_path = {(1,0): False}
101 |         self._max_rewards = 2000
102 | 
103 |         # create graph
104 |         self.graph = nx.DiGraph()
105 |         # Market 
106 |         self.graph.add_nodes_from([0])
107 |         # Retailer
108 |         self.graph.add_nodes_from([1], I0 = 100,
109 |                                         h = 0.030)
110 |         # Distributors
111 |         self.graph.add_nodes_from([2], I0 = 110,
112 |                                         h = 0.020)
113 |         self.graph.add_nodes_from([3], I0 = 80,
114 |                                         h = 0.015)
115 |         # Manufacturers
116 |         self.graph.add_nodes_from([4], I0 = 400,
117 |                                         C = 90,
118 |                                         o = 0.010,
119 |                                         v = 1.000,
120 |                                         h = 0.012)
121 |         self.graph.add_nodes_from([5], I0 = 350,
122 |                                         C = 90,
123 |                                         o = 0.015,
124 |                                         v = 1.000,
125 |                                         h = 0.013)
126 |         self.graph.add_nodes_from([6], I0 = 380,
127 |                                         C = 80,
128 |                                         o = 0.012,
129 |                                         v = 1.000,
130 |                                         h = 0.011)
131 |         # Raw materials
132 |         self.graph.add_nodes_from([7, 8])
133 |         # Links
134 |         self.graph.add_edges_from([(1,0,{'p': 2.000,
135 |                                          'b': 0.100,
136 |                                          'demand_dist': poisson,
137 |                                          'dist_param': {'mu': 20}}),
138 |                                    (2,1,{'L': 5,
139 |                                          'p': 1.500,
140 |                                          'g': 0.010}),
141 |                                    (3,1,{'L': 3,
142 |                                          'p': 1.600,
143 |                                          'g': 0.015}),
144 |                                    (4,2,{'L': 8,
145 |                                          'p': 1.000,
146 |                                          'g': 0.008}),
147 |                                    (4,3,{'L': 10,
148 |                                          'p': 0.800,
149 |                                          'g': 0.006}),
150 |                                    (5,2,{'L': 9,
151 |                                          'p': 0.700,
152 |                                          'g': 0.005}),
153 |                                    (6,2,{'L': 11,
154 |                                          'p': 0.750,
155 |                                          'g': 0.007}),
156 |                                    (6,3,{'L': 12,
157 |                                          'p': 0.800,
158 |                                          'g': 0.004}),
159 |                                    (7,4,{'L': 0,
160 |                                          'p': 0.150,
161 |                                          'g': 0.000}),
162 |                                    (7,5,{'L': 1,
163 |                                          'p': 0.050,
164 |                                          'g': 0.005}),
165 |                                    (8,5,{'L': 2,
166 |                                          'p': 0.070,
167 |                                          'g': 0.002}),
168 |                                    (8,6,{'L': 0,
169 |                                          'p': 0.200,
170 |                                          'g': 0.000})])
171 |         
172 |         # add environment configuration dictionary and keyword arguments
173 |         assign_env_config(self, kwargs)
174 | 
175 |         # Save user_D and sample_path to graph metadata
176 |         for link in self.user_D.keys():
177 |             d = self.user_D[link]
178 |             if np.sum(d) != 0:
179 |                 self.graph.edges[link]['user_D'] = d
180 |                 if link in self.sample_path.keys():
181 |                     self.graph.edges[link]['sample_path'] = self.sample_path[link]
182 |             else:
183 |                 # Placeholder to avoid key errors
184 |                 self.graph.edges[link]['user_D'] = 0
185 |         
186 |         self.num_nodes = self.graph.number_of_nodes()
187 |         self.adjacency_matrix = np.vstack(self.graph.edges())
188 |         # Set node levels
189 |         self.levels = {}
190 |         self.levels['retailer'] = np.array([1])
191 |         self.levels['distributor'] = np.unique(np.hstack(
192 |             [list(self.graph.predecessors(i)) for i in self.levels['retailer']]))
193 |         self.levels['manufacturer'] = np.unique(np.hstack(
194 |             [list(self.graph.predecessors(i)) for i in self.levels['distributor']]))
195 |         self.levels['raw_materials'] = np.unique(np.hstack(
196 |             [list(self.graph.predecessors(i)) for i in self.levels['manufacturer']]))
197 | 
198 |         self.level_col = {'retailer': 0,
199 |                     'distributor': 1,
200 |                     'manufacturer': 2,
201 |                     'raw_materials': 3}
202 | 
203 |         self.market = [j for j in self.graph.nodes() if len(list(self.graph.successors(j))) == 0]
204 |         self.distrib = [j for j in self.graph.nodes() if 'C' not in self.graph.nodes[j] and 'I0' in self.graph.nodes[j]]
205 |         self.retail = [j for j in self.graph.nodes() if len(set.intersection(set(self.graph.successors(j)), set(self.market))) > 0]
206 |         self.factory = [j for j in self.graph.nodes() if 'C' in self.graph.nodes[j]]
207 |         self.rawmat = [j for j in self.graph.nodes() if len(list(self.graph.predecessors(j))) == 0]
208 |         self.main_nodes = np.sort(self.distrib + self.factory)
209 |         self.reorder_links = [e for e in self.graph.edges() if 'L' in self.graph.edges[e]] #exclude links to markets (these cannot have lead time 'L')
210 |         self.retail_links = [e for e in self.graph.edges() if 'L' not in self.graph.edges[e]] #links joining retailers to markets
211 |         self.network_links = [e for e in self.graph.edges()] #all links involved in sale in the network
212 | 
213 |         # check inputs
214 |         assert set(self.graph.nodes()) == set.union(set(self.market),
215 |                                                     set(self.distrib),
216 |                                                     set(self.factory),
217 |                                                     set(self.rawmat)), "The union of market, distribution, factory, and raw material nodes is not equal to the system nodes."
218 |         for j in self.graph.nodes():
219 |             if 'I0' in self.graph.nodes[j]:
220 |                 assert self.graph.nodes[j]['I0'] >= 0, "The initial inventory cannot be negative for node {}.".format(j)
221 |             if 'h' in self.graph.nodes[j]:
222 |                 assert self.graph.nodes[j]['h'] >= 0, "The inventory holding costs cannot be negative for node {}.".format(j)
223 |             if 'C' in self.graph.nodes[j]:
224 |                 assert self.graph.nodes[j]['C'] > 0, "The production capacity must be positive for node {}.".format(j)
225 |             if 'o' in self.graph.nodes[j]:
226 |                 assert self.graph.nodes[j]['o'] >= 0, "The operating costs cannot be negative for node {}.".format(j)
227 |             if 'v' in self.graph.nodes[j]:
228 |                 assert self.graph.nodes[j]['v'] > 0 and self.graph.nodes[j]['v'] <= 1, "The production yield must be in the range (0, 1] for node {}.".format(j)
229 |         for e in self.graph.edges():
230 |             if 'L' in self.graph.edges[e]:
231 |                 assert self.graph.edges[e]['L'] >= 0, "The lead time joining nodes {} cannot be negative.".format(e)
232 |             if 'p' in self.graph.edges[e]:
233 |                 assert self.graph.edges[e]['p'] >= 0, "The sales price joining nodes {} cannot be negative.".format(e)
234 |             if 'b' in self.graph.edges[e]:
235 |                 assert self.graph.edges[e]['b'] >= 0, "The unfulfilled demand costs joining nodes {} cannot be negative.".format(e)
236 |             if 'g' in self.graph.edges[e]:
237 |                 assert self.graph.edges[e]['g'] >= 0, "The pipeline inventory holding costs joining nodes {} cannot be negative.".format(e)
238 |             if 'sample_path' in self.graph.edges[e]:
239 |                 assert isinstance(self.graph.edges[e]['sample_path'], bool), "When specifying if a user specified demand joining (retailer, market): {} is sampled from a distribution, sample_path must be a Boolean.".format(e)
240 |             if 'demand_dist' in self.graph.edges[e]:
241 |                 dist = self.graph.edges[e]['demand_dist'] #extract distribution
242 |                 assert dist.cdf(0,**self.graph.edges[e]['dist_param']), "Wrong parameters passed to the demand distribution joining (retailer, market): {}.".format(e)
243 |         assert self.backlog == False or self.backlog == True, "The backlog parameter must be a boolean."
244 |         assert self.graph.number_of_nodes() >= 2, "The minimum number of nodes is 2. Please try again"
245 |         assert self.alpha>0 and self.alpha<=1, "alpha must be in the range (0, 1]."
246 |         
247 |         # set random generation seed (unless using user demands)
248 |         self.seed(self.seed_int)
249 |         
250 |         # action space (reorder quantities for each node for each supplier; list)
251 |         # An action is defined for every node
252 |         num_reorder_links = len(self.reorder_links) 
253 |         self.lt_max = np.max([self.graph.edges[e]['L'] for e in self.graph.edges() if 'L' in self.graph.edges[e]])
254 |         self.init_inv_max = np.max([self.graph.nodes[j]['I0'] for j in self.graph.nodes() if 'I0' in self.graph.nodes[j]])
255 |         self.capacity_max = np.max([self.graph.nodes[j]['C'] for j in self.graph.nodes() if 'C' in self.graph.nodes[j]])
256 |         self.pipeline_length = sum([self.graph.edges[e]['L']
257 |             for e in self.graph.edges() if 'L' in self.graph.edges[e]])
258 |         self.lead_times = {e: self.graph.edges[e]['L'] 
259 |             for e in self.graph.edges() if 'L' in self.graph.edges[e]}
260 |         self.obs_dim = self.pipeline_length + len(self.main_nodes) + len(self.retail_links)
261 |         # self.pipeline_length = len(self.main_nodes)*(self.lt_max+1)
262 |         self.action_space = gym.spaces.Box(
263 |             low=np.zeros(num_reorder_links),
264 |             high=np.ones(num_reorder_links)*(self.init_inv_max + self.capacity_max*self.num_periods), 
265 |             dtype=np.float64)
266 |         # observation space (total inventory at each node, which is any integer value)
267 |         self.observation_space = gym.spaces.Box(
268 |             low=np.ones(self.obs_dim)*np.iinfo(np.int32).min,
269 |             high=np.ones(self.obs_dim)*np.iinfo(np.int32).max,
270 |             dtype=np.float64)
271 |             # low=-np.ones(self.pipeline_length)*(self.init_inv_max + self.capacity_max*self.num_periods)*10,
272 |             # high=np.ones(self.pipeline_length)*(self.init_inv_max + self.capacity_max*self.num_periods), 
273 |             # dtype=np.int32)
274 | 
275 |         # intialize
276 |         self.reset()
277 | 
278 |     def seed(self,seed=None):
279 |         '''
280 |         Set random number generation seed
281 |         '''
282 |         # seed random state
283 |         if seed != None:
284 |             np.random.seed(seed=int(seed))
285 |         
286 |     def _RESET(self):
287 |         '''
288 |         Create and initialize all variables and containers.
289 |         Nomenclature:
290 |             I = On hand inventory at the start of each period at each stage (except last one).
291 |             T = Pipeline inventory at the start of each period at each stage (except last one).
292 |             R = Replenishment order placed at each period at each stage (except last one).
293 |             D = Customer demand at each period (at the retailer)
294 |             S = Sales performed at each period at each stage.
295 |             B = Backlog at each period at each stage.
296 |             LS = Lost sales at each period at each stage.
297 |             P = Total profit at each stage.
298 |         '''
299 |         T = self.num_periods
300 |         J = len(self.main_nodes)
301 |         RM = len(self.retail_links)  # number of retailer-market pairs
302 |         PS = len(self.reorder_links) # number of purchaser-supplier pairs in the network
303 |         SL = len(self.network_links) # number of edges in the network (excluding links form raw material nodes)
304 |         
305 |         # simulation result lists
306 |         self.X=pd.DataFrame(data = np.zeros([T + 1, J]), 
307 |                             columns = self.main_nodes) # inventory at the beginning of each period
308 |         self.Y=pd.DataFrame(data = np.zeros([T + 1, PS]), 
309 |                             columns = pd.MultiIndex.from_tuples(self.reorder_links,
310 |                             names = ['Source','Receiver'])) # pipeline inventory at the beginning of each period
311 |         self.R=pd.DataFrame(data = np.zeros([T, PS]), 
312 |                             columns = pd.MultiIndex.from_tuples(self.reorder_links,
313 |                             names = ['Supplier','Requester'])) # replenishment orders
314 |         self.S=pd.DataFrame(data = np.zeros([T, SL]), 
315 |                             columns = pd.MultiIndex.from_tuples(self.network_links, 
316 |                             names = ['Seller','Purchaser'])) # units sold
317 |         self.D=pd.DataFrame(data = np.zeros([T, RM]), 
318 |                             columns = pd.MultiIndex.from_tuples(self.retail_links, 
319 |                             names = ['Retailer','Market'])) # demand at retailers
320 |         self.U=pd.DataFrame(data = np.zeros([T, RM]), 
321 |                             columns = pd.MultiIndex.from_tuples(self.retail_links, 
322 |                             names = ['Retailer','Market'])) # unfulfilled demand for each market - retailer pair
323 |         self.P=pd.DataFrame(data = np.zeros([T, J]), 
324 |                             columns = self.main_nodes) # profit at each node
325 |         
326 |         # initializetion
327 |         self.period = 0 # initialize time
328 |         for j in self.main_nodes:
329 |             self.X.loc[0,j]=self.graph.nodes[j]['I0'] # initial inventory
330 |         self.Y.loc[0,:]=np.zeros(PS) # initial pipeline inventory
331 |         self.action_log = np.zeros([T, PS])
332 | 
333 |         # set state
334 |         self._update_state()
335 |         
336 |         return self.state
337 | 
338 |     def _update_state(self):
339 |         # State is a concatenation of demand, inventory, and pipeline at each time step
340 |         demand = np.hstack([self.D[d].iloc[self.period] for d in self.retail_links])
341 |         inventory = np.hstack([self.X[n].iloc[self.period] for n in self.main_nodes])
342 | 
343 |         # Pipeline values won't be of proper dimension if current
344 |         # current period < lead time. We need to add 0's as padding.
345 |         if self.period == 0:
346 |             _pipeline = [[self.Y[k].iloc[0]]
347 |                 for k, v in self.lead_times.items()]
348 |         else:
349 |             _pipeline = [self.Y[k].iloc[max(self.period-v,0):self.period].values
350 |                 for k, v in self.lead_times.items()]
351 |         pipeline = []
352 |         for p, v in zip(_pipeline, self.lead_times.values()):
353 |             if v == 0:
354 |                 continue
355 |             if len(p) <= v:
356 |                 pipe = np.zeros(v)
357 |                 pipe[-len(p):] += p
358 |             pipeline.append(pipe)
359 |         pipeline = np.hstack(pipeline)
360 |         self.state = np.hstack([demand, inventory, pipeline])
361 |     
362 |     def _STEP(self, action):
363 |         '''
364 |         Take a step in time in the multiperiod inventory management problem.
365 |         action = number of units to request from each supplier.
366 |             dictionary: keys are (supplier, purchaser) tuples
367 |                         values are number of units requested from supplier
368 |                         dimension = len(reorder_links) (number of edges joining all nodes, 
369 |                                                         except market nodes)
370 |         '''
371 |         t = self.period
372 |         if type(action) != dict: # convert to dict if a list was given
373 |             action = {key: action[i] for i, key in enumerate(self.reorder_links)}
374 |         
375 |         # Place Orders
376 |         for key in action.keys():
377 |             request = round(max(action[key],0)) # force to integer value
378 |             supplier = key[0]
379 |             purchaser = key[1]
380 |             if supplier in self.rawmat:
381 |                 self.R.loc[t,(supplier, purchaser)] = request # accept request since supply is unlimited
382 |                 self.S.loc[t,(supplier, purchaser)] = request
383 |             elif supplier in self.distrib:
384 |                 X_supplier = self.X.loc[t,supplier] # request limited by available inventory at beginning of period
385 |                 self.R.loc[t,(supplier, purchaser)] = min(request, X_supplier)
386 |                 self.S.loc[t,(supplier, purchaser)] = min(request, X_supplier)
387 |             elif supplier in self.factory:
388 |                 C = self.graph.nodes[supplier]['C'] # supplier capacity
389 |                 v = self.graph.nodes[supplier]['v'] # supplier yield
390 |                 X_supplier = self.X.loc[t,supplier] # on-hand inventory at beginning of period
391 |                 self.R.loc[t,(supplier, purchaser)] = min(request, C, v*X_supplier)
392 |                 self.S.loc[t,(supplier, purchaser)] = min(request, C, v*X_supplier)
393 |             
394 |         #Receive deliveries and update inventories
395 |         for j in self.main_nodes:
396 |             #update pipeline inventories
397 |             incoming = []
398 |             for k in self.graph.predecessors(j):
399 |                 L = self.graph.edges[(k,j)]['L'] #extract lead time
400 |                 if t - L >= 0: #check if delivery has arrived
401 |                     delivery = self.R.loc[t-L,(k,j)]
402 |                 else:
403 |                     delivery = 0
404 |                 incoming += [delivery] #update incoming material
405 |                 self.Y.loc[t+1,(k,j)] = self.Y.loc[t,(k,j)] - delivery + self.R.loc[t,(k,j)]
406 | 
407 |             #update on-hand inventory
408 |             if 'v' in self.graph.nodes[j]: #extract production yield
409 |                 v = self.graph.nodes[j]['v']
410 |             else:
411 |                 v = 1
412 |             outgoing = 1/v * np.sum([self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #consumed inventory (for requests placed)
413 |             self.X.loc[t+1,j] = self.X.loc[t,j] + np.sum(incoming) - outgoing
414 |             
415 |         # demand is realized
416 |         for j in self.retail:
417 |             for k in self.market:
418 |                 #read user specified demand. if all zeros, use demand_dist instead.
419 |                 Demand = self.graph.edges[(j,k)]['user_D']
420 |                 if np.sum(Demand) > 0:
421 |                     self.D.loc[t,(j,k)] = Demand[t]
422 |                 else:
423 |                     Demand = self.graph.edges[(j,k)]['demand_dist']
424 |                     self.D.loc[t,(j,k)] = Demand.rvs(
425 |                         **self.graph.edges[(j,k)]['dist_param'])
426 |                 if self.backlog and t >= 1:
427 |                     D = self.D.loc[t,(j,k)] + self.U.loc[t-1,(j,k)]
428 |                 else:
429 |                     D = self.D.loc[t,(j,k)]
430 |                 #satisfy demand up to available level
431 |                 X_retail = self.X.loc[t+1,j] #get inventory at retail before demand was realized
432 |                 self.S.loc[t,(j,k)] = min(D, X_retail) #perform sale
433 |                 self.X.loc[t+1,j] -= self.S.loc[t,(j,k)] #update inventory
434 |                 self.U.loc[t,(j,k)] = D - self.S.loc[t,(j,k)] #update unfulfilled orders
435 | 
436 |         # calculate profit
437 |         for j in self.main_nodes:
438 |             a = self.alpha
439 |             SR = np.sum([self.graph.edges[(j,k)]['p'] * self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #sales revenue
440 |             PC = np.sum([self.graph.edges[(k,j)]['p'] * self.R.loc[t,(k,j)] for k in self.graph.predecessors(j)]) #purchasing costs
441 |             if j not in self.rawmat:
442 |                 HC = self.graph.nodes[j]['h'] * self.X.loc[t+1,j] + np.sum([self.graph.edges[(k,j)]['g'] * self.Y.loc[t+1,(k,j)] for k in self.graph.predecessors(j)]) #holding costs
443 |             else:
444 |                 HC = 0
445 |             if j in self.factory:
446 |                 OC = self.graph.nodes[j]['o'] / self.graph.nodes[j]['v'] * np.sum([self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #operating costs
447 |             else:
448 |                 OC = 0
449 |             if j in self.retail:
450 |                 UP = np.sum([self.graph.edges[(j,k)]['b'] * self.U.loc[t,(j,k)] for k in self.graph.successors(j)]) #unfulfilled penalty
451 |             else:
452 |                 UP = 0
453 |             self.P.loc[t,j] = a**t * (SR - PC - OC - HC - UP)
454 |         
455 |         # update period
456 |         self.period += 1
457 | 
458 |         # set reward (profit from current timestep)
459 |         reward = self.P.loc[t,:].sum()
460 |         
461 |         # determine if simulation should terminate
462 |         if self.period >= self.num_periods:
463 |             done = True
464 |         else:
465 |             done = False
466 |             # update stae
467 |             self._update_state()
468 | 
469 |         return self.state, reward, done, {}
470 |     
471 |     def sample_action(self):
472 |         '''
473 |         Generate an action by sampling from the action_space
474 |         '''
475 |         return self.action_space.sample()
476 | 
477 |     def step(self, action):
478 |         return self._STEP(action)
479 | 
480 |     def reset(self):
481 |         return self._RESET()
482 | 
483 |     def plot_network(self):
484 |         colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
485 |         adjacency_matrix = np.vstack(self.graph.edges())
486 |         # Set level colors
487 |         level_col = {'retailer': 0,
488 |                     'distributor': 1,
489 |                     'manufacturer': 2,
490 |                     'raw_materials': 3}
491 | 
492 |         max_density = np.max([len(v) for v in self.levels.values()])
493 |         node_coords = {}
494 |         node_num = 1
495 |         plt.figure(figsize=(12,8))
496 |         for i, (level, nodes) in enumerate(self.levels.items()):
497 |             n = len(nodes)
498 |             node_y = max_density / 2 if n == 1 else np.linspace(0, max_density, n)
499 |             node_y = np.atleast_1d(node_y)
500 |             plt.scatter(np.repeat(i, n), node_y, label=level, s=50)
501 |             for y in node_y:
502 |                 plt.annotate(r'$N_{}$'.format(node_num), xy=(i, y+0.05))
503 |                 node_coords[node_num] = (i, y)
504 |                 node_num += 1
505 | 
506 |         # Draw edges
507 |         for node_num, v in node_coords.items():
508 |             x, y = v
509 |             sinks = adjacency_matrix[np.where(adjacency_matrix[:, 0]==node_num)][:, 1]
510 |             for s in sinks:
511 |                 try:
512 |                     sink_coord = node_coords[s]
513 |                 except KeyError:
514 |                     continue
515 |                 for k, n in self.levels.items():
516 |                     if node_num in n:
517 |                         color = colors[level_col[k]]
518 |                 x_ = np.hstack([x, sink_coord[0]])
519 |                 y_ = np.hstack([y, sink_coord[1]])
520 |                 plt.plot(x_, y_, color=color)
521 | 
522 |         plt.ylabel('Node')
523 |         plt.yticks([0], [''])
524 |         plt.xlabel('Level')
525 |         plt.xticks(np.arange(len(self.levels)), [k for k in self.levels.keys()])
526 |         plt.show()
527 |         
528 | class NetInvMgmtBacklogEnv(NetInvMgmtMasterEnv):
529 |     def __init__(self, *args, **kwargs):
530 |         super().__init__(*args, **kwargs)
531 |         
532 | class NetInvMgmtLostSalesEnv(NetInvMgmtMasterEnv):
533 |     def __init__(self, *args, **kwargs):
534 |         super().__init__(*args, **kwargs)
535 |         self.backlog = False


--------------------------------------------------------------------------------