├── or_gym ├── version.py ├── envs │ ├── finance │ │ ├── __init__.py │ │ └── portfolio_opt.py │ ├── registry.py │ ├── classic_or │ │ ├── __init__.py │ │ ├── newsvendor.py │ │ ├── binpacking.py │ │ ├── vmpacking.py │ │ ├── tsp.py │ │ ├── vehicle_routing.py │ │ └── knapsack.py │ ├── supply_chain │ │ ├── __init__.py │ │ ├── inventory_management.py │ │ └── network_management.py │ ├── env_list.py │ └── __init__.py ├── __init__.py └── utils.py ├── .github └── workflows │ └── run_env_test.yml ├── pyproject.toml ├── LICENSE ├── setup.py ├── examples ├── ray_rllib_taxi_demo.py ├── ray_tests.py ├── rllib-validate-env.py ├── ray_rllib_knapsack.py ├── tf_orgym_examples.ipynb ├── ray_rllib_knapsack.ipynb ├── inv-management-quickstart.ipynb └── how-to-use-rl-to-improve-your-supply-chain.ipynb ├── tests ├── env_test.py └── rllib_test.py ├── .gitignore └── README.md /or_gym/version.py: -------------------------------------------------------------------------------- 1 | VERSION='0.5.0' -------------------------------------------------------------------------------- /or_gym/envs/finance/__init__.py: -------------------------------------------------------------------------------- 1 | from or_gym.envs.finance.portfolio_opt import PortfolioOptEnv 2 | -------------------------------------------------------------------------------- /or_gym/envs/registry.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import EnvRegistry 2 | 3 | registry = EnvRegistry() 4 | 5 | def register(id, **kwargs): 6 | return registry.register(id, **kwargs) 7 | 8 | def make(id, **kwargs): 9 | return registry.make(id, **kwargs) 10 | 11 | def spec(id): 12 | return registry.spec(id) -------------------------------------------------------------------------------- /or_gym/envs/classic_or/__init__.py: -------------------------------------------------------------------------------- 1 | from or_gym.envs.classic_or.knapsack import * 2 | from or_gym.envs.classic_or.binpacking import * 3 | from or_gym.envs.classic_or.vmpacking import * 4 | from or_gym.envs.classic_or.tsp import * 5 | from or_gym.envs.classic_or.vehicle_routing import VehicleRoutingEnv 6 | from or_gym.envs.classic_or.newsvendor import NewsvendorEnv -------------------------------------------------------------------------------- /or_gym/envs/supply_chain/__init__.py: -------------------------------------------------------------------------------- 1 | from or_gym.envs.supply_chain.inventory_management import InvManagementBacklogEnv 2 | from or_gym.envs.supply_chain.inventory_management import InvManagementLostSalesEnv 3 | from or_gym.envs.supply_chain.network_management import NetInvMgmtBacklogEnv 4 | from or_gym.envs.supply_chain.network_management import NetInvMgmtLostSalesEnv -------------------------------------------------------------------------------- /or_gym/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | 5 | from gym import error 6 | from or_gym.version import VERSION as __version__ 7 | from or_gym.utils import * 8 | 9 | from gym.core import Env, Wrapper, ObservationWrapper, ActionWrapper, RewardWrapper 10 | from gym.envs import make, spec, register 11 | from or_gym.envs import classic_or, finance, supply_chain -------------------------------------------------------------------------------- /or_gym/envs/env_list.py: -------------------------------------------------------------------------------- 1 | ENV_LIST = ['Newsvendor-v0', 2 | 'TSP-v0', 'TSP-v1', 3 | 'Knapsack-v0', 'Knapsack-v1', 'Knapsack-v2', 'Knapsack-v3', 4 | 'BinPacking-v0', 'BinPacking-v1', 'BinPacking-v2', 5 | 'BinPacking-v3', 'BinPacking-v4', 'BinPacking-v5', 6 | 'VMPacking-v0', 7 | 'InvManagement-v0', 'InvManagement-v1', 8 | 'NetworkManagement-v0', 'NetworkManagement-v1', 9 | 'PortfolioOpt-v0', 10 | 'VehicleRouting-v0'] -------------------------------------------------------------------------------- /.github/workflows/run_env_test.yml: -------------------------------------------------------------------------------- 1 | name: run-env-test 2 | on: [push] 3 | jobs: 4 | check-environments: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: checkout repo content 8 | uses: actions/checkout@v3 9 | - name: setup python 10 | uses: actions/setup-python@v2 11 | with: 12 | python-version: 3.7 13 | - name: install or_gym 14 | run: pip install -e . 15 | - name: execute py test script 16 | run: | 17 | python tests/env_test.py 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "or-gym" 7 | version = "0.5.0" 8 | authors = [ 9 | { name="Christian Hubbs", email="christiandhubbs@gmail.com"}, 10 | { name="Owais Sarwar", email="owais.sarwar@gmail.com"}, 11 | { name="Hector Perez", email="hdperez@cmu.edu"} 12 | ] 13 | description = "OR-Gym: A set of environments for developing reinforcement learning agents for OR problems." 14 | readme = "README.md" 15 | license = { file="LICENSE" } 16 | requires-python = ">=3.7" 17 | dependencies = [ 18 | 'gym<=0.21.0', 19 | 'numpy>=1.16.1', 20 | 'pandas>=1.2', 21 | 'scipy>=1.0', 22 | 'matplotlib>=3.1', 23 | 'networkx>=2.3' 24 | ] 25 | classifiers = [ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: MIT License", 28 | "Operating System :: OS Independent", 29 | ] 30 | 31 | [project.urls] 32 | "Homepage" = "https://github.com/hubbs5/or-gym" 33 | "Bug Tracker" = "https://github.com/hubbs5/or-gym/issues" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Christian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import sys 5 | import os 6 | 7 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'or_gym')) 8 | 9 | 10 | with open(os.path.join('or_gym', 'version.py')) as version_file: 11 | version = version_file.read().strip() 12 | 13 | VERSION = version.split("=")[-1].replace("'", "") 14 | 15 | # version = {} 16 | # with open("or_gym/version.py") as fp: 17 | # exec(fp.read(), version) 18 | # later on we use: version['__version__'] 19 | # from or_gym.version import VERSION 20 | 21 | setup(name='or-gym', 22 | version=VERSION, 23 | description='OR-Gym: A set of environments for developing reinforcement learning agents for OR problems.', 24 | author='Christian Hubbs, Hector Perez Parra, Owais Sarwar', 25 | license='MIT', 26 | url='https://github.com/hubbs5/or-gym', 27 | packages=find_packages(), 28 | install_requires=[ 29 | 'gym<=0.21.0', 30 | 'numpy>=1.16.1', 31 | 'pandas>=1.2', 32 | 'scipy>=1.0', 33 | 'matplotlib>=3.1', 34 | 'networkx>=2.3' 35 | ], 36 | zip_safe=False, 37 | python_requires='>=3.7', 38 | classifiers=[ 39 | 'Development Status :: 3 - Alpha', 40 | 'Intended Audience :: Developers', 41 | 'Programming Language :: Python :: 3.7', 42 | 'Programming Language :: Python :: 3.8', 43 | 'Programming Language :: Python :: 3.9', 44 | ] 45 | ) 46 | -------------------------------------------------------------------------------- /examples/ray_rllib_taxi_demo.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from ray.rllib.agents.ppo import PPOTrainer 3 | 4 | ray.shutdown() 5 | ray.init(ignore_reinit_error=True) 6 | 7 | # Configure the algorithm. 8 | config = { 9 | # Environment (RLlib understands openAI gym registered strings). 10 | "env": "Taxi-v3", 11 | # Use 2 environment workers (aka "rollout workers") that parallelly 12 | # collect samples from their own environment clone(s). 13 | "num_workers": 2, 14 | # Change this to "framework: torch", if you are using PyTorch. 15 | # Also, use "framework: tf2" for tf2.x eager execution. 16 | "framework": "tf", 17 | # Tweak the default model provided automatically by RLlib, 18 | # given the environment's observation- and action spaces. 19 | "model": { 20 | "fcnet_hiddens": [64, 64], 21 | "fcnet_activation": "relu", 22 | }, 23 | # Set up a separate evaluation worker set for the 24 | # `trainer.evaluate()` call after training (see below). 25 | "evaluation_num_workers": 1, 26 | # Only for evaluation runs, render the env. 27 | "evaluation_config": { 28 | "render_env": True, 29 | } 30 | } 31 | 32 | # Create our RLlib Trainer. 33 | trainer = PPOTrainer(config=config) 34 | 35 | # Run it for n training iterations. A training iteration includes 36 | # parallel sample collection by the environment workers as well as 37 | # loss calculation on the collected batch and a model update. 38 | for _ in range(3): 39 | print(trainer.train()) 40 | 41 | # Evaluate the trained Trainer (and render each timestep to the shell's 42 | # output). 43 | trainer.evaluate() 44 | -------------------------------------------------------------------------------- /tests/env_test.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | 3 | ''' 4 | Tests to ensure environments load and basic functionality 5 | is satisfied. 6 | ''' 7 | 8 | import or_gym 9 | from or_gym.envs.env_list import ENV_LIST 10 | import traceback 11 | 12 | def pytest_generate_tests(metafunc): 13 | idlist = [] 14 | argvalues = [] 15 | for scenario in metafunc.cls.scenarios: 16 | idlist.append(scenario[0]) 17 | items = scenario[1].items() 18 | argnames = [x[0] for x in items] 19 | argvalues.append([x[1] for x in items]) 20 | metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") 21 | 22 | class TestEnv: 23 | scenarios = [(i, {'config': {'env_name': i}}) for i in ENV_LIST] 24 | 25 | def _build_env(self, env_name): 26 | env = or_gym.make(env_name) 27 | return env 28 | 29 | def test_make(self, config): 30 | # Ensures that environments are instantiated 31 | env_name = config['env_name'] 32 | try: 33 | _ = self._build_env(env_name) 34 | success = True 35 | except Exception as e: 36 | tb = e.__traceback__ 37 | success = False 38 | assert success, ''.join(traceback.format_tb(tb)) 39 | 40 | def test_episode(self, config): 41 | # Run 100 episodes and check observation space 42 | env_name = config['env_name'] 43 | EPISODES = 100 44 | env = self._build_env(env_name) 45 | for ep in range(EPISODES): 46 | state = env.reset() 47 | while True: 48 | assert env.observation_space.contains(state), \ 49 | f"State out of range of observation space: {state}" 50 | action = env.action_space.sample() 51 | state, reward, done, info = env.step(action) 52 | if done: 53 | break 54 | 55 | assert done -------------------------------------------------------------------------------- /examples/ray_tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Dec 29 21:51:45 2021 4 | 5 | @author: phili 6 | """ 7 | 8 | import ray 9 | import time 10 | 11 | # Start the ray core 12 | ray.init() 13 | 14 | # By adding the `@ray.remote` decorator, a regular Python function 15 | # becomes a Ray remote function. 16 | @ray.remote 17 | def f(x): 18 | return x * x 19 | 20 | # To invoke this remote function, use the `remote` method. 21 | # This will immediately return an object ref (a future) and then create 22 | # a task that will be executed on a worker process. 23 | futures = [f.remote(i) for i in range(4)] 24 | 25 | # The result can be retrieved with ``ray.get``. 26 | print(ray.get(futures)) # [0, 1, 4, 9] 27 | 28 | @ray.remote 29 | class Counter(object): 30 | def __init__(self): 31 | self.n = 0 32 | 33 | def increment(self): 34 | self.n += 1 35 | 36 | def read(self): 37 | return self.n 38 | 39 | counters = [Counter.remote() for i in range(4)] 40 | [c.increment.remote() for c in counters] 41 | futures = [c.read.remote() for c in counters] 42 | print(ray.get(futures)) # [1, 1, 1, 1] 43 | 44 | # Note the following behaviors: 45 | # The second task will not be executed until the first task has finished executing because the second task depends on the output of the first task. 46 | # If the two tasks are scheduled on different machines, the output of the first task (the value corresponding to obj_ref1/objRef1) will be sent over the network to the machine where the second task is scheduled. 47 | 48 | # MWE for tiny work parallelization 49 | def tiny_work(x): 50 | time.sleep(0.0001) # replace this is with work you need to do 51 | return x 52 | 53 | @ray.remote 54 | def mega_work(start, end): 55 | return [tiny_work(x) for x in range(start, end)] 56 | 57 | start = time.time() 58 | result_ids = [] 59 | [result_ids.append(mega_work.remote(x*1000, (x+1)*1000)) for x in range(100)] 60 | results = ray.get(result_ids) 61 | print("duration =", time.time() - start) 62 | 63 | # Close down at the end of the session 64 | ray.shutdown() -------------------------------------------------------------------------------- /tests/rllib_test.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | 3 | ''' 4 | Tests to ensure environments are compatible with RLLib. 5 | Note RLLib is NOT a required package, but tests are included 6 | because it is very useful for RL work. 7 | ''' 8 | 9 | import or_gym 10 | from or_gym.utils import create_env 11 | from or_gym.envs.env_list import ENV_LIST 12 | import ray 13 | from ray import tune 14 | from ray.rllib.agents.ppo import PPOTrainer 15 | import traceback 16 | 17 | def pytest_generate_tests(metafunc): 18 | idlist = [] 19 | argvalues = [] 20 | for scenario in metafunc.cls.scenarios: 21 | idlist.append(scenario[0]) 22 | items = scenario[1].items() 23 | argnames = [x[0] for x in items] 24 | argvalues.append([x[1] for x in items]) 25 | metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") 26 | 27 | 28 | def register_env(env_name, env_config={}): 29 | env = create_env(env_name) 30 | tune.register_env(env_name, 31 | lambda env_name: env(env_name, env_config=env_config)) 32 | 33 | class TestEnv: 34 | scenarios = [(i, {"config": {"env_name": i}}) for i in ENV_LIST] 35 | 36 | def _build_env(self, env_name): 37 | env = or_gym.make(env_name) 38 | return env 39 | 40 | def _get_rl_config_dict(self, env_name, env_config={}): 41 | rl_config = dict( 42 | env=env_name, 43 | num_workers=2, 44 | env_config=env_config, 45 | model=dict( 46 | vf_share_layers=False, 47 | fcnet_activation='elu', 48 | fcnet_hiddens=[256, 256] 49 | ), 50 | lr=1e-5 51 | ) 52 | return rl_config 53 | 54 | def test_ray(self, config): 55 | env_name = config["env_name"] 56 | env = self._build_env(env_name) 57 | register_env(env_name) 58 | rl_config = self._get_rl_config_dict(env_name) 59 | ray.init(ignore_reinit_error=True) 60 | agent = PPOTrainer(env=env_name, config=rl_config) 61 | # Train 1 episode for testing 62 | try: 63 | res = agent.train() 64 | passed = True 65 | except: 66 | passed = False 67 | 68 | ray.shutdown() 69 | assert passed 70 | 71 | 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.gz 6 | *.csv 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # Output files 134 | nohup.out 135 | 136 | # .bak files 137 | *.bak 138 | 139 | # wandb output 140 | wandb/ 141 | 142 | notebooks/ -------------------------------------------------------------------------------- /or_gym/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Knapsack Environments 4 | register(id='Knapsack-v0', 5 | entry_point='or_gym.envs.classic_or.knapsack:KnapsackEnv' 6 | ) 7 | 8 | register(id='Knapsack-v1', 9 | entry_point='or_gym.envs.classic_or.knapsack:BinaryKnapsackEnv' 10 | ) 11 | 12 | register(id='Knapsack-v2', 13 | entry_point='or_gym.envs.classic_or.knapsack:BoundedKnapsackEnv' 14 | ) 15 | 16 | register(id='Knapsack-v3', 17 | entry_point='or_gym.envs.classic_or.knapsack:OnlineKnapsackEnv' 18 | ) 19 | 20 | # Bin Packing Environments 21 | register(id='BinPacking-v0', 22 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingEnv' 23 | ) 24 | 25 | register(id='BinPacking-v1', 26 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingLW1' 27 | ) 28 | 29 | register(id='BinPacking-v2', 30 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingPP0' 31 | ) 32 | 33 | register(id='BinPacking-v3', 34 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingPP1' 35 | ) 36 | 37 | register(id='BinPacking-v4', 38 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingBW0' 39 | ) 40 | 41 | register(id='BinPacking-v5', 42 | entry_point='or_gym.envs.classic_or.binpacking:BinPackingBW1' 43 | ) 44 | 45 | # Newsvendor Envs 46 | register(id='Newsvendor-v0', 47 | entry_point='or_gym.envs.classic_or.newsvendor:NewsvendorEnv' 48 | ) 49 | 50 | # Virtual Machine Packing Envs 51 | register(id='VMPacking-v0', 52 | entry_point='or_gym.envs.classic_or.vmpacking:VMPackingEnv' 53 | ) 54 | 55 | register(id='VMPacking-v1', 56 | entry_point='or_gym.envs.classic_or.vmpacking:TempVMPackingEnv' 57 | ) 58 | 59 | # Vehicle Routing Envs 60 | register(id='VehicleRouting-v0', 61 | entry_point='or_gym.envs.classic_or.vehicle_routing:VehicleRoutingEnv' 62 | ) 63 | 64 | # TSP 65 | register(id='TSP-v0', 66 | entry_point='or_gym.envs.classic_or.tsp:TSPEnv' 67 | ) 68 | 69 | register(id='TSP-v1', 70 | entry_point='or_gym.envs.classic_or.tsp:TSPDistCost' 71 | ) 72 | 73 | # Inventory Management Envs 74 | register(id='InvManagement-v0', 75 | entry_point='or_gym.envs.supply_chain.inventory_management:InvManagementBacklogEnv' 76 | ) 77 | 78 | register(id='InvManagement-v1', 79 | entry_point='or_gym.envs.supply_chain.inventory_management:InvManagementLostSalesEnv' 80 | ) 81 | 82 | register(id='NetworkManagement-v0', 83 | entry_point='or_gym.envs.supply_chain.network_management:NetInvMgmtBacklogEnv' 84 | ) 85 | 86 | register(id='NetworkManagement-v1', 87 | entry_point='or_gym.envs.supply_chain.network_management:NetInvMgmtLostSalesEnv' 88 | ) 89 | 90 | # Asset Allocation Envs 91 | register(id='PortfolioOpt-v0', 92 | entry_point='or_gym.envs.finance.portfolio_opt:PortfolioOptEnv' 93 | ) 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # or-gym 2 | ## Environments for OR and RL Research 3 | 4 | This library contains environments consisting of operations research problems which adhere to the OpenAI Gym API. The purpose is to bring reinforcement learning to the operations research community via accessible simulation environments featuring classic problems that are solved both with reinforcement learning as well as traditional OR techniques. 5 | 6 | ## Installation 7 | 8 | This library requires Python 3.5+ in order to function. 9 | 10 | Installation is possible via `pip`: 11 | 12 | `$ pip install or-gym` 13 | 14 | Or, you can install directly from GitHub with: 15 | 16 | ``` 17 | git clone https://github.com/hubbs5/or-gym.git 18 | cd or-gym 19 | pip install -e . 20 | ``` 21 | ## Quickstart Example and Benchmarking Example 22 | 23 | See the IPython notebook entitled `inv-management-quickstart.ipynb` in the `examples` folder for a quickstart example for training an agent in an OR-GYM environemnt, and for using the environment for benchmarking policies found by other algorithms. For the RL algorithm, Ray 1.0.0 is required. 24 | 25 | ## Citation 26 | ``` 27 | @misc{HubbsOR-Gym, 28 | author={Christian D. Hubbs and Hector D. Perez and Owais Sarwar and Nikolaos V. Sahinidis and Ignacio E. Grossmann and John M. Wassick}, 29 | title={OR-Gym: A Reinforcement Learning Library for Operations Research Problems}, 30 | year={2020}, 31 | Eprint={arXiv:2008.06319} 32 | } 33 | ``` 34 | 35 | ## Environments 36 | 37 | - `Knapsack-v0`: a small version of the classic unbounded knapsack problem with 200 items. 38 | - `Knapsack-v1`: binary (0-1) knapsack problem with 200 items. 39 | - `Knapsack-v2`: bounded knapsack problem with 200 items. 40 | - `Knapsack-v3`: stochastic, online knapsack problem. 41 | - `BinPacking-v0` through `BinPacking-v5`: online bin packing problem taken from [Balaji et al.](https://arxiv.org/abs/1911.10641). 42 | - `Newsvendor-v0`: multi-period newsvendor problem with lead times taken from [Balaji et al.](https://arxiv.org/abs/1911.10641). 43 | - `VMPacking-v0`: permanent, multi-dimensional virtual machine packing problem. 44 | - `VMPacking-v1`: temporary, multi-dimensional virtual machine packing problem. 45 | - `VehicleRouting-v0`: pick-up and delivery problem with delivery windows taken from [Balaji et al.](https://arxiv.org/abs/1911.10641). 46 | - `InvManagement-v0`: multi-echelon supply chain re-order problem with backlogs. 47 | - `InvManagement-v1`: multi-echelon supply chain re-order problem without backlog. 48 | - `NetworkManagement-v0`: multi-echelon supply chain network problem with backlogs from [Perez et al.](https://www.mdpi.com/2227-9717/9/1/102). 49 | - `NetworkManagement-v1`: multi-echelon supply chain network problem without backlogs from [Perez et al.](https://www.mdpi.com/2227-9717/9/1/102). 50 | - `PortfolioOpt-v0`: Multi-period asset allocation problem for managing investment decisions taken from [Dantzig and Infanger](https://apps.dtic.mil/dtic/tr/fulltext/u2/a242510.pdf). 51 | - `TSP-v0`: traveling salesman problem with bi-directional connections and uniform cost. 52 | - `TSP-v1`: traveling salesman problem with bi-directional connections. 53 | 54 | ## Resources 55 | 56 | Information on results and supporting models can be found [here](https://arxiv.org/abs/2008.06319). 57 | 58 | ## Examples 59 | 60 | - [Action Masking with RLlib using the Knapsack Environment](https://www.datahubbs.com/action-masking-with-rllib/) 61 | - [How to Use Deep Reinforcement Learning to Improve your Supply Chain](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/) -------------------------------------------------------------------------------- /or_gym/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def assign_env_config(self, kwargs): 4 | for key, value in kwargs.items(): 5 | setattr(self, key, value) 6 | if hasattr(self, 'env_config'): 7 | for key, value in self.env_config.items(): 8 | # Check types based on default settings 9 | if hasattr(self, key): 10 | if type(getattr(self,key)) == np.ndarray: 11 | setattr(self, key, value) 12 | else: 13 | setattr(self, key, 14 | type(getattr(self, key))(value)) 15 | else: 16 | raise AttributeError(f"{self} has no attribute, {key}") 17 | 18 | # Get Ray to work with gym registry 19 | def create_env(config, *args, **kwargs): 20 | if type(config) == dict: 21 | env_name = config['env'] 22 | else: 23 | env_name = config 24 | 25 | if env_name.lower() == 'knapsack-v0': 26 | from or_gym.envs.classic_or.knapsack import KnapsackEnv as env 27 | elif env_name.lower() == 'knapsack-v1': 28 | from or_gym.envs.classic_or.knapsack import BinaryKnapsackEnv as env 29 | elif env_name.lower() == 'knapsack-v2': 30 | from or_gym.envs.classic_or.knapsack import BoundedKnapsackEnv as env 31 | elif env_name.lower() == 'knapsack-v3': 32 | from or_gym.envs.classic_or.knapsack import OnlineKnapsackEnv as env 33 | elif env_name.lower() == 'binpacking-v0': 34 | from or_gym.envs.classic_or.binpacking import BinPackingEnv as env 35 | elif env_name.lower() == 'binpacking-v1': 36 | from or_gym.envs.classic_or.binpacking import BinPackingLW1 as env 37 | elif env_name.lower() == 'binpacking-v2': 38 | from or_gym.envs.classic_or.binpacking import BinPackingPP0 as env 39 | elif env_name.lower() == 'binpacking-v3': 40 | from or_gym.envs.classic_or.binpacking import BinPackingPP1 as env 41 | elif env_name.lower() == 'binpacking-v4': 42 | from or_gym.envs.classic_or.binpacking import BinPackingBW0 as env 43 | elif env_name.lower() == 'binpacking-v5': 44 | from or_gym.envs.classic_or.binpacking import BinPackingBW1 as env 45 | elif env_name.lower() == 'vmpacking-v0': 46 | from or_gym.envs.classic_or.vmpacking import VMPackingEnv as env 47 | elif env_name.lower() == 'vmpacking-v1': 48 | from or_gym.envs.classic_or.vmpacking import TempVMPackingEnv as env 49 | elif env_name.lower() == 'portfolioopt-v0': 50 | from or_gym.envs.finance.portfolio_opt import PortfolioOptEnv as env 51 | elif env_name.lower() == 'tsp-v0': 52 | from or_gym.envs.classic_or.tsp import TSPEnv as env 53 | elif env_name.lower() == 'tsp-v1': 54 | from or_gym.envs.classic_or.tsp import TSPDistCost as env 55 | elif env_name.lower() == 'vehiclerouting-v0': 56 | from or_gym.envs.classic_or.vehicle_routing import VehicleRoutingEnv as env 57 | elif env_name.lower() == 'newsvendor-v0': 58 | from or_gym.envs.classic_or.newsvendor import NewsvendorEnv as env 59 | elif env_name.lower() == 'invmanagement-v0': 60 | from or_gym.envs.supply_chain.inventory_management import InvManagementBacklogEnv as env 61 | elif env_name.lower() == 'invmanagement-v1': 62 | from or_gym.envs.supply_chain.inventory_management import InvManagementLostSalesEnv as env 63 | elif env_name.lower() == 'networkmanagement-v0': 64 | from or_gym.envs.supply_chain.network_management import NetInvMgmtBacklogEnv as env 65 | elif env_name.lower() == 'networkmanagement-v1': 66 | from or_gym.envs.supply_chain.network_management import NetInvMgmtLostSalesEnv as env 67 | else: 68 | raise NotImplementedError('Environment {} not recognized.'.format(env_name)) 69 | return env 70 | -------------------------------------------------------------------------------- /or_gym/envs/classic_or/newsvendor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Example taken from Balaji et al. 3 | Paper: https://arxiv.org/abs/1911.10641 4 | GitHub: https://github.com/awslabs/or-rl-benchmarks 5 | ''' 6 | import gym 7 | from gym import spaces 8 | import itertools 9 | import numpy as np 10 | from collections.abc import Iterable 11 | from or_gym.utils import assign_env_config 12 | 13 | class NewsvendorEnv(gym.Env): 14 | ''' 15 | Multi-Period Newsvendor with Lead Times 16 | 17 | The MPNV requires meeting stochastic demand by having sufficient 18 | inventory on hand to satisfy customers. The inventory orders are not 19 | instantaneous and have multi-period leadtimes. Additionally, there are 20 | costs associated with holding unsold inventory, however unsold inventory 21 | expires at the end of each period. 22 | 23 | Observation: 24 | Type: Box 25 | State Vector: S = (p, c, h, k, mu, x_l, x_l-1) 26 | p = price 27 | c = cost 28 | h = holding cost 29 | k = lost sales penalty 30 | mu = mean of demand distribution 31 | x_l = order quantities in the queue 32 | 33 | Actions: 34 | Type: Box 35 | Amount of product to order. 36 | 37 | Reward: 38 | Sales minus discounted purchase price, minus holding costs for 39 | unsold product or penalties associated with insufficient inventory. 40 | 41 | Initial State: 42 | Parameters p, c, h, k, and mu, with no inventory in the pipeline. 43 | 44 | Episode Termination: 45 | Termination occurs after the maximum number of time steps is reached 46 | (40 by default). 47 | ''' 48 | def __init__(self, *args, **kwargs): 49 | self.lead_time = 5 50 | self.max_inventory = 4000 51 | self.max_order_quantity = 2000 52 | self.step_limit = 40 53 | self.p_max = 100 # Max sale price 54 | self.h_max = 5 # Max holding cost 55 | self.k_max = 10 # Max lost sales penalty 56 | self.mu_max = 200 # Max mean of the demand distribution 57 | self.gamma = 1 # Discount factor 58 | assign_env_config(self, kwargs) 59 | 60 | self.obs_dim = self.lead_time + 5 61 | self.observation_space = spaces.Box( 62 | low=np.zeros(self.obs_dim, dtype=np.float32), 63 | high=np.array( 64 | [self.p_max, self.p_max, self.h_max, self.k_max, self.mu_max] + 65 | [self.max_order_quantity] * self.lead_time), 66 | dtype=np.float32) 67 | self.action_space = spaces.Box( 68 | low=np.zeros(1), high=np.array([self.max_order_quantity]), 69 | dtype=np.float32) 70 | 71 | self.reset() 72 | 73 | def _STEP(self, action): 74 | done = False 75 | order_qty = max(0, # Ensure order > 0 76 | min(action, self.max_inventory - self.state[5:].sum())) # Cap inventory 77 | demand = np.random.poisson(self.mu) 78 | inventory = self.state[5:] 79 | if self.lead_time == 0: # No lead time -> instant fulfillment 80 | inv_on_hand = order_qty 81 | else: 82 | inv_on_hand = inventory[0] 83 | sales = min(inv_on_hand, demand) * self.price 84 | excess_inventory = max(0, inv_on_hand - demand) 85 | short_inventory = max(0, demand - inv_on_hand) 86 | purchase_cost = excess_inventory * self.cost * order_qty * \ 87 | self.gamma ** self.lead_time 88 | holding_cost = excess_inventory * self.h 89 | lost_sales_penalty = short_inventory * self.k 90 | reward = sales - purchase_cost - holding_cost - lost_sales_penalty 91 | 92 | # Update state, note inventory on hand expires at each time step 93 | new_inventory = np.zeros(self.lead_time) 94 | new_inventory[:-1] += inventory[1:] 95 | new_inventory[-1] += order_qty 96 | self.state = np.hstack([self.state[:5], new_inventory] , dtype=np.float32) 97 | 98 | self.step_count += 1 99 | if self.step_count >= self.step_limit: 100 | done = True 101 | if isinstance(reward, Iterable): 102 | # TODO: Sometimes reward is np.array with one entry 103 | reward = sum(reward) 104 | 105 | return self.state, reward, done, {} 106 | 107 | def _RESET(self): 108 | # Randomize costs 109 | self.price = max(1, np.random.rand() * self.p_max) 110 | self.cost = max(1, np.random.rand() * self.price) 111 | self.h = np.random.rand() * min(self.cost, self.h_max) 112 | self.k = np.random.rand() * self.k_max 113 | self.mu = np.random.rand() * self.mu_max 114 | self.state = np.zeros(self.obs_dim, dtype=np.float32) 115 | self.state[:5] = np.array([self.price, self.cost, self.h, 116 | self.k, self.mu]) 117 | 118 | self.step_count = 0 119 | 120 | return self.state 121 | 122 | def reset(self): 123 | return self._RESET() 124 | 125 | def step(self, action): 126 | return self._STEP(action) -------------------------------------------------------------------------------- /examples/rllib-validate-env.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | This python script outlines the _validate_env logic for Rllib (v.1.9.1) and 5 | can be used for debugginf issues in environment configuration. 6 | 7 | Created on Thu Dec 30 16:04:12 2021 8 | 9 | @author: Philipp Willms 10 | """ 11 | import or_gym 12 | import gym 13 | import numpy as np 14 | 15 | 16 | # Configuration for gym environment 17 | env_config = {'N': 200, 18 | 'max_weight': 60, 19 | # 'item_weights': np.array([1, 12, 2, 1, 4]), 20 | # 'item_values': np.array([2, 4, 2, 1, 10]), 21 | 'mask': True} 22 | 23 | env_name = 'Knapsack-v0' 24 | env = or_gym.make('Knapsack-v0', env_config=env_config) 25 | 26 | 27 | if isinstance(env, gym.Env) : 28 | # Make sure the gym.Env has the two space attributes properly set. 29 | assert hasattr(env, "observation_space") and hasattr( 30 | env, "action_space") 31 | # Get a dummy observation by resetting the env. 32 | dummy_obs = env.reset() 33 | # Convert lists to np.ndarrays. 34 | if type(dummy_obs) is list and isinstance(env.observation_space, 35 | gym.spaces.Box): 36 | dummy_obs = np.array(dummy_obs) 37 | print("Dummy obs after np array conversion: ") 38 | print(dummy_obs) 39 | # Ignore float32/float64 diffs. 40 | if isinstance(env.observation_space, gym.spaces.Box) and \ 41 | env.observation_space.dtype != dummy_obs.dtype: 42 | dummy_obs = dummy_obs.astype(env.observation_space.dtype) 43 | print("Dummy obs after ignore float diffs ") 44 | print(dummy_obs) 45 | # Check, if observation is ok (part of the observation space). If not, 46 | # error. 47 | 48 | determined_obs_space = env.observation_space 49 | 50 | # original code from box.py 51 | # def contains(self, x): 52 | # if not isinstance(x, np.ndarray): 53 | # logger.warn("Casting input x to numpy array.") 54 | # x = np.asarray(x, dtype=self.dtype) 55 | 56 | # return ( 57 | # np.can_cast(x.dtype, self.dtype) 58 | # and x.shape == self.shape 59 | # and np.all(x >= self.low) 60 | # and np.all(x <= self.high) 61 | # ) 62 | 63 | # action_mask check 64 | x = determined_obs_space["action_mask"] 65 | y = dummy_obs["action_mask"] 66 | print(x) 67 | print(y) 68 | print(np.can_cast(x.dtype, y.dtype)) 69 | print(x.shape == y.shape) 70 | print(np.all(y >= x.low)) 71 | print(np.all(y <= x.high)) 72 | 73 | # avail actions check 74 | x = determined_obs_space["avail_actions"] 75 | y = dummy_obs["avail_actions"] 76 | print(x) 77 | print(y) 78 | print(np.can_cast(x.dtype, y.dtype)) 79 | print(x.shape == y.shape) 80 | print(np.all(y >= x.low)) 81 | print(np.all(y <= x.high)) 82 | 83 | # state check 84 | x = determined_obs_space["state"] 85 | y = dummy_obs["state"] 86 | print(x) 87 | print(y) 88 | print(np.can_cast(x.dtype, y.dtype)) 89 | print(x.shape == y.shape) 90 | print(np.all(y >= x.low)) 91 | print(np.all(y <= x.high)) 92 | 93 | # original code from dict.py 94 | # def contains(self, x): 95 | # if not isinstance(x, dict) or len(x) != len(self.spaces): 96 | # return False 97 | # for k, space in self.spaces.items(): 98 | # if k not in x: 99 | # return False 100 | # if not space.contains(x[k]): 101 | # return False 102 | # return True 103 | 104 | x = determined_obs_space 105 | y = dummy_obs 106 | print("Dict check") 107 | print(isinstance(y, dict)) 108 | print("Length observation space: " + str(len(x.spaces))) 109 | print("Length dummy observation: " + str(len(y))) 110 | print(len(y) == len(x.spaces)) 111 | for k, space in x.spaces.items(): 112 | print(k) 113 | print(space) 114 | if k not in y: 115 | #return False 116 | print("Element not found in dummy observation") 117 | print(k) 118 | if not space.contains(y[k]): 119 | print("Contains check failed") 120 | print(y[k]) 121 | # return False 122 | 123 | # If there is a hard nut to crack with specific observation state, use the following 124 | x = determined_obs_space["state"] 125 | mal_state = y[k] 126 | print(np.can_cast(x.dtype, mal_state.dtype)) 127 | print(x.shape == mal_state.shape) 128 | print(np.all(mal_state >= x.low)) 129 | print(np.all(mal_state <= x.high)) 130 | print(isinstance(y[k], np.ndarray)) 131 | print(x.contains(mal_state)) 132 | 133 | # Copied from rollout_worker.py 134 | if not env.observation_space.contains(dummy_obs): 135 | print( 136 | f"Env's `observation_space` {env.observation_space} does not " 137 | f"contain returned observation after a reset ({dummy_obs})!") 138 | else: 139 | print("All checks passed") -------------------------------------------------------------------------------- /examples/ray_rllib_knapsack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Spyder Editor 4 | 5 | This is a temporary script file. 6 | """ 7 | import ray 8 | from ray.rllib.agents.ppo import PPOTrainer 9 | from ray import tune 10 | from ray.rllib.models import ModelCatalog 11 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2 12 | from ray.rllib.models.tf.fcnet import FullyConnectedNetwork 13 | from ray.rllib.utils import try_import_tf 14 | from gym import spaces 15 | import or_gym 16 | from or_gym.utils import create_env 17 | import numpy as np 18 | 19 | tf_api, tf_original, tf_version = try_import_tf(error = True) 20 | 21 | 22 | class KP0ActionMaskModel(TFModelV2): 23 | 24 | def __init__(self, obs_space, action_space, num_outputs, 25 | model_config, name, true_obs_shape=(11,), 26 | action_embed_size=5, *args, **kwargs): 27 | 28 | # true_obs_shape is going to match the size of the state. 29 | # If we stick with our reduced KP, that will be a vector with 11 entries. 30 | # The other value we need to provide is the action_embed_size, which is going to be the size of our action space (5) 31 | 32 | super(KP0ActionMaskModel, self).__init__(obs_space, 33 | action_space, num_outputs, model_config, name, 34 | *args, **kwargs) 35 | 36 | self.action_embed_model = FullyConnectedNetwork( 37 | spaces.Box(0, 1, shape=true_obs_shape), 38 | action_space, action_embed_size, 39 | model_config, name + "_action_embedding") 40 | self.register_variables(self.action_embed_model.variables()) 41 | 42 | def forward(self, input_dict, state, seq_lens): 43 | 44 | # The actual masking takes place in the forward method where we unpack the mask, actions, and state from 45 | # the observation dictionary provided by our environment. The state yields our action embeddings which gets 46 | # combined with our mask to provide logits with the smallest value we can provide. 47 | # This will get passed to a softmax output which will reduce the probability of selecting these actions to 0, 48 | # effectively blocking the agent from ever taking these illegal actions. 49 | 50 | avail_actions = input_dict["obs"]["avail_actions"] 51 | action_mask = input_dict["obs"]["action_mask"] 52 | action_embedding, _ = self.action_embed_model({ 53 | "obs": input_dict["obs"]["state"]}) 54 | # intent_vector = tf.expand_dims(action_embedding, 1) 55 | # action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=1) 56 | # inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) 57 | 58 | intent_vector = tf_api.expand_dims(action_embedding, 1) 59 | action_logits = tf_api.reduce_sum(avail_actions * intent_vector, axis=1) 60 | inf_mask = tf_api.maximum(tf_api.log(action_mask), tf_api.float32.min) 61 | 62 | return action_logits + inf_mask, state 63 | 64 | def value_function(self): 65 | return self.action_embed_model.value_function() 66 | 67 | # Configuration for gym environment 68 | # Not to be used for Online Knapsack 69 | env_config = {'N': 5, 70 | 'max_weight': 15, 71 | 'item_weights': np.array([1, 12, 2, 1, 4]), 72 | 'item_values': np.array([2, 4, 2, 1, 10]), 73 | 'mask': True} 74 | 75 | env_name = 'Knapsack-v0' 76 | env = or_gym.make('Knapsack-v0', env_config=env_config) 77 | 78 | print("Max weight capacity:\t{}kg".format(env.max_weight)) 79 | print("Number of items:\t{}".format(env.N)) 80 | 81 | # Register the model for Rllib usage 82 | ModelCatalog.register_custom_model('kp_mask', KP0ActionMaskModel) 83 | # Register the environment 84 | # ATTENTION: Tune needs the base class, not an instance of the environment like we get from or_gym.make(env_name) to work with. So we need to pass this to register_env using a lambda function as shown below. 85 | env = create_env(env_name) 86 | tune.register_env(env_name, lambda env_name: env(env_name, env_config=env_config)) 87 | 88 | trainer_config = { 89 | "model": { 90 | "custom_model": "kp_mask" # Here we must use the custom model name taken in register process before 91 | }, 92 | "env_config": env_config # env config from (or_)gym 93 | } 94 | 95 | # ray.shutdown() 96 | # Ensure that a ray instance is running, e.g. via http://127.0.0.1:8265/#/ 97 | # ray.init(address="auto", ignore_reinit_error = True, local_mode=True) 98 | ray.init() 99 | trainer = PPOTrainer(env='Knapsack-v0', config=trainer_config) 100 | 101 | # The real action masking logic: disable the agent to take action 0 102 | env = trainer.env_creator('Knapsack-v0') 103 | state = env.state 104 | state['action_mask'][0] = 0 105 | 106 | # Train an agent for 1000 states and check if action 0 was not taken ever 107 | actions = np.array([trainer.compute_single_action(state) for i in range(10000)]) 108 | print(any(actions==0)) 109 | 110 | # Use tune for hyperparameter tuning 111 | tune_config = { 112 | 'env': 'Knapsack-v0' 113 | } 114 | stop = { 115 | 'timesteps_total': 10000 116 | } 117 | results = tune.run( 118 | 'PPO', # Specify the algorithm to train 119 | metric="score", 120 | config=tune_config, 121 | stop=stop 122 | ) 123 | 124 | ray.shutdown() 125 | -------------------------------------------------------------------------------- /examples/tf_orgym_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2244831d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import gym\n", 11 | "import or_gym\n", 12 | "import numpy as np\n", 13 | "import random\n", 14 | "from tensorflow.keras.models import Sequential\n", 15 | "from tensorflow.keras.layers import Input, Dense, Flatten, Reshape\n", 16 | "from tensorflow.keras.optimizers import Adam\n", 17 | "from rl.agents import DQNAgent\n", 18 | "from rl.policy import BoltzmannQPolicy\n", 19 | "from rl.memory import SequentialMemory" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "cc47ced5", 25 | "metadata": {}, 26 | "source": [ 27 | "# Binary Knapsack" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "118ab57c", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "env_config = {'N': 5,\n", 38 | " 'max_weight': 15,\n", 39 | " 'item_weights': np.array([1, 12, 2, 1, 4]),\n", 40 | " 'item_values': np.array([2, 4, 2, 1, 10]),\n", 41 | " 'mask': False}\n", 42 | "env = or_gym.make('Knapsack-v0', env_config=env_config) \n", 43 | "initial_state = env.reset()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "e05f8c49", 49 | "metadata": {}, 50 | "source": [ 51 | "The state variable must be read as the following:\n", 52 | "\n", 53 | " Observation:\n", 54 | " Type: Tuple, Discrete\n", 55 | " 0: list of item weights\n", 56 | " 1: list of item values\n", 57 | " 2: maximum weight of the knapsack\n", 58 | " 3: current weight in knapsack\n", 59 | "\n", 60 | " Actions:\n", 61 | " Type: Discrete\n", 62 | " 0: Place item 0 into knapsack\n", 63 | " 1: Place item 1 into knapsack\n", 64 | " 2: ...\n", 65 | "\n", 66 | " Reward:\n", 67 | " Value of item successfully placed into knapsack or 0 if the item\n", 68 | " doesn't fit, at which point the episode ends.\n", 69 | "\n", 70 | " Starting State:\n", 71 | " Lists of available items and empty knapsack.\n", 72 | "\n", 73 | " Episode Termination:\n", 74 | " Full knapsack or selection that puts the knapsack over the limit." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "70567316", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "actions = env.action_space.n\n", 85 | "states = env.observation_space\n", 86 | "states.shape" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "25fd9311", 92 | "metadata": {}, 93 | "source": [ 94 | "Simulate random item selection for 10 episodes" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "14c8bf18", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "env.reset()\n", 105 | "episode = 0\n", 106 | "done = False\n", 107 | "while not done :\n", 108 | " episode += 1\n", 109 | " print(\"Episode: \" + str(episode))\n", 110 | " action = np.random.randint(actions)\n", 111 | " print(\"Take element number: \" + str(action))\n", 112 | " next_state, reward, done, info = env.step(action)\n", 113 | " print(\"Reward: \" + str(reward))\n", 114 | " print(next_state)\n", 115 | " print(env.render())" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "7375ad77", 121 | "metadata": {}, 122 | "source": [ 123 | "As we can see in the detailed print out of the observation space, it is just the last index value which changes from episode to episode. This index is equal to the current total weight of the knapsack. The observation space from the environment gives no indication on the total value collected, which is instead added by the render() function." 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "4260a48b", 129 | "metadata": {}, 130 | "source": [ 131 | "# Keras model for the knapsack decision environment\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "91d5ba79", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "model = Sequential() \n", 142 | "model.add(Dense(24, activation='relu', input_shape=states.shape))\n", 143 | "model.add(Flatten())\n", 144 | "model.add(Dense(actions, activation='linear'))\n", 145 | "model.summary()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "34e842c5", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "model.layers[0].get_input_shape_at(0) # get the input shape of desired layer" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "52f787d2", 161 | "metadata": {}, 162 | "source": [ 163 | "# Agent training with Keras RL" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "b3e0907c", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "policy = BoltzmannQPolicy()\n", 174 | "memory = SequentialMemory(limit=50000, window_length=1)\n", 175 | "dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", 176 | " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", 177 | "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", 178 | "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.8.12" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /or_gym/envs/finance/portfolio_opt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces, logger 4 | from gym.utils import seeding 5 | from or_gym.utils import assign_env_config 6 | from copy import copy 7 | 8 | class PortfolioOptEnv(gym.Env): 9 | ''' 10 | Portfolio Optimization Problem 11 | 12 | Instance: Multi-Period Asset Allocation Problem, Dantzing & Infager, 1993 13 | 14 | The Portfolio Optimization (PO) Problem is a problem that seeks to optimize 15 | the distribution of assets in a financial portfolio to with respect to a desired 16 | financial metric (e.g. maximal return, minimum risk, etc.). 17 | 18 | In this particular instance by Dantzing & Infager, the optimizer begins with a 19 | quantity of cash and has the opportunity to purchase or sell 3 other assets in each 20 | of 10 different investement periods. Each transaction incurs a cost and prices of 21 | the 3 assets are subject to change over time. Cash value is consant (price = 1). 22 | The objective is to maximize the amount of wealth (i.e. the sum total of asset values) 23 | at the end of the total investment horizon. 24 | 25 | The episodes proceed by the optimizer deciding whether to buy or sell each asset 26 | in each time period. The episode ends when either all 10 periods have passed or 27 | if the amount of any given asset held becomes negative. 28 | 29 | Observation: 30 | Type: Box(9) 31 | "asset prices" (idx 0, 1, 2, 3): array of asset prices [cash, asset1, asset2, asset3] 32 | "asset quantities" (idx 4, 5, 6, 7): array of asset quantities [cash, asset1, asset2, asset3] 33 | "total wealth" (idx 8): current total wealth (sum of price*quantity for each asset) 34 | 35 | 36 | Actions: 37 | Type: Box (3) 38 | "asset 1 transaction amount" (idx 0): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 1; 39 | "asset 2 transaction amount" (idx 1): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 2; 40 | "asset 3 transaction amount" (idx 2): x in [-2000, 2000]: Buy (positive) or sell (negative) x shares of asset 3; 41 | 42 | Reward: 43 | Change in total wealth from previous period or [-max(asset price of all assets) * maximum transaction size] 44 | if an asset quantity becomes negative, at which 45 | point the episode ends. 46 | 47 | Starting State: 48 | Starting amount of cash and wealth and prices. 49 | 50 | Episode Termination: 51 | Negative asset quantity or traversal of investment horizon. 52 | ''' 53 | def __init__(self, *args, **kwargs): 54 | self.num_assets = 3 # Number of assets 55 | self.initial_cash = 100 # Starting amount of capital 56 | self.step_limit = 10 # Investment horizon 57 | 58 | self.cash = copy(self.initial_cash) 59 | 60 | #Transaction costs proportional to amount bought 61 | self.buy_cost = np.array([0.045, 0.025, 0.035]) 62 | self.sell_cost = np.array([0.04, 0.02, 0.03]) 63 | # self.step_limit = 10 64 | # assign_env_config(self, kwargs) 65 | # self.asset_price_means = asset_price_means.T 66 | # # self.asset_price_means = (np.random.randint(10, 50, self.num_assets) \ 67 | # # * np.ones((self.step_limit, self.num_assets))).T 68 | # self.asset_price_var = np.ones(self.asset_price_means.shape) 69 | 70 | # Prices of assets have a mean value in every period and vary according to a Gaussian distribution 71 | asset1mean = np.array([1.25, 2, 4, 5, 3, 2, 3, 6, 9, 7]).reshape(1, -1) # Up and down all the way 72 | asset2mean = np.array([5, 3, 2, 2, 1.25, 4, 5, 6, 7, 8]).reshape(1, -1) # Down intially then up 73 | asset3mean = np.array([3, 5, 6, 9, 10, 8, 4, 2, 1.25, 4]).reshape(1, -1) # Up initially then down 74 | self.asset_price_means = np.vstack([asset1mean, asset2mean, asset3mean]) 75 | self.asset_price_var = np.ones((self.asset_price_means.shape)) * 0.45 76 | 77 | # Cash on hand, asset prices, num of shares, portfolio value 78 | self.obs_length = 1 + 2 * self.num_assets 79 | 80 | self.observation_space = spaces.Box(-20000, 20000, shape=(self.obs_length,), dtype=np.float32) 81 | self.action_space = spaces.Box(-2000, 2000, shape=(self.num_assets,), dtype=np.float32) 82 | 83 | self.seed() 84 | self.reset() 85 | 86 | 87 | def _RESET(self): 88 | self.step_count = 0 89 | self.asset_prices = self._generate_asset_prices() 90 | self.holdings = np.zeros(self.num_assets) 91 | self.cash = copy(self.initial_cash) 92 | self.state = np.hstack([ 93 | self.initial_cash, 94 | self.asset_prices[:, self.step_count], 95 | self.holdings], 96 | dtype=np.float32) 97 | return self.state 98 | 99 | def _generate_asset_prices(self): 100 | asset_prices = np.array([self.np_random.normal(mu, sig) for mu, sig in 101 | zip(self.asset_price_means.flatten(), self.asset_price_var.flatten())] 102 | ).reshape(self.asset_price_means.shape) 103 | # Zero out negative asset prices and all following prices - implies 104 | # equity is bankrupt and worthless. 105 | zero_vals = np.vstack(np.where(asset_prices<0)) 106 | cols = np.unique(zero_vals[0]) 107 | for c in cols: 108 | first_zero = zero_vals[1][np.where(zero_vals[0]==c)[0].min()] 109 | asset_prices[c,first_zero:] = 0 110 | return asset_prices 111 | 112 | def _STEP(self, action): 113 | 114 | assert self.action_space.contains(action) 115 | 116 | asset_prices = self.asset_prices[:, self.step_count].copy() 117 | 118 | for idx, a in enumerate(action): 119 | if a == 0: 120 | continue 121 | # Sell a shares of asset 122 | elif a < 0: 123 | a = np.abs(a) 124 | if a > self.holdings[idx]: 125 | a = self.holdings[idx] 126 | self.holdings[idx] -= a 127 | self.cash += asset_prices[idx] * a * (1 - self.sell_cost[idx]) 128 | # Buy a shares of asset 129 | elif a > 0: 130 | purchase_cost = asset_prices[idx] * a * (1 + self.buy_cost[idx]) 131 | if self.cash < purchase_cost: 132 | a = np.floor(self.cash / ( 133 | asset_prices[idx] * (1 + self.buy_cost[idx]))) 134 | purchase_cost = asset_prices[idx] * a * (1 + self.buy_cost[idx]) 135 | self.holdings[idx] += a 136 | self.cash -= purchase_cost 137 | 138 | # Return total portfolio value at the end of the horizon as reward 139 | if self.step_count + 1 == self.step_limit: 140 | reward = np.dot(asset_prices, self.holdings) + self.cash 141 | else: 142 | reward = 0 143 | self.step_count += 1 144 | 145 | # Finish if 10 periods have passed - end of investment horizon 146 | if self.step_count >= self.step_limit: 147 | done = True 148 | else: 149 | self._update_state() 150 | done = False 151 | 152 | return self.state, reward, done, {} 153 | 154 | def _update_state(self): 155 | self.state = np.hstack([ 156 | self.cash, 157 | self.asset_prices[:, self.step_count], 158 | self.holdings 159 | ], dtype=np.float32) 160 | 161 | def step(self, action): 162 | return self._STEP(action) 163 | 164 | def reset(self): 165 | return self._RESET() 166 | 167 | def seed(self, seed=None): 168 | self.np_random, seed = seeding.np_random(seed) 169 | return [seed] 170 | -------------------------------------------------------------------------------- /or_gym/envs/classic_or/binpacking.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Example taken from Balaji et al. 3 | Paper: https://arxiv.org/abs/1911.10641 4 | GitHub: https://github.com/awslabs/or-rl-benchmarks 5 | ''' 6 | import numpy as np 7 | import gym 8 | from gym import spaces, logger 9 | from gym.utils import seeding 10 | from or_gym.utils import assign_env_config 11 | import copy 12 | 13 | BIG_NEG_REWARD = -100 14 | BIG_POS_REWARD = 10 15 | 16 | class BinPackingEnv(gym.Env): 17 | ''' 18 | Small Bin Packing with Bounded Waste 19 | Env Registration: BinPacking-v0 20 | 21 | The Bin Packing Problem (BPP) is a combinatorial optimization problem which 22 | requires the user to select from a range of goods of different values and 23 | weights in order to maximize the value of the selected items within a 24 | given weight limit. This version is online meaning each item is randonly 25 | presented to the algorithm one at a time, at which point the algorithm 26 | can either accept or reject the item. After seeing a fixed number of 27 | items are shown, the episode terminates. If the weight limit is reached 28 | before the episode ends, then it terminates early. 29 | 30 | Observation: 31 | If mask == False: 32 | Type: Discrete 33 | 0 - bin_capacity: Count of bins at a given level h 34 | -1: Current item size 35 | if mask == True: 36 | Type: Dict 37 | 'state': vector of bins where 0 to bin capacity is the count of 38 | bins at that load level h and the last entry is the current 39 | item size. 40 | 'action_mask': binary vector where 0 indicates infeasible 41 | actions and 1 feasible actions. 42 | 'avail_actions': vector of values to be combined with mask. 43 | 44 | Actions: 45 | Type: Discrete 46 | 0: Open a new bin and place item into bin 47 | 1+: Attempt to place item into bin at the corresponding level 48 | 49 | Reward: 50 | Negative of the waste, which is the difference between the current 51 | size and excess space of the bin. 52 | 53 | Starting State: 54 | No available bins and random starting item 55 | 56 | Episode Termination: 57 | When invalid action is selected (e.g. attempt to place item in non-existent 58 | bin), bin limits are exceeded, or step limit is reached. 59 | ''' 60 | def __init__(self, *args, **kwargs): 61 | self.bin_capacity = 9 62 | self.item_sizes = [2, 3] 63 | self.item_probs = [0.8, 0.2] 64 | self.step_count = 0 65 | self.step_limit = 100 66 | self.mask = False 67 | assign_env_config(self, kwargs) 68 | self._build_obs_space() 69 | self._check_settings() 70 | self.seed() 71 | self.state = self.reset() 72 | 73 | def _STEP(self, action): 74 | done = False 75 | if action >= self.bin_capacity: 76 | raise ValueError('{} is an invalid action. Must be between {} and {}'.format( 77 | action, 0, self.bin_capacity)) 78 | elif action > (self.bin_capacity - self.item_size): 79 | # Bin overflows 80 | reward = BIG_NEG_REWARD - self.waste 81 | done = True 82 | elif action == 0: 83 | # Create new bin 84 | self.bin_levels[self.item_size] += 1 85 | self.waste = self.bin_capacity - self.item_size 86 | reward = -1 * self.waste 87 | elif self.bin_levels[action] == 0: 88 | # Can't insert item into non-existent bin 89 | reward = BIG_NEG_REWARD - self.waste 90 | done = True 91 | else: 92 | if action + self.item_size == self.bin_capacity: 93 | self.num_full_bins += 1 94 | else: 95 | self.bin_levels[action + self.item_size] += 1 96 | self.waste = -self.item_size 97 | reward = -1 * self.waste 98 | 99 | self.bin_levels[action] -= 1 100 | 101 | self.total_reward += reward 102 | 103 | self.step_count += 1 104 | 105 | if self.step_count >= self.step_limit: 106 | done = True 107 | 108 | self.state = self._update_state() 109 | 110 | return self.state, reward, done, {} 111 | 112 | def _update_state(self): 113 | self.item_size = self.get_item() 114 | state = np.array(self.bin_levels + [self.item_size], dtype=np.uint32) 115 | if self.mask: 116 | state_dict = { 117 | 'state': state, 118 | 'avail_actions': np.ones(self.bin_capacity, dtype=np.uint8)} 119 | # Mask actions for closed bins 120 | mask = np.ones(self.bin_capacity, dtype=np.uint8) * np.array(state[:-1]) 121 | # Mask actions where packing would exceed capacity 122 | overflow = self.bin_capacity - self.item_size 123 | mask[overflow+1:] = 0 124 | # Ensure open new bin is available 125 | mask[0] = 1 126 | state_dict['action_mask'] = mask 127 | return state_dict 128 | else: 129 | return state 130 | 131 | def get_item(self): 132 | return np.random.choice(self.item_sizes, p=self.item_probs) 133 | 134 | def sample_action(self): 135 | return self.action_space.sample() 136 | 137 | def _RESET(self): 138 | self.current_weight = 0 139 | self.step_count = 0 140 | self.num_full_bins = 0 141 | self.total_reward = 0 142 | self.waste = 0 143 | self.bin_levels = [0] * self.bin_capacity 144 | self.item_size = self.get_item() 145 | self.state = self._update_state() 146 | return self.state 147 | 148 | def _build_obs_space(self): 149 | if self.mask: 150 | self.observation_space = spaces.Dict({ 151 | 'action_mask': spaces.Box(0, 1, 152 | shape=(self.bin_capacity,), 153 | dtype=np.uint8), 154 | 'avail_actions': spaces.Box(0, 1, 155 | shape=(self.bin_capacity,), 156 | dtype=np.uint8), 157 | 'state': spaces.Box( 158 | low=np.array([0] * (1 + self.bin_capacity)), 159 | high=np.array([self.step_limit] * self.bin_capacity + 160 | [max(self.item_sizes)]), 161 | dtype=np.uint32) 162 | }) 163 | else: 164 | self.observation_space = spaces.Box( 165 | low=np.array([0] * (1 + self.bin_capacity)), 166 | high=np.array([self.step_limit] * self.bin_capacity + 167 | [max(self.item_sizes)]), 168 | dtype=np.uint32) 169 | 170 | self.action_space = spaces.Discrete(self.bin_capacity) 171 | 172 | def _check_settings(self): 173 | # Ensure setting sizes and probs are correct at initialization 174 | assert sum(self.item_probs) == 1, 'Item probabilities do not sum to 1.' 175 | assert len(self.item_probs) == len(self.item_sizes), \ 176 | 'Dimension mismatch between item probabilities' + \ 177 | ' ({}) and sizes ({})'.format( 178 | len(self.item_probs), len(self.item_sizes)) 179 | 180 | def reset(self): 181 | return self._RESET() 182 | 183 | def step(self, action): 184 | return self._STEP(action) 185 | 186 | class BinPackingLW1(BinPackingEnv): 187 | ''' 188 | Large Bin Packing Probem with Bounded Waste 189 | Env Registration: BinPacking-v1 190 | ''' 191 | def __init__(self, *args, **kwargs): 192 | super().__init__() 193 | self.bin_capacity = 100 194 | self.item_probs = [0.14, 0.1, 0.06, 0.13, 0.11, 0.13, 0.03, 0.11, 0.19] 195 | self.item_sizes = np.arange(1, 10) 196 | self.step_limit = 1000 197 | assign_env_config(self, kwargs) 198 | self._build_obs_space() 199 | self._check_settings() 200 | self.seed() 201 | self.state = self.reset() 202 | 203 | class BinPackingPP0(BinPackingEnv): 204 | ''' 205 | Small Perfectly Packable Bin Packing with Linear Waste 206 | Env Registration: BinPacking-v2 207 | ''' 208 | def __init__(self, *args, **kwargs): 209 | super().__init__() 210 | self.item_probs = [0.75, 0.25] 211 | assign_env_config(self, kwargs) 212 | self._build_obs_space() 213 | self._check_settings() 214 | self.seed() 215 | self.state = self.reset() 216 | 217 | class BinPackingPP1(BinPackingPP0): 218 | ''' 219 | Large Bin Packing Probem with Bounded Waste 220 | Env Registration: BinPacking-v3 221 | ''' 222 | def __init__(self, *args, **kwargs): 223 | super().__init__() 224 | self.bin_capacity = 100 225 | self.item_probs = [0.06, 0.11, 0.11, 0.22, 0, 0.11, 0.06, 0, 0.33] 226 | self.item_sizes = np.arange(1, 10) 227 | self.step_limit = 1000 228 | assign_env_config(self, kwargs) 229 | self._build_obs_space() 230 | self._check_settings() 231 | self.seed() 232 | self.state = self.reset() 233 | 234 | class BinPackingBW0(BinPackingEnv): 235 | ''' 236 | Small Perfectly Packable Bin Packing Problem with Bounded Waste 237 | Env Registration: BinPacking-v4 238 | ''' 239 | def __init__(self, *args, **kwargs): 240 | super().__init__() 241 | self.item_probs = [0.5, 0.5] 242 | assign_env_config(self, kwargs) 243 | self._build_obs_space() 244 | self._check_settings() 245 | self.seed() 246 | self.state = self.reset() 247 | 248 | class BinPackingBW1(BinPackingBW0): 249 | ''' 250 | Large Perfectly Packable Bin Packing Problem with Bounded Waste 251 | Env Registration: BinPacking-v5 252 | ''' 253 | def __init__(self, *args, **kwargs): 254 | super().__init__() 255 | self.bin_capacity = 100 256 | self.item_probs = [0, 0, 0, 1/3, 0, 0, 0, 0, 2/3] 257 | self.item_sizes = np.arange(1, 10) 258 | self.step_limit = 1000 259 | assign_env_config(self, kwargs) 260 | self._build_obs_space() 261 | self._check_settings() 262 | self.seed() 263 | self.state = self.reset() -------------------------------------------------------------------------------- /or_gym/envs/classic_or/vmpacking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces, logger 4 | from gym.utils import seeding 5 | from or_gym.utils import assign_env_config 6 | import copy 7 | 8 | class VMPackingEnv(gym.Env): 9 | ''' 10 | Online VM Packing Problem 11 | 12 | The VM Packing Problem (VMPP) is a combinatorial optimization problem which 13 | requires the user to select from a series of physical machines (PM's) to 14 | send a virtual machine process to. Each VM process is characterized by 15 | two values, the memory and compute of the process. These are normalized 16 | by the PM capacities to range between 0-1. 17 | 18 | Observation: 19 | Type: Tuple, Discrete 20 | [0][:, 0]: Binary indicator for open PM's 21 | [0][:, 1]: CPU load of PM's 22 | [0][:, 2]: Memory load of PM's 23 | [1][0]: Current CPU demand 24 | [1][1]: Current memory demand 25 | 26 | Actions: 27 | Type: Discrete 28 | Integer of PM number to send VM to that PM 29 | 30 | Reward: 31 | Negative of the waste, which is the difference between the current 32 | size and excess space on the PM. 33 | 34 | Starting State: 35 | No open PM's and random starting item 36 | 37 | Episode Termination: 38 | When invalid action is selected, attempt to overload VM, or step 39 | limit is reached. 40 | ''' 41 | def __init__(self, *args, **kwargs): 42 | self.cpu_capacity = 1 43 | self.mem_capacity = 1 44 | self.t_interval = 20 45 | self.tol = 1e-5 46 | self.step_limit = int(60 * 24 / self.t_interval) 47 | self.n_pms = 50 48 | self.load_idx = np.array([1, 2]) 49 | self.seed = 0 50 | self.mask = True 51 | assign_env_config(self, kwargs) 52 | self.action_space = spaces.Discrete(self.n_pms) 53 | 54 | if self.mask: 55 | self.observation_space = spaces.Dict({ 56 | "action_mask": spaces.Box(0, 1, shape=(self.n_pms,), dtype=np.uint8), 57 | "avail_actions": spaces.Box(0, 1, shape=(self.n_pms,), dtype=np.uint8), 58 | "state": spaces.Box(0, 1, shape=(self.n_pms+1, 3), dtype=np.float32) 59 | }) 60 | else: 61 | self.observation_space = spaces.Box(0, 1, shape=(self.n_pms+1, 3), dtype=np.float32) 62 | self.reset() 63 | 64 | def _RESET(self): 65 | self.demand = self.generate_demand() 66 | self.current_step = 0 67 | self.state = { 68 | "action_mask": np.ones(self.n_pms, dtype=np.uint8), 69 | "avail_actions": np.ones(self.n_pms, dtype=np.uint8), 70 | "state": np.vstack([ 71 | np.zeros((self.n_pms, 3)), 72 | self.demand[self.current_step]], 73 | dtype=np.float32) 74 | } 75 | self.assignment = {} 76 | return self.state 77 | 78 | def _STEP(self, action): 79 | done = False 80 | pm_state = self.state["state"][:-1] 81 | demand = self.state["state"][-1, 1:] 82 | 83 | if action < 0 or action >= self.n_pms: 84 | raise ValueError("Invalid action: {}".format(action)) 85 | 86 | elif any(pm_state[action, 1:] + demand > 1 + self.tol): 87 | # Demand doesn't fit into PM 88 | reward = -1000 89 | done = True 90 | else: 91 | if pm_state[action, 0] == 0: 92 | # Open PM if closed 93 | pm_state[action, 0] = 1 94 | pm_state[action, self.load_idx] += demand 95 | reward = np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2)) 96 | self.assignment[self.current_step] = action 97 | 98 | self.current_step += 1 99 | if self.current_step >= self.step_limit: 100 | done = True 101 | self.update_state(pm_state) 102 | return self.state, reward, done, {} 103 | 104 | def update_state(self, pm_state): 105 | # Make action selection impossible if the PM would exceed capacity 106 | step = self.current_step if self.current_step < self.step_limit else self.step_limit-1 107 | data_center = np.vstack([pm_state, self.demand[step]], dtype=np.float32) 108 | data_center = np.where(data_center>1,1,data_center) # Fix rounding errors 109 | self.state["state"] = data_center 110 | self.state["action_mask"] = np.ones(self.n_pms, dtype=np.uint8) 111 | self.state["avail_actions"] = np.ones(self.n_pms, dtype=np.uint8) 112 | if self.mask: 113 | action_mask = (pm_state[:, 1:] + self.demand[step, 1:]) <= 1 114 | self.state["action_mask"] = (action_mask.sum(axis=1)==2).astype(np.uint8) 115 | 116 | def sample_action(self): 117 | return self.action_space.sample() 118 | 119 | def generate_demand(self): 120 | n = self.step_limit 121 | # From Azure data 122 | mem_probs = np.array([0.12 , 0.165, 0.328, 0.287, 0.064, 0.036]) 123 | mem_bins = np.array([0.02857143, 0.05714286, 0.11428571, 0.45714286, 0.91428571, 124 | 1.]) # Normalized bin sizes 125 | mu_cpu = 16.08 126 | sigma_cpu = 1.26 127 | cpu_demand = np.random.normal(loc=mu_cpu, scale=sigma_cpu, size=n) 128 | cpu_demand = np.where(cpu_demand<=0, mu_cpu, cpu_demand) # Ensure demand isn't negative 129 | mem_demand = np.random.choice(mem_bins, p=mem_probs, size=n) 130 | return np.vstack([np.arange(n)/n, cpu_demand/100, mem_demand]).T 131 | 132 | def step(self, action): 133 | return self._STEP(action) 134 | 135 | def reset(self): 136 | return self._RESET() 137 | 138 | class TempVMPackingEnv(VMPackingEnv): 139 | ''' 140 | Online Temporary VM Packing Problem 141 | 142 | The VM Packing Problem (VMPP) is a combinatorial optimization problem which 143 | requires the user to select from a series of physical machines (PM's) to 144 | send a virtual machine process to. Each VM process is characterized by 145 | two values, the memory and compute of the process. These are normalized 146 | by the PM capacities to range between 0-1. 147 | 148 | Observation: 149 | Type: Tuple, Discrete 150 | [0][:, 0]: Binary indicator for open PM's 151 | [0][:, 1]: CPU load of PM's 152 | [0][:, 2]: Memory load of PM's 153 | [1][0]: Current CPU demand 154 | [1][1]: Current memory demand 155 | 156 | Actions: 157 | Type: Discrete 158 | Integer of PM number to send VM to that PM 159 | 160 | Reward: 161 | Negative of the waste, which is the difference between the current 162 | size and excess space on the PM. 163 | 164 | Starting State: 165 | No open PM's and random starting item 166 | 167 | Episode Termination: 168 | When invalid action is selected, attempt to overload VM, or step 169 | limit is reached. 170 | ''' 171 | def __init__(self, *args, **kwargs): 172 | super().__init__() 173 | self.state = self.reset() 174 | 175 | def step(self, action): 176 | done = False 177 | pm_state = self.state["state"][:-1] 178 | demand = self.state["state"][-1, 1:] 179 | 180 | if action < 0 or action >= self.n_pms: 181 | raise ValueError("Invalid action: {}".format(action)) 182 | 183 | elif any(pm_state[action, 1:] + demand > 1 + self.tol): 184 | # Demand doesn't fit into PM 185 | reward = -1000 186 | done = True 187 | else: 188 | if pm_state[action, 0] == 0: 189 | # Open PM if closed 190 | pm_state[action, 0] = 1 191 | pm_state[action, self.load_idx] += demand 192 | reward = np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2)) 193 | self.assignment[self.current_step] = action 194 | 195 | # Remove processes 196 | if self.current_step in self.durations.values(): 197 | for process in self.durations.keys(): 198 | # Remove process from PM 199 | if self.durations[process] == self.current_step: 200 | pm = self.assignment[process] # Find PM where process was assigned 201 | pm_state[pm, self.load_idx] -= self.demand[process] 202 | # Shut down PM's if state is 0 203 | if pm_state[pm, self.load_idx].sum() == 0: 204 | pm_state[pm, 0] = 0 205 | 206 | self.current_step += 1 207 | if self.current_step >= self.step_limit: 208 | done = True 209 | self.update_state(pm_state) 210 | return self.state, reward, done, {} 211 | 212 | def update_state(self, pm_state): 213 | # Make action selection impossible if the PM would exceed capacity 214 | step = self.current_step if self.current_step < self.step_limit else self.step_limit-1 215 | data_center = np.vstack([pm_state, self.demand[step]]) 216 | data_center = np.where(data_center>1,1,data_center) # Fix rounding errors 217 | self.state["state"] = data_center 218 | self.state["action_mask"] = np.ones(self.n_pms) 219 | self.state["avail_actions"] = np.ones(self.n_pms) 220 | if self.mask: 221 | action_mask = (pm_state[:, 1:] + self.demand[step, 1:]) <= 1 222 | self.state["action_mask"] = (action_mask.sum(axis=1)==2).astype(int) 223 | 224 | def _RESET(self): 225 | self.current_step = 0 226 | self.assignment = {} 227 | self.demand = self.generate_demand() 228 | self.durations = generate_durations(self.demand) 229 | self.state = (np.zeros((self.n_pms, 3)), self.demand[0]) 230 | return self.state 231 | 232 | def step(self, action): 233 | return self._STEP(action) 234 | 235 | def reset(self): 236 | return self._RESET() 237 | 238 | def generate_durations(demand): 239 | # duration_params = np.array([ 6.53563303e-02, 5.16222242e+01, 4.05028032e+06, -4.04960880e+06]) 240 | return {i: np.random.randint(low=i+1, high=len(demand)+1) 241 | for i, j in enumerate(demand)} 242 | 243 | def gaussian_model(params, x): 244 | return params[2] * np.exp(-0.5*((x - params[0]) / params[1]) ** 2) + params[3] -------------------------------------------------------------------------------- /or_gym/envs/classic_or/tsp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | from or_gym import utils 5 | from copy import copy, deepcopy 6 | import matplotlib.pyplot as plt 7 | 8 | class TSPEnv(gym.Env): 9 | ''' 10 | Bi-directional connections and uniform cost 11 | 12 | This version of the TSP uses a sparse graph with uniform cost. 13 | The goal is to minimize the cost to traverse all of the nodes in the 14 | network. All connections are bi-directional meaning if a connection 15 | between nodes n and m exist, then the agent can move in either direction. 16 | The network is randomly generated with N nodes when the environment is 17 | initialized using or_gym.make(). 18 | 19 | TSP-v0 allows repeat visits to nodes with no additional penalty beyond 20 | the nominal movement cost. 21 | 22 | Observation: 23 | 24 | 25 | Actions: 26 | Type: Discrete 27 | 0: move to node 0 28 | 1: move to node 1 29 | 2: ... 30 | 31 | Action Masking (optional): 32 | Masks non-existent connections, otherwise a large penalty is imposed 33 | on the agent. 34 | 35 | Reward: 36 | Cost of moving from node to node or large negative penalty for 37 | attempting to move to a node via a non-existent connection. 38 | 39 | Starting State: 40 | Random node 41 | 42 | Episode Termination: 43 | All nodes have been visited or the maximimum number of steps (2N) 44 | have been reached. 45 | ''' 46 | def __init__(self, *args, **kwargs): 47 | self.N = 50 48 | self.move_cost = -1 49 | self.invalid_action_cost = -100 50 | self.mask = False 51 | utils.assign_env_config(self, kwargs) 52 | 53 | self.nodes = np.arange(self.N) 54 | self.step_limit = 2*self.N 55 | self.obs_dim = 1+self.N**2 56 | obs_space = spaces.Box(-1, self.N, shape=(self.obs_dim,), dtype=np.int32) 57 | if self.mask: 58 | self.observation_space = spaces.Dict({ 59 | "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8), 60 | "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8), 61 | "state": obs_space 62 | }) 63 | else: 64 | self.observation_space = obs_space 65 | self.action_space = spaces.Discrete(self.N) 66 | 67 | self.reset() 68 | 69 | def _STEP(self, action): 70 | done = False 71 | connections = self.node_dict[self.current_node] 72 | # Invalid action 73 | if action not in connections: 74 | reward = self.invalid_action_cost 75 | # Move to new node 76 | else: 77 | self.current_node = action 78 | reward = self.move_cost 79 | self.visit_log[self.current_node] += 1 80 | 81 | self.state = self._update_state() 82 | self.step_count += 1 83 | # See if all nodes have been visited 84 | unique_visits = sum([1 if v > 0 else 0 85 | for v in self.visit_log.values()]) 86 | if unique_visits >= self.N: 87 | done = True 88 | reward += 1000 89 | if self.step_count >= self.step_limit: 90 | done = True 91 | 92 | return self.state, reward, done, {} 93 | 94 | def _RESET(self): 95 | self.step_count = 0 96 | self._generate_connections() 97 | self.current_node = np.random.choice(self.nodes) 98 | self.visit_log = {n: 0 for n in self.nodes} 99 | self.visit_log[self.current_node] += 1 100 | 101 | self.state = self._update_state() 102 | return self.state 103 | 104 | def _update_state(self): 105 | node_connections = self.adjacency_matrix.copy() 106 | # Set value to 1 for existing, un-visited nodes 107 | # Set value to -1 for existing, visited nodes 108 | # Set value to 0 if connection doesn't exist 109 | visited = np.array([bool(min(v, 1)) 110 | for v in self.visit_log.values()]) 111 | node_connections[:, visited] = -1 112 | node_connections[np.where(self.adjacency_matrix==0)] = 0 113 | 114 | connections = node_connections.flatten().astype(int) 115 | obs = np.hstack([self.current_node, connections], dtype=np.int32) 116 | if self.mask: 117 | mask = node_connections[self.current_node] 118 | # mask = np.array([1 if c==1 and v==0 else 0 119 | # for c, v in zip(cons_from_node, self.visit_log.values())]) 120 | state = { 121 | "action_mask": mask, 122 | "avail_actions": np.ones(self.N, dtype=np.uint8), 123 | "state": obs, 124 | } 125 | else: 126 | state = obs.copy() 127 | 128 | return state 129 | 130 | def _generate_connections(self): 131 | node_dict = {} 132 | for n in range(self.N): 133 | connections = np.random.randint(2, self.N - 1) 134 | node_dict[n] = np.sort( 135 | np.random.choice(self.nodes[np.where(self.nodes!=n)], 136 | size=connections, replace=False)) 137 | # Get unique, bi-directional connections 138 | for k, v in node_dict.items(): 139 | for k1, v1 in node_dict.items(): 140 | if k == k1: 141 | continue 142 | if k in v1 and k1 not in v: 143 | v = np.append(v, k1) 144 | 145 | node_dict[k] = np.sort(v.copy()) 146 | self.node_dict = deepcopy(node_dict) 147 | self._generate_adjacency_matrix() 148 | 149 | def _generate_adjacency_matrix(self): 150 | self.adjacency_matrix = np.zeros((self.N, self.N)) 151 | for k, v in self.node_dict.items(): 152 | self.adjacency_matrix[k][v] += 1 153 | self.adjacency_matrix.astype(int) 154 | 155 | def _generate_coordinates(self): 156 | n = np.linspace(0, 2*np.pi, self.N+1) 157 | x = np.cos(n) 158 | y = np.sin(n) 159 | return np.vstack([x, y]) 160 | 161 | def _get_node_distance(self, N0, N1): 162 | return np.sqrt(np.power(N0[0] - N1[0], 2) + np.power(N0[1] - N1[1], 2)) 163 | 164 | def plot_network(self, offset=(0.02, 0.02)): 165 | coords = self._generate_coordinates() 166 | fig, ax = plt.subplots(figsize=(12,8)) 167 | ax.scatter(coords[0], coords[1], s=40) 168 | for n, c in self.node_dict.items(): 169 | for k in c: 170 | line = np.vstack([coords[:, n], coords[:, k]]) 171 | dis = self._get_node_distance(line[0], line[1]) 172 | # dis = np.sqrt(np.power(line[0, 0] - line[1, 0], 2) + 173 | # np.power(line[0, 1] - line[1, 1], 2)) 174 | ax.plot(line[:,0], line[:,1], c='g', zorder=-1) 175 | # ax.arrow(line[0, 0], line[0, 1], line[1, 0], line[1, 1]) 176 | ax.annotate(r"$N_{:d}$".format(n), xy=(line[0]+offset), zorder=2) 177 | ax.xaxis.set_visible(False) 178 | ax.yaxis.set_visible(False) 179 | plt.show() 180 | 181 | def step(self, action): 182 | return self._STEP(action) 183 | 184 | def reset(self): 185 | return self._RESET() 186 | 187 | class TSPDistCost(TSPEnv): 188 | ''' 189 | Fully connected network with distance-based cost. 190 | 191 | This environment enables travel between all nodes in the network and 192 | incurs cost based on the Euclidean distance between nodes. The goal is to 193 | minimize the cost to traverse all of the nodes in the network exactly 194 | once. The agent incurs a large penalty and ends the episode if it moves to 195 | a node more than once. All connections are bi-directional meaning if a 196 | connection between nodes n and m exist, then the agent can move in either 197 | direction. The network is randomly generated with N nodes when the 198 | environment is initialized using or_gym.make(). 199 | 200 | Observation: 201 | Type: Box 202 | 0: Current Node 203 | 1: 0 or 1 if node 0 has been visited or not 204 | 2: 0 or 1 if node 1 has been visited or not 205 | 3: ... 206 | 207 | Actions: 208 | Type: Discrete 209 | 0: move to node 0 210 | 1: move to node 1 211 | 2: ... 212 | 213 | Action Masking (optional): 214 | Masks visited nodes. 215 | 216 | Reward: 217 | Cost of moving from node to node. 218 | 219 | Starting State: 220 | Random node 221 | 222 | Episode Termination: 223 | All nodes have been visited or a node has been visited again. 224 | ''' 225 | def __init__(self, *args, **kwargs): 226 | self.N = 50 227 | self.invalid_action_cost = -100 228 | self.mask = False 229 | utils.assign_env_config(self, kwargs) 230 | self.nodes = np.arange(self.N) 231 | self.coords = self._generate_coordinates() 232 | self.distance_matrix = self._get_distance_matrix() 233 | 234 | self.obs_dim = 1+self.N 235 | obs_space = spaces.Box(-1, self.N, shape=(self.obs_dim,), dtype=np.int32) 236 | if self.mask: 237 | self.observation_space = spaces.Dict({ 238 | "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8), 239 | "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.int8), 240 | "state": obs_space 241 | }) 242 | else: 243 | self.observation_space = obs_space 244 | 245 | self.action_space = spaces.Discrete(self.N) 246 | 247 | self.reset() 248 | 249 | def _STEP(self, action): 250 | done = False 251 | if self.visit_log[action] > 0: 252 | # Node already visited 253 | reward = self.invalid_action_cost 254 | done = True 255 | else: 256 | reward = self.distance_matrix[self.current_node, action] 257 | self.current_node = action 258 | self.visit_log[self.current_node] = 1 259 | 260 | self.state = self._update_state() 261 | # See if all nodes have been visited 262 | unique_visits = self.visit_log.sum() 263 | if unique_visits == self.N: 264 | done = True 265 | 266 | return self.state, reward, done, {} 267 | 268 | def _RESET(self): 269 | self.step_count = 0 270 | self.current_node = np.random.choice(self.nodes) 271 | self.visit_log = np.zeros(self.N) 272 | self.visit_log[self.current_node] += 1 273 | 274 | self.state = self._update_state() 275 | return self.state 276 | 277 | def _generate_coordinates(self): 278 | return np.vstack([np.random.rand(self.N), np.random.rand(self.N)]) 279 | 280 | def _get_distance_matrix(self): 281 | # Distance matrix 282 | distance_matrix = np.zeros((self.N, self.N)) 283 | for i in range(self.N): 284 | # Take advantage of symmetrical matrix 285 | for j in range(self.N): 286 | if j <= i: 287 | continue 288 | d = self._get_node_distance(self.coords[:, i], self.coords[:, j]) 289 | distance_matrix[i, j] += d 290 | 291 | distance_matrix += distance_matrix.T 292 | return distance_matrix 293 | 294 | def _update_state(self): 295 | mask = np.where(self.visit_log==0, 0 , 1) 296 | obs = np.hstack([self.current_node, mask]) 297 | if self.mask: 298 | state = { 299 | "avail_actions": np.ones(self.N), 300 | "action_mask": mask, 301 | "state": obs 302 | } 303 | else: 304 | state = obs.copy() 305 | return state 306 | 307 | def step(self, action): 308 | return self._STEP(action) 309 | 310 | def reset(self): 311 | return self._RESET() 312 | -------------------------------------------------------------------------------- /examples/ray_rllib_knapsack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "973f2613", 7 | "metadata": { 8 | "scrolled": true 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import ray\n", 13 | "from ray.rllib import agents\n", 14 | "from ray import tune\n", 15 | "from ray.rllib.models import ModelCatalog\n", 16 | "from ray.rllib.models.tf.tf_modelv2 import TFModelV2\n", 17 | "from ray.rllib.models.tf.fcnet import FullyConnectedNetwork\n", 18 | "from ray.rllib.utils import try_import_tf\n", 19 | "from gym import spaces\n", 20 | "import or_gym\n", 21 | "from or_gym.utils import create_env\n", 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "4201d91c", 30 | "metadata": {}, 31 | "source": [ 32 | "# Prepare Tensforflow and ray" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "2ac8ec32", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Establish Tensorflow API conncetion\n", 43 | "tf_api, tf_original, tf_version = try_import_tf(error = True) \n", 44 | "# Disable callback synch on Windows\n", 45 | "TUNE_DISABLE_AUTO_CALLBACK_SYNCER=1" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "6acd05db", 51 | "metadata": {}, 52 | "source": [ 53 | "# Knapsack environment with action masking" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "15bc4f2b", 59 | "metadata": {}, 60 | "source": [ 61 | "Class definition: customized Tensorflow-2-model for OR-Gym knapsack envrionemnt with action masking" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "a2b629ed", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "class KP0ActionMaskModel(TFModelV2):\n", 72 | " \n", 73 | " def __init__(self, obs_space, action_space, num_outputs,\n", 74 | " model_config, name, true_obs_shape=(11,),\n", 75 | " action_embed_size=5, *args, **kwargs):\n", 76 | " \n", 77 | " # true_obs_shape is going to match the size of the state. \n", 78 | " # If we stick with our reduced KP, that will be a vector with 11 entries. \n", 79 | " # The other value we need to provide is the action_embed_size, which is going to be the size of our action space (5)\n", 80 | " \n", 81 | " super(KP0ActionMaskModel, self).__init__(obs_space,\n", 82 | " action_space, num_outputs, model_config, name, \n", 83 | " *args, **kwargs)\n", 84 | " \n", 85 | " self.action_embed_model = FullyConnectedNetwork(\n", 86 | " spaces.Box(0, 1, shape=true_obs_shape), \n", 87 | " action_space, action_embed_size,\n", 88 | " model_config, name + \"_action_embedding\")\n", 89 | " self.register_variables(self.action_embed_model.variables())\n", 90 | " \n", 91 | " def forward(self, input_dict, state, seq_lens):\n", 92 | " \n", 93 | " # The actual masking takes place in the forward method where we unpack the mask, actions, and state from \n", 94 | " # the observation dictionary provided by our environment. The state yields our action embeddings which gets \n", 95 | " # combined with our mask to provide logits with the smallest value we can provide. \n", 96 | " # This will get passed to a softmax output which will reduce the probability of selecting these actions to 0, \n", 97 | " # effectively blocking the agent from ever taking these illegal actions.\n", 98 | " \n", 99 | " avail_actions = input_dict[\"obs\"][\"avail_actions\"]\n", 100 | " action_mask = input_dict[\"obs\"][\"action_mask\"]\n", 101 | " action_embedding, _ = self.action_embed_model({\n", 102 | " \"obs\": input_dict[\"obs\"][\"state\"]})\n", 103 | " intent_vector = tf_api.expand_dims(action_embedding, 1)\n", 104 | " action_logits = tf_api.reduce_sum(avail_actions * intent_vector, axis=1)\n", 105 | " inf_mask = tf_api.maximum(tf_api.log(action_mask), tf_api.float32.min)\n", 106 | " return action_logits + inf_mask, state\n", 107 | " \n", 108 | " def value_function(self):\n", 109 | " return self.action_embed_model.value_function()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "4246a847", 115 | "metadata": {}, 116 | "source": [ 117 | "Environment creation" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "9d15617a", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Configuration for gym environment\n", 128 | "env_config = {'N': 5,\n", 129 | " 'max_weight': 15,\n", 130 | " 'item_weights': np.array([1, 12, 2, 1, 4]),\n", 131 | " 'item_values': np.array([2, 4, 2, 1, 10]),\n", 132 | " 'mask': True}\n", 133 | " \n", 134 | "env_name = 'Knapsack-v0'\n", 135 | "env = or_gym.make('Knapsack-v0', env_config=env_config)\n", 136 | " \n", 137 | "print(\"Max weight capacity:\\t{}kg\".format(env.max_weight))\n", 138 | "print(\"Number of items:\\t{}\".format(env.N))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "f3710cae", 144 | "metadata": {}, 145 | "source": [ 146 | "Create Rllib trainable instance" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "b78cf89e", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Register the model for Rllib usage\n", 157 | "ModelCatalog.register_custom_model('kp_mask', KP0ActionMaskModel)\n", 158 | "# Register the environment, so that we have a Trainable instance later\n", 159 | "# ATTENTION: Tune needs the base class, not an instance of the environment like we get from or_gym.make(env_name) to work with. So we need to pass this to register_env using a lambda function as shown below.\n", 160 | "env = create_env(env_name)\n", 161 | "tune.register_env(env_name, lambda env_name: env(env_name, env_config=env_config))\n", 162 | "\n", 163 | "trainer_config = {\n", 164 | " \"model\": {\n", 165 | " \"custom_model\": \"kp_mask\" # Here we must use the custom model name taken in register process before\n", 166 | " },\n", 167 | " \"env_config\": env_config, # env config from (or_)gym\n", 168 | " #\"framework\" : \"tfe\" # tip by rllib to enable TensorFlow eager exection\n", 169 | " }\n", 170 | "\n", 171 | "# ray.shutdown() maybe necessary in case of blocking instance\n", 172 | "ray.init( ignore_reinit_error = True )\n", 173 | "trainer = agents.ppo.PPOTrainer(env='Knapsack-v0', config=trainer_config)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "65e87dff", 179 | "metadata": {}, 180 | "source": [ 181 | "Train the agent\n", 182 | "\n" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "2da600b2", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "env = trainer.env_creator('Knapsack-v0')\n", 193 | "state = env.state\n", 194 | "\n", 195 | "# Use the action masking to disable the agent to take specific actions, i.e. to avoid taking element in knapsack by index\n", 196 | "# state['action_mask'][0] = 0\n", 197 | "\n", 198 | "# Train an agent for 1000 states \n", 199 | "actions = np.array([trainer.compute_single_action(state) for i in range(10000)])\n", 200 | "\n", 201 | "# If action masking used, check that this action was never taken\n", 202 | "# print(any(actions==0))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "8848f472", 208 | "metadata": {}, 209 | "source": [ 210 | "# Tuning hyperparameters" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "b07a0cb3", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# Use tune for hyperparameter tuning\n", 221 | "tune_config = {\n", 222 | " 'env': 'Knapsack-v0'\n", 223 | "}\n", 224 | "stop = {\n", 225 | " 'timesteps_total': 10000\n", 226 | "}\n", 227 | "results = tune.run(\n", 228 | " 'PPO', # Specify the algorithm to train\n", 229 | " config=tune_config,\n", 230 | " stop=stop\n", 231 | ") " 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "8e31e7ab", 237 | "metadata": {}, 238 | "source": [ 239 | "Plot the results" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "817660cf", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "colors = plt.rcParams['axes.prop_cycle'].by_key()['color']\n", 250 | "df = results.dataframe()\n", 251 | "# Get column for total loss, policy loss, and value loss\n", 252 | "tl_col = [i for i, j in enumerate(df.columns)\n", 253 | " if 'total_loss' in j][0]\n", 254 | "pl_col = [i for i, j in enumerate(df.columns)\n", 255 | " if 'policy_loss' in j][0]\n", 256 | "vl_col = [i for i, j in enumerate(df.columns)\n", 257 | " if 'vf_loss' in j][0]\n", 258 | "labels = []\n", 259 | "fig, ax = plt.subplots(2, 2, figsize=(15, 15), sharex=True)\n", 260 | "for i, path in df['logdir'].iteritems():\n", 261 | " data = pd.read_csv(path + '/progress.csv')\n", 262 | " # Get labels for legend\n", 263 | " lr = data['experiment_id'][0]\n", 264 | " layers = data['training_iteration'][0]\n", 265 | " labels.append('LR={}; Shared Layers={}'.format(lr, layers))\n", 266 | " \n", 267 | " ax[0, 0].plot(data['timesteps_total'], \n", 268 | " data['episode_reward_mean'], c=colors[i],\n", 269 | " label=labels[-1])\n", 270 | " \n", 271 | " ax[0, 1].plot(data['timesteps_total'], \n", 272 | " data.iloc[:, tl_col], c=colors[i],\n", 273 | " label=labels[-1])\n", 274 | " \n", 275 | " ax[1, 0].plot(data['timesteps_total'], \n", 276 | " data.iloc[:, pl_col], c=colors[i],\n", 277 | " label=labels[-1])\n", 278 | " \n", 279 | " \n", 280 | " ax[1, 1].plot(data['timesteps_total'], \n", 281 | " data.iloc[:, vl_col], c=colors[i],\n", 282 | " label=labels[-1])\n", 283 | " \n", 284 | "ax[0, 0].set_ylabel('Mean Rewards')\n", 285 | "ax[0, 0].set_title('Training Rewards by Time Step')\n", 286 | "ax[0, 0].legend(labels=labels, loc='upper center',\n", 287 | " ncol=3, bbox_to_anchor=[0.75, 1.2])\n", 288 | " \n", 289 | " \n", 290 | "ax[0, 1].set_title('Total Loss by Time Step')\n", 291 | "ax[0, 1].set_ylabel('Total Loss')\n", 292 | "ax[0, 1].set_xlabel('Training Episodes')\n", 293 | " \n", 294 | "ax[1, 0].set_title('Policy Loss by Time Step')\n", 295 | "ax[1, 0].set_ylabel('Policy Loss')\n", 296 | "ax[1, 0].set_xlabel('Time Step')\n", 297 | " \n", 298 | "ax[1, 1].set_title('Value Loss by Time Step')\n", 299 | "ax[1, 1].set_ylabel('Value Loss')\n", 300 | "ax[1, 1].set_xlabel('Time Step')\n", 301 | " \n", 302 | "plt.show()" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "or-gym", 309 | "language": "python", 310 | "name": "or-gym" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.8.12" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 5 327 | } 328 | -------------------------------------------------------------------------------- /examples/inv-management-quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "937597e4", 6 | "metadata": {}, 7 | "source": [ 8 | "# How to Use Deep Reinforcement Learning to Improve your Supply Chain\n", 9 | "\n", 10 | "Full write up available [here](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/).\n", 11 | "\n", 12 | "Note Ray is not a dependency of OR-Gym. We want OR-Gym to be able to stand independently of other RL libraries as much as possible.\n", 13 | "\n", 14 | "There have been breaking changes that have been introduced in later version of Ray which affect this environment in particular. To ensure no conflicts, please run:\n", 15 | "- `pip install ray==1.0.0`\n", 16 | "- `pip install ray[rllib]`\n", 17 | "- `pip install ray[tune]`\n", 18 | "- `pip install tensorflow==2.3.0`\n", 19 | "\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "fefefc51", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 33 | "Instructions for updating:\n", 34 | "non-resource variables are not supported in the long term\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import or_gym\n", 40 | "from or_gym.utils import create_env\n", 41 | "import ray\n", 42 | "from ray.rllib.agents.ppo import PPOTrainer\n", 43 | "from ray import tune\n", 44 | "import numpy as np\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "from matplotlib import gridspec" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "id": "40fa580e", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "def register_env(env_name, env_config={}):\n", 57 | " env = create_env(env_name)\n", 58 | " tune.register_env(env_name, \n", 59 | " lambda env_name: env(env_name,\n", 60 | " env_config=env_config))\n", 61 | "\n", 62 | "# Environment and RL Configuration Settings\n", 63 | "env_name = 'InvManagement-v1'\n", 64 | "# env_name = \"Knapsack-v0\"\n", 65 | "env_config = {} # Change environment parameters here\n", 66 | "rl_config = dict(\n", 67 | " env=env_name,\n", 68 | " num_workers=2,\n", 69 | " env_config=env_config,\n", 70 | " model=dict(\n", 71 | " vf_share_layers=False,\n", 72 | " fcnet_activation='elu',\n", 73 | " fcnet_hiddens=[256, 256]\n", 74 | " ),\n", 75 | " lr=1e-5\n", 76 | ")\n", 77 | " \n", 78 | "# Register environment\n", 79 | "register_env(env_name, env_config)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "id": "ea13304f", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "2022-09-02 10:53:41,358\tINFO services.py:1164 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n", 93 | "2022-09-02 10:53:44,394\tINFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution\n", 94 | "2022-09-02 10:53:44,398\tINFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.\n", 95 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 96 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n", 97 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m non-resource variables are not supported in the long term\n", 98 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 99 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n", 100 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m non-resource variables are not supported in the long term\n", 101 | "2022-09-02 10:54:04,675\tINFO trainable.py:252 -- Trainable.setup took 20.284 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.\n", 102 | "2022-09-02 10:54:04,677\tWARNING util.py:39 -- Install gputil for GPU system monitoring.\n" 103 | ] 104 | }, 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 110 | "Instructions for updating:\n", 111 | "Prefer Variable.assign which has equivalent behavior in 2.X.\n" 112 | ] 113 | }, 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 119 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n", 120 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n", 121 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 122 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n", 123 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n" 124 | ] 125 | }, 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Iter: 100\tReward: 233.14" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "# Initialize Ray and Build Agent\n", 136 | "ray.init(ignore_reinit_error=True)\n", 137 | "agent = PPOTrainer(env=env_name,\n", 138 | " config=rl_config)\n", 139 | " \n", 140 | "results = []\n", 141 | "for i in range(500):\n", 142 | " res = agent.train()\n", 143 | " results.append(res)\n", 144 | " if (i+1) % 5 == 0:\n", 145 | " print('\\rIter: {}\\tReward: {:.2f}'.format(\n", 146 | " i+1, res['episode_reward_mean']), end='')\n", 147 | "ray.shutdown()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "793e41cf", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "Box(3,)" 160 | ] 161 | }, 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# Unpack values from each iteration\n", 169 | "rewards = np.hstack([i['hist_stats']['episode_reward'] \n", 170 | " for i in results])\n", 171 | "pol_loss = [\n", 172 | " i['info']['learner']['default_policy']['policy_loss'] \n", 173 | " for i in results]\n", 174 | "vf_loss = [\n", 175 | " i['info']['learner']['default_policy']['vf_loss'] \n", 176 | " for i in results]\n", 177 | "p = 100\n", 178 | "mean_rewards = np.array([np.mean(rewards[i-p:i+1]) \n", 179 | " if i >= p else np.mean(rewards[:i+1]) \n", 180 | " for i, _ in enumerate(rewards)])\n", 181 | "std_rewards = np.array([np.std(rewards[i-p:i+1])\n", 182 | " if i >= p else np.std(rewards[:i+1])\n", 183 | " for i, _ in enumerate(rewards)])\n", 184 | "fig = plt.figure(constrained_layout=True, figsize=(20, 10))\n", 185 | "gs = fig.add_gridspec(2, 4)\n", 186 | "ax0 = fig.add_subplot(gs[:, :-2])\n", 187 | "ax0.fill_between(np.arange(len(mean_rewards)), \n", 188 | " mean_rewards - std_rewards, \n", 189 | " mean_rewards + std_rewards, \n", 190 | " label='Standard Deviation', alpha=0.3)\n", 191 | "ax0.plot(mean_rewards, label='Mean Rewards')\n", 192 | "ax0.set_ylabel('Rewards')\n", 193 | "ax0.set_xlabel('Episode')\n", 194 | "ax0.set_title('Training Rewards')\n", 195 | "ax0.legend()\n", 196 | "ax1 = fig.add_subplot(gs[0, 2:])\n", 197 | "ax1.plot(pol_loss)\n", 198 | "ax1.set_ylabel('Loss')\n", 199 | "ax1.set_xlabel('Iteration')\n", 200 | "ax1.set_title('Policy Loss')\n", 201 | "ax2 = fig.add_subplot(gs[1, 2:])\n", 202 | "ax2.plot(vf_loss)\n", 203 | "ax2.set_ylabel('Loss')\n", 204 | "ax2.set_xlabel('Iteration')\n", 205 | "ax2.set_title('Value Function Loss')\n", 206 | "plt.show()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "9e494fbe", 212 | "metadata": {}, 213 | "source": [ 214 | "# Derivative Free Optimization" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "id": "44bb7398", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "ename": "AssertionError", 225 | "evalue": "", 226 | "output_type": "error", 227 | "traceback": [ 228 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 229 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 230 | "\u001b[1;32m/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb Cell 6\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m high_ \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmax(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh)\n\u001b[1;32m 4\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mlow \u001b[39m==\u001b[39m low_)\n\u001b[0;32m----> 5\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh \u001b[39m==\u001b[39m high_)\n", 231 | "\u001b[0;31mAssertionError\u001b[0m: " 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "from scipy.optimize import minimize" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "741bbd80", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "(100, array([100, 90, 80], dtype=int16), array([0, 0, 0], dtype=int16))" 249 | ] 250 | }, 251 | "execution_count": 9, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "def base_stock_policy(policy, env):\n", 258 | " '''\n", 259 | " Implements a re-order up-to policy. This means that for\n", 260 | " each node in the network, if the inventory at that node \n", 261 | " falls below the level denoted by the policy, we will \n", 262 | " re-order inventory to bring it to the policy level.\n", 263 | " \n", 264 | " For example, policy at a node is 10, current inventory\n", 265 | " is 5: the action is to order 5 units.\n", 266 | " '''\n", 267 | " assert len(policy) == len(env.init_inv), (\n", 268 | " 'Policy should match number of nodes in network' + \n", 269 | " '({}, {}).'.format(len(policy), len(env.init_inv)))\n", 270 | " \n", 271 | " # Get echelon inventory levels\n", 272 | " if env.period == 0:\n", 273 | " inv_ech = np.cumsum(env.I[env.period] +\n", 274 | " env.T[env.period])\n", 275 | " else:\n", 276 | " inv_ech = np.cumsum(env.I[env.period] +\n", 277 | " env.T[env.period] - env.B[env.period-1, :-1])\n", 278 | " \n", 279 | " # Get unconstrained actions\n", 280 | " unc_actions = policy - inv_ech\n", 281 | " unc_actions = np.where(unc_actions>0, unc_actions, 0)\n", 282 | " \n", 283 | " # Ensure that actions can be fulfilled by checking \n", 284 | " # constraints\n", 285 | " inv_const = np.hstack([env.I[env.period, 1:], np.Inf])\n", 286 | " actions = np.minimum(env.c, np.minimum(unc_actions, inv_const))\n", 287 | " return actions\n", 288 | "\n", 289 | "def dfo_func(policy, env, *args):\n", 290 | " '''\n", 291 | " Runs an episode based on current base-stock model \n", 292 | " settings. This allows us to use our environment for the \n", 293 | " DFO optimizer.\n", 294 | " '''\n", 295 | " env.reset() # Ensure env is fresh\n", 296 | " rewards = []\n", 297 | " done = False\n", 298 | " while not done:\n", 299 | " action = base_stock_policy(policy, env)\n", 300 | " state, reward, done, _ = env.step(action)\n", 301 | " rewards.append(reward)\n", 302 | " if done:\n", 303 | " break\n", 304 | " \n", 305 | " rewards = np.array(rewards)\n", 306 | " prob = env.demand_dist.pmf(env.D, **env.dist_param)\n", 307 | " \n", 308 | " # Return negative of expected profit\n", 309 | " return -1 / env.num_periods * np.sum(prob * rewards)\n", 310 | " \n", 311 | "def optimize_inventory_policy(env_name, fun,\n", 312 | " init_policy=None, env_config={}, method='Powell'):\n", 313 | " \n", 314 | " env = or_gym.make(env_name, env_config=env_config)\n", 315 | " \n", 316 | " if init_policy is None:\n", 317 | " init_policy = np.ones(env.num_stages-1)\n", 318 | " \n", 319 | " # Optimize policy\n", 320 | " out = minimize(fun=fun, x0=init_policy, args=env, \n", 321 | " method=method)\n", 322 | " policy = out.x.copy()\n", 323 | " \n", 324 | " # Policy must be positive integer\n", 325 | " policy = np.round(np.maximum(policy, 0), 0).astype(int)\n", 326 | " \n", 327 | " return policy, out" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "3e11da7d", 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "Box(3,)" 340 | ] 341 | }, 342 | "execution_count": 8, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "policy, out = optimize_inventory_policy('InvManagement-v1',\n", 349 | " dfo_func)\n", 350 | "print(\"Re-order levels: {}\".format(policy))\n", 351 | "print(\"DFO Info:\\n{}\".format(out))\n", 352 | "\n", 353 | "env = or_gym.make(env_name, env_config=env_config)\n", 354 | "eps = 1000\n", 355 | "rewards = []\n", 356 | "for i in range(eps):\n", 357 | " env.reset()\n", 358 | " reward = 0\n", 359 | " while True:\n", 360 | " action = base_stock_policy(policy, env)\n", 361 | " s, r, done, _ = env.step(action)\n", 362 | " reward += r\n", 363 | " if done:\n", 364 | " rewards.append(reward)\n", 365 | " break" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "def5147b", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3.8.0 ('or-gym-dev')", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.8.13" 394 | }, 395 | "vscode": { 396 | "interpreter": { 397 | "hash": "bc8a2230aa8b659650bd48bf6a546b4d453aa64d7078ee0770a23a54a48157c8" 398 | } 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 5 403 | } 404 | -------------------------------------------------------------------------------- /examples/how-to-use-rl-to-improve-your-supply-chain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "937597e4", 6 | "metadata": {}, 7 | "source": [ 8 | "# How to Use Deep Reinforcement Learning to Improve your Supply Chain\n", 9 | "\n", 10 | "Full write up available [here](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/).\n", 11 | "\n", 12 | "Note Ray is not a dependency of OR-Gym. We want OR-Gym to be able to stand independently of other RL libraries as much as possible.\n", 13 | "\n", 14 | "There have been breaking changes that have been introduced in later version of Ray which affect this environment in particular. To ensure no conflicts, please run:\n", 15 | "- `pip install ray==1.0.0`\n", 16 | "- `pip install ray[rllib]`\n", 17 | "- `pip install ray[tune]`\n", 18 | "- `pip install tensorflow==2.3.0`\n", 19 | "\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "fefefc51", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 33 | "Instructions for updating:\n", 34 | "non-resource variables are not supported in the long term\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import or_gym\n", 40 | "from or_gym.utils import create_env\n", 41 | "import ray\n", 42 | "from ray.rllib.agents.ppo import PPOTrainer\n", 43 | "from ray import tune\n", 44 | "import numpy as np\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "from matplotlib import gridspec" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "id": "40fa580e", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "def register_env(env_name, env_config={}):\n", 57 | " env = create_env(env_name)\n", 58 | " tune.register_env(env_name, \n", 59 | " lambda env_name: env(env_name,\n", 60 | " env_config=env_config))\n", 61 | "\n", 62 | "# Environment and RL Configuration Settings\n", 63 | "env_name = 'InvManagement-v1'\n", 64 | "# env_name = \"Knapsack-v0\"\n", 65 | "env_config = {} # Change environment parameters here\n", 66 | "rl_config = dict(\n", 67 | " env=env_name,\n", 68 | " num_workers=2,\n", 69 | " env_config=env_config,\n", 70 | " model=dict(\n", 71 | " vf_share_layers=False,\n", 72 | " fcnet_activation='elu',\n", 73 | " fcnet_hiddens=[256, 256]\n", 74 | " ),\n", 75 | " lr=1e-5\n", 76 | ")\n", 77 | " \n", 78 | "# Register environment\n", 79 | "register_env(env_name, env_config)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "id": "ea13304f", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "2022-09-02 10:53:41,358\tINFO services.py:1164 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n", 93 | "2022-09-02 10:53:44,394\tINFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution\n", 94 | "2022-09-02 10:53:44,398\tINFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.\n", 95 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 96 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n", 97 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m non-resource variables are not supported in the long term\n", 98 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", 99 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n", 100 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m non-resource variables are not supported in the long term\n", 101 | "2022-09-02 10:54:04,675\tINFO trainable.py:252 -- Trainable.setup took 20.284 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.\n", 102 | "2022-09-02 10:54:04,677\tWARNING util.py:39 -- Install gputil for GPU system monitoring.\n" 103 | ] 104 | }, 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 110 | "Instructions for updating:\n", 111 | "Prefer Variable.assign which has equivalent behavior in 2.X.\n" 112 | ] 113 | }, 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 119 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Instructions for updating:\n", 120 | "\u001b[2m\u001b[36m(pid=9662)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n", 121 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m WARNING:tensorflow:From /home/christian/anaconda3/envs/or-gym-dev/lib/python3.8/site-packages/ray/rllib/policy/tf_policy.py:872: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n", 122 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Instructions for updating:\n", 123 | "\u001b[2m\u001b[36m(pid=9660)\u001b[0m Prefer Variable.assign which has equivalent behavior in 2.X.\n" 124 | ] 125 | }, 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Iter: 155\tReward: 339.55" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "# Initialize Ray and Build Agent\n", 136 | "ray.init(ignore_reinit_error=True)\n", 137 | "agent = PPOTrainer(env=env_name,\n", 138 | " config=rl_config)\n", 139 | " \n", 140 | "results = []\n", 141 | "for i in range(500):\n", 142 | " res = agent.train()\n", 143 | " results.append(res)\n", 144 | " if (i+1) % 5 == 0:\n", 145 | " print('\\rIter: {}\\tReward: {:.2f}'.format(\n", 146 | " i+1, res['episode_reward_mean']), end='')\n", 147 | "ray.shutdown()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "793e41cf", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "Box(3,)" 160 | ] 161 | }, 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# Unpack values from each iteration\n", 169 | "rewards = np.hstack([i['hist_stats']['episode_reward'] \n", 170 | " for i in results])\n", 171 | "pol_loss = [\n", 172 | " i['info']['learner']['default_policy']['policy_loss'] \n", 173 | " for i in results]\n", 174 | "vf_loss = [\n", 175 | " i['info']['learner']['default_policy']['vf_loss'] \n", 176 | " for i in results]\n", 177 | "p = 100\n", 178 | "mean_rewards = np.array([np.mean(rewards[i-p:i+1]) \n", 179 | " if i >= p else np.mean(rewards[:i+1]) \n", 180 | " for i, _ in enumerate(rewards)])\n", 181 | "std_rewards = np.array([np.std(rewards[i-p:i+1])\n", 182 | " if i >= p else np.std(rewards[:i+1])\n", 183 | " for i, _ in enumerate(rewards)])\n", 184 | "fig = plt.figure(constrained_layout=True, figsize=(20, 10))\n", 185 | "gs = fig.add_gridspec(2, 4)\n", 186 | "ax0 = fig.add_subplot(gs[:, :-2])\n", 187 | "ax0.fill_between(np.arange(len(mean_rewards)), \n", 188 | " mean_rewards - std_rewards, \n", 189 | " mean_rewards + std_rewards, \n", 190 | " label='Standard Deviation', alpha=0.3)\n", 191 | "ax0.plot(mean_rewards, label='Mean Rewards')\n", 192 | "ax0.set_ylabel('Rewards')\n", 193 | "ax0.set_xlabel('Episode')\n", 194 | "ax0.set_title('Training Rewards')\n", 195 | "ax0.legend()\n", 196 | "ax1 = fig.add_subplot(gs[0, 2:])\n", 197 | "ax1.plot(pol_loss)\n", 198 | "ax1.set_ylabel('Loss')\n", 199 | "ax1.set_xlabel('Iteration')\n", 200 | "ax1.set_title('Policy Loss')\n", 201 | "ax2 = fig.add_subplot(gs[1, 2:])\n", 202 | "ax2.plot(vf_loss)\n", 203 | "ax2.set_ylabel('Loss')\n", 204 | "ax2.set_xlabel('Iteration')\n", 205 | "ax2.set_title('Value Function Loss')\n", 206 | "plt.show()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "9e494fbe", 212 | "metadata": {}, 213 | "source": [ 214 | "# Derivative Free Optimization" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "id": "44bb7398", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "ename": "AssertionError", 225 | "evalue": "", 226 | "output_type": "error", 227 | "traceback": [ 228 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 229 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 230 | "\u001b[1;32m/home/christian/GitHub/or-gym/examples/how-to-use-rl-to-improve-your-supply-chain.ipynb Cell 6\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m high_ \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmax(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh)\n\u001b[1;32m 4\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mlow \u001b[39m==\u001b[39m low_)\n\u001b[0;32m----> 5\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mall(env\u001b[39m.\u001b[39maction_space\u001b[39m.\u001b[39mhigh \u001b[39m==\u001b[39m high_)\n", 231 | "\u001b[0;31mAssertionError\u001b[0m: " 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "from scipy.optimize import minimize" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "741bbd80", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "(100, array([100, 90, 80], dtype=int16), array([0, 0, 0], dtype=int16))" 249 | ] 250 | }, 251 | "execution_count": 9, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "def base_stock_policy(policy, env):\n", 258 | " '''\n", 259 | " Implements a re-order up-to policy. This means that for\n", 260 | " each node in the network, if the inventory at that node \n", 261 | " falls below the level denoted by the policy, we will \n", 262 | " re-order inventory to bring it to the policy level.\n", 263 | " \n", 264 | " For example, policy at a node is 10, current inventory\n", 265 | " is 5: the action is to order 5 units.\n", 266 | " '''\n", 267 | " assert len(policy) == len(env.init_inv), (\n", 268 | " 'Policy should match number of nodes in network' + \n", 269 | " '({}, {}).'.format(len(policy), len(env.init_inv)))\n", 270 | " \n", 271 | " # Get echelon inventory levels\n", 272 | " if env.period == 0:\n", 273 | " inv_ech = np.cumsum(env.I[env.period] +\n", 274 | " env.T[env.period])\n", 275 | " else:\n", 276 | " inv_ech = np.cumsum(env.I[env.period] +\n", 277 | " env.T[env.period] - env.B[env.period-1, :-1])\n", 278 | " \n", 279 | " # Get unconstrained actions\n", 280 | " unc_actions = policy - inv_ech\n", 281 | " unc_actions = np.where(unc_actions>0, unc_actions, 0)\n", 282 | " \n", 283 | " # Ensure that actions can be fulfilled by checking \n", 284 | " # constraints\n", 285 | " inv_const = np.hstack([env.I[env.period, 1:], np.Inf])\n", 286 | " actions = np.minimum(env.c, np.minimum(unc_actions, inv_const))\n", 287 | " return actions\n", 288 | "\n", 289 | "def dfo_func(policy, env, *args):\n", 290 | " '''\n", 291 | " Runs an episode based on current base-stock model \n", 292 | " settings. This allows us to use our environment for the \n", 293 | " DFO optimizer.\n", 294 | " '''\n", 295 | " env.reset() # Ensure env is fresh\n", 296 | " rewards = []\n", 297 | " done = False\n", 298 | " while not done:\n", 299 | " action = base_stock_policy(policy, env)\n", 300 | " state, reward, done, _ = env.step(action)\n", 301 | " rewards.append(reward)\n", 302 | " if done:\n", 303 | " break\n", 304 | " \n", 305 | " rewards = np.array(rewards)\n", 306 | " prob = env.demand_dist.pmf(env.D, **env.dist_param)\n", 307 | " \n", 308 | " # Return negative of expected profit\n", 309 | " return -1 / env.num_periods * np.sum(prob * rewards)\n", 310 | " \n", 311 | "def optimize_inventory_policy(env_name, fun,\n", 312 | " init_policy=None, env_config={}, method='Powell'):\n", 313 | " \n", 314 | " env = or_gym.make(env_name, env_config=env_config)\n", 315 | " \n", 316 | " if init_policy is None:\n", 317 | " init_policy = np.ones(env.num_stages-1)\n", 318 | " \n", 319 | " # Optimize policy\n", 320 | " out = minimize(fun=fun, x0=init_policy, args=env, \n", 321 | " method=method)\n", 322 | " policy = out.x.copy()\n", 323 | " \n", 324 | " # Policy must be positive integer\n", 325 | " policy = np.round(np.maximum(policy, 0), 0).astype(int)\n", 326 | " \n", 327 | " return policy, out" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "3e11da7d", 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "Box(3,)" 340 | ] 341 | }, 342 | "execution_count": 8, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "policy, out = optimize_inventory_policy('InvManagement-v1',\n", 349 | " dfo_func)\n", 350 | "print(\"Re-order levels: {}\".format(policy))\n", 351 | "print(\"DFO Info:\\n{}\".format(out))\n", 352 | "\n", 353 | "env = or_gym.make(env_name, env_config=env_config)\n", 354 | "eps = 1000\n", 355 | "rewards = []\n", 356 | "for i in range(eps):\n", 357 | " env.reset()\n", 358 | " reward = 0\n", 359 | " while True:\n", 360 | " action = base_stock_policy(policy, env)\n", 361 | " s, r, done, _ = env.step(action)\n", 362 | " reward += r\n", 363 | " if done:\n", 364 | " rewards.append(reward)\n", 365 | " break" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "def5147b", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3.8.0 ('or-gym-dev')", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.8.13" 394 | }, 395 | "vscode": { 396 | "interpreter": { 397 | "hash": "bc8a2230aa8b659650bd48bf6a546b4d453aa64d7078ee0770a23a54a48157c8" 398 | } 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 5 403 | } 404 | -------------------------------------------------------------------------------- /or_gym/envs/classic_or/vehicle_routing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Example taken from Balaji et al. 3 | Paper: https://arxiv.org/abs/1911.10641 4 | GitHub: https://github.com/awslabs/or-rl-benchmarks 5 | ''' 6 | import gym 7 | from gym import spaces 8 | import or_gym 9 | from or_gym.utils import assign_env_config 10 | import random 11 | import numpy as np 12 | from scipy.stats import truncnorm 13 | 14 | 15 | class VehicleRoutingEnv(gym.Env): 16 | ''' 17 | Dynamic Vehicle Routing Problem 18 | 19 | This environment simulates a driver working with a food delivery app 20 | to move through a city, accept orders, pick them up from restaurants, 21 | and deliver them to waiting customers. Each order has a specific 22 | delivery value, restaurant, and delivery location, all of which are 23 | known by the driver before he accepts the order. After accepting, the 24 | driver must navigate to the restaurant to collect the order and then 25 | deliver it. If an order isn't accepted, it may be taken by another 26 | driver. Additionally, the driver has 60 minutes to make a delivery 27 | from the time an order is created. 28 | The city is represented as a grid with different zones that have 29 | different statistics for order creation and value. At each time step, 30 | new orders are created with a fixed probability unique to each zone. 31 | The driver's vehicle also has a finite capacity limiting the number of 32 | orders he can carry at a given time, although there is no limit on the 33 | number of accepted orders. 34 | The driver receives a penalty for time and distance spent during travel, 35 | but receives rewards for accepting and delivering orders. 36 | 37 | Observation: 38 | Type: Box 39 | State Vector: S = (p, h, c, l, w, e, v) 40 | p = pickup location 41 | h = driver's current position 42 | c = remaining vehicle capacity 43 | l = order location 44 | w = order status (open, accepted, picked up, delivered/inactive) 45 | e = time elapsed since order generation 46 | v = order value 47 | 48 | Action: 49 | Type: Discrete 50 | 0 = wait 51 | 1:max_orders = accept order 52 | max_orders:2*max_orders = pickup order 53 | 2*max_orders:3*max_orders = deliver order 54 | 3*max_orders:3*max_orders + n_restaurants = go to restaurant 55 | 56 | Action masking is available for this environment. Set mask=True 57 | in the env_config dictionary. 58 | 59 | Reward: 60 | The agent recieves 1/3 of the order value for accepting an order, 61 | picking it up, and delivering the order. The cost is comprised of 62 | three elements: delivery time, delivery distance, and cost of failure 63 | (if the driver does not deliver the item). 64 | 65 | Starting State: 66 | Restaurant and driver locations are randomized at the start of each 67 | episode. New orders are generated according to the order probability. 68 | 69 | Episode Terimantion: 70 | Episode termination occurs when the total time has elapsed. 71 | ''' 72 | 73 | def __init__(self, *args, **kwargs): 74 | self.n_restaurants = 2 75 | self.max_orders = 10 76 | self.order_prob = 0.5 77 | self.vehicle_capacity = 4 78 | self.grid = (5, 5) 79 | self.order_promise = 60 80 | self.order_timeout_prob = 0.15 81 | self.num_zones = 4 82 | self.order_probs_per_zone = [0.1, 0.5, 0.3, 0.1] 83 | self.order_reward_min = [8, 5, 2, 1] 84 | self.order_reward_max = [12, 8, 5, 3] 85 | self.half_norm_scale_reward_per_zone = [0.5, 0.5, 0.5, 0.5] 86 | self.penalty_per_timestep = 0.1 87 | self.penalty_per_move = 0.1 88 | self.order_miss_penalty = 50 89 | self.step_limit = 1000 90 | self.mask = False 91 | self.info = {} 92 | 93 | assign_env_config(self, kwargs) 94 | self._order_nums = np.arange(self.max_orders) 95 | self.loc_permutations = [(x, y) for x in range(self.grid[0]) 96 | for y in range(self.grid[1])] 97 | self.action_dim = 1 + 3 * self.max_orders + self.n_restaurants 98 | self.obs_dim = 2 * self.n_restaurants + 4 + 6 * self.max_orders 99 | box_low = np.zeros(self.obs_dim) 100 | box_high = np.hstack([ 101 | np.repeat( 102 | max(self.grid), 2 * self.n_restaurants + 2), # Locations 0-5 103 | np.repeat(self.vehicle_capacity, 2), # Vehicle capacities 6-7 104 | np.tile(np.hstack([4, self.n_restaurants, self.grid, 105 | self.order_promise, max(self.order_reward_max)]), self.max_orders) 106 | ]) 107 | 108 | if self.mask: 109 | self.observation_space = spaces.Dict({ 110 | 'action_mask': spaces.Box( 111 | low=np.zeros(self.action_dim), 112 | high=np.ones(self.action_dim), 113 | dtype=np.uint8), 114 | 'avail_actions': spaces.Box( 115 | low=np.zeros(self.action_dim), 116 | high=np.ones(self.action_dim), 117 | dtype=np.uint8), 118 | 'state': spaces.Box( 119 | low=box_low, 120 | high=box_high, 121 | dtype=np.float16) 122 | }) 123 | else: 124 | self.observation_space = spaces.Box( 125 | low=box_low, 126 | high=box_high, 127 | dtype=np.float16) 128 | 129 | self.action_space = spaces.Discrete(self.action_dim) 130 | 131 | self.reset() 132 | 133 | def _STEP(self, action): 134 | done = False 135 | self.reward = 0 136 | self.late_penalty = 0 137 | 138 | if action == 0: 139 | self.wait(action) 140 | elif action <= self.max_orders: 141 | self.accept_order(action) 142 | elif action <= 2 * self.max_orders: 143 | self.pickup_order(action) 144 | elif action <= 3 * self.max_orders: 145 | self.deliver_order(action) 146 | elif action <= 3 * self.max_orders + self.n_restaurants: 147 | self.return_to_restaurant(action) 148 | else: 149 | raise Exception( 150 | f"Selected action ({action}) outside of action space.") 151 | 152 | self.state = self._update_state() 153 | 154 | self.step_count += 1 155 | if self.step_count >= self.step_limit: 156 | done = True 157 | 158 | return self.state, self.reward, done, self.info 159 | 160 | def wait(self, action): 161 | # Do nothing 162 | pass 163 | 164 | def accept_order(self, action): 165 | # Accept order denoted by action 166 | order_idx = action - 1 167 | if order_idx not in self.order_dict.keys(): 168 | # Invalid action, do nothing 169 | pass 170 | elif self.order_dict[order_idx]['Status'] == 1: 171 | self.order_dict[order_idx]['Status'] = 2 172 | self.reward += self.order_dict[order_idx]['Value'] / 3 173 | 174 | def pickup_order(self, action): 175 | order_idx = action - self.max_orders - 1 176 | if order_idx not in self.order_dict.keys(): 177 | # Invalid action, do nothing 178 | pass 179 | else: 180 | restaurant = self.order_dict[order_idx]['RestaurantID'] 181 | restaurant_loc = self.restaurant_loc[restaurant] 182 | self._go_to_destination(restaurant_loc) 183 | self.reward -= self.penalty_per_move 184 | # Movement and pickup can occur during same time step 185 | if self.order_dict[order_idx]['Status'] == 2 and self.driver_loc[0] == restaurant_loc[0] and self.driver_loc[1] == restaurant_loc[1]: 186 | if self.vehicle_load < self.vehicle_capacity: 187 | self.order_dict[order_idx]['Status'] = 3 188 | self.vehicle_load += 1 189 | self.reward += self.order_dict[order_idx]['Value'] / 3 190 | 191 | def deliver_order(self, action): 192 | order_idx = action - 2 * self.max_orders - 1 193 | if order_idx not in self.order_dict.keys(): 194 | # Invalid action, do nothing 195 | pass 196 | else: 197 | order_loc = self.order_dict[order_idx]['DeliveryLoc'] 198 | self._go_to_destination(order_loc) 199 | self.reward -= self.penalty_per_move 200 | # Can deliver multiple orders simultaneously 201 | for k, v in self.order_dict.items(): 202 | if v['Status'] == 3 and v['DeliveryLoc'][0] == self.driver_loc[0] and v['DeliveryLoc'][1] == self.driver_loc[1]: 203 | if v['Time'] <= self.order_promise: 204 | self.reward = v['Value'] / 3 205 | self.vehicle_load -= 1 206 | v['Status'] = 4 # Delivered 207 | 208 | def return_to_restaurant(self, action): 209 | restaurant = action - 3 * self.max_orders - 1 210 | restaurant_loc = self.restaurant_loc[restaurant] 211 | self._go_to_destination(restaurant_loc) 212 | self.reward -= self.penalty_per_move 213 | 214 | def _update_orders(self): 215 | self._update_order_times() 216 | self._remove_orders() 217 | self._generate_orders() 218 | 219 | def _remove_orders(self): 220 | # Remove orders if they're over due 221 | orders_to_delete = [] 222 | for k, v in self.order_dict.items(): 223 | if v['Time'] >= self.order_promise: 224 | if v['Status'] >= 2: 225 | # Apply penalty and remove associated rewards 226 | self.reward -= (self.order_miss_penalty + 227 | v['Value'] * (v['Status'] == 2)/3 + 228 | v['Value'] * (v['Status'] == 3) * 2/3) 229 | self.late_penalty += self.order_miss_penalty 230 | if v['Status'] == 3: 231 | self.vehicle_capacity -= 1 232 | orders_to_delete.append(k) 233 | 234 | elif v['Status'] == 4: 235 | orders_to_delete.append(k) 236 | 237 | # Probabalistically remove open orders 238 | elif v['Status'] == 1 and np.random.random() < self.order_timeout_prob: 239 | orders_to_delete.append(k) 240 | 241 | for k in orders_to_delete: 242 | del self.order_dict[k] 243 | 244 | def _update_state(self): 245 | self._update_orders() 246 | # Placeholder for order data 247 | order_array = np.zeros((self.max_orders, 6)) 248 | try: 249 | order_data = np.hstack([v1 for v in self.order_dict.values() 250 | for v1 in v.values()]).reshape(-1, 7) 251 | order_array[order_data[:, 0].astype(int)] += order_data[:, 1:] 252 | except ValueError: 253 | # Occurs when order_data is empty 254 | pass 255 | state = np.hstack([ 256 | np.hstack(self.restaurant_loc), 257 | np.hstack(self.driver_loc), 258 | np.hstack([self.vehicle_load, self.vehicle_capacity]), 259 | order_array.flatten() 260 | ], dtype=np.float16) 261 | if self.mask: 262 | action_mask = self._update_mask(state) 263 | state = { 264 | 'state': state, 265 | 'action_mask': action_mask, 266 | 'avail_actions': np.ones(self.action_dim, dtype=np.uint8) 267 | } 268 | return state 269 | 270 | def _update_mask(self, state): 271 | action_mask = np.zeros(self.action_dim, dtype=np.uint8) 272 | # Wait and return to restaurant are always allowed 273 | action_mask[0] = 1 274 | action_mask[(3 * self.max_orders + 1) :(3 * self.max_orders + self.n_restaurants + 1)] = 1 275 | 276 | for k, v in self.order_dict.items(): 277 | status = v['Status'] 278 | # Allow accepting an open order 279 | if status == 1: 280 | action_mask[k + 1] = 1 281 | # Allow navigating to accepted order for pickup 282 | elif status == 2 and self.vehicle_load < self.vehicle_capacity: 283 | action_mask[k + self.max_orders + 1] = 1 284 | # Allow delivery of picked up order 285 | elif status == 3: 286 | action_mask[k + 2 * self.max_orders + 1] = 1 287 | 288 | return action_mask 289 | 290 | def _RESET(self): 291 | self.step_count = 0 292 | self.vehicle_load = 0 293 | self.randomize_locations() 294 | self.zone_loc = self._get_zones() 295 | self.order_dict = {} 296 | self.state = self._update_state() 297 | return self.state 298 | 299 | def _update_order_times(self): 300 | for k, v in self.order_dict.items(): 301 | if v['Status'] >= 1: 302 | v['Time'] += 1 303 | 304 | def _generate_orders(self): 305 | open_slots = self._order_nums[~np.isin(self._order_nums, 306 | np.array([k for k in self.order_dict.keys()]))] 307 | try: 308 | order_num = open_slots.min() 309 | except ValueError: 310 | pass 311 | for n in open_slots: 312 | # Probabalistically create a new order 313 | if np.random.random() < self.order_prob: 314 | zone = np.random.choice( 315 | self.num_zones, p=self.order_probs_per_zone) 316 | order = self._get_order_from_zone(zone, order_num) 317 | self.order_dict[order_num] = order 318 | order_num += 1 319 | 320 | def _get_order_from_zone(self, zone, n): 321 | delivery_loc = random.choice(self.zone_loc[zone]) 322 | restaurant_idx = np.random.choice(self.n_restaurants) 323 | value = truncnorm.rvs(0, 324 | (self.order_reward_max[zone] - 325 | self.order_reward_min[zone]) 326 | / self.half_norm_scale_reward_per_zone[zone], 327 | self.order_reward_min[zone], 328 | self.half_norm_scale_reward_per_zone[zone]) 329 | return {'Number': n, 330 | 'Status': 1, 331 | 'RestaurantID': restaurant_idx, 332 | 'DeliveryLoc': delivery_loc, 333 | 'Time': 0, 334 | 'Value': value} 335 | 336 | def randomize_locations(self): 337 | self._place_restaurants() 338 | self._place_driver() 339 | 340 | def _place_restaurants(self): 341 | self.restaurant_loc = random.sample( 342 | self.loc_permutations, self.n_restaurants) 343 | 344 | def _place_driver(self): 345 | self.driver_loc = list(random.sample(self.loc_permutations, 1)[0]) 346 | 347 | def _move_driver(self, direction): 348 | if direction is None: 349 | return None 350 | # Receives direction from routing function 351 | if direction == 0: # Up 352 | self.driver_loc[1] += 1 353 | elif direction == 1: # Down 354 | self.driver_loc[1] -= 1 355 | elif direction == 2: # Right 356 | self.driver_loc[0] += 1 357 | elif direction == 3: # Left 358 | self.driver_loc[0] -= 1 359 | # Check boundaries 360 | if self.driver_loc[0] > self.grid[0]: 361 | self.driver_loc[0] = self.grid[0] 362 | if self.driver_loc[0] < 0: 363 | self.driver_loc[0] = 0 364 | if self.driver_loc[1] > self.grid[1]: 365 | self.driver_loc[1] = self.grid[1] 366 | if self.driver_loc[1] < 0: 367 | self.driver_loc[1] = 0 368 | 369 | def _go_to_destination(self, destination): 370 | # Automatically selects direction based on starting location and 371 | # destination. 372 | # 0 -> Up; 1 -> Down; 2 -> Right; 3 -> Left 373 | x_diff = self.driver_loc[0] - destination[0] 374 | y_diff = self.driver_loc[1] - destination[1] 375 | if abs(x_diff) >= abs(y_diff): 376 | if x_diff > 0: 377 | direction = 3 378 | elif x_diff < 0: 379 | direction = 2 380 | elif abs(x_diff) == abs(y_diff): # 0 == 0 381 | # Do nothing 382 | direction = None 383 | else: 384 | if y_diff > 0: 385 | direction = 1 386 | elif y_diff < 0: 387 | direction = 0 388 | print('direction ',direction) 389 | self._move_driver(direction) 390 | 391 | def _get_num_spaces_per_zone(self): 392 | total_spaces = self.grid[0] * self.grid[1] 393 | spaces_per_zone = np.array([np.floor(total_spaces / self.num_zones) 394 | for i in range(self.num_zones)]) 395 | for i in range(total_spaces % self.num_zones): 396 | spaces_per_zone[i] += 1 397 | return spaces_per_zone.astype(int) 398 | 399 | def _get_zones(self): 400 | # Slices the grid into zones by row 401 | spaces_per_zone = self._get_num_spaces_per_zone() 402 | zones = {} 403 | for i, n in enumerate(spaces_per_zone): 404 | x = sum(spaces_per_zone[:i]) 405 | zones[i] = self.loc_permutations[x:x+n] 406 | 407 | zones = self._remove_restaurants_from_zone_locs(zones) 408 | return zones 409 | 410 | def _remove_restaurants_from_zone_locs(self, zones): 411 | for k, v in zones.items(): 412 | for r in self.restaurant_loc: 413 | try: 414 | loc_to_remove = v.index(r) 415 | del zones[k][loc_to_remove] 416 | except ValueError: 417 | pass 418 | return zones 419 | 420 | def step(self, action): 421 | return self._STEP(action) 422 | 423 | def reset(self): 424 | return self._RESET() 425 | -------------------------------------------------------------------------------- /or_gym/envs/supply_chain/inventory_management.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Multi-period inventory management 3 | Hector Perez, Christian Hubbs, Owais Sarwar 4 | 4/14/2020 5 | ''' 6 | 7 | import gym 8 | import itertools 9 | import numpy as np 10 | from scipy.stats import * 11 | from or_gym.utils import assign_env_config 12 | from collections import deque 13 | 14 | class InvManagementMasterEnv(gym.Env): 15 | ''' 16 | The supply chain environment is structured as follows: 17 | 18 | It is a multi-period multi-echelon production-inventory system for a single non-perishable product that is sold only 19 | in discrete quantities. Each stage in the supply chain consists of an inventory holding area and a production area. 20 | The exception are the first stage (retailer: only inventory area) and the last stage (raw material transformation 21 | plant: only production area, with unlimited raw material availability). The inventory holding area holds the inventory 22 | necessary to produce the material at that stage. One unit of inventory produces one unit of product at each stage. 23 | There are lead times between the transfer of material from one stage to the next. The outgoing material from stage i 24 | is the feed material for production at stage i-1. Stages are numbered in ascending order: Stages = {0, 1, ..., M} 25 | (i.e. m = 0 is the retailer). Production at each stage is bounded by the stage's production capacity and the available 26 | inventory. 27 | 28 | At the beginning of each time period, the following sequence of events occurs: 29 | 30 | 0) Stages 0 through M-1 place replenishment orders to their respective suppliers. Replenishment orders are filled 31 | according to available production capacity and available inventory at the respective suppliers. 32 | 1) Stages 0 through M-1 receive incoming inventory replenishment shipments that have made it down the product pipeline 33 | after the stage's respective lead time. 34 | 2) Customer demand occurs at stage 0 (retailer). It is sampled from a specified discrete probability distribution. 35 | 3) Demand is filled according to available inventory at stage 0. 36 | 4) Option: one of the following occurs, 37 | a) Unfulfilled sales and replenishment orders are backlogged at a penalty. 38 | Note: Backlogged sales take priority in the following period. 39 | b) Unfulfilled sales and replenishment orders are lost with a goodwill loss penalty. 40 | 5) Surpluss inventory is held at each stage at a holding cost. 41 | 42 | ''' 43 | def __init__(self, *args, **kwargs): 44 | ''' 45 | periods = [positive integer] number of periods in simulation. 46 | I0 = [non-negative integer; dimension |Stages|-1] initial inventories for each stage. 47 | p = [positive float] unit price for final product. 48 | r = [non-negative float; dimension |Stages|] unit cost for replenishment orders at each stage. 49 | k = [non-negative float; dimension |Stages|] backlog cost or goodwill loss (per unit) for unfulfilled orders (demand or replenishment orders). 50 | h = [non-negative float; dimension |Stages|-1] unit holding cost for excess on-hand inventory at each stage. 51 | (Note: does not include pipeline inventory). 52 | c = [positive integer; dimension |Stages|-1] production capacities for each suppliers (stages 1 through |Stage|). 53 | L = [non-negative integer; dimension |Stages|-1] lead times in betwen stages. 54 | backlog = [boolean] are unfulfilled orders backlogged? True = backlogged, False = lost sales. 55 | dist = [integer] value between 1 and 4. Specifies distribution for customer demand. 56 | 1: poisson distribution 57 | 2: binomial distribution 58 | 3: uniform random integer 59 | 4: geometric distribution 60 | 5: user supplied demand values 61 | dist_param = [dictionary] named values for parameters fed to statistical distribution. 62 | poisson: {'mu': } 63 | binom: {'n': , 'p': } 64 | raindint: {'low' = , 'high': } 65 | geom: {'p': } 66 | alpha = [float in range (0,1]] discount factor to account for the time value of money 67 | seed_int = [integer] seed for random state. 68 | user_D = [list] user specified demand for each time period in simulation 69 | ''' 70 | # set default (arbitrary) values when creating environment (if no args or kwargs are given) 71 | self.periods = 30 72 | self.I0 = [100, 100, 200] 73 | self.p = 2 74 | self.r = [1.5, 1.0, 0.75, 0.5] 75 | self.k = [0.10, 0.075, 0.05, 0.025] 76 | self.h = [0.15, 0.10, 0.05] 77 | self.c = [100, 90, 80] 78 | self.L = [3, 5, 10] 79 | self.backlog = True 80 | self.dist = 1 81 | self.dist_param = {'mu': 20} 82 | self.alpha = 0.97 83 | self.seed_int = 0 84 | self.user_D = np.zeros(self.periods) 85 | self._max_rewards = 2000 86 | 87 | # add environment configuration dictionary and keyword arguments 88 | assign_env_config(self, kwargs) 89 | 90 | # input parameters 91 | try: 92 | self.init_inv = np.array(list(self.I0)) 93 | except: 94 | self.init_inv = np.array([self.I0]) 95 | self.num_periods = self.periods 96 | self.unit_price = np.append(self.p,self.r[:-1]) # cost to stage 1 is price to stage 2 97 | self.unit_cost = np.array(self.r) 98 | self.demand_cost = np.array(self.k) 99 | self.holding_cost = np.append(self.h,0) # holding cost at last stage is 0 100 | try: 101 | self.supply_capacity = np.array(list(self.c)) 102 | except: 103 | self.supply_capacity = np.array([self.c]) 104 | try: 105 | self.lead_time = np.array(list(self.L)) 106 | except: 107 | self.lead_time = np.array([self.L]) 108 | self.discount = self.alpha 109 | self.user_D = np.array(list(self.user_D)) 110 | self.num_stages = len(self.init_inv) + 1 111 | m = self.num_stages 112 | lt_max = self.lead_time.max() 113 | 114 | # parameters 115 | # dictionary with options for demand distributions 116 | distributions = {1:poisson, 117 | 2:binom, 118 | 3:randint, 119 | 4:geom, 120 | 5:self.user_D} 121 | 122 | # check inputs 123 | assert np.all(self.init_inv) >=0, "The initial inventory cannot be negative" 124 | try: 125 | assert self.num_periods > 0, "The number of periods must be positive. Num Periods = {}".format(self.num_periods) 126 | except TypeError: 127 | print('\n{}\n'.format(self.num_periods)) 128 | assert np.all(self.unit_price >= 0), "The sales prices cannot be negative." 129 | assert np.all(self.unit_cost >= 0), "The procurement costs cannot be negative." 130 | assert np.all(self.demand_cost >= 0), "The unfulfilled demand costs cannot be negative." 131 | assert np.all(self.holding_cost >= 0), "The inventory holding costs cannot be negative." 132 | assert np.all(self.supply_capacity > 0), "The supply capacities must be positive." 133 | assert np.all(self.lead_time >= 0), "The lead times cannot be negative." 134 | assert (self.backlog == False) | (self.backlog == True), "The backlog parameter must be a boolean." 135 | assert m >= 2, "The minimum number of stages is 2. Please try again" 136 | assert len(self.unit_cost) == m, "The length of r is not equal to the number of stages." 137 | assert len(self.demand_cost) == m, "The length of k is not equal to the number of stages." 138 | assert len(self.holding_cost) == m, "The length of h is not equal to the number of stages - 1." 139 | assert len(self.supply_capacity) == m-1, "The length of c is not equal to the number of stages - 1." 140 | assert len(self.lead_time) == m-1, "The length of L is not equal to the number of stages - 1." 141 | assert self.dist in [1,2,3,4,5], "dist must be one of 1, 2, 3, 4, 5." 142 | if self.dist < 5: 143 | assert distributions[self.dist].cdf(0,**self.dist_param), "Wrong parameters given for distribution." 144 | else: 145 | assert len(self.user_D) == self.num_periods, "The length of the user specified distribution is not equal to the number of periods." 146 | assert (self.alpha>0) & (self.alpha<=1), "alpha must be in the range (0,1]." 147 | 148 | # select distribution 149 | self.demand_dist = distributions[self.dist] 150 | 151 | # set random generation seed (unless using user demands) 152 | if self.dist < 5: 153 | self.seed(self.seed_int) 154 | 155 | # intialize 156 | self.reset() 157 | 158 | # action space (reorder quantities for each stage; list) 159 | # An action is defined for every stage (except last one) 160 | # self.action_space = gym.spaces.Tuple(tuple( 161 | # [gym.spaces.Box(0, i, shape=(1,)) for i in self.supply_capacity])) 162 | self.pipeline_length = (m-1)*(lt_max+1) 163 | self.action_space = gym.spaces.Box( 164 | low=np.zeros(m-1), high=self.supply_capacity, dtype=np.int16) 165 | # observation space (Inventory position at each echelon, which is any integer value) 166 | self.observation_space = gym.spaces.Box( 167 | low=-np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods*10, 168 | high=np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods, dtype=np.int32) 169 | 170 | # self.observation_space = gym.spaces.Box( 171 | # low=-np.ones(m-1)*self.supply_capacity.max()*self.num_periods*10, 172 | # high=self.supply_capacity*self.num_periods, dtype=np.int32) 173 | 174 | def seed(self,seed=None): 175 | ''' 176 | Set random number generation seed 177 | ''' 178 | # seed random state 179 | if seed != None: 180 | np.random.seed(seed=int(seed)) 181 | 182 | def _RESET(self): 183 | ''' 184 | Create and initialize all variables and containers. 185 | Nomenclature: 186 | I = On hand inventory at the start of each period at each stage (except last one). 187 | T = Pipeline inventory at the start of each period at each stage (except last one). 188 | R = Replenishment order placed at each period at each stage (except last one). 189 | D = Customer demand at each period (at the retailer) 190 | S = Sales performed at each period at each stage. 191 | B = Backlog at each period at each stage. 192 | LS = Lost sales at each period at each stage. 193 | P = Total profit at each stage. 194 | ''' 195 | periods = self.num_periods 196 | m = self.num_stages 197 | I0 = self.init_inv 198 | 199 | # simulation result lists 200 | self.I=np.zeros([periods + 1, m - 1]) # inventory at the beginning of each period (last stage not included since iventory is infinite) 201 | self.T=np.zeros([periods + 1, m - 1]) # pipeline inventory at the beginning of each period (no pipeline inventory for last stage) 202 | self.R=np.zeros([periods, m - 1]) # replenishment order (last stage places no replenishment orders) 203 | self.D=np.zeros(periods) # demand at retailer 204 | self.S=np.zeros([periods, m]) # units sold 205 | self.B=np.zeros([periods, m]) # backlog (includes top most production site in supply chain) 206 | self.LS=np.zeros([periods, m]) # lost sales 207 | self.P=np.zeros(periods) # profit 208 | 209 | # initializetion 210 | self.period = 0 # initialize time 211 | self.I[0,:]=np.array(I0) # initial inventory 212 | self.T[0,:]=np.zeros(m-1) # initial pipeline inventory 213 | self.action_log = np.zeros((periods, m-1), dtype=np.int32) 214 | # set state 215 | self._update_state() 216 | 217 | return self.state 218 | 219 | def _update_state(self): 220 | m = self.num_stages - 1 221 | t = self.period 222 | lt_max = self.lead_time.max() 223 | state = np.zeros(m*(lt_max + 1), dtype=np.int32) 224 | # state = np.zeros(m) 225 | if t == 0: 226 | state[:m] = self.I0 227 | else: 228 | state[:m] = self.I[t] 229 | 230 | if t == 0: 231 | pass 232 | elif t >= lt_max: 233 | state[-m*lt_max:] += self.action_log[t-lt_max:t].flatten() 234 | else: 235 | state[-m*(t):] += self.action_log[:t].flatten() 236 | 237 | self.state = state.copy() 238 | 239 | def _update_base_stock_policy_state(self): 240 | ''' 241 | Get current state of the system: Inventory position at each echelon 242 | Inventory at hand + Pipeline inventory - backlog up to the current stage 243 | (excludes last stage since no inventory there, nor replenishment orders placed there). 244 | ''' 245 | n = self.period 246 | m = self.num_stages 247 | if n>=1: 248 | IP = np.cumsum(self.I[n,:] + self.T[n,:] - self.B[n-1,:-1]) 249 | else: 250 | IP = np.cumsum(self.I[n,:] + self.T[n,:]) 251 | self.state = IP 252 | 253 | def _STEP(self,action): 254 | ''' 255 | Take a step in time in the multiperiod inventory management problem. 256 | action = [integer; dimension |Stages|-1] number of units to request from suppliers (last stage makes no requests) 257 | ''' 258 | R = np.maximum(action, 0).astype(int) 259 | 260 | # get inventory at hand and pipeline inventory at beginning of the period 261 | n = self.period 262 | L = self.lead_time 263 | I = self.I[n,:].copy() # inventory at start of period n 264 | T = self.T[n,:].copy() # pipeline inventory at start of period n 265 | m = self.num_stages # number of stages 266 | 267 | # get production capacities 268 | c = self.supply_capacity # capacity 269 | self.action_log[n] = R.copy() 270 | # available inventory at the m+1 stage (note: last stage has unlimited supply) 271 | Im1 = np.append(I[1:], np.Inf) 272 | 273 | # place replenishment order 274 | if n>=1: # add backlogged replenishment orders to current request 275 | R = R + self.B[n-1,1:] 276 | Rcopy = R.copy() # copy original replenishment quantity 277 | R[R>=c] = c[R>=c] # enforce capacity constraint 278 | R[R>=Im1] = Im1[R>=Im1] # enforce available inventory constraint 279 | self.R[n,:] = R # store R[n] 280 | 281 | # receive inventory replenishment placed L periods ago 282 | RnL = np.zeros(m-1) # initialize 283 | for i in range(m-1): 284 | if n - L[i] >= 0: 285 | RnL[i] = self.R[n-L[i],i].copy() # replenishment placed at the end of period n-L-1 286 | I[i] = I[i] + RnL[i] 287 | 288 | # demand is realized 289 | if self.dist < 5: 290 | D0 = self.demand_dist.rvs(**self.dist_param) 291 | else: 292 | D0 = self.demand_dist[n] # user specified demand 293 | D = D0 # demand 294 | self.D[n] = D0 # store D[n] 295 | 296 | # add previous backlog to demand 297 | if n >= 1: 298 | D = D0 + self.B[n-1,0].copy() # add backlogs to demand 299 | 300 | # units sold 301 | S0 = min(I[0],D) # at retailer 302 | S = np.append(S0,R) # at each stage 303 | self.S[n,:] = S # store S[n] 304 | 305 | # update inventory on hand and pipeline inventory 306 | I = I - S[:-1] # updated inventory at all stages (exclude last stage) 307 | T = T - RnL + R # updated pipeline inventory at all stages (exclude last one) 308 | self.I[n+1,:] = I # store inventory available at start of period n + 1 (exclude last stage) 309 | self.T[n+1,:] = T # store pipeline inventory at start of period n + 1 310 | 311 | # unfulfilled orders 312 | U = np.append(D, Rcopy) - S # unfulfilled demand and replenishment orders 313 | 314 | # backlog and lost sales 315 | if self.backlog: 316 | B = U 317 | LS = np.zeros(m) 318 | else: 319 | LS = U # lost sales 320 | B = np.zeros(m) 321 | self.B[n,:] = B # store B[n] 322 | self.LS[n,:] = LS # store LS[n] 323 | 324 | # calculate profit 325 | p = self.unit_price 326 | r = self.unit_cost 327 | k = self.demand_cost 328 | h = self.holding_cost 329 | a = self.discount 330 | II = np.append(I,0) # augment inventory so that last has no onsite inventory 331 | RR = np.append(R,S[-1]) # augment replenishment orders to include production cost at last stage 332 | P = a**n*np.sum(p*S - (r*RR + k*U + h*II)) # discounted profit in period n 333 | # P = a**n*np.sum(p*S - (r*RR + k*U + h*I)) 334 | self.P[n] = P # store P 335 | 336 | # update period 337 | self.period += 1 338 | 339 | # update stae 340 | self._update_state() 341 | 342 | # set reward (profit from current timestep) 343 | reward = P 344 | 345 | # determine if simulation should terminate 346 | if self.period >= self.num_periods: 347 | done = True 348 | else: 349 | done = False 350 | 351 | return self.state, reward, done, {} 352 | 353 | def sample_action(self): 354 | ''' 355 | Generate an action by sampling from the action_space 356 | ''' 357 | return self.action_space.sample() 358 | 359 | def base_stock_action(self,z): 360 | ''' 361 | Sample action (number of units to request) based on a base-stock policy (order up to z policy) 362 | z = [integer list; dimension |Stages| - 1] base stock level (no inventory at the last stage) 363 | ''' 364 | n = self.period 365 | c = self.supply_capacity 366 | m = self.num_stages 367 | IP = self._update_base_stock_policy_state() # extract inventory position (current state) 368 | 369 | try: 370 | dimz = len(z) 371 | except: 372 | dimz = 1 373 | assert dimz == m-1, "Wrong dimension on base stock level vector. Should be # Stages - 1." 374 | 375 | # calculate total inventory position at the beginning of period n 376 | R = z - IP # replenishmet order to reach zopt 377 | 378 | # check if R can actually be fulfilled (capacity and inventory constraints) 379 | Im1 = np.append(self.I[n,1:], np.Inf) # available inventory at the m+1 stage 380 | # NOTE: last stage has unlimited raw materials 381 | Rpos = np.column_stack((np.zeros(len(R)),R)) # augmented materix to get replenishment only if positive 382 | A = np.column_stack((c, np.max(Rpos,axis=1), Im1)) # augmented matrix with c, R, and I_m+1 as columns 383 | 384 | R = np.min(A, axis = 1) # replenishmet order to reach zopt (capacity constrained) 385 | 386 | return R 387 | 388 | def step(self, action): 389 | return self._STEP(action) 390 | 391 | def reset(self): 392 | return self._RESET() 393 | 394 | class InvManagementBacklogEnv(InvManagementMasterEnv): 395 | def __init__(self, *args, **kwargs): 396 | super().__init__(*args, **kwargs) 397 | 398 | class InvManagementLostSalesEnv(InvManagementMasterEnv): 399 | def __init__(self, *args, **kwargs): 400 | super().__init__(*args, **kwargs) 401 | self.backlog = False 402 | self.observation_space = gym.spaces.Box( 403 | low=np.zeros(self.pipeline_length), # Never goes negative without backlog 404 | high=np.ones(self.pipeline_length)*self.supply_capacity.max()*self.num_periods, dtype=np.int32) -------------------------------------------------------------------------------- /or_gym/envs/classic_or/knapsack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces, logger 4 | from gym.utils import seeding 5 | from or_gym.utils import assign_env_config 6 | import copy 7 | 8 | class KnapsackEnv(gym.Env): 9 | ''' 10 | Unbounded Knapsack Problem 11 | 12 | The Knapsack Problem (KP) is a combinatorial optimization problem which 13 | requires the user to select from a range of goods of different values and 14 | weights in order to maximize the value of the selected items within a 15 | given weight limit. This version is unbounded meaning that we can select 16 | items without limit. 17 | 18 | The episodes proceed by selecting items and placing them into the 19 | knapsack one at a time until the weight limit is reached or exceeded, at 20 | which point the episode ends. 21 | 22 | Observation: 23 | Type: Tuple, Discrete 24 | 0: list of item weights 25 | 1: list of item values 26 | 2: maximum weight of the knapsack 27 | 3: current weight in knapsack 28 | 29 | Actions: 30 | Type: Discrete 31 | 0: Place item 0 into knapsack 32 | 1: Place item 1 into knapsack 33 | 2: ... 34 | 35 | Reward: 36 | Value of item successfully placed into knapsack or 0 if the item 37 | doesn't fit, at which point the episode ends. 38 | 39 | Starting State: 40 | Lists of available items and empty knapsack. 41 | 42 | Episode Termination: 43 | Full knapsack or selection that puts the knapsack over the limit. 44 | ''' 45 | 46 | # Internal list of placed items for better rendering 47 | _collected_items = [] 48 | 49 | def __init__(self, *args, **kwargs): 50 | # Generate data with consistent random seed to ensure reproducibility 51 | self.N = 200 52 | self.max_weight = 200 53 | self.current_weight = 0 54 | self._max_reward = 10000 55 | self.mask = True 56 | self.seed = 0 57 | self.item_numbers = np.arange(self.N) 58 | self.item_weights = np.random.randint(1, 100, size=self.N) 59 | self.item_values = np.random.randint(0, 100, size=self.N) 60 | self.over_packed_penalty = 0 61 | self.randomize_params_on_reset = False 62 | self._collected_items.clear() 63 | # Add env_config, if any 64 | assign_env_config(self, kwargs) 65 | self.set_seed() 66 | 67 | obs_space = spaces.Box( 68 | 0, self.max_weight, shape=(2*self.N + 1,), dtype=np.int32) 69 | self.action_space = spaces.Discrete(self.N) 70 | if self.mask: 71 | self.observation_space = spaces.Dict({ 72 | "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8), 73 | "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8), 74 | "state": obs_space 75 | }) 76 | else: 77 | self.observation_space = spaces.Box( 78 | 0, self.max_weight, shape=(2, self.N + 1), dtype=np.int32) 79 | 80 | self.reset() 81 | 82 | def _STEP(self, item): 83 | # Check that item will fit 84 | if self.item_weights[item] + self.current_weight <= self.max_weight: 85 | self.current_weight += self.item_weights[item] 86 | reward = self.item_values[item] 87 | self._collected_items.append(item) 88 | if self.current_weight == self.max_weight: 89 | done = True 90 | else: 91 | done = False 92 | else: 93 | # End trial if over weight 94 | reward = self.over_packed_penalty 95 | done = True 96 | 97 | self._update_state() 98 | return self.state, reward, done, {} 99 | 100 | def _get_obs(self): 101 | return self.state 102 | 103 | def _update_state(self): 104 | if self.mask: 105 | mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8) 106 | state = np.hstack([ 107 | self.item_weights, 108 | self.item_values, 109 | np.array([self.current_weight]) 110 | ], dtype=np.int32) 111 | self.state = { 112 | "action_mask": mask, 113 | "avail_actions": np.ones(self.N, dtype=np.uint8), 114 | "state": state 115 | } 116 | else: 117 | state = np.vstack([ 118 | self.item_weights, 119 | self.item_values], dtype=np.int32) 120 | self.state = np.hstack([ 121 | state, 122 | np.array([ 123 | [self.max_weight], 124 | [self.current_weight]]) 125 | ]) 126 | 127 | def _RESET(self): 128 | if self.randomize_params_on_reset: 129 | self.item_weights = np.random.randint(1, 100, size=self.N) 130 | self.item_values = np.random.randint(0, 100, size=self.N) 131 | self.current_weight = 0 132 | self._collected_items.clear() 133 | self._update_state() 134 | return self.state 135 | 136 | def sample_action(self): 137 | return np.random.choice(self.item_numbers) 138 | 139 | def set_seed(self, seed=None): 140 | if seed == None: 141 | seed = np.random.randint(0, np.iinfo(np.int32).max) 142 | self.np_random, seed = seeding.np_random(seed) 143 | return [seed] 144 | 145 | def reset(self): 146 | return self._RESET() 147 | 148 | def step(self, action): 149 | return self._STEP(action) 150 | 151 | def render(self): 152 | total_value = 0 153 | total_weight = 0 154 | for i in range(self.N) : 155 | if i in self._collected_items : 156 | total_value += self.item_values[i] 157 | total_weight += self.item_weights[i] 158 | print(self._collected_items, total_value, total_weight) 159 | 160 | # RlLib requirement: Make sure you either return a uint8/w x h x 3 (RGB) image or handle rendering in a window and then return `True`. 161 | return True 162 | 163 | class BinaryKnapsackEnv(KnapsackEnv): 164 | ''' 165 | Binary Knapsack Problem 166 | 167 | The Binary or 0-1 KP allows selection of each item only once or not at 168 | all. 169 | 170 | The episodes proceed by selecting items and placing them into the 171 | knapsack one at a time until the weight limit is reached or exceeded, at 172 | which point the episode ends. 173 | 174 | Observation: 175 | Type: Tuple, Discrete 176 | 0: list of item weights 177 | 1: list of item values 178 | 2: list of item limits 179 | 3: maximum weight of the knapsack 180 | 4: current weight in knapsack 181 | 182 | Actions: 183 | Type: Discrete 184 | 0: Place item 0 into knapsack 185 | 1: Place item 1 into knapsack 186 | 2: ... 187 | 188 | Reward: 189 | Value of item successfully placed into knapsack or 0 if the item 190 | doesn't fit, at which point the episode ends. 191 | 192 | Starting State: 193 | Lists of available items and empty knapsack. 194 | 195 | Episode Termination: 196 | Full knapsack or selection that puts the knapsack over the limit. 197 | ''' 198 | def __init__(self, *args, **kwargs): 199 | super().__init__() 200 | self.item_weights = np.random.randint(1, 100, size=self.N) 201 | self.item_values = np.random.randint(0, 100, size=self.N) 202 | assign_env_config(self, kwargs) 203 | 204 | obs_space = spaces.Box( 205 | 0, self.max_weight, shape=(3, self.N + 1), dtype=np.int32) 206 | if self.mask: 207 | self.observation_space = spaces.Dict({ 208 | "action_mask": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8), 209 | "avail_actions": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8), 210 | "state": obs_space 211 | }) 212 | else: 213 | self.observation_space = obs_space 214 | 215 | self.reset() 216 | 217 | def _STEP(self, item): 218 | # Check item limit 219 | if self.item_limits[item] > 0: 220 | # Check that item will fit 221 | if self.item_weights[item] + self.current_weight <= self.max_weight: 222 | self.current_weight += self.item_weights[item] 223 | reward = self.item_values[item] 224 | if self.current_weight == self.max_weight: 225 | done = True 226 | else: 227 | done = False 228 | self._update_state(item) 229 | else: 230 | # End if over weight 231 | reward = 0 232 | done = True 233 | else: 234 | # End if item is unavailable 235 | reward = 0 236 | done = True 237 | 238 | return self.state, reward, done, {} 239 | 240 | def _update_state(self, item=None): 241 | if item is not None: 242 | self.item_limits[item] -= 1 243 | state_items = np.vstack([ 244 | self.item_weights, 245 | self.item_values, 246 | self.item_limits 247 | ], dtype=np.int32) 248 | state = np.hstack([ 249 | state_items, 250 | np.array([[self.max_weight], 251 | [self.current_weight], 252 | [0] # Serves as place holder 253 | ]) 254 | ], dtype=np.int32) 255 | if self.mask: 256 | mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8) 257 | mask = np.where(self.item_limits > 0, mask, 0) 258 | self.state = { 259 | "action_mask": mask, 260 | "avail_actions": np.ones(self.N, dtype=np.uint8), 261 | "state": state 262 | } 263 | else: 264 | self.state = state.copy() 265 | 266 | def sample_action(self): 267 | return np.random.choice( 268 | self.item_numbers[np.where(self.item_limits!=0)]) 269 | 270 | def _RESET(self): 271 | if self.randomize_params_on_reset: 272 | self.item_weights = np.random.randint(1, 100, size=self.N) 273 | self.item_values = np.random.randint(0, 100, size=self.N) 274 | self.current_weight = 0 275 | self.item_limits = np.ones(self.N, dtype=np.int32) 276 | self._update_state() 277 | return self.state 278 | 279 | class BoundedKnapsackEnv(KnapsackEnv): 280 | ''' 281 | Bounded Knapsack Problem 282 | 283 | The Knapsack Problem (KP) is a combinatorial optimization problem which 284 | requires the user to select from a range of goods of different values and 285 | weights in order to maximize the value of the selected items within a 286 | given weight limit. This version is bounded meaning each item can be 287 | selected a limited number of times. 288 | 289 | The episodes proceed by selecting items and placing them into the 290 | knapsack one at a time until the weight limit is reached or exceeded, at 291 | which point the episode ends. 292 | 293 | Observation: 294 | Type: Tuple, Discrete 295 | 0: list of item weights 296 | 1: list of item values 297 | 2: list of item limits 298 | 3: maximum weight of the knapsack 299 | 4: current weight in knapsack 300 | 301 | Actions: 302 | Type: Discrete 303 | 0: Place item 0 into knapsack 304 | 1: Place item 1 into knapsack 305 | 2: ... 306 | 307 | Reward: 308 | Value of item successfully placed into knapsack or 0 if the item 309 | doesn't fit, at which point the episode ends. 310 | 311 | Starting State: 312 | Lists of available items and empty knapsack. 313 | 314 | Episode Termination: 315 | Full knapsack or selection that puts the knapsack over the limit. 316 | ''' 317 | def __init__(self, *args, **kwargs): 318 | self.N = 200 319 | self.item_limits_init = np.random.randint(1, 10, size=self.N, dtype=np.int32) 320 | self.item_limits = self.item_limits_init.copy() 321 | super().__init__() 322 | self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32) 323 | self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32) 324 | 325 | assign_env_config(self, kwargs) 326 | 327 | obs_space = spaces.Box( 328 | 0, self.max_weight, shape=(3, self.N + 1), dtype=np.int32) 329 | if self.mask: 330 | self.observation_space = spaces.Dict({ 331 | "action_mask": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8), 332 | "avail_actions": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8), 333 | "state": obs_space 334 | }) 335 | else: 336 | self.observation_space = obs_space 337 | 338 | def _STEP(self, item): 339 | # Check item limit 340 | if self.item_limits[item] > 0: 341 | # Check that item will fit 342 | if self.item_weights[item] + self.current_weight <= self.max_weight: 343 | self.current_weight += self.item_weights[item] 344 | reward = self.item_values[item] 345 | if self.current_weight == self.max_weight: 346 | done = True 347 | else: 348 | done = False 349 | self._update_state(item) 350 | else: 351 | # End if over weight 352 | reward = 0 353 | done = True 354 | else: 355 | # End if item is unavailable 356 | reward = 0 357 | done = True 358 | 359 | return self.state, reward, done, {} 360 | 361 | def _update_state(self, item=None): 362 | if item is not None: 363 | self.item_limits[item] -= 1 364 | state_items = np.vstack([ 365 | self.item_weights, 366 | self.item_values, 367 | self.item_limits 368 | ], dtype=np.int32) 369 | state = np.hstack([ 370 | state_items, 371 | np.array([[self.max_weight], 372 | [self.current_weight], 373 | [0] # Serves as place holder 374 | ], dtype=np.int32) 375 | ]) 376 | if self.mask: 377 | mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8) 378 | mask = np.where(self.item_limits > 0, mask, 0) 379 | self.state = { 380 | "action_mask": mask, 381 | "avail_actions": np.ones(self.N, dtype=np.uint8), 382 | "state": state 383 | } 384 | else: 385 | self.state = state.copy() 386 | 387 | def sample_action(self): 388 | return np.random.choice( 389 | self.item_numbers[np.where(self.item_limits!=0)]) 390 | 391 | def _RESET(self): 392 | if self.randomize_params_on_reset: 393 | self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32) 394 | self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32) 395 | self.item_limits = np.random.randint(1, 10, size=self.N, dtype=np.int32) 396 | else: 397 | self.item_limits = self.item_limits_init.copy() 398 | 399 | self.current_weight = 0 400 | self._update_state() 401 | return self.state 402 | 403 | class OnlineKnapsackEnv(BoundedKnapsackEnv): 404 | ''' 405 | Online Knapsack Problem 406 | 407 | The Knapsack Problem (KP) is a combinatorial optimization problem which 408 | requires the user to select from a range of goods of different values and 409 | weights in order to maximize the value of the selected items within a 410 | given weight limit. This version is online meaning each item is randomly 411 | presented to the algorithm one at a time, at which point the algorithm 412 | can either accept or reject the item. After seeing a fixed number of 413 | items are shown, the episode terminates. If the weight limit is reached 414 | before the episode ends, then it terminates early. 415 | 416 | Observation: 417 | Type: Tuple, Discrete 418 | 0: list of item weights 419 | 1: list of item values 420 | 2: list of item limits 421 | 3: maximum weight of the knapsack 422 | 4: current weight in knapsack 423 | 424 | 425 | Actions: 426 | Type: Discrete 427 | 0: Reject item 428 | 1: Place item into knapsack 429 | 430 | Reward: 431 | Value of item successfully placed into knapsack or 0 if the item 432 | doesn't fit, at which point the episode ends. 433 | 434 | Starting State: 435 | Lists of available items and empty knapsack. 436 | 437 | Episode Termination: 438 | Full knapsack, selection that puts the knapsack over the limit, or 439 | the number of items to be drawn has been reached. 440 | ''' 441 | def __init__(self, *args, **kwargs): 442 | BoundedKnapsackEnv.__init__(self) 443 | assign_env_config(self, kwargs) 444 | self.action_space = spaces.Discrete(2) 445 | 446 | obs_space = spaces.Box(0, self.max_weight, shape=(4,), dtype=np.int32) 447 | if self.mask: 448 | self.observation_space = spaces.Dict({ 449 | 'state': obs_space, 450 | 'avail_actions': spaces.Box(0, 1, shape=(2,), dtype=np.uint8), 451 | 'action_mask': spaces.Box(0, 1, shape=(2,), dtype=np.uint8) 452 | }) 453 | else: 454 | self.observation_space = obs_space 455 | 456 | self.step_counter = 0 457 | self.step_limit = 50 458 | 459 | self.state = self.reset() 460 | self._max_reward = 600 461 | 462 | def _STEP(self, action): 463 | if bool(action): 464 | # Check that item will fit 465 | if self.item_weights[self.current_item] + self.current_weight <= self.max_weight: 466 | self.current_weight += self.item_weights[self.current_item] 467 | reward = self.item_values[self.current_item] 468 | if self.current_weight == self.max_weight: 469 | done = True 470 | else: 471 | done = False 472 | else: 473 | # End if over weight 474 | reward = 0 475 | done = True 476 | else: 477 | reward = 0 478 | done = False 479 | 480 | self._update_state() 481 | self.step_counter += 1 482 | if self.step_counter >= self.step_limit: 483 | done = True 484 | 485 | return self.state, reward, done, {} 486 | 487 | def _update_state(self): 488 | self.current_item = np.random.choice(self.item_numbers, p=self.item_probs) 489 | current_item_weight = self.item_weights[self.current_item] 490 | state = np.array([ 491 | self.current_weight, 492 | self.current_item, 493 | current_item_weight, 494 | self.item_values[self.current_item] 495 | ],) 496 | if self.mask: 497 | mask = np.ones(2, dtype=np.uint8) 498 | if current_item_weight + self.current_weight > self.max_weight: 499 | mask[1] = 0 500 | self.state = { 501 | 'state': state, 502 | 'avail_actions': np.ones(2, dtype=np.uint8), 503 | 'action_mask': mask 504 | } 505 | else: 506 | self.state = state 507 | 508 | def sample_action(self): 509 | return np.random.choice([0, 1]) 510 | 511 | def _RESET(self): 512 | if self.randomize_params_on_reset: 513 | self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32) 514 | self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32) 515 | self.item_limits = np.random.randint(1, 10, size=self.N, dtype=np.int32) 516 | else: 517 | self.item_limits = self.item_limits_init.copy() 518 | 519 | if not hasattr(self, 'item_probs'): 520 | self.item_probs = self.item_limits_init / self.item_limits_init.sum() 521 | self.current_weight = 0 522 | self.step_counter = 0 523 | self._update_state() 524 | return self.state -------------------------------------------------------------------------------- /or_gym/envs/supply_chain/network_management.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Multi-period inventory management 3 | Hector Perez, Christian Hubbs, Can Li 4 | 9/14/2020 5 | ''' 6 | 7 | import gym 8 | import itertools 9 | import numpy as np 10 | import networkx as nx 11 | import pandas as pd 12 | from scipy.stats import * 13 | from or_gym.utils import assign_env_config 14 | from collections import deque 15 | import matplotlib.pyplot as plt 16 | 17 | class NetInvMgmtMasterEnv(gym.Env): 18 | ''' 19 | The supply network environment is structured as follows: 20 | 21 | It is a multi-period multi-node production-inventory system for 22 | a single non-perishable product that is sold in discrete quantities. 23 | Two main types of nodes exist: 1) production nodes (which have an 24 | inventory holding area and a manufacturing area), and 2) distribution 25 | nodes (which only have an inventory holding area). Retail nodes are 26 | considered distribution nodes. Other node types in the network include 27 | 1) raw material nodes (source nodes), which have unlimited supply 28 | of raw materials, and 2) market nodes (sink nodes), which generate an 29 | uncertain demand on their respective retailers in each period. 30 | 31 | Within production nodes, the inventory holding area holds the inventory 32 | necessary to produce the respective intermediate material at that node. 33 | Yield ratios are specified at each production stage relating the amount 34 | of material produced from one unit of inventory. Production at each node 35 | is bounded by the nodes's production capacity and the available inventory. 36 | 37 | Lead times between neighbor nodes exist and are associated with the edges 38 | connecting them. 39 | 40 | At the beginning of each time period, the following sequence of events occurs: 41 | 42 | 0) Each node places replenishment orders to their immediate suppliers. 43 | Replenishment orders are filled according to available production capacity 44 | and available inventory at the suppliers. There is a cost associated with 45 | each order request. 46 | 1) Incoming inventory replenishment shipments that have made it down network 47 | pipeline (after the associated lead time) are received at each node. 48 | 2) Market demands occur at the retail nodes. Demands are sampled from a 49 | specified discrete probability distribution. Demands are filled according 50 | to available inventory at the retailers. 51 | 4) Option: one of the following occurs, 52 | a) Unfulfilled sales are backlogged at a penalty. 53 | Note: Backlogged orders are added to the next period's market demand. 54 | b) Unfulfilled sales and replenishment orders are lost with a 55 | goodwill-loss penalty. 56 | 5) Surpluss inventory is held at each stage at a holding cost. 57 | Pipeline inventories also incur a cost for each period in the pipeline. 58 | 59 | ''' 60 | def __init__(self, *args, **kwargs): 61 | ''' 62 | num_periods = number of periods in simulation. 63 | Node specific parameters: 64 | - I0 = initial inventory. 65 | - C = production capacity. 66 | - v = production yield in the range (0, 1]. 67 | - o = unit operating cost (feed-based) 68 | - h = unit holding cost for excess on-hand inventory. 69 | Edge specific parameters: 70 | - L = lead times in betwen adjacent nodes. 71 | - p = unit price to send material between adjacent nodes (purchase price/reorder cost) 72 | - b = unit backlog cost or good-wil loss for unfulfilled market demand between adjacent retailer and market. 73 | - g = unit holding cost for pipeline inventory on a specified edge. 74 | - prob_dist = probability distribution function on a (retailer, market) edge. 75 | - demand_dist = demand distribution for (retailer, market) edge. Two options: 76 | - use scipy probability distribution: must be a lambda function calling the rvs method of the distribution 77 | i.e. lambda: poisson.rvs(mu=20) 78 | - use a list of user specified demands for each period. 79 | backlog = Are unfulfilled orders backlogged? True = backlogged, False = lost sales. 80 | demand_dist = distribution function for customer demand (e.g. poisson, binomial, uniform, geometric, etc.) 81 | dist_param = named values for parameters fed to statistical distribution. 82 | poisson: {'mu': } 83 | binom: {'n': , 84 | 'p': } 85 | raindint: {'low' = , 'high': } 86 | geom: {'p': } 87 | alpha = discount factor in the range (0,1] that accounts for the time value of money 88 | seed_int = integer seed for random state. 89 | user_D = dictionary containing user specified demand (list) for each (retail, market) pair at 90 | each time period in the simulation. If all zeros, ignored; otherwise, demands will be taken from this list. 91 | sample_path = dictionary specifying if is user_D (for each (retail, market) pair) is sampled from demand_dist. 92 | ''' 93 | # set default (arbitrary) values when creating environment (if no args or kwargs are given) 94 | self._max_rewards = 2000 95 | self.num_periods = 30 96 | self.backlog = True 97 | self.alpha = 1.00 98 | self.seed_int = 0 99 | self.user_D = {(1,0): np.zeros(self.num_periods)} 100 | self.sample_path = {(1,0): False} 101 | self._max_rewards = 2000 102 | 103 | # create graph 104 | self.graph = nx.DiGraph() 105 | # Market 106 | self.graph.add_nodes_from([0]) 107 | # Retailer 108 | self.graph.add_nodes_from([1], I0 = 100, 109 | h = 0.030) 110 | # Distributors 111 | self.graph.add_nodes_from([2], I0 = 110, 112 | h = 0.020) 113 | self.graph.add_nodes_from([3], I0 = 80, 114 | h = 0.015) 115 | # Manufacturers 116 | self.graph.add_nodes_from([4], I0 = 400, 117 | C = 90, 118 | o = 0.010, 119 | v = 1.000, 120 | h = 0.012) 121 | self.graph.add_nodes_from([5], I0 = 350, 122 | C = 90, 123 | o = 0.015, 124 | v = 1.000, 125 | h = 0.013) 126 | self.graph.add_nodes_from([6], I0 = 380, 127 | C = 80, 128 | o = 0.012, 129 | v = 1.000, 130 | h = 0.011) 131 | # Raw materials 132 | self.graph.add_nodes_from([7, 8]) 133 | # Links 134 | self.graph.add_edges_from([(1,0,{'p': 2.000, 135 | 'b': 0.100, 136 | 'demand_dist': poisson, 137 | 'dist_param': {'mu': 20}}), 138 | (2,1,{'L': 5, 139 | 'p': 1.500, 140 | 'g': 0.010}), 141 | (3,1,{'L': 3, 142 | 'p': 1.600, 143 | 'g': 0.015}), 144 | (4,2,{'L': 8, 145 | 'p': 1.000, 146 | 'g': 0.008}), 147 | (4,3,{'L': 10, 148 | 'p': 0.800, 149 | 'g': 0.006}), 150 | (5,2,{'L': 9, 151 | 'p': 0.700, 152 | 'g': 0.005}), 153 | (6,2,{'L': 11, 154 | 'p': 0.750, 155 | 'g': 0.007}), 156 | (6,3,{'L': 12, 157 | 'p': 0.800, 158 | 'g': 0.004}), 159 | (7,4,{'L': 0, 160 | 'p': 0.150, 161 | 'g': 0.000}), 162 | (7,5,{'L': 1, 163 | 'p': 0.050, 164 | 'g': 0.005}), 165 | (8,5,{'L': 2, 166 | 'p': 0.070, 167 | 'g': 0.002}), 168 | (8,6,{'L': 0, 169 | 'p': 0.200, 170 | 'g': 0.000})]) 171 | 172 | # add environment configuration dictionary and keyword arguments 173 | assign_env_config(self, kwargs) 174 | 175 | # Save user_D and sample_path to graph metadata 176 | for link in self.user_D.keys(): 177 | d = self.user_D[link] 178 | if np.sum(d) != 0: 179 | self.graph.edges[link]['user_D'] = d 180 | if link in self.sample_path.keys(): 181 | self.graph.edges[link]['sample_path'] = self.sample_path[link] 182 | else: 183 | # Placeholder to avoid key errors 184 | self.graph.edges[link]['user_D'] = 0 185 | 186 | self.num_nodes = self.graph.number_of_nodes() 187 | self.adjacency_matrix = np.vstack(self.graph.edges()) 188 | # Set node levels 189 | self.levels = {} 190 | self.levels['retailer'] = np.array([1]) 191 | self.levels['distributor'] = np.unique(np.hstack( 192 | [list(self.graph.predecessors(i)) for i in self.levels['retailer']])) 193 | self.levels['manufacturer'] = np.unique(np.hstack( 194 | [list(self.graph.predecessors(i)) for i in self.levels['distributor']])) 195 | self.levels['raw_materials'] = np.unique(np.hstack( 196 | [list(self.graph.predecessors(i)) for i in self.levels['manufacturer']])) 197 | 198 | self.level_col = {'retailer': 0, 199 | 'distributor': 1, 200 | 'manufacturer': 2, 201 | 'raw_materials': 3} 202 | 203 | self.market = [j for j in self.graph.nodes() if len(list(self.graph.successors(j))) == 0] 204 | self.distrib = [j for j in self.graph.nodes() if 'C' not in self.graph.nodes[j] and 'I0' in self.graph.nodes[j]] 205 | self.retail = [j for j in self.graph.nodes() if len(set.intersection(set(self.graph.successors(j)), set(self.market))) > 0] 206 | self.factory = [j for j in self.graph.nodes() if 'C' in self.graph.nodes[j]] 207 | self.rawmat = [j for j in self.graph.nodes() if len(list(self.graph.predecessors(j))) == 0] 208 | self.main_nodes = np.sort(self.distrib + self.factory) 209 | self.reorder_links = [e for e in self.graph.edges() if 'L' in self.graph.edges[e]] #exclude links to markets (these cannot have lead time 'L') 210 | self.retail_links = [e for e in self.graph.edges() if 'L' not in self.graph.edges[e]] #links joining retailers to markets 211 | self.network_links = [e for e in self.graph.edges()] #all links involved in sale in the network 212 | 213 | # check inputs 214 | assert set(self.graph.nodes()) == set.union(set(self.market), 215 | set(self.distrib), 216 | set(self.factory), 217 | set(self.rawmat)), "The union of market, distribution, factory, and raw material nodes is not equal to the system nodes." 218 | for j in self.graph.nodes(): 219 | if 'I0' in self.graph.nodes[j]: 220 | assert self.graph.nodes[j]['I0'] >= 0, "The initial inventory cannot be negative for node {}.".format(j) 221 | if 'h' in self.graph.nodes[j]: 222 | assert self.graph.nodes[j]['h'] >= 0, "The inventory holding costs cannot be negative for node {}.".format(j) 223 | if 'C' in self.graph.nodes[j]: 224 | assert self.graph.nodes[j]['C'] > 0, "The production capacity must be positive for node {}.".format(j) 225 | if 'o' in self.graph.nodes[j]: 226 | assert self.graph.nodes[j]['o'] >= 0, "The operating costs cannot be negative for node {}.".format(j) 227 | if 'v' in self.graph.nodes[j]: 228 | assert self.graph.nodes[j]['v'] > 0 and self.graph.nodes[j]['v'] <= 1, "The production yield must be in the range (0, 1] for node {}.".format(j) 229 | for e in self.graph.edges(): 230 | if 'L' in self.graph.edges[e]: 231 | assert self.graph.edges[e]['L'] >= 0, "The lead time joining nodes {} cannot be negative.".format(e) 232 | if 'p' in self.graph.edges[e]: 233 | assert self.graph.edges[e]['p'] >= 0, "The sales price joining nodes {} cannot be negative.".format(e) 234 | if 'b' in self.graph.edges[e]: 235 | assert self.graph.edges[e]['b'] >= 0, "The unfulfilled demand costs joining nodes {} cannot be negative.".format(e) 236 | if 'g' in self.graph.edges[e]: 237 | assert self.graph.edges[e]['g'] >= 0, "The pipeline inventory holding costs joining nodes {} cannot be negative.".format(e) 238 | if 'sample_path' in self.graph.edges[e]: 239 | assert isinstance(self.graph.edges[e]['sample_path'], bool), "When specifying if a user specified demand joining (retailer, market): {} is sampled from a distribution, sample_path must be a Boolean.".format(e) 240 | if 'demand_dist' in self.graph.edges[e]: 241 | dist = self.graph.edges[e]['demand_dist'] #extract distribution 242 | assert dist.cdf(0,**self.graph.edges[e]['dist_param']), "Wrong parameters passed to the demand distribution joining (retailer, market): {}.".format(e) 243 | assert self.backlog == False or self.backlog == True, "The backlog parameter must be a boolean." 244 | assert self.graph.number_of_nodes() >= 2, "The minimum number of nodes is 2. Please try again" 245 | assert self.alpha>0 and self.alpha<=1, "alpha must be in the range (0, 1]." 246 | 247 | # set random generation seed (unless using user demands) 248 | self.seed(self.seed_int) 249 | 250 | # action space (reorder quantities for each node for each supplier; list) 251 | # An action is defined for every node 252 | num_reorder_links = len(self.reorder_links) 253 | self.lt_max = np.max([self.graph.edges[e]['L'] for e in self.graph.edges() if 'L' in self.graph.edges[e]]) 254 | self.init_inv_max = np.max([self.graph.nodes[j]['I0'] for j in self.graph.nodes() if 'I0' in self.graph.nodes[j]]) 255 | self.capacity_max = np.max([self.graph.nodes[j]['C'] for j in self.graph.nodes() if 'C' in self.graph.nodes[j]]) 256 | self.pipeline_length = sum([self.graph.edges[e]['L'] 257 | for e in self.graph.edges() if 'L' in self.graph.edges[e]]) 258 | self.lead_times = {e: self.graph.edges[e]['L'] 259 | for e in self.graph.edges() if 'L' in self.graph.edges[e]} 260 | self.obs_dim = self.pipeline_length + len(self.main_nodes) + len(self.retail_links) 261 | # self.pipeline_length = len(self.main_nodes)*(self.lt_max+1) 262 | self.action_space = gym.spaces.Box( 263 | low=np.zeros(num_reorder_links), 264 | high=np.ones(num_reorder_links)*(self.init_inv_max + self.capacity_max*self.num_periods), 265 | dtype=np.float64) 266 | # observation space (total inventory at each node, which is any integer value) 267 | self.observation_space = gym.spaces.Box( 268 | low=np.ones(self.obs_dim)*np.iinfo(np.int32).min, 269 | high=np.ones(self.obs_dim)*np.iinfo(np.int32).max, 270 | dtype=np.float64) 271 | # low=-np.ones(self.pipeline_length)*(self.init_inv_max + self.capacity_max*self.num_periods)*10, 272 | # high=np.ones(self.pipeline_length)*(self.init_inv_max + self.capacity_max*self.num_periods), 273 | # dtype=np.int32) 274 | 275 | # intialize 276 | self.reset() 277 | 278 | def seed(self,seed=None): 279 | ''' 280 | Set random number generation seed 281 | ''' 282 | # seed random state 283 | if seed != None: 284 | np.random.seed(seed=int(seed)) 285 | 286 | def _RESET(self): 287 | ''' 288 | Create and initialize all variables and containers. 289 | Nomenclature: 290 | I = On hand inventory at the start of each period at each stage (except last one). 291 | T = Pipeline inventory at the start of each period at each stage (except last one). 292 | R = Replenishment order placed at each period at each stage (except last one). 293 | D = Customer demand at each period (at the retailer) 294 | S = Sales performed at each period at each stage. 295 | B = Backlog at each period at each stage. 296 | LS = Lost sales at each period at each stage. 297 | P = Total profit at each stage. 298 | ''' 299 | T = self.num_periods 300 | J = len(self.main_nodes) 301 | RM = len(self.retail_links) # number of retailer-market pairs 302 | PS = len(self.reorder_links) # number of purchaser-supplier pairs in the network 303 | SL = len(self.network_links) # number of edges in the network (excluding links form raw material nodes) 304 | 305 | # simulation result lists 306 | self.X=pd.DataFrame(data = np.zeros([T + 1, J]), 307 | columns = self.main_nodes) # inventory at the beginning of each period 308 | self.Y=pd.DataFrame(data = np.zeros([T + 1, PS]), 309 | columns = pd.MultiIndex.from_tuples(self.reorder_links, 310 | names = ['Source','Receiver'])) # pipeline inventory at the beginning of each period 311 | self.R=pd.DataFrame(data = np.zeros([T, PS]), 312 | columns = pd.MultiIndex.from_tuples(self.reorder_links, 313 | names = ['Supplier','Requester'])) # replenishment orders 314 | self.S=pd.DataFrame(data = np.zeros([T, SL]), 315 | columns = pd.MultiIndex.from_tuples(self.network_links, 316 | names = ['Seller','Purchaser'])) # units sold 317 | self.D=pd.DataFrame(data = np.zeros([T, RM]), 318 | columns = pd.MultiIndex.from_tuples(self.retail_links, 319 | names = ['Retailer','Market'])) # demand at retailers 320 | self.U=pd.DataFrame(data = np.zeros([T, RM]), 321 | columns = pd.MultiIndex.from_tuples(self.retail_links, 322 | names = ['Retailer','Market'])) # unfulfilled demand for each market - retailer pair 323 | self.P=pd.DataFrame(data = np.zeros([T, J]), 324 | columns = self.main_nodes) # profit at each node 325 | 326 | # initializetion 327 | self.period = 0 # initialize time 328 | for j in self.main_nodes: 329 | self.X.loc[0,j]=self.graph.nodes[j]['I0'] # initial inventory 330 | self.Y.loc[0,:]=np.zeros(PS) # initial pipeline inventory 331 | self.action_log = np.zeros([T, PS]) 332 | 333 | # set state 334 | self._update_state() 335 | 336 | return self.state 337 | 338 | def _update_state(self): 339 | # State is a concatenation of demand, inventory, and pipeline at each time step 340 | demand = np.hstack([self.D[d].iloc[self.period] for d in self.retail_links]) 341 | inventory = np.hstack([self.X[n].iloc[self.period] for n in self.main_nodes]) 342 | 343 | # Pipeline values won't be of proper dimension if current 344 | # current period < lead time. We need to add 0's as padding. 345 | if self.period == 0: 346 | _pipeline = [[self.Y[k].iloc[0]] 347 | for k, v in self.lead_times.items()] 348 | else: 349 | _pipeline = [self.Y[k].iloc[max(self.period-v,0):self.period].values 350 | for k, v in self.lead_times.items()] 351 | pipeline = [] 352 | for p, v in zip(_pipeline, self.lead_times.values()): 353 | if v == 0: 354 | continue 355 | if len(p) <= v: 356 | pipe = np.zeros(v) 357 | pipe[-len(p):] += p 358 | pipeline.append(pipe) 359 | pipeline = np.hstack(pipeline) 360 | self.state = np.hstack([demand, inventory, pipeline]) 361 | 362 | def _STEP(self, action): 363 | ''' 364 | Take a step in time in the multiperiod inventory management problem. 365 | action = number of units to request from each supplier. 366 | dictionary: keys are (supplier, purchaser) tuples 367 | values are number of units requested from supplier 368 | dimension = len(reorder_links) (number of edges joining all nodes, 369 | except market nodes) 370 | ''' 371 | t = self.period 372 | if type(action) != dict: # convert to dict if a list was given 373 | action = {key: action[i] for i, key in enumerate(self.reorder_links)} 374 | 375 | # Place Orders 376 | for key in action.keys(): 377 | request = round(max(action[key],0)) # force to integer value 378 | supplier = key[0] 379 | purchaser = key[1] 380 | if supplier in self.rawmat: 381 | self.R.loc[t,(supplier, purchaser)] = request # accept request since supply is unlimited 382 | self.S.loc[t,(supplier, purchaser)] = request 383 | elif supplier in self.distrib: 384 | X_supplier = self.X.loc[t,supplier] # request limited by available inventory at beginning of period 385 | self.R.loc[t,(supplier, purchaser)] = min(request, X_supplier) 386 | self.S.loc[t,(supplier, purchaser)] = min(request, X_supplier) 387 | elif supplier in self.factory: 388 | C = self.graph.nodes[supplier]['C'] # supplier capacity 389 | v = self.graph.nodes[supplier]['v'] # supplier yield 390 | X_supplier = self.X.loc[t,supplier] # on-hand inventory at beginning of period 391 | self.R.loc[t,(supplier, purchaser)] = min(request, C, v*X_supplier) 392 | self.S.loc[t,(supplier, purchaser)] = min(request, C, v*X_supplier) 393 | 394 | #Receive deliveries and update inventories 395 | for j in self.main_nodes: 396 | #update pipeline inventories 397 | incoming = [] 398 | for k in self.graph.predecessors(j): 399 | L = self.graph.edges[(k,j)]['L'] #extract lead time 400 | if t - L >= 0: #check if delivery has arrived 401 | delivery = self.R.loc[t-L,(k,j)] 402 | else: 403 | delivery = 0 404 | incoming += [delivery] #update incoming material 405 | self.Y.loc[t+1,(k,j)] = self.Y.loc[t,(k,j)] - delivery + self.R.loc[t,(k,j)] 406 | 407 | #update on-hand inventory 408 | if 'v' in self.graph.nodes[j]: #extract production yield 409 | v = self.graph.nodes[j]['v'] 410 | else: 411 | v = 1 412 | outgoing = 1/v * np.sum([self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #consumed inventory (for requests placed) 413 | self.X.loc[t+1,j] = self.X.loc[t,j] + np.sum(incoming) - outgoing 414 | 415 | # demand is realized 416 | for j in self.retail: 417 | for k in self.market: 418 | #read user specified demand. if all zeros, use demand_dist instead. 419 | Demand = self.graph.edges[(j,k)]['user_D'] 420 | if np.sum(Demand) > 0: 421 | self.D.loc[t,(j,k)] = Demand[t] 422 | else: 423 | Demand = self.graph.edges[(j,k)]['demand_dist'] 424 | self.D.loc[t,(j,k)] = Demand.rvs( 425 | **self.graph.edges[(j,k)]['dist_param']) 426 | if self.backlog and t >= 1: 427 | D = self.D.loc[t,(j,k)] + self.U.loc[t-1,(j,k)] 428 | else: 429 | D = self.D.loc[t,(j,k)] 430 | #satisfy demand up to available level 431 | X_retail = self.X.loc[t+1,j] #get inventory at retail before demand was realized 432 | self.S.loc[t,(j,k)] = min(D, X_retail) #perform sale 433 | self.X.loc[t+1,j] -= self.S.loc[t,(j,k)] #update inventory 434 | self.U.loc[t,(j,k)] = D - self.S.loc[t,(j,k)] #update unfulfilled orders 435 | 436 | # calculate profit 437 | for j in self.main_nodes: 438 | a = self.alpha 439 | SR = np.sum([self.graph.edges[(j,k)]['p'] * self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #sales revenue 440 | PC = np.sum([self.graph.edges[(k,j)]['p'] * self.R.loc[t,(k,j)] for k in self.graph.predecessors(j)]) #purchasing costs 441 | if j not in self.rawmat: 442 | HC = self.graph.nodes[j]['h'] * self.X.loc[t+1,j] + np.sum([self.graph.edges[(k,j)]['g'] * self.Y.loc[t+1,(k,j)] for k in self.graph.predecessors(j)]) #holding costs 443 | else: 444 | HC = 0 445 | if j in self.factory: 446 | OC = self.graph.nodes[j]['o'] / self.graph.nodes[j]['v'] * np.sum([self.S.loc[t,(j,k)] for k in self.graph.successors(j)]) #operating costs 447 | else: 448 | OC = 0 449 | if j in self.retail: 450 | UP = np.sum([self.graph.edges[(j,k)]['b'] * self.U.loc[t,(j,k)] for k in self.graph.successors(j)]) #unfulfilled penalty 451 | else: 452 | UP = 0 453 | self.P.loc[t,j] = a**t * (SR - PC - OC - HC - UP) 454 | 455 | # update period 456 | self.period += 1 457 | 458 | # set reward (profit from current timestep) 459 | reward = self.P.loc[t,:].sum() 460 | 461 | # determine if simulation should terminate 462 | if self.period >= self.num_periods: 463 | done = True 464 | else: 465 | done = False 466 | # update stae 467 | self._update_state() 468 | 469 | return self.state, reward, done, {} 470 | 471 | def sample_action(self): 472 | ''' 473 | Generate an action by sampling from the action_space 474 | ''' 475 | return self.action_space.sample() 476 | 477 | def step(self, action): 478 | return self._STEP(action) 479 | 480 | def reset(self): 481 | return self._RESET() 482 | 483 | def plot_network(self): 484 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] 485 | adjacency_matrix = np.vstack(self.graph.edges()) 486 | # Set level colors 487 | level_col = {'retailer': 0, 488 | 'distributor': 1, 489 | 'manufacturer': 2, 490 | 'raw_materials': 3} 491 | 492 | max_density = np.max([len(v) for v in self.levels.values()]) 493 | node_coords = {} 494 | node_num = 1 495 | plt.figure(figsize=(12,8)) 496 | for i, (level, nodes) in enumerate(self.levels.items()): 497 | n = len(nodes) 498 | node_y = max_density / 2 if n == 1 else np.linspace(0, max_density, n) 499 | node_y = np.atleast_1d(node_y) 500 | plt.scatter(np.repeat(i, n), node_y, label=level, s=50) 501 | for y in node_y: 502 | plt.annotate(r'$N_{}$'.format(node_num), xy=(i, y+0.05)) 503 | node_coords[node_num] = (i, y) 504 | node_num += 1 505 | 506 | # Draw edges 507 | for node_num, v in node_coords.items(): 508 | x, y = v 509 | sinks = adjacency_matrix[np.where(adjacency_matrix[:, 0]==node_num)][:, 1] 510 | for s in sinks: 511 | try: 512 | sink_coord = node_coords[s] 513 | except KeyError: 514 | continue 515 | for k, n in self.levels.items(): 516 | if node_num in n: 517 | color = colors[level_col[k]] 518 | x_ = np.hstack([x, sink_coord[0]]) 519 | y_ = np.hstack([y, sink_coord[1]]) 520 | plt.plot(x_, y_, color=color) 521 | 522 | plt.ylabel('Node') 523 | plt.yticks([0], ['']) 524 | plt.xlabel('Level') 525 | plt.xticks(np.arange(len(self.levels)), [k for k in self.levels.keys()]) 526 | plt.show() 527 | 528 | class NetInvMgmtBacklogEnv(NetInvMgmtMasterEnv): 529 | def __init__(self, *args, **kwargs): 530 | super().__init__(*args, **kwargs) 531 | 532 | class NetInvMgmtLostSalesEnv(NetInvMgmtMasterEnv): 533 | def __init__(self, *args, **kwargs): 534 | super().__init__(*args, **kwargs) 535 | self.backlog = False --------------------------------------------------------------------------------