├── .gitignore ├── BGAgent.py ├── LICENSE ├── README.md ├── SRDQN.py ├── clBeergame.py ├── config.py ├── data.zip ├── main.py ├── plotting.py ├── requirements.txt └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /BGAgent.py: -------------------------------------------------------------------------------- 1 | from SRDQN import DQN 2 | import numpy as np 3 | 4 | # Here we want to define the agent class for the BeerGame 5 | class Agent(object): 6 | 7 | # initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents. 8 | def __init__(self, agentNum, IL, AO, AS, c_h, c_p, eta, compuType, config): 9 | self.agentNum = agentNum 10 | self.IL = IL # Inventory level of each agent - changes during the game 11 | self.OO = 0 # Open order of each agent - changes during the game 12 | self.ASInitial = AS # the initial arriving shipment. 13 | self.ILInitial = IL # IL at which we start each game with this number 14 | self.AOInitial = AO # OO at which we start each game with this number 15 | self.config = config # an instance of config is stored inside the class 16 | self.curState = [] # this function gets the current state of the game 17 | self.nextState = [] 18 | self.curReward = 0 # the reward observed at the current step 19 | self.cumReward = 0 # cumulative reward; reset at the begining of each episode 20 | self.totRew = 0 # it is reward of all players obtained for the current player. 21 | self.c_h=c_h # holding cost 22 | self.c_p = c_p # backorder cost 23 | self.eta = eta # the total cost regulazer 24 | self.AS = np.zeros((1,1)) # arriced shipment 25 | self.AO = np.zeros((1,1)) # arrived order 26 | self.action=0 # the action at time t 27 | self.compTypeTrain = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists 28 | self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists 29 | self.alpha_b = self.config.alpha_b[self.agentNum] # parameters for the formula 30 | self.betta_b = self.config.betta_b[self.agentNum] # parameters for the formula 31 | if self.config.demandDistribution == 0: 32 | self.a_b = np.mean((self.config.demandUp , self.config.demandLow)) # parameters for the formula 33 | self.b_b = np.mean((self.config.demandUp , self.config.demandLow))*(np.mean((self.config.leadRecItemLow[self.agentNum] , 34 | self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula 35 | elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4: 36 | self.a_b = self.config.demandMu # parameters for the formula 37 | self.b_b = self.config.demandMu*(np.mean((self.config.leadRecItemLow[self.agentNum] , 38 | self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula 39 | elif self.config.demandDistribution == 2: 40 | self.a_b = 8 # parameters for the formula 41 | self.b_b = (3/4.)*8*(np.mean((self.config.leadRecItemLow[self.agentNum] , 42 | self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula 43 | elif self.config.demandDistribution == 3: 44 | self.a_b = 10 # parameters for the formula 45 | self.b_b = 7*(np.mean((self.config.leadRecItemLow[self.agentNum] , 46 | self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula 47 | 48 | self.hist = [] # this is used for plotting - keeps the history for only one game 49 | self.hist2 = [] # this is used for animation usage 50 | self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017 51 | self.T = 0 52 | self.bsBaseStock = 0 53 | self.init_bsBaseStock = 0 54 | self.nextObservation = [] 55 | if self.compTypeTrain == 'srdqn': 56 | self.brain = DQN(self.agentNum,config) 57 | self.brain.setInitState(self.curState) # sets the initial input of the network 58 | 59 | # reset player information 60 | def resetPlayer(self, T): 61 | self.IL = self.ILInitial 62 | self.OO = 0 63 | self.AS = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 ))) # arriced shipment 64 | self.AO = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 ))) # arrived order 65 | if self.agentNum != 0: 66 | for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]): 67 | self.AO[i] = self.AOInitial[self.agentNum - 1] 68 | for i in range(self.config.leadRecItemUp[self.agentNum]): 69 | self.AS[i] = self.ASInitial 70 | self.curReward = 0 # the reward observed at the current step 71 | self.cumReward = 0 # cumulative reward; reset at the begining of each episode 72 | self.action= [] 73 | self.hist = [] 74 | self.hist2 = [] 75 | self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017 76 | self.T = T 77 | self.curObservation = self.getCurState(1) # this function gets the current state of the game 78 | self.nextObservation = [] 79 | if self.compTypeTrain == 'srdqn': 80 | self.brain.setInitState(self.curObservation) # sets the initial input of the network 81 | 82 | 83 | # updates the IL and OO at time t, after recieving "rec" number of items 84 | def recieveItems(self, time): 85 | self.IL = self.IL + self.AS[time] # inverntory level update 86 | self.OO = self.OO - self.AS[time] # invertory in transient update 87 | 88 | 89 | # find action Value associated with the action list 90 | def actionValue(self,curTime,playType): 91 | if playType == "test": 92 | if self.config.fixedAction: 93 | a = self.config.actionList[np.argmax(self.action)] 94 | else: 95 | # "d + x" rule 96 | if self.compTypeTest == 'srdqn': 97 | a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime]) 98 | elif self.compTypeTest == 'rnd': 99 | a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime]) 100 | else: 101 | a = max(0, self.config.actionListOpt[np.argmax(self.action)]) 102 | 103 | elif playType == "train": 104 | if self.config.fixedAction: 105 | a = self.config.actionList[np.argmax(self.action)] 106 | else: 107 | if self.compTypeTrain == 'srdqn': 108 | a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime]) 109 | elif self.compTypeTest == 'rnd': 110 | a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime]) 111 | else: 112 | a = max(0, self.config.actionListOpt[np.argmax(self.action)]) 113 | 114 | return a 115 | 116 | 117 | # getReward returns the reward at the current state 118 | def getReward(self): 119 | # cost (holding + backorder) for one time unit 120 | self.curReward= (self.c_p * max(0,-self.IL) + self.c_h * max(0,self.IL))/200. # self.config.Ttest # 121 | self.curReward = -self.curReward; # make reward negative, because it is the cost 122 | 123 | # sum total reward of each agent 124 | self.cumReward = self.config.gamma*self.cumReward + self.curReward 125 | 126 | # This function returns a np.array of the current state of the agent 127 | def getCurState(self,t): 128 | if self.config.ifUseASAO: 129 | if self.config.if_use_AS_t_plus_1: 130 | curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t],self.AO[t]]) 131 | else: 132 | curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t-1],self.AO[t]]) 133 | else: 134 | curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO]) 135 | 136 | if self.config.ifUseActionInD: 137 | a = self.config.actionList[np.argmax(self.action)] 138 | curState= np.concatenate((curState, np.array([a]))) 139 | 140 | return curState 141 | 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Optimization and Machine Learning Group @ Lehigh 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization 2 | 3 | The code of the paper `A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization` is presented at this repository. The paper is available online in https://pubsonline.informs.org/doi/abs/10.1287/msom.2020.0939. The code works with `Python2.7` and `Python3.4-Python3.7`. For more information see the list of the requirments (You can install them `pip install -r requirements.txt`). 4 | The `main.py` is the file to call to start the training. `BGAgent.py` provides the beer-game agent which involves all the properties and functionality of an agent. `clBeergame.py` instanciates the agents and runs the beer-game simulation. Also, once the number of observations in the replay buffer filled by the minimum requirement, it calls the train-step of the SRDQN algorithm. The DNN approximator and SRDQN algorithm are implemented in `SRDQN.py`. `config.py` introduce all arguments and their default values, as well as some functions to properly build the simulation scenarios for different instances of the game. In the following the procedure to run the training and setting different values for the arguments is described. 5 | 6 | ###Play beer-game and compare your result with AI! 7 | You can play the beer-game and compare your result on the same game with the result that our RL algorithm achieves. See https://beergame.opexanalytics.com/ 8 | 9 | 10 | Note that this code does not work with TensorFlow 2+. 11 | ## Some Notations 12 | Each agent can use either of the `srdqn`, `bs`, `Ster`, or `Rnd` algorithms to decide about the action (order quantity). So, there are 256 combination of agent-types from which we consider 23 cases in this study. To determine each of these cases, we have used `config.gameConfig` to select one of pre-defined type of four agents in the game. For example, `config.gameConfig=3`, sets `config.agentTypes = ["srdqn", "bs","bs","bs"]`, in which the retailer follows the `srdqn` algorithm and the rest of agents use the base-stock policy to decide for the order quantity. The main `gameConfig` are as below: 13 | 14 | Base-stock co-players 15 | 16 | if config.gameConfig == 3: 17 | config.agentTypes = ["srdqn", "bs","bs","bs"] 18 | if config.gameConfig == 4: 19 | config.agentTypes = ["bs", "srdqn","bs","bs"] 20 | if config.gameConfig == 5: 21 | config.agentTypes = ["bs", "bs","srdqn","bs"] 22 | if config.gameConfig == 6: 23 | config.agentTypes = ["bs", "bs","bs","srdqn"] 24 | Sterman co-players 25 | 26 | if config.gameConfig == 7: 27 | config.agentTypes = ["srdqn", "Strm","Strm","Strm"] 28 | if config.gameConfig == 8: 29 | config.agentTypes = ["Strm", "srdqn","Strm","Strm"] 30 | if config.gameConfig == 9: 31 | config.agentTypes = ["Strm", "Strm","srdqn","Strm"] 32 | if config.gameConfig == 10: 33 | config.agentTypes = ["Strm", "Strm","Strm","srdqn"] 34 | Random co-players 35 | 36 | if config.gameConfig == 11: 37 | config.agentTypes = ["srdqn", "rnd","rnd","rnd"] 38 | if config.gameConfig == 12: 39 | config.agentTypes = ["rnd", "srdqn","rnd","rnd"] 40 | if config.gameConfig == 13: 41 | config.agentTypes = ["rnd", "rnd","srdqn","rnd"] 42 | if config.gameConfig == 14: 43 | config.agentTypes = ["rnd", "rnd","rnd","srdqn"] 44 | 45 | The full list of all `gameConfig` is defined in `setAgentType()` function in `config.py`. 46 | 47 | Since the `d+x` rule is used to train the `SRDQN` model, we use the upper and lower limit for `x`. `config.actionLow` and `config.actionUp` are used to set these values. 48 | 49 | In addition, for each agent one can determine the lead time for receving order as well as receving the shimpement via `config.leadRecItem1`, `config.leadRecItem2`, `config.leadRecItem3`, `config.leadRecItem4` and `config.leadRecOrder1`, `config.leadRecOrder2`, `config.leadRecOrder3`, `config.leadRecOrder4` for four agents. Similarly, the initial inventory level, initial arriving order, and initial arriving shipment can be set by `config.ILInit1`, `config.ILInit2`, `config.ILInit3`, `config.ILInit4`, `config.AOInit1`, `config.AOInit2`, `config.AOInit3`, `config.AOInit4`, `config.ASInit1`, `config.ASInit2`, `config.ASInit3`, `config.ASInit4`, respectively for the four agents. 50 | 51 | `config.maxEpisodesTrain` determines the number of episodes to train the `srdqn` agent. 52 | 53 | TO run the baseStock policy (`bs`), you need to set the value of the base-stock level for each agent by `config.f1`, `config.f2`, `config.f3`, `config.f4`. We obtained those values by running the Clark-Scarf algorithm for each instance. 54 | 55 | ## unzip the data 56 | `data.zip` includes all the required dataset to train the model on basic case, literature cases, basket dataset, and forecasting dataset. Unzipping this file creates `data` directory, in which there is a python file (`createDemand.py`) as well as the mentioned datasets. `createDemand.py` can be used to create datasets of any size for the literature cases. 57 | 58 | ## Train the basic model 59 | The basic model used the Uniform distribution `U[0,2]` with action space of `{-2, -1, 0, 1, 2}`. All the default values are set to run this experiment for the case that `srdqn` plays the retailer and other agents follow base-stock policy. For any other case the training can be started by setting the corresponding arguments. For example, to train a `srdqn` Warehouse with the initial inventory of 10 units which plays with Sterman co-players, the following line can be used to run the training for 50000 episodes: 60 | 61 | python main.py --gameConfig=8 --maxEpisodesTrain=50000 config.ILInit2=10 --batchSize=128 62 | 63 | ## Train the literature cases 64 | To train each of the literature cases, first you need to set `config.demandDistribution`, `actionUp`, and `actionLow`, as well as the other parameter for the agents as following: 65 | 66 | For U[0,8]: 67 | 68 | python main.py --demandDistribution=0 --demandUp=9 --actionUp=8 --actionLow=-8 --ch1=0.5 --ch2=0.5 --ch3=0.5 --ch4=0.5 --cp1=1.0 --cp2=1.0 --cp3=1.0 --cp4=1.0 --f1=19.0 --f2=20.0 --f3=20.0 --f4=14.0 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=12 --ILInit2=12 --ILInit3=12 --ILInit4=12 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 --gameConfig=6 69 | 70 | For N(10,2): 71 | 72 | python main.py --demandDistribution=1 --demandMu=10 --demandSigma=2 --actionUp=5 --actionLow=-5 --ch1=1 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0 --cp3=0 --cp4=0 --f1=48.0 --f2=43.0 --f3=41.0 --f4=30.0 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=10 --ILInit2=10 --ILInit3=10 --ILInit4=10 --AOInit1=10 --AOInit2=10 --AOInit3=10 --AOInit4=10 --ASInit1=10 --ASInit2=10 --ASInit3=10 --ASInit4=10 --gameConfig=6 73 | 74 | For C(4,8): 75 | 76 | python main.py --demandDistribution=2 --actionUp=8 --actionLow=-8 --ch1=0.5 --ch2=0.5 --ch3=0.5 --ch4=0.5 --cp1=1.0 --cp2=1.0 --cp3=1.0 --cp4=1.0 --demandUp=9 --f1=32.0 --f2=32.0 --f3=32.0 --f4=24.0 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=12 --ILInit2=12 --ILInit3=12 --ILInit4=12 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 --gameConfig=6 77 | 78 | ## Train the basket dataset 79 | For the basket dataset you need to set `config.demandDistribution=3`, and then `config.data_id` can be either `6, 13`, or `22`. For training with the scaled dataset, which is reported in the paper, `config.scaled=True` is required too. See the following commands for three cases: 80 | 81 | python main.py --demandDistribution=3 --data_id=6 --demandMu=3 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=19.0 --f2=12.0 --f3=12.0 --f4=8.0 --ILInit1=3 --ILInit2=3 --ILInit3=3 --ILInit4=3 --AOInit1=3 --AOInit2=3 --AOInit3=3 --AOInit4=3 --ASInit1=3 --ASInit2=3 --ASInit3=3 --ASInit4=3 82 | 83 | python main.py --demandDistribution=3 --data_id=13 --demandMu=3 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=19.0 --f2=13.0 --f3=11.0 --f4=8.0 --ILInit1=3 --ILInit2=3 --ILInit3=3 --ILInit4=3 --AOInit1=3 --AOInit2=3 --AOInit3=3 --AOInit4=3 --ASInit1=3 --ASInit2=3 --ASInit3=3 --ASInit4=3 84 | 85 | python main.py --demandDistribution=3 --data_id=22 --demandMu=2 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=14.0 --f2=9.0 --f3=9.0 --f4=5.0 --ILInit1=2 --ILInit2=2 --ILInit3=2 --ILInit4=2 --AOInit1=2 --AOInit2=2 --AOInit3=2 --AOInit4=2 --ASInit1=2 --ASInit2=2 --ASInit3=2 --ASInit4=2 86 | 87 | ## Train the forecasting dataset 88 | For the forecasting dataset you need to set `config.demandDistribution=4`, and then `config.data_id` can be either `5, 34`, or `46`. For training with the scaled dataset, which is reported in the paper, `config.scaled=True` is required too. See the following commands for three cases: 89 | 90 | python main.py --demandDistribution=4 --data_id=5 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=21.0 --f2=16.0 --f3=16.0 --f4=11.0 --ILInit1=4 --ILInit2=4 --ILInit3=4 --ILInit4=4 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 91 | 92 | python main.py --demandDistribution=4 --data_id=34 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=18.0 --f2=15.0 --f3=14.0 --f4=10.0 --ILInit1=4 --ILInit2=4 --ILInit3=4 --ILInit4=4 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 93 | 94 | python main.py --demandDistribution=4 --data_id=46 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=21.0 --f2=16.0 --f3=18.0 --f4=12.0 --ILInit1=4 --ILInit2=4 --ILInit3=4 --ILInit4=4 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 95 | 96 | ## Use Transfer Learning 97 | We have provided the trained model of the basic model which are used in the transfer learning section. The saved models are available in `pre_model\uniform\0-3\brainX` in which `X` is in `{3, 4, 5, 6}`. The value of `X` follows the same pattern as of `config.gameConfig`. To train a new with either of these trained models, you need to set `config.tlBaseBrain` that determines which trained should be used as the base model. For example: 98 | 99 | python main.py --gameConfig=3 --iftl=True --ifUsePreviousModel=True --tlBaseBrain=3 --baseDemandDistribution=0 100 | 101 | Besides, if you trained a model with another demand distribution, e.g., `N(10,2)`, you need to move the saved models into `pre_model\normal\10-2\brainX` and then for a new training set `config.baseDemandDistribution=1`. The `config.baseDemandDistribution` follows the same pattern as of `config.demandDistribution`. 102 | 103 | ## Other utilities 104 | If you set `config.ifSaveFigure=True`, it saves the trajectories of inventory-level, reward, action, open-order, and order-upto-level for each agent in an episode. `config.saveFigIntLow` and `config.saveFigIntUp` determine the range of eprisode to save the figures. 105 | 106 | Setting `config.ifsaveHistInterval=True`, activate saving of trajectory of the received order, received shipment, inventory-level, reward, action, open-order, and order-upto-level for each agent in an episode. With this argument, you need to determine the interval between every two epsiode to save the history with `config.saveHistInterval`. 107 | 108 | 109 | ## Paper citation 110 | If you used this code for your experiments or found it helpful, consider citing the following paper: 111 | 112 | @article{oroojlooyjadid2017deep, 113 | title={A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization}, 114 | author={Oroojlooyjadid, Afshin and Nazari, MohammadReza and Snyder, Lawrence and Tak{\'a}{\v{c}}, Martin}, 115 | journal = {Manufacturing \& Service Operations Management}, 116 | volume = {0}, 117 | number = {0}, 118 | pages = {null}, 119 | year = {0}, 120 | doi = {10.1287/msom.2020.0939}, 121 | 122 | URL = { 123 | https://doi.org/10.1287/msom.2020.0939 124 | 125 | }, 126 | eprint = { 127 | https://doi.org/10.1287/msom.2020.0939 128 | 129 | } 130 | year={2021} 131 | } 132 | -------------------------------------------------------------------------------- /SRDQN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from time import gmtime, strftime 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | from collections import deque 8 | 9 | class DQN: 10 | def __init__(self,agentNum,config): 11 | if agentNum==0: 12 | graph_dqn1 = tf.Graph() 13 | graph_dqn = graph_dqn1 14 | elif agentNum==1: 15 | graph_dqn2 = tf.Graph() 16 | graph_dqn = graph_dqn2 17 | elif agentNum==2: 18 | graph_dqn3 = tf.Graph() 19 | graph_dqn = graph_dqn3 20 | elif agentNum==3: 21 | graph_dqn4 = tf.Graph() 22 | graph_dqn = graph_dqn4 23 | 24 | with graph_dqn.as_default(): 25 | 26 | tf.set_random_seed(1) 27 | self.agentNum = agentNum 28 | self.global_step = tf.Variable(0, trainable=False) 29 | # Hyper Parameters Link: 30 | self.config = config 31 | modelNumber = 'model'+str(agentNum+1) 32 | #self.addressName = 'model'+str(agentNum+1)+'/savetrained' + str(self.config.address) + '/network-' 33 | self.address = os.path.join(self.config.model_dir, modelNumber) # 'model'+str(agentNum+1)+'/savetrained'+ str(self.config.address) 34 | self.addressName = self.address + '/network-' 35 | if self.config.maxEpisodesTrain != 0: 36 | self.epsilon = config.epsilonBeg 37 | else: 38 | self.epsilon = 0 39 | self.epsilonRed = self.epsilonBuild() 40 | self.inputSize = self.config.stateDim * self.config.multPerdInpt 41 | self.timeStep = 0 42 | self.learning_rate = 0 # this is used when we have decaying 43 | self.iflrReseted = False # this is used to manage the scale of lr 44 | 45 | # init replay memory 46 | self.replayMemory = deque() 47 | self.replaySize = 0 48 | 49 | # create input placeholders 50 | self.createInputs() 51 | 52 | we = [] 53 | be = [] 54 | # create a network same as the saved network, to use some of its weight values. It is used 55 | # when the number of output in the loaded network is different than the current model. 56 | if self.config.ifUsePreviousModel and self.config.ifTransferFromSmallerActionSpace: 57 | # with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=self.config.gpu_memory_fraction))) as sess: 58 | with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=self.config.number_cpu_active, gpu_options=tf.GPUOptions(allow_growth=True))) as sess: 59 | weights, biases = self.createQNetworkForTL() 60 | sess.run(tf.global_variables_initializer()) 61 | if self.config.baseDemandDistribution == 0: 62 | directory=os.path.join(self.config.pre_model_dir,'uniform/'+str(int(self.config.demandLow))+'-'+str(int(self.config.demandUp))) 63 | elif self.config.baseDemandDistribution == 1: 64 | directory=os.path.join(self.config.pre_model_dir,'normal/'+str(int(self.config.demandMu))+'-'+str(int(self.config.demandSigma))) 65 | elif self.config.baseDemandDistribution == 2: 66 | directory=os.path.join(self.config.pre_model_dir,'classic') 67 | elif self.config.baseDemandDistribution == 3: 68 | directory=os.path.join(self.config.pre_model_dir,'basket'+str(self.config.data_id)) 69 | elif self.config.baseDemandDistribution == 4: 70 | directory=os.path.join(self.config.pre_model_dir,'forecast'+str(self.config.data_id)) 71 | 72 | if self.config.gameConfig == 1: 73 | # the Sterman case. 74 | base_brain = 7 + self.config.tlBaseBrain 75 | elif self.config.gameConfig == 2: 76 | # the BS case. 77 | base_brain = 3 + self.config.tlBaseBrain 78 | else: 79 | base_brain = self.config.tlBaseBrain 80 | checkpoint = tf.train.get_checkpoint_state(os.path.join(directory, 'brain'+str(base_brain))) 81 | # checkpoint = tf.train.get_checkpoint_state(os.path.join(self.config.pre_model_dir, 'brain'+str(self.config.tlBaseBrain))) 82 | if checkpoint and checkpoint.model_checkpoint_path: 83 | saver = tf.train.Saver() 84 | saver.restore(sess, checkpoint.model_checkpoint_path) 85 | we = sess.run(weights) 86 | np.save('weights',we) 87 | be=sess.run(biases) 88 | if self.config.INFO_print: 89 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 90 | ifLoadedModel = True 91 | else: 92 | ifLoadedModel = False 93 | if self.config.INFO_print: 94 | print("Could not find old network weights") 95 | 96 | 97 | # init Q network 98 | self.QValue,self.W_fc,self.b_fc = self.createQNetwork('Q', we, be) 99 | # init Target Q Network 100 | self.QValueT,self.W_fcT,self.b_fcT = self.createQNetwork('TQ') 101 | 102 | # copy the network to target network 103 | self.copyTargetQNetworkOperation = self.copyTargetQNetworkFunc() 104 | 105 | # create the placeholders and training model 106 | self.createTrainingMethod() 107 | self.currentState = [] 108 | 109 | # saving and loading networks 110 | self.saver = tf.train.Saver() 111 | config_tf = tf.ConfigProto() 112 | # config_tf.log_device_placement=True 113 | config_tf.gpu_options.per_process_gpu_memory_fraction = self.config.gpu_memory_fraction 114 | config_tf.gpu_options.allow_growth = True 115 | config_tf.intra_op_parallelism_threads = self.config.number_cpu_active 116 | 117 | # create the session 118 | # self.session = tf.InteractiveSession(config=config_tf) 119 | self.session = tf.Session(config=config_tf) 120 | 121 | # call tensor board 122 | self.merged = [] 123 | if self.config.TB: 124 | self.merged = tf.summary.merge_all() 125 | 126 | # create summary writer 127 | self.train_writer = tf.summary.FileWriter(self.config.model_dir + '/tb', self.session.graph) 128 | 129 | # initialize the variables 130 | self.session.run(tf.global_variables_initializer()) 131 | 132 | if self.config.ifUsePreviousModel: 133 | if not self.config.ifTransferFromSmallerActionSpace: 134 | # check if all agents are dnn, use the save network by each of them. 135 | 136 | if self.config.ifSinglePathExist: 137 | directory=self.config.pre_model_dir 138 | elif self.config.baseDemandDistribution == 0: 139 | directory=os.path.join(self.config.pre_model_dir,'uniform/'+str(int(self.config.demandLow))+'-'+str(int(self.config.demandUp))) 140 | elif self.config.baseDemandDistribution == 1: 141 | directory=os.path.join(self.config.pre_model_dir,'normal/'+str(int(self.config.demandMu))+'-'+str(int(self.config.demandSigma))) 142 | elif self.config.baseDemandDistribution == 2: 143 | directory=os.path.join(self.config.pre_model_dir,'classic') 144 | elif self.config.baseDemandDistribution == 3: 145 | directory=os.path.join(self.config.pre_model_dir,'basket'+str(self.config.data_id)) 146 | elif self.config.baseDemandDistribution == 4: 147 | directory=os.path.join(self.config.pre_model_dir,'forecast'+str(self.config.data_id)) 148 | 149 | if self.config.ifSinglePathExist: 150 | base_brain = self.config.tlBaseBrain + 1 151 | else: 152 | if self.config.gameConfig == 1: 153 | # the Sterman case. 154 | base_brain = 7 + self.config.tlBaseBrain 155 | elif self.config.gameConfig == 2: 156 | base_brain = 3 + self.config.tlBaseBrain 157 | else: 158 | # the BS case. 159 | base_brain = self.config.tlBaseBrain 160 | # checkpoint = tf.train.get_checkpoint_state(os.path.join(self.config.pre_model_dir, 'brain'+str(self.config.gameConfig))) 161 | if self.config.ifSinglePathExist: 162 | model_address = os.path.join(directory, 'model'+str(base_brain)) 163 | else: 164 | model_address = os.path.join(directory, 'brain'+str(base_brain)) 165 | 166 | checkpoint = tf.train.get_checkpoint_state(model_address) 167 | if checkpoint and checkpoint.model_checkpoint_path: 168 | self.saver.restore(self.session, checkpoint.model_checkpoint_path) 169 | if self.config.INFO_print: 170 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 171 | 172 | # copy the network to target network 173 | self.session.run(self.copyTargetQNetworkOperation) 174 | else: 175 | if self.config.INFO_print: 176 | print("Could not find old network weights in ", model_address) 177 | else: 178 | if ifLoadedModel: 179 | # copy the network to target network 180 | self.session.run(self.copyTargetQNetworkOperation) 181 | else: 182 | if self.config.INFO_print: 183 | print("Could not find old network weights") 184 | else: 185 | if self.config.INFO_print: 186 | print("Previous models will not be used") 187 | 188 | 189 | # returns the operator which copies the Q network to the target network 190 | def copyTargetQNetworkFunc(self): 191 | operation = [] 192 | for i in range(self.config.NoHiLayer+1): 193 | operation += [ self.W_fcT[i].assign(self.W_fc[i]),self.b_fcT[i].assign(self.b_fc[i])] 194 | return operation 195 | 196 | def copyBaseNetworkFunc(self, weights, biases): 197 | operation = [] 198 | for i in range(self.config.NoHiLayer): # we ignored the last layer (Q-value) that its dimension is different 199 | operation += [ self.W_fc[i].assign(weights[i]),self.b_fc[i].assign(biases[i])] 200 | return operation 201 | 202 | def createInputs(self): 203 | # input layer 204 | with tf.name_scope('input'): 205 | self.stateInput = tf.placeholder("float",[None,self.config.multPerdInpt,self.config.stateDim]) 206 | with tf.name_scope('input_reshape'): 207 | self.stateInputFlat = tf.reshape(self.stateInput,[-1,self.inputSize]) 208 | 209 | def createQNetworkForTL(self, graph_name='Q'): 210 | # input layer 211 | W = [] 212 | b = [] 213 | layer = [] 214 | 215 | for j in range(self.config.NoHiLayer+1): 216 | # var = np.sqrt(1.0/(self.config.nodes[j] + 0.0)) 217 | if j == 0: 218 | # hidden layers 219 | name=graph_name+'-layer'+str(j+1) 220 | hidden, weights, biases = self.fc_layer(self.stateInputFlat, self.config.nodes[j], 221 | self.config.nodes[j+1], name, j) # act=tf.sigmoid 222 | elif j == self.config.NoHiLayer: 223 | # output value 224 | name=graph_name+'-layer'+str(j+1) 225 | QValue, weights, biases = self.fc_layer(layer[j-1], self.config.nodes[j], 226 | self.config.baseActionSize, name,j ,act=tf.identity) 227 | else: 228 | # hidden layers 229 | name=graph_name+'-layer'+str(j+1) 230 | hidden, weights, biases = self.fc_layer(layer[j-1], 231 | self.config.nodes[j], self.config.nodes[j+1], name, j) 232 | 233 | layer += [hidden] 234 | W += [weights] 235 | b += [biases] 236 | 237 | return W, b 238 | 239 | def createQNetwork(self, graph_name, initial_w=[], initial_b=[]): 240 | # initiate the weight variables 241 | W = [] 242 | b = [] 243 | layer = [] 244 | 245 | for j in range(self.config.NoHiLayer+1): 246 | # var = np.sqrt(1.0/(self.config.nodes[j] + 0.0)) 247 | if list(initial_w): 248 | w_init = initial_w[j] 249 | b_init = initial_b[j] 250 | else: 251 | w_init = [] 252 | b_init = [] 253 | 254 | if j == 0: 255 | # hidden layers 256 | name=graph_name+'-layer'+str(j+1) 257 | hidden, weights, biases = self.fc_layer(self.stateInputFlat, self.config.nodes[j], 258 | self.config.nodes[j+1], name, j, w_init, b_init) # act=tf.sigmoid 259 | elif j == self.config.NoHiLayer: 260 | # output value 261 | name=graph_name+'-layer'+str(j+1) 262 | QValue, weights, biases = self.fc_layer(layer[j-1], self.config.nodes[j], 263 | self.config.nodes[j+1], name,j, init_w=[], init_b=[] ,act=tf.identity) 264 | else: 265 | # hidden layers 266 | name=graph_name+'-layer'+str(j+1) 267 | hidden, weights, biases = self.fc_layer(layer[j-1], 268 | self.config.nodes[j], self.config.nodes[j+1], name, j, w_init, b_init) 269 | 270 | layer += [hidden] 271 | W += [weights] 272 | b += [biases] 273 | 274 | return QValue,W,b 275 | 276 | 277 | def copyTargetQNetwork(self): 278 | self.session.run(self.copyTargetQNetworkOperation) 279 | 280 | def createTrainingMethod(self): 281 | self.actionInput = tf.placeholder("float",[None,self.config.actionListLen]) 282 | self.yInput = tf.placeholder("float", [None]) 283 | Q_Action = tf.reduce_sum(tf.multiply(self.QValue, self.actionInput), reduction_indices = 1) # dim: batchSize *1 284 | with tf.name_scope('cost'): 285 | self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action)) 286 | tf.summary.scalar('cost', self.cost) 287 | #self.trainStep = tf.train.RMSPropOptimizer(self.config.lr0,self.config.decay,self.config.momentum,1e-6).minimize(self.cost) 288 | if self.config.ifDecayAdam: 289 | with tf.name_scope('train'): 290 | self.learning_rate = tf.train.exponential_decay(self.config.lr0, self.global_step, self.config.decayStep, self.config.decayRate, staircase=True) 291 | self.trainStep = tf.train.AdamOptimizer(self.learning_rate,0.9,0.999,1e-8).minimize(self.cost, global_step=self.global_step) 292 | else: 293 | with tf.name_scope('train'): 294 | self.trainStep = tf.train.AdamOptimizer(self.config.lr0,0.9,0.999,1e-8).minimize(self.cost) 295 | 296 | def trainQNetwork(self): 297 | # Step 1: obtain random minibatch from replay memory 298 | minibatch = random.sample(self.replayMemory,self.config.batchSize) 299 | state_batch = [data[0] for data in minibatch] #dim: each item is multPerInput*stateDim 300 | action_batch = [data[1] for data in minibatch] 301 | reward_batch = [data[2] for data in minibatch] 302 | nextState_batch = [data[3] for data in minibatch] 303 | 304 | # Step 2: calculate y 305 | y_batch = [] 306 | QValue_batch = self.QValueT.eval(feed_dict={self.stateInput:nextState_batch},session = self.session) 307 | # for i in range(0,self.config.batchSize): 308 | # terminal = minibatch[i][4] 309 | # if terminal: 310 | # y_batch.append(reward_batch[i]) 311 | # else: 312 | # y_batch.append(reward_batch[i] + self.config.gamma * np.max(QValue_batch[i])) 313 | y_batch = reward_batch + (1-np.array(minibatch)[:,4])*self.config.gamma * np.max(QValue_batch, axis=1) 314 | # dim yInput: batchSize*1 315 | # dim actionInput: batchSize*actionListLen 316 | # dim stateInput: batchSize**multPerInput*stateDim 317 | 318 | # check if lr < Minlr, stop its decreasing procedure 319 | lr = self.learning_rate.eval(session=self.session) 320 | if lr < self.config.Minlr and not self.iflrReseted: 321 | self.iflrReseted = True 322 | self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 10000000, 1, staircase=True) 323 | 324 | feed_dict={ 325 | self.yInput : y_batch, 326 | self.actionInput : action_batch, 327 | self.stateInput : state_batch 328 | } 329 | if self.config.TB and (self.timeStep % self.config.tbLogInterval == 1): 330 | # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 331 | run_metadata = tf.RunMetadata() 332 | # summary, _ = self.session.run([self.merged, self.trainStep], feed_dict, feed_dictoptions=run_options, 333 | # run_metadata=run_metadata) 334 | summary, _ = self.session.run([self.merged, self.trainStep], feed_dict, 335 | run_metadata=run_metadata) 336 | self.train_writer.add_run_metadata(run_metadata, 'step%03d' % self.timeStep) 337 | self.train_writer.add_summary(summary, self.timeStep) 338 | if self.config.INFO_print: 339 | print('Adding run metadata for', self.timeStep) 340 | else: 341 | summary, _ = self.session.run([self.merged, self.trainStep], feed_dict) 342 | if self.config.TB and (self.timeStep%self.config.tbLogInterval==1): 343 | self.train_writer.add_summary(summary, self.timeStep) 344 | # self.trainStep.run(feed_dict, session=self.session) 345 | # self.session.run([self.trainStep], feed_dict) 346 | 347 | 348 | # grad_w= self.session.run([tf.norm(tf.gradients(self.cost, self.W_fc[3]))], feed_dict) 349 | # grad_b= self.session.run([tf.norm(tf.gradients(self.cost, self.b_fc[3]))], feed_dict) 350 | # print('grad is ', grad_w, grad_b) 351 | """trainResult = self.session.run(self.cost,feed_dict={ 352 | self.yInput : y_batch, 353 | self.actionInput : action_batch, 354 | self.stateInput : state_batch 355 | },session = self.session) 356 | print("TRAIN_RESULT", trainResult) """ 357 | 358 | # save network every saveInterval iteration 359 | if (self.timeStep+1) % self.config.saveInterval == 0: 360 | self.saver.save(self.session, self.addressName, global_step = self.timeStep) 361 | print("network weights are saved") 362 | 363 | if self.timeStep % self.config.dnnUpCnt == 0: 364 | self.copyTargetQNetwork() 365 | 366 | def train(self,nextObservation,action,reward,terminal,playType): 367 | # Considering the multi-period observation idea, merges the last $m-1$ periods with the new state. 368 | newState = np.append(self.currentState[1:,:],[nextObservation],axis = 0) 369 | 370 | if playType == "train": 371 | if self.config.MultiAgent: 372 | if self.config.MultiAgentRun[self.agentNum]: 373 | self.replayMemory.append([self.currentState,action,reward,newState,terminal]) 374 | self.replaySize = len(self.replayMemory) 375 | else: 376 | self.replayMemory.append([self.currentState,action,reward,newState,terminal]) 377 | self.replaySize = len(self.replayMemory) 378 | 379 | if self.replaySize > self.config.maxReplayMem and self.config.MultiAgentRun[self.agentNum]: 380 | self.replayMemory.popleft() 381 | self.trainQNetwork() 382 | state = "train" 383 | self.timeStep += 1 384 | 385 | elif self.replaySize >= self.config.minReplayMem and self.config.MultiAgentRun[self.agentNum]: 386 | # Train the network 387 | state = "train" 388 | self.trainQNetwork() 389 | self.timeStep += 1 390 | else: 391 | state = "observe" 392 | 393 | if terminal and state == "train": 394 | self.epsilonReduce() 395 | 396 | # print(info) 397 | #print("AGENT", self.agentNum,"/TRAINING_ITER", self.timeStep, "/ STATE", state, \) 398 | #"/ EPSILON", self.epsilon 399 | 400 | self.currentState = newState 401 | 402 | def getDNNAction(self,playType): 403 | action = np.zeros(self.config.actionListLen) 404 | action_index = 0 405 | if playType == "train": 406 | if (random.random() <= self.epsilon) or (self.replaySize < self.config.minReplayMem): 407 | action_index = random.randrange(self.config.actionListLen) 408 | action[action_index] = 1 409 | else: 410 | QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]},session = self.session)[0] 411 | action_index = np.argmax(QValue) 412 | action[action_index] = 1 413 | elif playType == "test" : 414 | QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]},session = self.session)[0] 415 | action_index = np.argmax(QValue) 416 | action[action_index] = 1 417 | 418 | return action 419 | 420 | # this functions sets the current state of the game in the begining of each game 421 | def setInitState(self,observation): 422 | self.currentState = np.stack([observation for _ in range(self.config.multPerdInpt)], axis = 0) # multPerdInpt observations stacked. each row is an observation 423 | 424 | 425 | def epsilonBuild(self): # this function specifies how much we should deduct from /epsilon at each game 426 | betta = 0.8 427 | if self.config.maxEpisodesTrain != 0: 428 | epsilon_red = (self.config.epsilonBeg - self.config.epsilonEnd)/(self.config.maxEpisodesTrain*betta) 429 | else: 430 | epsilon_red = 0 431 | return epsilon_red 432 | 433 | def epsilonReduce(self): 434 | # Reduces the values of epsilon at each iteration of episode 435 | if self.epsilon >self.config.epsilonEnd: 436 | self.epsilon -= self.epsilonRed 437 | 438 | def deleteGraph(self): 439 | tf.reset_default_graph() 440 | self.sess.close() 441 | 442 | 443 | 444 | def fc_layer(self, input_tensor, input_dim, output_dim, layer_name, j_, init_w=[], init_b=[], act=tf.nn.relu): 445 | """Reusable code for making a simple fully connected neural net layer. 446 | 447 | It does a matrix multiply, bias add, and then uses relu to nonlinearize. 448 | It also sets up name scoping so that the resultant graph is easy to read, 449 | and adds a number of summary ops. 450 | """ 451 | def variable_summaries(var): 452 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 453 | with tf.name_scope('summaries'): 454 | mean = tf.reduce_mean(var) 455 | tf.summary.scalar('mean', mean) 456 | with tf.name_scope('stddev'): 457 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 458 | tf.summary.scalar('stddev', stddev) 459 | tf.summary.scalar('max', tf.reduce_max(var)) 460 | tf.summary.scalar('min', tf.reduce_min(var)) 461 | tf.summary.histogram('histogram', var) 462 | 463 | def weight_variable(shape, j_, init_w=None): 464 | """Create a weight variable with appropriate initialization.""" 465 | if not list(init_w): 466 | initial = tf.random.truncated_normal(shape, stddev = 0.1) 467 | else: 468 | initial = tf.constant(init_w) 469 | if self.config.iftl and j_ < self.config.NoFixedLayer: 470 | return tf.Variable(initial, trainable=False) 471 | else: 472 | return tf.Variable(initial, trainable=True) 473 | 474 | def bias_variable(shape, j_, init_b=None): 475 | """Create a bias variable with appropriate initialization.""" 476 | if not list(init_b): 477 | initial = tf.constant(0.1, shape = shape) 478 | else: 479 | initial = tf.constant(init_b) 480 | if self.config.iftl and j_ < self.config.NoFixedLayer: 481 | return tf.Variable(initial, trainable=False) 482 | else: 483 | return tf.Variable(initial, trainable=True) 484 | 485 | # Adding a name scope ensures logical grouping of the layers in the graph. 486 | with tf.name_scope(layer_name): 487 | # This Variable will hold the state of the weights for the layer 488 | with tf.name_scope('weights'): 489 | weights = weight_variable([input_dim, output_dim], j_, init_w) 490 | variable_summaries(weights) 491 | with tf.name_scope('biases'): 492 | biases = bias_variable([output_dim], j_, init_b) 493 | variable_summaries(biases) 494 | with tf.name_scope('Wx_plus_b'): 495 | preactivate = tf.matmul(input_tensor, weights) + biases 496 | tf.summary.histogram('pre_activations', preactivate) 497 | activations = act(preactivate, name='activation') 498 | tf.summary.histogram('activations', activations) 499 | return activations, weights, biases 500 | 501 | -------------------------------------------------------------------------------- /clBeergame.py: -------------------------------------------------------------------------------- 1 | import time 2 | from time import gmtime, strftime 3 | import numpy as np 4 | import random 5 | from random import randint 6 | from BGAgent import Agent 7 | from plotting import plotting, savePlot, plotBaseStock 8 | import matplotlib.pyplot as plt 9 | import os 10 | from matplotlib import rc 11 | rc('text', usetex=True) 12 | import tensorflow as tf 13 | 14 | class clBeerGame(object): 15 | def __init__(self, config): 16 | self.config = config 17 | self.curGame = 0 # The number associated with the current game (counter of the game) 18 | self.curTime = 0 19 | self.totIterPlayed = 0 # total iterations of the game, played so far in this and previous games 20 | self.players = self.createAgent() # create the agents 21 | self.T = 0 22 | self.demand = [] 23 | self.playType = [] # "train" or "test" 24 | self.ifOptimalSolExist = self.config.ifOptimalSolExist 25 | self.getOptimalSol() 26 | self.totRew = 0 # it is reward of all players obtained for the current player. 27 | self.resultTest = [] 28 | self.runnerMidlResults = [] # stores the results to use in runner comparisons 29 | self.runnerFinlResults = [] # stores the results to use in runner comparisons 30 | self.middleTestResult = [] # stores the whole middle results of bs, Strm, and random to avoid doing same tests multiple of times. 31 | self.runNumber = 0 # the runNumber which is used when use runner 32 | self.strNum = 0 # the runNumber which is used when use runner 33 | 34 | # createAgent : Create agent objects (agentNum,IL,OO,c_h,c_p,type,config) 35 | def createAgent(self): 36 | agentTypes = self.config.agentTypes 37 | return [Agent(i,self.config.ILInit[i], self.config.AOInit, self.config.ASInit[i], 38 | self.config.c_h[i], self.config.c_p[i], self.config.eta[i], 39 | agentTypes[i],self.config) for i in range(self.config.NoAgent)] 40 | 41 | # planHorizon : Find a random planning horizon 42 | def planHorizon(self): 43 | # TLow: minimum number for the planning horizon # TUp: maximum number for the planning horizon 44 | #output: The planning horizon which is chosen randomly. 45 | return randint(self.config.TLow,self.config.TUp) 46 | 47 | # this function resets the game for start of the new game 48 | def resetGame(self, demand, playType): 49 | self.playType = playType #"train" or "test" 50 | self.demand = demand 51 | self.curTime = 0 52 | if playType == "train": 53 | self.curGame += 1 54 | self.totIterPlayed += self.T 55 | self.T = self.planHorizon() 56 | else: 57 | self.T = self.config.Ttest 58 | 59 | # reset the required information of player for each episode 60 | for k in range(0,self.config.NoAgent): 61 | self.players[k].resetPlayer(self.T) 62 | 63 | # update OO when there are initial IL,AO,AS 64 | self.update_OO() 65 | 66 | # correction on cost at time T according to the cost of the other players 67 | def getTotRew(self): 68 | totRew = 0 69 | for i in range(self.config.NoAgent): 70 | # sum all rewards for the agents and make correction 71 | totRew += self.players[i].cumReward 72 | 73 | for i in range(self.config.NoAgent): 74 | self.players[i].curReward += self.players[i].eta*(totRew - self.players[i].cumReward) #/(self.T) 75 | 76 | # make correction to the rewards in the experience replay for all iterations of current game 77 | def distTotReward(self): 78 | totRew = 0 79 | optRew = 0.1 80 | for i in range(self.config.NoAgent): 81 | # sum all rewards for the agents and make correction 82 | totRew += self.players[i].cumReward 83 | totRew += optRew 84 | 85 | for i in range(self.config.NoAgent): 86 | for j in range(self.T): 87 | if self.config.NoAgent>1 and hasattr(self.players[i], 'brain') and (len(self.players[i].brain.replayMemory)>0): 88 | #self.players[i].brain.replayMemory[-1*(j+1)][2] += (np.power(self.config.alpha,j)/(self.config.NoAgent-1))*((totRew - self.players[i].cumReward)/(self.T)) # changes the last T periods in the replayMemory 89 | self.players[i].brain.replayMemory[-1*(j+1)][2] += (self.config.distCoeff/(self.config.NoAgent-1))*((totRew - self.players[i].cumReward)/(self.T)) # changes the last T periods in the replayMemory 90 | 91 | def getAction(self, k): 92 | 93 | # get action for training run 94 | if self.playType == "train": 95 | if self.players[k].compTypeTrain == "srdqn": 96 | self.players[k].action = np.zeros(self.config.actionListLen) 97 | self.players[k].action = self.players[k].brain.getDNNAction(self.playType) 98 | elif self.players[k].compTypeTrain == "Strm": 99 | self.players[k].action = np.zeros(self.config.actionListLenOpt) 100 | self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)\ 101 | -max(0,round(self.players[k].AO[self.curTime] +\ 102 | self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) +\ 103 | self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1 104 | elif self.players[k].compTypeTest == "rnd": 105 | self.players[k].action = np.zeros(self.config.actionListLen) 106 | a = np.random.randint(self.config.actionListLen) 107 | self.players[k].action[a] = 1 108 | elif self.players[k].compTypeTrain == "bs": 109 | self.players[k].action = np.zeros(self.config.actionListLenOpt) 110 | if self.config.demandDistribution == 2: 111 | if self.curTime and self.config.use_initial_BS <= 4: 112 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 113 | max(0,(self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 114 | else: 115 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 116 | max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 117 | else: 118 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 119 | max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 120 | else: 121 | # not a valid player is defined. 122 | raise Exception('The player type is not defined or it is not a valid type.!') 123 | 124 | # get action for test runs 125 | elif self.playType == "test": 126 | if self.players[k].compTypeTest == "srdqn": 127 | self.players[k].action = np.zeros(self.config.actionListLen) 128 | if self.config.ifPlaySavedData: 129 | self.players[k].action[int(self.loaded_dqn_actions[self.curTime])] = 1 130 | else: 131 | self.players[k].action = self.players[k].brain.getDNNAction(self.playType) 132 | elif self.players[k].compTypeTest == "Strm": 133 | self.players[k].action = np.zeros(self.config.actionListLenOpt) 134 | 135 | self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 136 | max(0,round(self.players[k].AO[self.curTime] +\ 137 | self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) +\ 138 | self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1 139 | elif self.players[k].compTypeTest == "rnd": 140 | self.players[k].action = np.zeros(self.config.actionListLen) 141 | a = np.random.randint(self.config.actionListLen) 142 | self.players[k].action[a] = 1 143 | elif self.players[k].compTypeTest == "bs": 144 | self.players[k].action = np.zeros(self.config.actionListLenOpt) 145 | 146 | if self.config.demandDistribution == 2: 147 | if self.curTime and self.config.use_initial_BS <= 4: 148 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 149 | max(0,(self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 150 | else: 151 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 152 | max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 153 | else: 154 | self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\ 155 | max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1 156 | else: 157 | # not a valid player is defined. 158 | raise Exception('The player type is not defined or it is not a valid type.!') 159 | # print(self.curTime, self.players[k].agentNum, "IL", self.players[k].IL, "OO", self.players[k].OO, "Op", self.players[k].bsBaseStock, self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO)) 160 | 161 | # next action 162 | def next(self): 163 | # get a random leadtime 164 | leadTimeIn = randint(self.config.leadRecItemLow[self.config.NoAgent-1], self.config.leadRecItemUp[self.config.NoAgent-1]) 165 | # handle the most upstream recieved shipment 166 | self.players[self.config.NoAgent-1].AS[self.curTime + leadTimeIn] += self.players[self.config.NoAgent-1].actionValue(self.curTime, self.playType) 167 | 168 | for k in range(self.config.NoAgent-1,-1,-1): # [3,2,1,0] 169 | 170 | # get current IL and Backorder 171 | current_IL = max(0, self.players[k].IL) 172 | current_backorder = max(0, -self.players[k].IL) 173 | 174 | # TODO: We have get the AS and AO from the UI and update our AS and AO, so that code update the corresponding variables 175 | 176 | # increase IL and decrease OO based on the action, for the next period 177 | self.players[k].recieveItems(self.curTime) 178 | 179 | # observe the reward 180 | possible_shipment = min(current_IL + self.players[k].AS[self.curTime], current_backorder + self.players[k].AO[self.curTime]) 181 | 182 | # plan arrivals of the items to the downstream agent 183 | if self.players[k].agentNum > 0: 184 | leadTimeIn = randint(self.config.leadRecItemLow[k-1], self.config.leadRecItemUp[k-1]) 185 | self.players[k-1].AS[self.curTime + leadTimeIn] += possible_shipment 186 | 187 | # update IL 188 | self.players[k].IL -= self.players[k].AO[self.curTime] 189 | # observe the reward 190 | self.players[k].getReward() 191 | self.players[k].hist[-1][-2] = self.players[k].curReward 192 | self.players[k].hist2[-1][-2] = self.players[k].curReward 193 | 194 | # update next observation 195 | self.players[k].nextObservation = self.players[k].getCurState(self.curTime+1) 196 | 197 | if self.config.ifUseTotalReward: 198 | # correction on cost at time T 199 | if self.curTime == self.T: 200 | self.getTotRew() 201 | 202 | self.curTime +=1 203 | 204 | def handelAction(self): 205 | # get random lead time 206 | leadTime = randint(self.config.leadRecOrderLow[0], self.config.leadRecOrderUp[0]) 207 | # set AO 208 | self.players[0].AO[self.curTime] += self.demand[self.curTime] 209 | for k in range(0,self.config.NoAgent): 210 | self.getAction(k) 211 | 212 | self.players[k].srdqnBaseStock += [self.players[k].actionValue( \ 213 | self.curTime, self.playType) + self.players[k].IL + self.players[k].OO] 214 | 215 | # update hist for the plots 216 | self.players[k].hist += [[self.curTime,self.players[k].IL, self.players[k].OO,\ 217 | self.players[k].actionValue(self.curTime,self.playType),self.players[k].curReward, self.players[k].srdqnBaseStock[-1]]] 218 | 219 | if (self.players[k].compTypeTrain == "srdqn" and self.playType == "train") or (self.players[k].compTypeTest == "srdqn" and self.playType == "test"): 220 | self.players[k].hist2 += [[self.curTime,self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \ 221 | self.players[k].actionValue(self.curTime,self.playType), self.players[k].curReward, \ 222 | self.config.actionList[np.argmax(self.players[k].action)]]] 223 | 224 | else: 225 | self.players[k].hist2 += [[self.curTime,self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \ 226 | self.players[k].actionValue(self.curTime,self.playType), self.players[k].curReward, 0]] 227 | 228 | # updates OO and AO at time t+1 229 | self.players[k].OO += self.players[k].actionValue(self.curTime, self.playType) # open order level update 230 | leadTime = randint(self.config.leadRecOrderLow[k], self.config.leadRecOrderUp[k]) 231 | if self.players[k].agentNum < self.config.NoAgent-1: 232 | self.players[k+1].AO[self.curTime + leadTime] += self.players[k].actionValue(self.curTime, self.playType) # open order level update 233 | 234 | 235 | def playGame(self, demand, playType): 236 | self.resetGame(demand, playType) 237 | 238 | # run the game 239 | while self.curTime <= self.T: 240 | self.handelAction() 241 | self.next() 242 | 243 | 244 | for k in range(0,self.config.NoAgent): 245 | if (self.players[k].compTypeTrain == "srdqn" and playType == "train") or (self.players[k].compTypeTest == "srdqn" and playType == "test"): 246 | # control the learner agent 247 | 248 | self.players[k].brain.train(self.players[k].nextObservation,self.players[k].action, \ 249 | self.players[k].curReward,self.curTime == self.T,self.playType) 250 | if self.config.ifUsedistTotReward and playType == "train": 251 | self.distTotReward() 252 | return [-1*self.players[i].cumReward for i in range(0,self.config.NoAgent)] 253 | 254 | # check the Shang and Song (2003) condition, and if it works, obtains the base stock policy values for each agent 255 | def getOptimalSol(self): 256 | # if self.config.NoAgent !=1: 257 | if self.config.NoAgent !=1 and 1 == 2: 258 | # check the Shang and Song (2003) condition. 259 | for k in range(self.config.NoAgent-1): 260 | if not (self.players[k].c_h == self.players[k+1].c_h and self.players[k+1].c_p == 0): 261 | self.ifOptimalSolExist = False 262 | 263 | # if the Shang and Song (2003) condition satisfied, it runs the algorithm 264 | if self.ifOptimalSolExist == True: 265 | calculations = np.zeros((7,self.config.NoAgent)) 266 | for k in range(self.config.NoAgent): 267 | # DL_high 268 | calculations[0][k] = ((self.config.leadRecItemLow +self.config.leadRecItemUp + 2)/2 \ 269 | + (self.config.leadRecOrderLow+self.config.leadRecOrderUp + 2)/2)* \ 270 | (self.config.demandUp - self.config.demandLow- 1) 271 | if k > 0: 272 | calculations[0][k] += calculations[0][k-1] 273 | # probability_high 274 | nominator_ch = 0 275 | low_denominator_ch = 0 276 | for j in range(k,self.config.NoAgent): 277 | if j < self.config.NoAgent-1: 278 | nominator_ch += self.players[j+1].c_h 279 | low_denominator_ch += self.players[j].c_h 280 | if k == 0: 281 | high_denominator_ch = low_denominator_ch 282 | calculations[2][k] = (self.players[0].c_p + nominator_ch)/(self.players[0].c_p + low_denominator_ch + 0.0) 283 | # probability_low 284 | calculations[3][k] = (self.players[0].c_p + nominator_ch)/(self.players[0].c_p + high_denominator_ch + 0.0) 285 | # S_high 286 | calculations[4] = np.round(np.multiply(calculations[0],calculations[2])) 287 | # S_low 288 | calculations[5] = np.round(np.multiply(calculations[0],calculations[3])) 289 | # S_avg 290 | calculations[6] = np.round(np.mean(calculations[4:6], axis=0)) 291 | # S', set the base stock values into each agent. 292 | for k in range(self.config.NoAgent): 293 | if k == 0: 294 | self.players[k].bsBaseStock = calculations[6][k] 295 | 296 | else: 297 | self.players[k].bsBaseStock = calculations[6][k] - calculations[6][k-1] 298 | if self.players[k].bsBaseStock < 0: 299 | self.players[k].bsBaseStock = 0 300 | elif self.config.NoAgent ==1: 301 | if self.config.demandDistribution==0: 302 | self.players[0].bsBaseStock = np.ceil(self.config.c_h[0]/(self.config.c_h[0]+self.config.c_p[0]+ 0.0))*((self.config.demandUp-self.config.demandLow-1)/2)*self.config.leadRecItemUp 303 | elif 1 == 1: 304 | f = self.config.f 305 | f_init = self.config.f_init 306 | for k in range(self.config.NoAgent): 307 | self.players[k].bsBaseStock = f[k] 308 | self.players[k].int_bslBaseStock = f_init[k] 309 | 310 | def doTestMid(self, demandTs): 311 | if self.config.ifPlaySavedData: 312 | for c,i in enumerate(self.config.agentTypes): 313 | if i == "srdqn": 314 | dnn_agent = c 315 | break 316 | 317 | self.resultTest = [] 318 | for i in range(self.config.testRepeatMid): 319 | if self.config.ifPlaySavedData: 320 | hist2 = np.load(os.path.join(self.config.model_dir,'DQN-0-player-'+str(dnn_agent)+'-'+str(i)+'.npy')) 321 | self.loaded_dqn_actions = hist2[:,7] 322 | self.doTest(i,demandTs[i]) 323 | 324 | print("---------------------------------------------------------------------------------------") 325 | resultSummary = np.array(self.resultTest).mean(axis=0).tolist() 326 | 327 | 328 | 329 | result_srdqn= ', '.join(map("{:.2f}".format, resultSummary[0])) 330 | result_rand= ', '.join(map("{:.2f}".format, resultSummary[1])) 331 | result_strm= ', '.join(map("{:.2f}".format, resultSummary[2])) 332 | if self.ifOptimalSolExist: 333 | result_bs= ', '.join(map("{:.2f}".format, resultSummary[3])) 334 | print('SUMMARY; {0:s}; ITER= {1:d}; SRDQN= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}; BS= [{8:s}]; SUM = {9:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 335 | self.curGame, result_srdqn, sum(resultSummary[0]), 336 | result_rand, sum(resultSummary[1]), 337 | result_strm, sum(resultSummary[2]), 338 | result_bs, sum(resultSummary[3]))) 339 | 340 | else: 341 | print('SUMMARY; {0:s}; ITER= {1:d}; SRDQN= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 342 | self.curGame, result_srdqn, sum(resultSummary[0]), 343 | result_rand, sum(resultSummary[1]), 344 | result_strm, sum(resultSummary[2]))) 345 | 346 | print("=======================================================================================") 347 | 348 | 349 | 350 | 351 | def doTest(self, m,demand): 352 | import matplotlib.pyplot as plt 353 | 354 | if (self.config.ifSaveFigure) and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])): 355 | plt.figure(self.curGame, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') 356 | 357 | self.demand = demand 358 | # use dnn to get output. 359 | Rsltdnn,plt = self.tester(self.config.agentTypes ,plt, 'b', 'DQN' ,m) 360 | baseStockdata = self.players[0].srdqnBaseStock 361 | 362 | # check some condition to avoid doing same test middle again. 363 | if ((self.config.ifSaveFigure) and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1]))) \ 364 | or (self.curGame >= self.config.maxEpisodesTrain-1) or (len(self.middleTestResult) < self.config.testRepeatMid): 365 | 366 | # use random to get output. 367 | RsltRnd ,plt= self.tester(["rnd","rnd","rnd","rnd"], plt,'y21', 'RAND' ,m) 368 | 369 | # use formual to get output. 370 | RsltStrm ,plt= self.tester(["Strm","Strm","Strm","Strm"],plt, 'g', 'Strm' ,m) 371 | 372 | # use optimal strategy to get output, if it works. 373 | if self.ifOptimalSolExist: 374 | if self.config.agentTypes == ["srdqn", "Strm","Strm","Strm"]: 375 | Rsltbs ,plt= self.tester(["bs","Strm","Strm","Strm"],plt, 'r', 'Strm-BS' ,m) 376 | elif self.config.agentTypes == ["Strm", "srdqn","Strm","Strm"]: 377 | Rsltbs ,plt= self.tester(["Strm","bs","Strm","Strm"],plt, 'r', 'Strm-BS' ,m) 378 | elif self.config.agentTypes == ["Strm", "Strm","srdqn","Strm"]: 379 | Rsltbs ,plt= self.tester(["Strm","Strm","bs","Strm"],plt, 'r', 'Strm-BS' ,m) 380 | elif self.config.agentTypes == ["Strm", "Strm","Strm","srdqn"]: 381 | Rsltbs ,plt= self.tester(["Strm","Strm","Strm","bs"],plt, 'r', 'Strm-BS' ,m) 382 | elif self.config.agentTypes == ["srdqn", "rnd","rnd","rnd"]: 383 | Rsltbs ,plt= self.tester(["bs","rnd","rnd","rnd"],plt, 'r', 'RND-BS' ,m) 384 | elif self.config.agentTypes == ["rnd", "srdqn","rnd","rnd"]: 385 | Rsltbs ,plt= self.tester(["rnd","bs","rnd","rnd"],plt, 'r', 'RND-BS' ,m) 386 | elif self.config.agentTypes == ["rnd", "rnd","srdqn","rnd"]: 387 | Rsltbs ,plt= self.tester(["rnd","rnd","bs","rnd"],plt, 'r', 'RND-BS' ,m) 388 | elif self.config.agentTypes == ["rnd", "rnd","rnd","srdqn"]: 389 | Rsltbs ,plt= self.tester(["rnd","rnd","rnd","bs"],plt, 'r', 'RND-BS' ,m) 390 | else: 391 | Rsltbs ,plt= self.tester(["bs","bs","bs","bs"],plt, 'r', 'BS' ,m) 392 | # hold the results of the optimal solution 393 | self.middleTestResult += [[RsltRnd,RsltStrm,Rsltbs]] 394 | else: 395 | self.middleTestResult += [[RsltRnd,RsltStrm]] 396 | 397 | else: 398 | # return the obtained results into their lists 399 | RsltRnd = self.middleTestResult[m][0] 400 | RsltStrm = self.middleTestResult[m][1] 401 | if self.ifOptimalSolExist: 402 | Rsltbs = self.middleTestResult[m][2] 403 | 404 | # save the figure 405 | if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])): 406 | savePlot(self.players, self.curGame, Rsltdnn ,RsltStrm, Rsltbs , self.config, m) 407 | 408 | result_srdqn = ', '.join(map("{:.2f}".format, Rsltdnn)) 409 | result_rand = ', '.join(map("{:.2f}".format, RsltRnd)) 410 | result_strm = ', '.join(map("{:.2f}".format, RsltStrm)) 411 | if self.ifOptimalSolExist: 412 | result_bs = ', '.join(map("{:.2f}".format, Rsltbs)) 413 | print('output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}; BS= [{8:s}]; sum = {9:2.4f}'.format( 414 | strftime("%Y-%m-%d %H:%M:%S", gmtime()) , str(str(self.curGame)+"-"+str(m)), result_srdqn , sum(Rsltdnn), 415 | result_rand, sum(RsltRnd), 416 | result_strm, sum(RsltStrm), 417 | result_bs, sum(Rsltbs))) 418 | self.resultTest += [[Rsltdnn,RsltRnd,RsltStrm,Rsltbs]] 419 | 420 | else: 421 | print('output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 422 | str(str(self.curGame)+"-"+str(m)), result_srdqn, sum(Rsltdnn), 423 | result_rand, sum(RsltRnd), 424 | result_strm, sum(RsltStrm))) 425 | 426 | self.resultTest += [[Rsltdnn,RsltRnd,RsltStrm]] 427 | 428 | return sum(Rsltdnn) 429 | 430 | def tester(self,testType,plt, colori, labeli ,m): 431 | 432 | # set computation type for test 433 | for k in range(0,self.config.NoAgent): 434 | self.players[k].compTypeTest = testType[k] 435 | # run the episode to get the results. 436 | result = self.playGame(self.demand,"test") 437 | # add the results into the figure 438 | if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])) and (testType[0] != "rnd"): 439 | plt = plotting(plt,[np.array(self.players[i].hist) for i in range(0,self.config.NoAgent)],colori, labeli) 440 | if self.config.ifsaveHistInterval and ((self.curGame == 0) or (self.curGame == 1) or (self.curGame == 2) or(self.curGame == 3) or ((self.curGame - 1) % self.config.saveHistInterval == 0)\ 441 | or ((self.curGame) % self.config.saveHistInterval == 0) or ((self.curGame) % self.config.saveHistInterval == 1) \ 442 | or ((self.curGame) % self.config.saveHistInterval == 2)) : 443 | for k in range(0,self.config.NoAgent): 444 | name = labeli + "-" + str(self.curGame) + "-" + "player" + "-" + str(k)+ "-" + str(m) 445 | np.save(os.path.join(self.config.model_dir,name), np.array(self.players[k].hist2)) 446 | 447 | # save the figure of base stocks 448 | # if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])): 449 | # for k in range(self.config.NoAgent): 450 | # if self.players[k].compTypeTest == 'dnn': 451 | # plotBaseStock(self.players[k].srdqnBaseStock, 'b', 'base stock of agent '+ str(self.players[k].agentNum), self.curGame, self.config, m) 452 | 453 | return result,plt 454 | 455 | 456 | def update_OO(self): 457 | for k in range(0,self.config.NoAgent): 458 | if k < self.config.NoAgent - 1: 459 | self.players[k].OO = sum(self.players[k+1].AO) + sum(self.players[k].AS) 460 | else: 461 | self.players[k].OO = sum(self.players[k].AS) 462 | 463 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | def str2bool(v): 6 | return v.lower() in ('true', '1') 7 | 8 | arg_lists = [] 9 | parser = argparse.ArgumentParser() 10 | 11 | def add_argument_group(name): 12 | arg = parser.add_argument_group(name) 13 | arg_lists.append(arg) 14 | return arg 15 | 16 | # crm 17 | game_arg = add_argument_group('BeerGame') 18 | game_arg.add_argument('--task', type=str, default='bg') 19 | game_arg.add_argument('--fixedAction', type=str2bool, default='False', help='if you want to have actions in [0,actionMax] set it to True. with False it will set it [actionLow, actionUp]') 20 | game_arg.add_argument('--observation_data', type=str2bool, default=False, help='if it is True, then it uses the data that is generated by based on few real world observation') 21 | game_arg.add_argument('--data_id', type=int, default=22, help='the default item id for the basket dataset') 22 | game_arg.add_argument('--TLow', type=int, default=100, help='duration of one GAME (lower bound)') 23 | game_arg.add_argument('--TUp', type=int, default=100, help='duration of one GAME (upper bound)') 24 | game_arg.add_argument('--demandDistribution', type=int, default=0, help='0=uniform, 1=normal distribution, 2=the sequence of 4,4,4,4,8,..., 3= basket data, 4= forecast data') 25 | game_arg.add_argument('--scaled', type=str2bool, default=False, help='if true it uses the (if) existing scaled parameters') 26 | game_arg.add_argument('--demandLow', type=int, default=0, help='the lower bound of random demand') 27 | game_arg.add_argument('--demandUp', type=int, default=3, help='the upper bound of random demand') 28 | game_arg.add_argument('--demandMu', type=float, default=10, help='the mu of the normal distribution for demand ') 29 | game_arg.add_argument('--demandSigma', type=float, default=2, help='the sigma of the normal distribution for demand ') 30 | game_arg.add_argument('--actionMax', type=int, default=2, help='it works when fixedAction is True') 31 | game_arg.add_argument('--actionUp', type=int, default=2, help='bounds on my decision (upper bound), it works when fixedAction is True') 32 | game_arg.add_argument('--actionLow', type=int, default=-2, help='bounds on my decision (lower bound), it works when fixedAction is True') 33 | game_arg.add_argument('--action_step', type=int, default=1, help='The obtained action value by dnn is multiplied by this value') 34 | game_arg.add_argument('--actionList', type=list, default=[], help='The list of the available actions') 35 | game_arg.add_argument('--actionListLen', type=int, default=0, help='the length of the action list') 36 | game_arg.add_argument('--actionListOpt', type=int, default=0 , help='the action list which is used in optimal and sterman') 37 | game_arg.add_argument('--actionListLenOpt', type=int, default=0, help='the length of the actionlistopt') 38 | game_arg.add_argument('--agentTypes', type=list, default=['dnn','dnn','dnn','dnn'], help='the player types') 39 | game_arg.add_argument('--agent_type1', type=str, default='dnn', help='the player types for agent 1, it can be dnn, Strm, bs, rnd') 40 | game_arg.add_argument('--agent_type2', type=str, default='dnn', help='the player types for agent 2, it can be dnn, Strm, bs, rnd') 41 | game_arg.add_argument('--agent_type3', type=str, default='dnn', help='the player types for agent 3, it can be dnn, Strm, bs, rnd') 42 | game_arg.add_argument('--agent_type4', type=str, default='dnn', help='the player types for agent 4, it can be dnn, Strm, bs, rnd') 43 | game_arg.add_argument('--NoAgent', type=int, default=1, help='number of agents, currently it should be in {1,2,3,4}') 44 | game_arg.add_argument('--cp1', type=float, default=2.0, help='shortage cost of player 1') 45 | game_arg.add_argument('--cp2', type=float, default=0.0, help='shortage cost of player 2') 46 | game_arg.add_argument('--cp3', type=float, default=0.0, help='shortage cost of player 3') 47 | game_arg.add_argument('--cp4', type=float, default=0.0, help='shortage cost of player 4') 48 | game_arg.add_argument('--ch1', type=float, default=2.0, help='holding cost of player 1') 49 | game_arg.add_argument('--ch2', type=float, default=2.0, help='holding cost of player 2') 50 | game_arg.add_argument('--ch3', type=float, default=2.0, help='holding cost of player 3') 51 | game_arg.add_argument('--ch4', type=float, default=2.0, help='holding cost of player 4') 52 | game_arg.add_argument('--alpha_b1', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 1') 53 | game_arg.add_argument('--alpha_b2', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 2') 54 | game_arg.add_argument('--alpha_b3', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 3') 55 | game_arg.add_argument('--alpha_b4', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 4') 56 | game_arg.add_argument('--betta_b1', type=float, default=-0.2, help='beta of Sterman formula parameter for player 1') 57 | game_arg.add_argument('--betta_b2', type=float, default=-0.2, help='beta of Sterman formula parameter for player 2') 58 | game_arg.add_argument('--betta_b3', type=float, default=-0.2, help='beta of Sterman formula parameter for player 3') 59 | game_arg.add_argument('--betta_b4', type=float, default=-0.2, help='beta of Sterman formula parameter for player 4') 60 | game_arg.add_argument('--eta', type=list, default=[0,4,4,4], help='the total cost regulazer') 61 | game_arg.add_argument('--distCoeff', type=int, default=20, help='the total cost regulazer') 62 | game_arg.add_argument('--gameConfig', type=int, default=3, help='if it is "0", it uses the current "agentType", otherwise sets agent types according to the function setAgentType() in this file.') 63 | game_arg.add_argument('--ifUseTotalReward', type=str2bool, default='False', help='if you want to have the total rewards in the experience replay, set it to true.') 64 | game_arg.add_argument('--ifUsedistTotReward', type=str2bool, default='True', help='If use correction to the rewards in the experience replay for all iterations of current game') 65 | game_arg.add_argument('--ifUseASAO', type=str2bool, default='True', help='if use AS and AO, i.e., received shipment and received orders in the input of DNN') 66 | game_arg.add_argument('--ifUseActionInD', type=str2bool, default='False', help='if use action in the input of DNN') 67 | game_arg.add_argument('--stateDim', type=int, default=5, help='Number of elements in the state desciptor - Depends on ifUseASAO') 68 | game_arg.add_argument('--iftl', type=str2bool, default=False, help='if apply transfer learning') 69 | game_arg.add_argument('--ifTransferFromSmallerActionSpace', type=str2bool, default=False, help='if want to transfer knowledge from a network with different action space size.') 70 | game_arg.add_argument('--baseActionSize', type=int, default=5, help='if ifTransferFromSmallerActionSpace is true, this determines the size of action space of saved network') 71 | game_arg.add_argument('--tlBaseBrain', type=int, default=3, help='the gameConfig of the base network for re-training with transfer-learning') 72 | game_arg.add_argument('--baseDemandDistribution', type=int, default=0, help='same as the demandDistribution') 73 | game_arg.add_argument('--MultiAgent', type=str2bool, default=False, help='if run multi-agent RL model, not fully operational') 74 | game_arg.add_argument('--MultiAgentRun', type=list, default=[True, True, True, True], help='In the multi-RL setting, it determines which agent should get training.') 75 | game_arg.add_argument('--if_use_AS_t_plus_1', type=str2bool, default='False', help='if use AS[t+1], not AS[t] in the input of DNN') 76 | game_arg.add_argument('--ifSinglePathExist', type=str2bool, default=False, help='If true it uses the predefined path in pre_model_dir and does not merge it with demandDistribution.') 77 | game_arg.add_argument('--ifPlaySavedData', type=str2bool, default=False, help='If true it uses the saved actions which are read from file.') 78 | 79 | #################### parameters of the leadtimes ######################## 80 | leadtimes_arg = add_argument_group('leadtimes') 81 | leadtimes_arg.add_argument('--leadRecItemLow', type=list, default=[2,2,2,4], help='the min lead time for receiving items') 82 | leadtimes_arg.add_argument('--leadRecItemUp', type=list, default=[2,2,2,4], help='the max lead time for receiving items') 83 | leadtimes_arg.add_argument('--leadRecOrderLow', type=int, default=[2,2,2,0], help='the min lead time for receiving orders') 84 | leadtimes_arg.add_argument('--leadRecOrderUp', type=int, default=[2,2,2,0], help='the max lead time for receiving orders') 85 | leadtimes_arg.add_argument('--ILInit', type=list, default=[0,0,0,0], help='') 86 | leadtimes_arg.add_argument('--AOInit', type=list, default=[0,0,0,0], help='') 87 | leadtimes_arg.add_argument('--ASInit', type=list, default=[0,0,0,0], help='the initial shipment of each agent') 88 | leadtimes_arg.add_argument('--leadRecItem1', type=int, default=2, help='the min lead time for receiving items') 89 | leadtimes_arg.add_argument('--leadRecItem2', type=int, default=2, help='the min lead time for receiving items') 90 | leadtimes_arg.add_argument('--leadRecItem3', type=int, default=2, help='the min lead time for receiving items') 91 | leadtimes_arg.add_argument('--leadRecItem4', type=int, default=2, help='the min lead time for receiving items') 92 | leadtimes_arg.add_argument('--leadRecOrder1', type=int, default=2, help='the min lead time for receiving order') 93 | leadtimes_arg.add_argument('--leadRecOrder2', type=int, default=2, help='the min lead time for receiving order') 94 | leadtimes_arg.add_argument('--leadRecOrder3', type=int, default=2, help='the min lead time for receiving order') 95 | leadtimes_arg.add_argument('--leadRecOrder4', type=int, default=2, help='the min lead time for receiving order') 96 | leadtimes_arg.add_argument('--ILInit1', type=int, default=0, help='the initial inventory level of the agent') 97 | leadtimes_arg.add_argument('--ILInit2', type=int, default=0, help='the initial inventory level of the agent') 98 | leadtimes_arg.add_argument('--ILInit3', type=int, default=0, help='the initial inventory level of the agent') 99 | leadtimes_arg.add_argument('--ILInit4', type=int, default=0, help='the initial inventory level of the agent') 100 | leadtimes_arg.add_argument('--AOInit1', type=int, default=0, help='the initial arriving order of the agent') 101 | leadtimes_arg.add_argument('--AOInit2', type=int, default=0, help='the initial arriving order of the agent') 102 | leadtimes_arg.add_argument('--AOInit3', type=int, default=0, help='the initial arriving order of the agent') 103 | leadtimes_arg.add_argument('--AOInit4', type=int, default=0, help='the initial arriving order of the agent') 104 | leadtimes_arg.add_argument('--ASInit1', type=int, default=0, help='the initial arriving shipment of the agent') 105 | leadtimes_arg.add_argument('--ASInit2', type=int, default=0, help='the initial arriving shipment of the agent') 106 | leadtimes_arg.add_argument('--ASInit3', type=int, default=0, help='the initial arriving shipment of the agent') 107 | leadtimes_arg.add_argument('--ASInit4', type=int, default=0, help='the initial arriving shipment of the agent') 108 | 109 | 110 | #################### DQN setting #################### 111 | DQN_arg = add_argument_group('DQN') 112 | DQN_arg.add_argument('--maxEpisodesTrain', type=int, default=60100, help='number of GAMES to be trained') 113 | DQN_arg.add_argument('--NoHiLayer', type=int, default=3, help='number of hidden layers') 114 | DQN_arg.add_argument('--NoFixedLayer', type=int, default=1, help='number of hidden layers') 115 | DQN_arg.add_argument('--node1', type=int, default=180, help='the number of nodes in the first hidden layer') 116 | DQN_arg.add_argument('--node2', type=int, default=130, help='the number of nodes in the second hidden layer') 117 | DQN_arg.add_argument('--node3', type=int, default=61, help='the number of nodes in the third hidden layer') 118 | DQN_arg.add_argument('--nodes', type=list, default=[], help='') 119 | 120 | DQN_arg.add_argument('--seed', type=int, default=4, help='the seed for DNN stuff') 121 | DQN_arg.add_argument('--batchSize', type=int, default=64, help='the batch size which is used to obtain') 122 | DQN_arg.add_argument('--minReplayMem', type=int, default=50000, help='the minimum of experience reply size to start dnn') 123 | DQN_arg.add_argument('--maxReplayMem', type=int, default=1000000, help='the maximum size of the replay memory') 124 | DQN_arg.add_argument('--alpha', type=float, default=.97, help='learning rate for total reward distribution ') 125 | DQN_arg.add_argument('--gamma', type=float, default=.99, help='discount factor for Q-learning') 126 | DQN_arg.add_argument('--saveInterval', type=int, default=10000, help='every xx training iteration, saves the games network') 127 | DQN_arg.add_argument('--epsilonBeg', type=float, default=0.9, help='') 128 | DQN_arg.add_argument('--epsilonEnd', type=float, default=0.1, help='') 129 | 130 | DQN_arg.add_argument('--lr0', type=float, default=0.00025 , help='the learning rate') 131 | DQN_arg.add_argument('--Minlr', type=float, default=1e-8, help='the minimum learning rate, if it drops below it, fix it there ') 132 | DQN_arg.add_argument('--ifDecayAdam', type=str2bool, default=True, help='decays the learning rate of the adam optimizer') 133 | DQN_arg.add_argument('--decayStep', type=int, default=10000, help='the decay step of the learning rate') 134 | DQN_arg.add_argument('--decayRate', type=float, default=0.98, help='the rate to reduce the lr at every decayStep') 135 | 136 | DQN_arg.add_argument('--display', type=int, default=1000, help='the number of iterations between two display of results.') 137 | DQN_arg.add_argument('--momentum', type=float, default=0.9, help='the momentum value') 138 | DQN_arg.add_argument('--dnnUpCnt', type=int, default=10000, help='the number of iterations that updates the dnn weights') 139 | DQN_arg.add_argument('--multPerdInpt', type=int, default=10, help='Number of history records which we feed into DNN') 140 | 141 | 142 | #################### Utilities #################### 143 | utility_arg = add_argument_group('Utilities') 144 | utility_arg.add_argument('--address', type=str, default="", help='the address which is used to save the model files') 145 | utility_arg.add_argument('--ifUsePreviousModel', type=str2bool, default='False', help='if there is a saved model, then False value of this parameter will overwrite.') 146 | utility_arg.add_argument('--number_cpu_active', type=int, default=5, help='number of cpu cores') 147 | utility_arg.add_argument('--gpu_memory_fraction', type=float, default=0.1, help='the fraction of gpu memory which we are gonna use') 148 | # Dirs 149 | utility_arg.add_argument('--load_path', type=str, default='', help='The directory to load the models') 150 | utility_arg.add_argument('--log_dir', type=str, default=os.path.expanduser('./logs/'), help='') 151 | utility_arg.add_argument('--pre_model_dir', type=str, default=os.path.expanduser('./pre_model'),help='') 152 | utility_arg.add_argument('--action_dir', type=str, default=os.path.expanduser('./'),help='if ifPlaySavedData is true, it uses this path to load actions') 153 | utility_arg.add_argument('--model_dir', type=str, default='./',help='') 154 | utility_arg.add_argument('--TB', type=str2bool, default=False, help='set to True if use tensor board and save the required data for TB.') 155 | utility_arg.add_argument('--INFO_print', type=str2bool, default=True, help='if true, it does not print anything all.') 156 | utility_arg.add_argument('--tbLogInterval', type=int, default=80000, help='number of GAMES for testing') 157 | 158 | #################### testing #################### 159 | test_arg = add_argument_group('testing') 160 | test_arg.add_argument('--testRepeatMid', type=int, default=50, help='it is number of episodes which is going to be used for testing in the middle of training') 161 | test_arg.add_argument('--testInterval', type=int, default=100, help='every xx games compute "test error"') 162 | test_arg.add_argument('--ifSaveFigure', type=str2bool, default=True, help='if is it True, save the figures in each testing.') 163 | test_arg.add_argument('--if_titled_figure', type=str2bool, default='True', help='if is it True, save the figures with details in the title.') 164 | test_arg.add_argument('--saveFigInt', type=list, default=[59990,60000], help='') 165 | test_arg.add_argument('--saveFigIntLow', type=int, default=59990, help='') 166 | test_arg.add_argument('--saveFigIntUp', type=int, default=60000, help='') 167 | test_arg.add_argument('--ifsaveHistInterval', type=str2bool, default=False, help='if every xx games save details of the episode') 168 | test_arg.add_argument('--saveHistInterval', type=int, default=50000, help='every xx games save details of the play') 169 | test_arg.add_argument('--Ttest', type=int, default=100, help='it defines the number of periods in the test cases') 170 | test_arg.add_argument('--ifOptimalSolExist', type=str2bool, default=True, help='if the instance has optimal base stock policy, set it to True, otherwise it should be False.') 171 | test_arg.add_argument('--f1', type=float, default=8, help='base stock policy decision of player 1') 172 | test_arg.add_argument('--f2', type=float, default=8, help='base stock policy decision of player 2') 173 | test_arg.add_argument('--f3', type=float, default=0, help='base stock policy decision of player 3') 174 | test_arg.add_argument('--f4', type=float, default=0, help='base stock policy decision of player 4') 175 | test_arg.add_argument('--f_init', type=list, default=[32,32,32,24], help='base stock policy decision for 4 time-steps on the C(4,8) demand distribution') 176 | test_arg.add_argument('--use_initial_BS', type=str2bool, default=False, help='If use f_init set it to True') 177 | 178 | #################### reporting #################### 179 | reporting_arg = add_argument_group('reporting') 180 | reporting_arg.add_argument('--Rsltdnn', type=list, default=[], help='the result of dnn play tests will be saved here') 181 | reporting_arg.add_argument('--RsltRnd', type=list, default=[], help='the result of random play tests will be saved here') 182 | reporting_arg.add_argument('--RsltStrm', type=list, default=[], help='the result of heuristic fomula play tests will be saved here') 183 | reporting_arg.add_argument('--Rsltbs', type=list, default=[], help='the result of optimal play tests will be saved here') 184 | reporting_arg.add_argument('--ifSaveHist', type=str2bool, default='False', help='if it is true, saves history, prediction, and the randBatch in each period, WARNING: just make it True in small runs, it saves huge amount of files.') 185 | 186 | 187 | #buildActionList: actions for the beer game problem 188 | def buildActionList(config): 189 | aDiv = 1 # difference in the action list 190 | if config.fixedAction: 191 | actions = list(range(0,config.actionMax+1,aDiv)) # If you put the second argument =11, creates an actionlist from 0..xx 192 | else: 193 | actions = list(range(config.actionLow,config.actionUp+1,aDiv) ) 194 | return actions 195 | 196 | # specify the dimension of the state of the game 197 | def getStateDim(config): 198 | if config.ifUseASAO: 199 | stateDim=5 200 | else: 201 | stateDim=3 202 | 203 | if config.ifUseActionInD: 204 | stateDim += 1 205 | 206 | return stateDim 207 | 208 | # agents 1=[dnn,dnn,dnn,dnn]; 2=[dnn,Strm,Strm,Strm]; 3=[dnn,bs,bs,bs] 209 | def setAgentType(config): 210 | if config.gameConfig == 1: # all agents are run by DNN- Also, load-model loads from brain-3+agentNum- 211 | # Also multi-agent with double target uses this gameConfig. 212 | config.agentTypes = ["srdqn", "srdqn","srdqn","srdqn"] 213 | config.to_prev_ai = [3,-1,-1,-1] 214 | elif config.gameConfig == 2: # one agent is run by DNN- Also, load-model loads from brain-3+agentNum- 215 | # Also multi-agent with double target uses this gameConfig. 216 | config.agentTypes = ["srdqn", "srdqn","srdqn","srdqn"] 217 | config.to_prev_ai = [3,-1,-1,-1] 218 | elif config.gameConfig == 3: 219 | config.agentTypes = ["srdqn", "bs","bs","bs"] 220 | elif config.gameConfig == 4: 221 | config.agentTypes = ["bs", "srdqn","bs","bs"] 222 | elif config.gameConfig == 5: 223 | config.agentTypes = ["bs", "bs","srdqn","bs"] 224 | elif config.gameConfig == 6: 225 | config.agentTypes = ["bs", "bs","bs","srdqn"] 226 | elif config.gameConfig == 7: 227 | config.agentTypes = ["srdqn", "Strm","Strm","Strm"] 228 | elif config.gameConfig == 8: 229 | config.agentTypes = ["Strm", "srdqn","Strm","Strm"] 230 | elif config.gameConfig == 9: 231 | config.agentTypes = ["Strm", "Strm","srdqn","Strm"] 232 | elif config.gameConfig == 10: 233 | config.agentTypes = ["Strm", "Strm","Strm","srdqn"] 234 | elif config.gameConfig == 11: 235 | config.agentTypes = ["srdqn", "rnd","rnd","rnd"] 236 | elif config.gameConfig == 12: 237 | config.agentTypes = ["rnd", "srdqn","rnd","rnd"] 238 | elif config.gameConfig == 13: 239 | config.agentTypes = ["rnd", "rnd","srdqn","rnd"] 240 | elif config.gameConfig == 14: 241 | config.agentTypes = ["rnd", "rnd","rnd","srdqn"] 242 | elif config.gameConfig == 15: 243 | config.agentTypes = ["Strm", "bs","bs","bs"] 244 | elif config.gameConfig == 16: 245 | config.agentTypes = ["bs", "Strm","bs","bs"] 246 | elif config.gameConfig == 17: 247 | config.agentTypes = ["bs", "bs","Strm","bs"] 248 | elif config.gameConfig == 18: 249 | config.agentTypes = ["bs", "bs","bs","Strm"] 250 | elif config.gameConfig == 19: 251 | config.agentTypes = ["rnd", "bs","bs","bs"] 252 | elif config.gameConfig == 20: 253 | config.agentTypes = ["bs", "rnd","bs","bs"] 254 | elif config.gameConfig == 21: 255 | config.agentTypes = ["bs", "bs","rnd","bs"] 256 | elif config.gameConfig == 22: 257 | config.agentTypes = ["bs", "bs","bs","rnd"] 258 | elif config.gameConfig == 23: 259 | config.agentTypes = ["Strm", "Strm","Strm","Strm"] 260 | elif config.gameConfig == 24: 261 | config.agentTypes = ["rnd", "rnd","rnd","rnd"] 262 | elif config.gameConfig == 25: 263 | config.agentTypes = ["bs", "bs","bs","bs"] 264 | elif config.gameConfig == 26: 265 | config.agentTypes = ["bs", "Strm","Strm","Strm"] 266 | elif config.gameConfig == 27: 267 | config.agentTypes = ["Strm", "bs","Strm","Strm"] 268 | elif config.gameConfig == 28: 269 | config.agentTypes = ["Strm", "Strm","bs","Strm"] 270 | elif config.gameConfig == 29: 271 | config.agentTypes = ["Strm", "Strm","Strm","bs"] 272 | elif config.gameConfig == 30: 273 | config.agentTypes = ["bs", "rnd","rnd","rnd"] 274 | elif config.gameConfig == 31: 275 | config.agentTypes = ["rnd", "bs","rnd","rnd"] 276 | elif config.gameConfig == 32: 277 | config.agentTypes = ["rnd", "rnd","bs","rnd"] 278 | elif config.gameConfig == 33: 279 | config.agentTypes = ["rnd", "rnd","rnd","bs"] 280 | else: 281 | config.agentTypes = ["bs", "bs","bs","bs"] 282 | 283 | def fillnodes(config): 284 | if config.NoHiLayer == 2: 285 | config.nodes = [config.stateDim * config.multPerdInpt, config.node1,config.node2,config.actionListLen] 286 | elif config.NoHiLayer == 3: 287 | config.nodes = [config.stateDim * config.multPerdInpt, config.node1,config.node2,config.node3,config.actionListLen] 288 | 289 | 290 | def setSavedDimentionPerBrain(config): 291 | if config.ifUsePreviousModel and not config.iftl: 292 | if config.demandDistribution == 0 and config.demandUp == 9 and config.demandLow == 0 and config.actionUp == 8: 293 | if config.gameConfig == 3: 294 | config.multPerdInpt = 5 295 | config.NoHiLayer = 3 296 | config.node1=180 297 | config.node2=130 298 | config.node3=61 299 | elif config.gameConfig == 4: 300 | config.multPerdInpt = 10 301 | config.NoHiLayer = 3 302 | config.node1=180 303 | config.node2=130 304 | config.node3=61 305 | elif config.gameConfig == 5: 306 | config.multPerdInpt = 5 307 | config.NoHiLayer = 3 308 | config.node1=180 309 | config.node2=130 310 | config.node3=61 311 | elif config.gameConfig == 6: 312 | config.multPerdInpt = 5 313 | config.NoHiLayer = 3 314 | config.node1=180 315 | config.node2=130 316 | config.node3=61 317 | elif config.gameConfig == 7: 318 | config.multPerdInpt = 10 319 | config.NoHiLayer = 3 320 | config.node1=180 321 | config.node2=130 322 | config.node3=61 323 | elif config.gameConfig == 8: 324 | config.multPerdInpt = 10 325 | config.NoHiLayer = 3 326 | config.node1=180 327 | config.node2=130 328 | config.node3=61 329 | elif config.gameConfig == 9: 330 | config.multPerdInpt = 10 331 | config.NoHiLayer = 3 332 | config.node1=180 333 | config.node2=130 334 | config.node3=61 335 | elif config.gameConfig == 10: 336 | config.multPerdInpt = 10 337 | config.NoHiLayer = 3 338 | config.node1=180 339 | config.node2=130 340 | config.node3=61 341 | elif config.gameConfig == 11: 342 | config.multPerdInpt = 5 343 | config.NoHiLayer = 3 344 | config.node1=180 345 | config.node2=130 346 | config.node3=61 347 | elif config.gameConfig == 12: 348 | config.multPerdInpt = 5 349 | config.NoHiLayer = 3 350 | config.node1=180 351 | config.node2=130 352 | config.node3=61 353 | elif config.gameConfig == 13: 354 | config.multPerdInpt = 10 355 | config.NoHiLayer = 3 356 | config.node1=180 357 | config.node2=130 358 | config.node3=61 359 | elif config.gameConfig == 14: 360 | config.multPerdInpt = 5 361 | config.NoHiLayer = 3 362 | config.node1=180 363 | config.node2=130 364 | config.node3=61 365 | 366 | elif config.demandDistribution == 1 and config.demandMu == 10 and config.demandSigma == 2 and config.actionUp == 5: 367 | if config.gameConfig == 3: 368 | config.multPerdInpt = 5 369 | config.NoHiLayer = 3 370 | config.node1=180 371 | config.node2=130 372 | config.node3=61 373 | elif config.gameConfig == 4: 374 | config.multPerdInpt = 5 375 | config.NoHiLayer = 3 376 | config.node1=180 377 | config.node2=130 378 | config.node3=61 379 | elif config.gameConfig == 5: 380 | config.multPerdInpt = 5 381 | config.NoHiLayer = 3 382 | config.node1=180 383 | config.node2=130 384 | config.node3=61 385 | elif config.gameConfig == 6: 386 | config.multPerdInpt = 10 387 | config.NoHiLayer = 3 388 | config.node1=180 389 | config.node2=130 390 | config.node3=61 391 | elif config.gameConfig == 7: 392 | config.multPerdInpt = 10 393 | config.NoHiLayer = 3 394 | config.node1=180 395 | config.node2=130 396 | config.node3=61 397 | elif config.gameConfig == 8: 398 | config.multPerdInpt = 10 399 | config.NoHiLayer = 3 400 | config.node1=180 401 | config.node2=130 402 | config.node3=61 403 | elif config.gameConfig == 9: 404 | config.multPerdInpt = 10 405 | config.NoHiLayer = 3 406 | config.node1=180 407 | config.node2=130 408 | config.node3=61 409 | elif config.gameConfig == 10: 410 | config.multPerdInpt = 10 411 | config.NoHiLayer = 3 412 | config.node1=180 413 | config.node2=130 414 | config.node3=61 415 | elif config.gameConfig == 11: 416 | config.multPerdInpt = 10 417 | config.NoHiLayer = 3 418 | config.node1=180 419 | config.node2=130 420 | config.node3=61 421 | elif config.gameConfig == 12: 422 | config.multPerdInpt = 10 423 | config.NoHiLayer = 3 424 | config.node1=180 425 | config.node2=130 426 | config.node3=61 427 | elif config.gameConfig == 13: 428 | config.multPerdInpt = 5 429 | config.NoHiLayer = 3 430 | config.node1=180 431 | config.node2=130 432 | config.node3=61 433 | elif config.gameConfig == 14: 434 | config.multPerdInpt = 10 435 | config.NoHiLayer = 3 436 | config.node1=180 437 | config.node2=130 438 | config.node3=61 439 | 440 | elif config.demandDistribution == 2 and config.demandUp == 9 and config.demandLow == 0 and config.actionUp == 8: 441 | if config.gameConfig == 3: 442 | config.multPerdInpt = 10 443 | config.NoHiLayer = 3 444 | config.node1=180 445 | config.node2=130 446 | config.node3=61 447 | elif config.gameConfig == 4: 448 | config.multPerdInpt = 10 449 | config.NoHiLayer = 3 450 | config.node1=180 451 | config.node2=130 452 | config.node3=61 453 | elif config.gameConfig == 5: 454 | config.multPerdInpt = 10 455 | config.NoHiLayer = 3 456 | config.node1=180 457 | config.node2=130 458 | config.node3=61 459 | elif config.gameConfig == 6: 460 | config.multPerdInpt = 5 461 | config.NoHiLayer = 3 462 | config.node1=180 463 | config.node2=130 464 | config.node3=61 465 | elif config.gameConfig == 7: 466 | config.multPerdInpt = 5 467 | config.NoHiLayer = 3 468 | config.node1=180 469 | config.node2=130 470 | config.node3=61 471 | elif config.gameConfig == 8: 472 | config.multPerdInpt = 10 473 | config.NoHiLayer = 3 474 | config.node1=180 475 | config.node2=130 476 | config.node3=61 477 | elif config.gameConfig == 9: 478 | config.multPerdInpt = 5 479 | config.NoHiLayer = 3 480 | config.node1=180 481 | config.node2=130 482 | config.node3=61 483 | elif config.gameConfig == 10: 484 | config.multPerdInpt = 10 485 | config.NoHiLayer = 3 486 | config.node1=180 487 | config.node2=130 488 | config.node3=61 489 | elif config.gameConfig == 11: 490 | config.multPerdInpt = 10 491 | config.NoHiLayer = 3 492 | config.node1=180 493 | config.node2=130 494 | config.node3=61 495 | elif config.gameConfig == 12: 496 | config.multPerdInpt = 5 497 | config.NoHiLayer = 3 498 | config.node1=180 499 | config.node2=130 500 | config.node3=61 501 | elif config.gameConfig == 13: 502 | config.multPerdInpt = 5 503 | config.NoHiLayer = 3 504 | config.node1=180 505 | config.node2=130 506 | config.node3=61 507 | elif config.gameConfig == 14: 508 | config.multPerdInpt = 10 509 | config.NoHiLayer = 3 510 | config.node1=180 511 | config.node2=130 512 | config.node3=61 513 | 514 | elif config.demandDistribution != 3 and config.demandDistribution != 4: 515 | if config.gameConfig == 7: 516 | config.dnnUpCnt = 10000 517 | config.multPerdInpt = 5 518 | config.NoHiLayer = 2 519 | config.lr0 = 0.001 520 | elif config.gameConfig == 8: 521 | config.dnnUpCnt = 5000 522 | config.multPerdInpt = 5 523 | config.NoHiLayer = 2 # this should be 3 524 | config.lr0 = 0.00025 525 | elif config.gameConfig == 9: 526 | config.dnnUpCnt = 5000 527 | config.multPerdInpt = 3 528 | config.NoHiLayer = 2 529 | config.lr0 = 0.001 530 | elif config.gameConfig == 10: 531 | config.dnnUpCnt = 5000 532 | config.multPerdInpt = 3 # it should be 5 533 | config.NoHiLayer = 2 534 | config.lr0 = 0.001 535 | 536 | def set_optimal(config): 537 | if config.demandDistribution == 0: 538 | if config.cp1==2 and config.ch1==2 and config.ch2==2 and config.ch3==2 and config.ch4==2 : 539 | config.f1 = 8. 540 | config.f2 = 8. 541 | config.f3 = 0. 542 | config.f4 = 0. 543 | 544 | def get_config(): 545 | config, unparsed = parser.parse_known_args() 546 | config = update_config(config) 547 | 548 | return config, unparsed 549 | 550 | def fill_leadtime_initial_values(config): 551 | config.leadRecItemLow = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4] 552 | config.leadRecItemUp = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4] 553 | config.leadRecOrderLow = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4] 554 | config.leadRecOrderUp = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4] 555 | config.ILInit = [config.ILInit1, config.ILInit2, config.ILInit3, config.ILInit4] 556 | config.AOInit = [config.AOInit1, config.AOInit2, config.AOInit3, config.AOInit4] 557 | config.ASInit = [config.ASInit1, config.ASInit2, config.ASInit3, config.ASInit4] 558 | 559 | def get_auxuliary_leadtime_initial_values(config): 560 | config.leadRecOrderUp_aux = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4] 561 | config.leadRecItemUp_aux = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4] 562 | 563 | def fix_lead_time_manufacturer(config): 564 | if config.leadRecOrder4 > 0: 565 | config.leadRecItem4 += config.leadRecOrder4 566 | config.leadRecOrder4 = 0 567 | 568 | def set_sterman_parameters(config): 569 | config.alpha_b =[config.alpha_b1,config.alpha_b2,config.alpha_b3,config.alpha_b4] 570 | config.betta_b =[config.betta_b1,config.betta_b2,config.betta_b3,config.betta_b4] 571 | 572 | 573 | def update_config(config): 574 | config.actionList = buildActionList(config) # The list of the available actions 575 | config.actionListLen = len(config.actionList) # the length of the action list 576 | 577 | # set_optimal(config) 578 | config.f = [config.f1, config.f2, config.f3, config.f4] # [6.4, 2.88, 2.08, 0.8] 579 | 580 | config.actionListLen=len(config.actionList) 581 | if config.demandDistribution == 0: 582 | config.actionListOpt=list(range(0,int(max(config.actionUp*30+1, 3*sum(config.f))),1)) 583 | else: 584 | config.actionListOpt=list(range(0,int(max(config.actionUp*30+1, 7*sum(config.f))),1)) 585 | config.actionListLenOpt=len(config.actionListOpt) 586 | config.agentTypes=['dnn','dnn','dnn','dnn'] 587 | config.saveFigInt = [config.saveFigIntLow, config.saveFigIntUp] 588 | 589 | if config.gameConfig == 0: 590 | config.NoAgent=min(config.NoAgent,len(config.agentTypes)) 591 | config.agentTypes=[config.agent_type1,config.agent_type2,config.agent_type3,config.agent_type4] 592 | else: 593 | config.NoAgent=4 594 | setAgentType(config) # set the agent brain types according to ifFourDNNtrain, ... 595 | 596 | config.c_h =[config.ch1, config.ch2, config.ch3, config.ch4] 597 | config.c_p =[config.cp1, config.cp2, config.cp3, config.cp4] 598 | 599 | config.stateDim= getStateDim(config) # Number of elements in the state description - Depends on ifUseASAO 600 | np.random.seed(seed = config.seed) 601 | setSavedDimentionPerBrain(config) # set the parameters of pre_trained model. 602 | fillnodes(config) # create the structure of network nodes 603 | get_auxuliary_leadtime_initial_values(config) 604 | fix_lead_time_manufacturer(config) 605 | fill_leadtime_initial_values(config) 606 | set_sterman_parameters(config) 607 | 608 | return config 609 | 610 | -------------------------------------------------------------------------------- /data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OptMLGroup/DeepBeerInventory-RL/ca2bb90a5ee3a45fa89cfadf56354369a62bf5a8/data.zip -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from clBeergame import * 3 | from utilities import * 4 | import numpy as np 5 | #from clGeneralParameters import generalParameters 6 | import random 7 | from config import get_config, update_config 8 | import tensorflow as tf 9 | 10 | config = None 11 | 12 | #def main(config, beerGame): 13 | def main(config): 14 | random.seed(10) 15 | 16 | # prepare loggers and directories 17 | prepare_dirs_and_logger(config) 18 | config = update_config(config) 19 | # save the current configuration of the problem in a json file 20 | save_config(config) 21 | 22 | # get the address of data 23 | if config.observation_data: 24 | adsr = 'data/demandTr-obs-' 25 | elif config.demandDistribution == 3: 26 | if config.scaled: 27 | adsr = 'data/basket_data/scaled' 28 | else: 29 | adsr = 'data/basket_data' 30 | elif config.demandDistribution == 4: 31 | if config.scaled: 32 | adsr = 'data/forecast_data/scaled' 33 | else: 34 | adsr = 'data/forecast_data' 35 | else: 36 | adsr = 'data/demandTr' 37 | 38 | 39 | # load demands 40 | # demandTr = np.load('demandTr'+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy') 41 | if config.demandDistribution == 0: 42 | direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(config.demandUp)+'-'+str(config.maxEpisodesTrain)+'.npy') 43 | if not os.path.exists(direc): 44 | direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy') 45 | elif config.demandDistribution == 1: 46 | direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(int(config.demandMu))+'-'+str(int(config.demandSigma))+'.npy') 47 | elif config.demandDistribution == 2: 48 | direc = os.path.realpath(adsr+str(config.demandDistribution)+'.npy') 49 | elif config.demandDistribution == 3: 50 | direc = os.path.realpath(adsr+'/demandTr-'+str(config.data_id)+'.npy') 51 | elif config.demandDistribution == 4: 52 | direc = os.path.realpath(adsr+'/demandTr-'+str(config.data_id)+'.npy') 53 | demandTr = np.load(direc) 54 | print("loaded training set=", direc) 55 | if config.demandDistribution == 0: 56 | direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(config.demandUp)+'-'+str(config.maxEpisodesTrain)+'.npy') 57 | if not os.path.exists(direc): 58 | direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy') 59 | elif config.demandDistribution == 1: 60 | direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(int(config.demandMu))+'-'+str(int(config.demandSigma))+'.npy') 61 | elif config.demandDistribution == 2: 62 | direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'.npy') 63 | elif config.demandDistribution == 3: 64 | direc = os.path.realpath(adsr+'/demandTs-'+str(config.data_id)+'.npy') 65 | direcVl = os.path.realpath(adsr+'/demandVl-'+str(config.data_id)+'.npy') 66 | demandVl = np.load(direcVl) 67 | elif config.demandDistribution == 4: 68 | direc = os.path.realpath(adsr+'/demandTs-'+str(config.data_id)+'.npy') 69 | direcVl = os.path.realpath(adsr+'/demandVl-'+str(config.data_id)+'.npy') 70 | demandVl = np.load(direcVl) 71 | demandTs = np.load(direc) 72 | print("loaded test set=", direc) 73 | 74 | 75 | # initilize an instance of Beergame 76 | beerGame = clBeerGame(config) 77 | 78 | # get the length of the demand. 79 | demand_len = np.shape(demandTr)[0] 80 | # Do Initial tests 81 | beerGame.doTestMid(demandTs[0:config.testRepeatMid]) 82 | 83 | # train the specified number of games 84 | for i in range(0, config.maxEpisodesTrain): 85 | beerGame.playGame(demandTr[i%demand_len],"train") 86 | # get the test results 87 | if (np.mod(beerGame.curGame,config.testInterval) == 0) and (beerGame.curGame>500): 88 | beerGame.doTestMid(demandTs[0:config.testRepeatMid]) 89 | 90 | # do the last test on the middle test data set. 91 | beerGame.doTestMid(demandTs[0:config.testRepeatMid]) 92 | if config.demandDistribution == 3: 93 | beerGame.doTestMid(demandVl[0:config.testRepeatMid]) 94 | 95 | if __name__ == '__main__': 96 | # load parameters 97 | config, unparsed = get_config() 98 | 99 | # run main 100 | main(config) 101 | -------------------------------------------------------------------------------- /plotting.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib 4 | matplotlib.use('Agg') 5 | import matplotlib.pyplot as plt 6 | from pylab import * 7 | 8 | # plotting 9 | def plotting(plt, data, colori, pltLabel): 10 | plt.hold(True) 11 | 12 | for i in range (np.shape(data)[0]): 13 | plt.subplot(4,5,5*i+1) 14 | plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[1,:],colori,label=pltLabel) 15 | plt.xlabel('Time') 16 | plt.ylabel('IL') 17 | plt.grid(True) 18 | 19 | 20 | plt.subplot(4,5,5*i+2) 21 | plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[2,:],colori, label=pltLabel) 22 | plt.xlabel('Time') 23 | plt.ylabel('OO') 24 | plt.grid(True) 25 | 26 | plt.subplot(4,5,5*i+3) 27 | plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[3,:],colori, label=pltLabel) 28 | plt.xlabel('Time') 29 | plt.ylabel('a') 30 | plt.grid(True) 31 | 32 | plt.subplot(4,5,5*i+4) 33 | plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[5,:],colori,label=pltLabel) 34 | plt.xlabel('Time') 35 | plt.ylabel('OUTL') 36 | plt.grid(True) 37 | 38 | plt.subplot(4,5,5*i+5) 39 | plt.plot(np.transpose(data[i])[0,:], -1*np.transpose(data[i])[4,:],colori,label=pltLabel) 40 | plt.xlabel('Time') 41 | plt.ylabel('r') 42 | plt.grid(True) 43 | 44 | return plt 45 | 46 | def savePlot(players, curGame, Rsltdnn, RsltFrmu, RsltOptm, config, m): 47 | node1 = config.node1 48 | node2 = config.node2 49 | node3 = config.node3 50 | #add title to plot 51 | if config.if_titled_figure: 52 | if config.NoHiLayer==2: 53 | plt.suptitle("Game No="+str(curGame)+";" + str(config.agentTypes.count("srdqn"))+ " SRDQN Agents; SRDQN nodes="+str(node1)+ 54 | "-"+str(node2)+ "; sum SRDQN=" + str(round(sum(Rsltdnn),2)) + "; sum Strm=" 55 | + str(round(sum(RsltFrmu),2)) +"; sum BS=" + str(round(sum(RsltOptm),2))+ "\n"+ 56 | "Ag SRDQN="+str([round(Rsltdnn[i],2) for i in range(config.NoAgent)])+ 57 | "; Ag Strm="+str([round(RsltFrmu[i],2) for i in range(config.NoAgent)])+ 58 | "; Ag BS="+str([round(RsltOptm[i],2) for i in range(config.NoAgent)]), fontsize=12) 59 | elif config.NoHiLayer==3: 60 | plt.suptitle("Game No="+str(curGame)+";" + str(config.agentTypes.count("srdqn"))+ " SRDQN Agents; SRDQN nodes="+str(node1)+ 61 | "-"+str(node2)+"-"+str(node3)+ "; sum SRDQN=" + str(round(sum(Rsltdnn),2)) + 62 | "; sum Strm=" + str(round(sum(RsltFrmu),2)) +"; sum BS=" + str(round(sum(RsltOptm),2))+"\n"+ 63 | "Ag SRDQN="+str([round(Rsltdnn[i],2) for i in range(config.NoAgent)])+ 64 | "; Ag Strm="+str([round(RsltFrmu[i],2) for i in range(config.NoAgent)])+ 65 | "; Ag BS="+str([round(RsltOptm[i],2) for i in range(config.NoAgent)]), fontsize=12) 66 | 67 | 68 | #insert legend to the figure 69 | legend = plt.legend(bbox_to_anchor=(-1.4, -.165, 1., -.102), shadow=True, ncol=4) 70 | 71 | # configures spaces between subplots 72 | plt.subplots_adjust(left=None, bottom=None, right=None, top=None,wspace=.5, hspace=.5) 73 | # save the figure 74 | plt.savefig(os.path.join(config.model_dir,'saved_figures/') + str(curGame)+ '-' + str(m)+'.pdf', format='pdf') 75 | print("figure"+str(curGame)+".pdf saved in folder \"saved_figures\"") 76 | plt.close(curGame) 77 | 78 | 79 | def plotBaseStock(data, colori, pltLabel, curGame, config, m): 80 | plt.figure(104, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k') 81 | plt.plot(range(len(data)), data, colori, label=pltLabel) 82 | plt.xlabel('Time') 83 | plt.ylabel('Order-up-to level') 84 | plt.grid(True) 85 | plt.savefig(os.path.join(config.model_dir,'saved_figures/') + "dnnBaseStock" + str(curGame)+ '-' + str(m)+'.pdf', format='pdf') 86 | print("base stock figure"+str(curGame)+ '-' + str(m)+".pdf saved in folder \"saved_figures\"") 87 | plt.close(104) 88 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15 2 | matplotlib 3 | numpyencoder 4 | -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import json 4 | import random 5 | import matplotlib 6 | if not True: 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as plt 9 | import os 10 | import logging 11 | from datetime import datetime 12 | import tensorflow as tf 13 | import tensorflow.contrib.slim as slim 14 | from numpyencoder import NumpyEncoder 15 | 16 | import sys 17 | 18 | 19 | class Logger(object): 20 | # writes the outputs to a file 21 | def __init__(self,config): 22 | self.terminal = sys.stdout 23 | self.log = open(os.path.join(config.model_dir,"logfile.log"), "a",0) 24 | 25 | def write(self, message): 26 | self.terminal.write(message) 27 | self.log.write(message) 28 | 29 | def flush(self): 30 | #this flush method is needed for python 3 compatibility. 31 | #this handles the flush command by doing nothing. 32 | #you might want to specify some extra behavior here. 33 | pass 34 | 35 | 36 | def prepare_dirs_and_logger(config): 37 | # set format of the logger 38 | formatter = logging.Formatter( 39 | "%(asctime)s:%(levelname)s::%(message)s") 40 | logger = logging.getLogger('tensorflow') 41 | 42 | for hdlr in logger.handlers: 43 | logger.removeHandler(hdlr) 44 | 45 | handler = logging.StreamHandler() 46 | handler.setFormatter(formatter) 47 | 48 | logger.addHandler(handler) 49 | logger.setLevel(tf.logging.INFO) 50 | 51 | # create load paths, if they don't exist 52 | if config.load_path: 53 | if config.load_path.startswith(config.task): 54 | config.model_name = config.load_path 55 | else: 56 | config.model_name = "{}_{}".format(config.task, config.load_path) 57 | else: 58 | if config.iftl: 59 | tl = 1 60 | else: 61 | tl = 0 62 | config.model_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(config.task, get_time(), 63 | config.gameConfig, config.tlBaseBrain, config.NoHiLayer, 64 | config.demandUp, config.cp1, config.cp2, config.cp3, config.cp4, config.ch1, config.ch2, config.ch3, config.ch4, 65 | config.distCoeff, config.NoAgent, config.maxEpisodesTrain, config.lr0, config.multPerdInpt, config.dnnUpCnt, tl, 66 | config.actionUp, config.demandDistribution, config.action_step, config.data_id) 67 | 68 | config.model_dir = os.path.join(config.log_dir, config.model_name) 69 | 70 | for path in [config.pre_model_dir, config.log_dir, config.model_dir, 71 | os.path.join(config.model_dir,'saved_figures'), 72 | os.path.join(config.model_dir,'model1'), 73 | os.path.join(config.model_dir,'model2'), 74 | os.path.join(config.model_dir,'model3'), 75 | os.path.join(config.model_dir,'model4'), 76 | os.path.join(config.pre_model_dir,'brain3'), 77 | os.path.join(config.pre_model_dir,'brain4'), 78 | os.path.join(config.pre_model_dir,'brain5'), 79 | os.path.join(config.pre_model_dir,'brain6'), 80 | os.path.join(config.pre_model_dir,'brain7'), 81 | os.path.join(config.pre_model_dir,'brain8'), 82 | os.path.join(config.pre_model_dir,'brain9'), 83 | os.path.join(config.pre_model_dir,'brain10'), 84 | os.path.join(config.log_dir,'reports')]: 85 | 86 | if not os.path.exists(path): 87 | os.makedirs(path) 88 | 89 | 90 | def get_time(): 91 | return datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 92 | 93 | 94 | def save_config(config): 95 | param_path = os.path.join(config.model_dir, "params.json") 96 | 97 | tf.logging.info("MODEL dir: %s" % config.model_dir) 98 | tf.logging.info("PARAM path: %s" % param_path) 99 | save_json(config.__dict__,param_path) 100 | 101 | 102 | # input-output functions 103 | def save(obj,name): 104 | pickle.dump(obj, open(name, "wb")) 105 | 106 | def load(name): 107 | return pickle.load(open(name, "rb")) 108 | 109 | def save_json(obj,name): 110 | with open(name, 'w') as outfile: 111 | json.dump(obj, outfile, indent=4, sort_keys=True, cls=NumpyEncoder) 112 | --------------------------------------------------------------------------------