├── .gitignore
├── BGAgent.py
├── LICENSE
├── README.md
├── SRDQN.py
├── clBeergame.py
├── config.py
├── data.zip
├── main.py
├── plotting.py
├── requirements.txt
└── utilities.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/BGAgent.py:
--------------------------------------------------------------------------------
  1 | from SRDQN import DQN
  2 | import numpy as np 
  3 | 
  4 | # Here we want to define the agent class for the BeerGame
  5 | class Agent(object):
  6 | 	
  7 | 	# initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
  8 | 	def __init__(self, agentNum, IL, AO, AS, c_h, c_p, eta, compuType, config):
  9 | 		self.agentNum = agentNum
 10 | 		self.IL = IL		# Inventory level of each agent - changes during the game
 11 | 		self.OO = 0		# Open order of each agent - changes during the game
 12 | 		self.ASInitial = AS # the initial arriving shipment.
 13 | 		self.ILInitial = IL	# IL at which we start each game with this number
 14 | 		self.AOInitial = AO	# OO at which we start each game with this number
 15 | 		self.config = config	# an instance of config is stored inside the class
 16 | 		self.curState = []  # this function gets the current state of the game
 17 | 		self.nextState = []
 18 | 		self.curReward = 0 	# the reward observed at the current step
 19 | 		self.cumReward = 0 	# cumulative reward; reset at the begining of each episode
 20 | 		self.totRew = 0    	# it is reward of all players obtained for the current player.
 21 | 		self.c_h=c_h		# holding cost
 22 | 		self.c_p = c_p		# backorder cost
 23 | 		self.eta = eta		# the total cost regulazer
 24 | 		self.AS = np.zeros((1,1)) 	# arriced shipment 
 25 | 		self.AO = np.zeros((1,1))	# arrived order
 26 | 		self.action=0		# the action at time t
 27 | 		self.compTypeTrain = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
 28 | 		self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
 29 | 		self.alpha_b = self.config.alpha_b[self.agentNum] # parameters for the formula
 30 | 		self.betta_b = self.config.betta_b[self.agentNum] # parameters for the formula
 31 | 		if self.config.demandDistribution == 0:
 32 | 			self.a_b = np.mean((self.config.demandUp , self.config.demandLow)) # parameters for the formula
 33 | 			self.b_b = np.mean((self.config.demandUp , self.config.demandLow))*(np.mean((self.config.leadRecItemLow[self.agentNum] , 
 34 | 			self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
 35 | 		elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
 36 | 			self.a_b = self.config.demandMu # parameters for the formula
 37 | 			self.b_b = self.config.demandMu*(np.mean((self.config.leadRecItemLow[self.agentNum] , 
 38 | 			self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
 39 | 		elif self.config.demandDistribution == 2:
 40 | 			self.a_b = 8 # parameters for the formula
 41 | 			self.b_b = (3/4.)*8*(np.mean((self.config.leadRecItemLow[self.agentNum] , 
 42 | 			self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
 43 | 		elif self.config.demandDistribution == 3:
 44 | 			self.a_b = 10 # parameters for the formula
 45 | 			self.b_b = 7*(np.mean((self.config.leadRecItemLow[self.agentNum] , 
 46 | 			self.config.leadRecItemUp[self.agentNum])) + np.mean((self.config.leadRecOrderLow[self.agentNum] , self.config.leadRecOrderUp[self.agentNum]))) # parameters for the formula
 47 | 
 48 | 		self.hist = []  # this is used for plotting - keeps the history for only one game
 49 | 		self.hist2 = []	# this is used for animation usage
 50 | 		self.srdqnBaseStock = []	# this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
 51 | 		self.T = 0
 52 | 		self.bsBaseStock = 0  
 53 | 		self.init_bsBaseStock = 0 
 54 | 		self.nextObservation = []
 55 | 		if self.compTypeTrain == 'srdqn':
 56 | 			self.brain = DQN(self.agentNum,config)
 57 | 			self.brain.setInitState(self.curState) # sets the initial input of the network
 58 | 	
 59 | 	# reset player information
 60 | 	def resetPlayer(self, T):
 61 | 		self.IL = self.ILInitial
 62 | 		self.OO = 0
 63 | 		self.AS = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 ))) 	# arriced shipment 
 64 | 		self.AO = np.squeeze(np.zeros((1,T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10 )))	# arrived order	
 65 | 		if self.agentNum != 0:
 66 | 			for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
 67 | 				self.AO[i] = self.AOInitial[self.agentNum - 1]
 68 | 		for i in range(self.config.leadRecItemUp[self.agentNum]):
 69 | 			self.AS[i] = self.ASInitial
 70 | 		self.curReward = 0 # the reward observed at the current step
 71 | 		self.cumReward = 0 # cumulative reward; reset at the begining of each episode	
 72 | 		self.action= [] 
 73 | 		self.hist = []
 74 | 		self.hist2 = []
 75 | 		self.srdqnBaseStock = []	# this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
 76 | 		self.T = T
 77 | 		self.curObservation = self.getCurState(1)  # this function gets the current state of the game
 78 | 		self.nextObservation = []
 79 | 		if self.compTypeTrain == 'srdqn':
 80 | 			self.brain.setInitState(self.curObservation) # sets the initial input of the network
 81 | 		
 82 | 	
 83 | 	# updates the IL and OO at time t, after recieving "rec" number of items 
 84 | 	def recieveItems(self, time):
 85 | 		self.IL = self.IL + self.AS[time] # inverntory level update
 86 | 		self.OO = self.OO - self.AS[time] # invertory in transient update
 87 | 		
 88 | 	
 89 | 	# find action Value associated with the action list
 90 | 	def actionValue(self,curTime,playType):
 91 | 		if playType == "test":
 92 | 			if self.config.fixedAction:
 93 | 				a = self.config.actionList[np.argmax(self.action)]
 94 | 			else:
 95 | 				# "d + x" rule 
 96 | 				if self.compTypeTest == 'srdqn':
 97 | 					a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime])
 98 | 				elif self.compTypeTest == 'rnd':
 99 | 					a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
100 | 				else: 
101 | 					a = max(0, self.config.actionListOpt[np.argmax(self.action)])
102 | 				
103 | 		elif playType == "train":
104 | 			if self.config.fixedAction:
105 | 				a = self.config.actionList[np.argmax(self.action)]
106 | 			else:
107 | 				if self.compTypeTrain == 'srdqn':
108 | 					a = max(0, self.config.actionList[np.argmax(self.action)]*self.config.action_step + self.AO[curTime])
109 | 				elif self.compTypeTest == 'rnd':
110 | 					a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
111 | 				else: 
112 | 					a = max(0, self.config.actionListOpt[np.argmax(self.action)])
113 | 		
114 | 		return a
115 | 		
116 | 			
117 | 	# getReward returns the reward at the current state 
118 | 	def getReward(self):
119 | 		# cost (holding + backorder) for one time unit
120 | 		self.curReward= (self.c_p * max(0,-self.IL) + self.c_h * max(0,self.IL))/200. # self.config.Ttest # 
121 | 		self.curReward = -self.curReward;		# make reward negative, because it is the cost
122 | 		
123 | 		# sum total reward of each agent
124 | 		self.cumReward = self.config.gamma*self.cumReward + self.curReward		
125 | 
126 | 	# This function returns a np.array of the current state of the agent	
127 | 	def getCurState(self,t):
128 | 		if self.config.ifUseASAO:
129 | 			if self.config.if_use_AS_t_plus_1:
130 | 				curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t],self.AO[t]])
131 | 			else:
132 | 				curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO,self.AS[t-1],self.AO[t]])
133 | 		else:
134 | 			curState= np.array([-1*(self.IL<0)*self.IL,1*(self.IL>0)*self.IL,self.OO])
135 | 
136 | 		if self.config.ifUseActionInD:
137 | 			a = self.config.actionList[np.argmax(self.action)]
138 | 			curState= np.concatenate((curState, np.array([a])))
139 | 
140 | 		return curState
141 | 
142 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Optimization and Machine Learning Group @ Lehigh
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization
  2 | 
  3 | The code of the paper `A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization` is presented at this repository. The paper is available online in https://pubsonline.informs.org/doi/abs/10.1287/msom.2020.0939. The code works with `Python2.7` and `Python3.4-Python3.7`. For more information see the list of the requirments (You can install them `pip install -r requirements.txt`). 
  4 | The `main.py` is the file to call to start the training. `BGAgent.py` provides the beer-game agent which involves all the properties and functionality of an agent. `clBeergame.py` instanciates the agents and runs the beer-game simulation. Also, once the number of observations in the replay buffer filled by the minimum requirement, it calls the train-step of the SRDQN algorithm. The DNN approximator and SRDQN algorithm are implemented in `SRDQN.py`. `config.py` introduce all arguments and their default values, as well as some functions to properly build the simulation scenarios for different instances of the game. In the following the procedure to run the training and setting different values for the arguments is described. 
  5 | 
  6 | ###Play beer-game and compare your result with AI!
  7 | You can play the beer-game and compare your result on the same game with the result that our RL algorithm achieves. See https://beergame.opexanalytics.com/
  8 |  
  9 | 
 10 | Note that this code does not work with TensorFlow 2+. 
 11 | ## Some Notations
 12 | Each agent can use either of the `srdqn`, `bs`, `Ster`, or `Rnd` algorithms to decide about the action (order quantity). So, there are 256 combination of agent-types from which we consider 23 cases in this study. To determine each of these cases, we have used `config.gameConfig` to select one of pre-defined type of four agents in the game. For example, `config.gameConfig=3`, sets `config.agentTypes = ["srdqn", "bs","bs","bs"]`, in which the retailer follows the `srdqn` algorithm and the rest of agents use the base-stock policy to decide for the order quantity. The main `gameConfig` are as below:
 13 | 
 14 | Base-stock co-players 
 15 | 
 16 | 	if config.gameConfig == 3: 
 17 | 		config.agentTypes = ["srdqn", "bs","bs","bs"]
 18 | 	if config.gameConfig == 4: 
 19 | 		config.agentTypes = ["bs", "srdqn","bs","bs"]
 20 | 	if config.gameConfig == 5: 
 21 | 		config.agentTypes = ["bs", "bs","srdqn","bs"]
 22 | 	if config.gameConfig == 6: 
 23 | 		config.agentTypes = ["bs", "bs","bs","srdqn"]
 24 | Sterman co-players 
 25 | 
 26 | 	if config.gameConfig == 7: 
 27 | 		config.agentTypes = ["srdqn", "Strm","Strm","Strm"]
 28 | 	if config.gameConfig == 8: 
 29 | 		config.agentTypes = ["Strm", "srdqn","Strm","Strm"]
 30 | 	if config.gameConfig == 9: 
 31 | 		config.agentTypes = ["Strm", "Strm","srdqn","Strm"]
 32 | 	if config.gameConfig == 10: 
 33 | 		config.agentTypes = ["Strm", "Strm","Strm","srdqn"]
 34 | Random co-players 
 35 | 
 36 | 	if config.gameConfig == 11: 
 37 | 		config.agentTypes = ["srdqn", "rnd","rnd","rnd"]
 38 | 	if config.gameConfig == 12: 
 39 | 		config.agentTypes = ["rnd", "srdqn","rnd","rnd"]
 40 | 	if config.gameConfig == 13: 
 41 | 		config.agentTypes = ["rnd", "rnd","srdqn","rnd"]
 42 | 	if config.gameConfig == 14: 
 43 | 		config.agentTypes = ["rnd", "rnd","rnd","srdqn"]
 44 | 
 45 | The full list of all `gameConfig` is defined in `setAgentType()` function in `config.py`. 
 46 | 
 47 | Since the `d+x` rule is used to train the `SRDQN` model, we use the upper and lower limit for `x`. `config.actionLow` and `config.actionUp` are used to set these values.  
 48 | 
 49 | In addition, for each agent one can determine the lead time for receving order as well as receving the shimpement via `config.leadRecItem1`, `config.leadRecItem2`, `config.leadRecItem3`, `config.leadRecItem4` and `config.leadRecOrder1`, `config.leadRecOrder2`, `config.leadRecOrder3`, `config.leadRecOrder4` for four agents. Similarly, the initial inventory level, initial arriving order, and initial arriving shipment can be set by `config.ILInit1`, `config.ILInit2`, `config.ILInit3`, `config.ILInit4`, `config.AOInit1`, `config.AOInit2`, `config.AOInit3`, `config.AOInit4`, `config.ASInit1`, `config.ASInit2`, `config.ASInit3`, `config.ASInit4`, respectively for the four agents.
 50 | 
 51 | `config.maxEpisodesTrain` determines the number of episodes to train the `srdqn` agent. 
 52 | 
 53 | TO run the baseStock policy (`bs`), you need to set the value of the base-stock level for each agent by `config.f1`, `config.f2`, `config.f3`, `config.f4`. We obtained those values by running the Clark-Scarf algorithm for each instance. 
 54 | 
 55 | ## unzip the data
 56 | `data.zip` includes all the required dataset to train the model on basic case, literature cases, basket dataset, and forecasting dataset. Unzipping this file creates `data` directory, in which there is a python file (`createDemand.py`) as well as the mentioned datasets. `createDemand.py` can be used to create datasets of any size for the literature cases.
 57 | 
 58 | ## Train the basic model
 59 | The basic model used the Uniform distribution `U[0,2]` with action space of `{-2, -1, 0, 1, 2}`. All the default values are set to run this experiment for the case that `srdqn` plays the retailer and other agents follow base-stock policy. For any other case the training can be started by setting the corresponding arguments. For example, to train a `srdqn` Warehouse with the initial inventory of 10 units which plays with Sterman co-players, the following line can be used to run the training for 50000 episodes: 
 60 | 
 61 | 	python main.py --gameConfig=8 --maxEpisodesTrain=50000 config.ILInit2=10 --batchSize=128
 62 | 
 63 | ## Train the literature cases 
 64 | To train each of the literature cases, first you need to set `config.demandDistribution`, `actionUp`, and `actionLow`, as well as the other parameter for the agents as following:
 65 | 
 66 | For U[0,8]:
 67 | 
 68 | 	python main.py --demandDistribution=0 --demandUp=9  --actionUp=8  --actionLow=-8 --ch1=0.5 --ch2=0.5 --ch3=0.5 --ch4=0.5 --cp1=1.0 --cp2=1.0 --cp3=1.0 --cp4=1.0 --f1=19.0 --f2=20.0 --f3=20.0 --f4=14.0  --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=12 --ILInit2=12 --ILInit3=12 --ILInit4=12 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 --gameConfig=6 
 69 | 
 70 | For N(10,2):
 71 | 
 72 | 	python main.py --demandDistribution=1 --demandMu=10  --demandSigma=2 --actionUp=5  --actionLow=-5 --ch1=1 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0 --cp3=0 --cp4=0 --f1=48.0 --f2=43.0 --f3=41.0 --f4=30.0 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=10 --ILInit2=10 --ILInit3=10 --ILInit4=10 --AOInit1=10 --AOInit2=10 --AOInit3=10 --AOInit4=10 --ASInit1=10 --ASInit2=10 --ASInit3=10 --ASInit4=10 --gameConfig=6
 73 | 
 74 | For C(4,8):
 75 | 
 76 | 	python main.py --demandDistribution=2 --actionUp=8  --actionLow=-8 --ch1=0.5 --ch2=0.5 --ch3=0.5 --ch4=0.5 --cp1=1.0 --cp2=1.0 --cp3=1.0 --cp4=1.0 --demandUp=9 --f1=32.0 --f2=32.0 --f3=32.0 --f4=24.0 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --ILInit1=12 --ILInit2=12 --ILInit3=12 --ILInit4=12 --AOInit1=4 --AOInit2=4 --AOInit3=4 --AOInit4=4 --ASInit1=4 --ASInit2=4 --ASInit3=4 --ASInit4=4 --gameConfig=6
 77 | 
 78 | ## Train the basket dataset 
 79 | For the basket dataset you need to set `config.demandDistribution=3`, and then `config.data_id` can be either `6, 13`, or `22`. For training with the scaled dataset, which is reported in the paper, `config.scaled=True` is required too. See the following commands for three cases: 
 80 | 
 81 | 	python main.py --demandDistribution=3 --data_id=6 --demandMu=3 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=19.0 --f2=12.0 --f3=12.0 --f4=8.0 --ILInit1=3 --ILInit2=3 --ILInit3=3 --ILInit4=3 --AOInit1=3 --AOInit2=3 --AOInit3=3 --AOInit4=3 --ASInit1=3 --ASInit2=3 --ASInit3=3 --ASInit4=3
 82 | 
 83 | 	python main.py --demandDistribution=3 --data_id=13 --demandMu=3  --demandSigma=2  --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=19.0 --f2=13.0 --f3=11.0 --f4=8.0 --ILInit1=3  --ILInit2=3  --ILInit3=3  --ILInit4=3  --AOInit1=3  --AOInit2=3  --AOInit3=3  --AOInit4=3  --ASInit1=3  --ASInit2=3  --ASInit3=3  --ASInit4=3 
 84 | 
 85 | 	python main.py --demandDistribution=3 --data_id=22 --demandMu=2  --demandSigma=2  --demandUp=3 --actionUp=5 --actionLow=-5       --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=14.0 --f2=9.0 --f3=9.0 --f4=5.0 --ILInit1=2  --ILInit2=2  --ILInit3=2  --ILInit4=2  --AOInit1=2  --AOInit2=2  --AOInit3=2  --AOInit4=2  --ASInit1=2  --ASInit2=2  --ASInit3=2  --ASInit4=2 
 86 | 
 87 | ## Train the forecasting dataset 
 88 | For the forecasting dataset you need to set `config.demandDistribution=4`, and then `config.data_id` can be either `5, 34`, or `46`. For training with the scaled dataset, which is reported in the paper, `config.scaled=True` is required too. See the following commands for three cases: 
 89 | 
 90 | 	python main.py --demandDistribution=4 --data_id=5 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=21.0 --f2=16.0 --f3=16.0 --f4=11.0 --ILInit1=4  --ILInit2=4  --ILInit3=4  --ILInit4=4  --AOInit1=4  --AOInit2=4  --AOInit3=4  --AOInit4=4  --ASInit1=4  --ASInit2=4  --ASInit3=4  --ASInit4=4 
 91 | 
 92 | 	python main.py --demandDistribution=4 --data_id=34 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=18.0 --f2=15.0 --f3=14.0 --f4=10.0 --ILInit1=4  --ILInit2=4  --ILInit3=4  --ILInit4=4  --AOInit1=4  --AOInit2=4  --AOInit3=4  --AOInit4=4  --ASInit1=4  --ASInit2=4  --ASInit3=4  --ASInit4=4 
 93 | 
 94 | 	python main.py --demandDistribution=4 --data_id=46 --demandMu=4 --demandSigma=2 --demandUp=3 --actionUp=5 --actionLow=-5 --leadRecItem1=2 --leadRecItem2=2 --leadRecItem3=2 --leadRecItem4=2 --leadRecOrder1=2 --leadRecOrder2=2 --leadRecOrder3=2 --leadRecOrder4=1 --scaled=True --ch1=1.0 --ch2=0.75 --ch3=0.5 --ch4=0.25 --cp1=10.0 --cp2=0.0 --cp3=0.0 --cp4=0.0 --f1=21.0 --f2=16.0 --f3=18.0 --f4=12.0 --ILInit1=4  --ILInit2=4  --ILInit3=4  --ILInit4=4  --AOInit1=4  --AOInit2=4  --AOInit3=4  --AOInit4=4  --ASInit1=4  --ASInit2=4  --ASInit3=4  --ASInit4=4 
 95 | 
 96 | ## Use Transfer Learning 
 97 | We have provided the trained model of the basic model which are used in the transfer learning section. The saved models are available in `pre_model\uniform\0-3\brainX` in which `X` is in `{3, 4, 5, 6}`. The value of `X` follows the same pattern as of `config.gameConfig`. To train a new with either of these trained models, you need to set `config.tlBaseBrain` that determines which trained should be used as the base model. For example: 
 98 | 
 99 | 	python main.py --gameConfig=3  --iftl=True --ifUsePreviousModel=True  --tlBaseBrain=3 --baseDemandDistribution=0
100 | 
101 | Besides, if you trained a model with another demand distribution, e.g., `N(10,2)`, you need to move the saved models into `pre_model\normal\10-2\brainX` and then for a new training set `config.baseDemandDistribution=1`. The `config.baseDemandDistribution` follows the same pattern as of `config.demandDistribution`. 
102 | 
103 | ## Other utilities 
104 | If you set `config.ifSaveFigure=True`, it saves the trajectories of inventory-level, reward, action, open-order, and order-upto-level for each agent in an episode. `config.saveFigIntLow` and `config.saveFigIntUp` determine the range of eprisode to save the figures. 
105 | 
106 | Setting `config.ifsaveHistInterval=True`, activate saving of trajectory of the received order, received shipment, inventory-level, reward, action, open-order, and order-upto-level for each agent in an episode. With this argument, you need to determine the interval between every two epsiode to save the history with `config.saveHistInterval`.
107 | 
108 | 
109 | ## Paper citation
110 | If you used this code for your experiments or found it helpful, consider citing the following paper:
111 | 
112 | 	@article{oroojlooyjadid2017deep,
113 | 	title={A Deep Q-Network for the Beer Game: Deep Reinforcement Learning for Inventory Optimization},
114 | 	author={Oroojlooyjadid, Afshin and Nazari, MohammadReza and Snyder, Lawrence and Tak{\'a}{\v{c}}, Martin},
115 | 	journal = {Manufacturing \& Service Operations Management},
116 | 	volume = {0},
117 | 	number = {0},
118 | 	pages = {null},
119 | 	year = {0},
120 | 	doi = {10.1287/msom.2020.0939},
121 | 
122 | 	URL = { 
123 | 		https://doi.org/10.1287/msom.2020.0939
124 | 
125 | 	},
126 | 	eprint = { 
127 | 		https://doi.org/10.1287/msom.2020.0939
128 | 
129 | 	}
130 | 	  year={2021}
131 | 	}
132 | 


--------------------------------------------------------------------------------
/SRDQN.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import time
  3 | from time import gmtime, strftime
  4 | import tensorflow as tf 
  5 | import numpy as np 
  6 | import random
  7 | from collections import deque 
  8 | 
  9 | class DQN:
 10 | 	def __init__(self,agentNum,config):
 11 | 		if agentNum==0:
 12 | 			graph_dqn1 = tf.Graph()
 13 | 			graph_dqn = graph_dqn1
 14 | 		elif agentNum==1:
 15 | 			graph_dqn2 = tf.Graph()
 16 | 			graph_dqn = graph_dqn2
 17 | 		elif agentNum==2:
 18 | 			graph_dqn3 = tf.Graph()
 19 | 			graph_dqn = graph_dqn3
 20 | 		elif agentNum==3:
 21 | 			graph_dqn4 = tf.Graph()
 22 | 			graph_dqn = graph_dqn4
 23 | 
 24 | 		with graph_dqn.as_default():		
 25 | 
 26 | 			tf.set_random_seed(1)
 27 | 			self.agentNum = agentNum
 28 | 			self.global_step = tf.Variable(0, trainable=False)
 29 | 			# Hyper Parameters Link:
 30 | 			self.config = config	
 31 | 			modelNumber = 'model'+str(agentNum+1)
 32 | 			#self.addressName = 'model'+str(agentNum+1)+'/savetrained' + str(self.config.address) + '/network-'
 33 | 			self.address = os.path.join(self.config.model_dir, modelNumber) # 'model'+str(agentNum+1)+'/savetrained'+ str(self.config.address)
 34 | 			self.addressName = self.address + '/network-'
 35 | 			if self.config.maxEpisodesTrain != 0:
 36 | 				self.epsilon = config.epsilonBeg
 37 | 			else:
 38 | 				self.epsilon = 0
 39 | 			self.epsilonRed	 = self.epsilonBuild()
 40 | 			self.inputSize = self.config.stateDim * self.config.multPerdInpt 
 41 | 			self.timeStep = 0		
 42 | 			self.learning_rate = 0		# this is used when we have decaying
 43 | 			self.iflrReseted = False	# this is used to manage the scale of lr
 44 | 			
 45 | 			# init replay memory
 46 | 			self.replayMemory = deque()
 47 | 			self.replaySize = 0
 48 | 			
 49 | 			# create input placeholders 
 50 | 			self.createInputs()
 51 | 
 52 | 			we = []
 53 | 			be = []
 54 | 			# create a network same as the saved network, to use some of its weight values. It is used 
 55 | 			# when the number of output in the loaded network is different than the current model.
 56 | 			if self.config.ifUsePreviousModel and self.config.ifTransferFromSmallerActionSpace:
 57 | 				# with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=self.config.gpu_memory_fraction))) as sess:
 58 | 				with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=self.config.number_cpu_active, gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
 59 | 					weights, biases = self.createQNetworkForTL()
 60 | 					sess.run(tf.global_variables_initializer())				
 61 | 					if self.config.baseDemandDistribution == 0:
 62 | 						directory=os.path.join(self.config.pre_model_dir,'uniform/'+str(int(self.config.demandLow))+'-'+str(int(self.config.demandUp)))
 63 | 					elif self.config.baseDemandDistribution == 1:
 64 | 						directory=os.path.join(self.config.pre_model_dir,'normal/'+str(int(self.config.demandMu))+'-'+str(int(self.config.demandSigma)))
 65 | 					elif self.config.baseDemandDistribution == 2:
 66 | 						directory=os.path.join(self.config.pre_model_dir,'classic')
 67 | 					elif self.config.baseDemandDistribution == 3:
 68 | 						directory=os.path.join(self.config.pre_model_dir,'basket'+str(self.config.data_id))
 69 | 					elif self.config.baseDemandDistribution == 4:
 70 | 						directory=os.path.join(self.config.pre_model_dir,'forecast'+str(self.config.data_id))
 71 | 
 72 | 					if self.config.gameConfig == 1:
 73 | 						# the Sterman case.
 74 | 						base_brain = 7 + self.config.tlBaseBrain
 75 | 					elif self.config.gameConfig == 2:
 76 | 						# the BS case.
 77 | 						base_brain = 3 + self.config.tlBaseBrain
 78 | 					else:
 79 | 						base_brain = self.config.tlBaseBrain
 80 | 					checkpoint = tf.train.get_checkpoint_state(os.path.join(directory, 'brain'+str(base_brain)))
 81 | 					# checkpoint = tf.train.get_checkpoint_state(os.path.join(self.config.pre_model_dir, 'brain'+str(self.config.tlBaseBrain)))
 82 | 					if checkpoint and checkpoint.model_checkpoint_path:
 83 | 						saver = tf.train.Saver()
 84 | 						saver.restore(sess, checkpoint.model_checkpoint_path)
 85 | 						we = sess.run(weights)
 86 | 						np.save('weights',we)
 87 | 						be=sess.run(biases)
 88 | 						if self.config.INFO_print: 
 89 | 							print("Successfully loaded:", checkpoint.model_checkpoint_path)
 90 | 						ifLoadedModel = True
 91 | 					else:
 92 | 						ifLoadedModel = False
 93 | 						if self.config.INFO_print: 
 94 | 							print("Could not find old network weights")
 95 | 				
 96 | 
 97 | 			# init Q network
 98 | 			self.QValue,self.W_fc,self.b_fc = self.createQNetwork('Q', we, be)
 99 | 			# init Target Q Network
100 | 			self.QValueT,self.W_fcT,self.b_fcT = self.createQNetwork('TQ')
101 | 			
102 | 			# copy the network to target network
103 | 			self.copyTargetQNetworkOperation = self.copyTargetQNetworkFunc() 
104 | 
105 | 			# create the placeholders and training model
106 | 			self.createTrainingMethod()
107 | 			self.currentState = []
108 | 			
109 | 			# saving and loading networks
110 | 			self.saver = tf.train.Saver()
111 | 			config_tf = tf.ConfigProto()
112 | 			# config_tf.log_device_placement=True
113 | 			config_tf.gpu_options.per_process_gpu_memory_fraction = self.config.gpu_memory_fraction
114 | 			config_tf.gpu_options.allow_growth = True
115 | 			config_tf.intra_op_parallelism_threads = self.config.number_cpu_active
116 | 			
117 | 			# create the session 
118 | 			# self.session = tf.InteractiveSession(config=config_tf)
119 | 			self.session = tf.Session(config=config_tf)
120 | 
121 | 			# call tensor board 
122 | 			self.merged = []
123 | 			if self.config.TB:
124 | 				self.merged = tf.summary.merge_all()
125 | 			
126 | 			# create summary writer 
127 | 			self.train_writer = tf.summary.FileWriter(self.config.model_dir + '/tb', self.session.graph)
128 | 
129 | 			# initialize the variables 
130 | 			self.session.run(tf.global_variables_initializer())
131 | 
132 | 			if self.config.ifUsePreviousModel:
133 | 				if not self.config.ifTransferFromSmallerActionSpace:
134 | 					# check if all agents are dnn, use the save network by each of them.
135 | 
136 | 					if self.config.ifSinglePathExist:
137 | 						directory=self.config.pre_model_dir
138 | 					elif self.config.baseDemandDistribution == 0:
139 | 						directory=os.path.join(self.config.pre_model_dir,'uniform/'+str(int(self.config.demandLow))+'-'+str(int(self.config.demandUp)))
140 | 					elif self.config.baseDemandDistribution == 1:
141 | 						directory=os.path.join(self.config.pre_model_dir,'normal/'+str(int(self.config.demandMu))+'-'+str(int(self.config.demandSigma)))
142 | 					elif self.config.baseDemandDistribution == 2:
143 | 						directory=os.path.join(self.config.pre_model_dir,'classic')
144 | 					elif self.config.baseDemandDistribution == 3:
145 | 						directory=os.path.join(self.config.pre_model_dir,'basket'+str(self.config.data_id))
146 | 					elif self.config.baseDemandDistribution == 4:
147 | 						directory=os.path.join(self.config.pre_model_dir,'forecast'+str(self.config.data_id))
148 | 
149 | 					if self.config.ifSinglePathExist:
150 | 						base_brain = self.config.tlBaseBrain + 1
151 | 					else:
152 | 						if self.config.gameConfig == 1:
153 | 							# the Sterman case.
154 | 							base_brain = 7 + self.config.tlBaseBrain
155 | 						elif self.config.gameConfig == 2:
156 | 							base_brain = 3 + self.config.tlBaseBrain
157 | 						else:
158 | 							# the BS case.
159 | 							base_brain = self.config.tlBaseBrain
160 | 					# checkpoint = tf.train.get_checkpoint_state(os.path.join(self.config.pre_model_dir, 'brain'+str(self.config.gameConfig)))
161 | 					if self.config.ifSinglePathExist:
162 | 						model_address = os.path.join(directory, 'model'+str(base_brain))
163 | 					else:
164 | 						model_address = os.path.join(directory, 'brain'+str(base_brain))
165 | 					
166 | 					checkpoint = tf.train.get_checkpoint_state(model_address)
167 | 					if checkpoint and checkpoint.model_checkpoint_path:
168 | 						self.saver.restore(self.session, checkpoint.model_checkpoint_path)
169 | 						if self.config.INFO_print: 
170 | 							print("Successfully loaded:", checkpoint.model_checkpoint_path)
171 | 
172 | 						# copy the network to target network
173 | 						self.session.run(self.copyTargetQNetworkOperation)
174 | 					else:
175 | 						if self.config.INFO_print: 
176 | 							print("Could not find old network weights in ", model_address)
177 | 				else:				
178 | 					if ifLoadedModel:
179 | 						# copy the network to target network
180 | 						self.session.run(self.copyTargetQNetworkOperation)
181 | 					else:
182 | 						if self.config.INFO_print: 
183 | 							print("Could not find old network weights")
184 | 			else:
185 | 				if self.config.INFO_print: 
186 | 					print("Previous models will not be used")
187 | 
188 | 
189 | 	# returns the operator which copies the Q network to the target network	
190 | 	def copyTargetQNetworkFunc(self):
191 | 		operation = []
192 | 		for i in range(self.config.NoHiLayer+1):
193 | 			operation += [	self.W_fcT[i].assign(self.W_fc[i]),self.b_fcT[i].assign(self.b_fc[i])]
194 | 		return operation
195 | 
196 | 	def copyBaseNetworkFunc(self, weights, biases):
197 | 		operation = []
198 | 		for i in range(self.config.NoHiLayer): # we ignored the last layer (Q-value) that its dimension is different
199 | 			operation += [	self.W_fc[i].assign(weights[i]),self.b_fc[i].assign(biases[i])]
200 | 		return operation
201 | 
202 | 	def createInputs(self):
203 | 		# input layer
204 | 		with tf.name_scope('input'):
205 | 			self.stateInput = tf.placeholder("float",[None,self.config.multPerdInpt,self.config.stateDim])
206 | 		with tf.name_scope('input_reshape'):
207 | 			self.stateInputFlat = tf.reshape(self.stateInput,[-1,self.inputSize])
208 | 
209 | 	def createQNetworkForTL(self, graph_name='Q'):
210 | 		# input layer
211 | 		W = []
212 | 		b = []
213 | 		layer = []
214 | 
215 | 		for j in range(self.config.NoHiLayer+1):
216 | 			# var = np.sqrt(1.0/(self.config.nodes[j] + 0.0))
217 | 			if j == 0:
218 | 				# hidden layers
219 | 				name=graph_name+'-layer'+str(j+1)
220 | 				hidden, weights, biases = self.fc_layer(self.stateInputFlat, self.config.nodes[j], 
221 | 				self.config.nodes[j+1], name, j) # act=tf.sigmoid
222 | 			elif j == self.config.NoHiLayer:
223 | 				# output value
224 | 				name=graph_name+'-layer'+str(j+1)
225 | 				QValue, weights, biases = self.fc_layer(layer[j-1], self.config.nodes[j],
226 | 				 self.config.baseActionSize, name,j ,act=tf.identity)
227 | 			else:
228 | 				# hidden layers
229 | 				name=graph_name+'-layer'+str(j+1)
230 | 				hidden, weights, biases = self.fc_layer(layer[j-1],
231 | 				 self.config.nodes[j], self.config.nodes[j+1], name, j)
232 | 
233 | 			layer += [hidden]
234 | 			W += [weights]
235 | 			b += [biases]
236 | 			
237 | 		return W, b
238 | 
239 | 	def createQNetwork(self, graph_name, initial_w=[], initial_b=[]):
240 | 		# initiate the weight variables
241 | 		W = []
242 | 		b = []
243 | 		layer = []
244 | 
245 | 		for j in range(self.config.NoHiLayer+1):
246 | 			# var = np.sqrt(1.0/(self.config.nodes[j] + 0.0))
247 | 			if list(initial_w):
248 | 				w_init = initial_w[j]
249 | 				b_init = initial_b[j]
250 | 			else:
251 | 				w_init = []
252 | 				b_init = []
253 | 
254 | 			if j == 0:
255 | 				# hidden layers
256 | 				name=graph_name+'-layer'+str(j+1)
257 | 				hidden, weights, biases = self.fc_layer(self.stateInputFlat, self.config.nodes[j], 
258 | 				self.config.nodes[j+1], name, j, w_init, b_init) # act=tf.sigmoid
259 | 			elif j == self.config.NoHiLayer:
260 | 				# output value
261 | 				name=graph_name+'-layer'+str(j+1)
262 | 				QValue, weights, biases = self.fc_layer(layer[j-1], self.config.nodes[j],
263 | 				 self.config.nodes[j+1], name,j, init_w=[], init_b=[] ,act=tf.identity)
264 | 			else:
265 | 				# hidden layers
266 | 				name=graph_name+'-layer'+str(j+1)
267 | 				hidden, weights, biases = self.fc_layer(layer[j-1],
268 | 				 self.config.nodes[j], self.config.nodes[j+1], name, j, w_init, b_init)
269 | 
270 | 			layer += [hidden]
271 | 			W += [weights]
272 | 			b += [biases]
273 | 
274 | 		return QValue,W,b
275 | 
276 | 
277 | 	def copyTargetQNetwork(self):
278 | 		self.session.run(self.copyTargetQNetworkOperation)
279 | 
280 | 	def createTrainingMethod(self):
281 | 		self.actionInput = tf.placeholder("float",[None,self.config.actionListLen])
282 | 		self.yInput = tf.placeholder("float", [None]) 
283 | 		Q_Action = tf.reduce_sum(tf.multiply(self.QValue, self.actionInput), reduction_indices = 1) # dim: batchSize *1
284 | 		with tf.name_scope('cost'):
285 | 			self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
286 | 		tf.summary.scalar('cost', self.cost)		
287 | 		#self.trainStep = tf.train.RMSPropOptimizer(self.config.lr0,self.config.decay,self.config.momentum,1e-6).minimize(self.cost)
288 | 		if self.config.ifDecayAdam: 
289 | 			with tf.name_scope('train'):
290 | 				self.learning_rate = tf.train.exponential_decay(self.config.lr0, self.global_step, self.config.decayStep, self.config.decayRate, staircase=True)
291 | 				self.trainStep = tf.train.AdamOptimizer(self.learning_rate,0.9,0.999,1e-8).minimize(self.cost, global_step=self.global_step)
292 | 		else:	
293 | 			with tf.name_scope('train'):
294 | 				self.trainStep = tf.train.AdamOptimizer(self.config.lr0,0.9,0.999,1e-8).minimize(self.cost)
295 | 
296 | 	def trainQNetwork(self):
297 | 		# Step 1: obtain random minibatch from replay memory
298 | 		minibatch = random.sample(self.replayMemory,self.config.batchSize)
299 | 		state_batch = [data[0] for data in minibatch]  #dim: each item is multPerInput*stateDim
300 | 		action_batch = [data[1] for data in minibatch]
301 | 		reward_batch = [data[2] for data in minibatch]
302 | 		nextState_batch = [data[3] for data in minibatch]
303 | 
304 | 		# Step 2: calculate y 
305 | 		y_batch = []
306 | 		QValue_batch = self.QValueT.eval(feed_dict={self.stateInput:nextState_batch},session = self.session)
307 | 		# for i in range(0,self.config.batchSize):
308 | 		# 	terminal = minibatch[i][4]
309 | 		# 	if terminal:
310 | 		# 		y_batch.append(reward_batch[i])
311 | 		# 	else:
312 | 		# 		y_batch.append(reward_batch[i] + self.config.gamma * np.max(QValue_batch[i]))
313 | 		y_batch = reward_batch + (1-np.array(minibatch)[:,4])*self.config.gamma * np.max(QValue_batch, axis=1)
314 | 		# dim yInput: batchSize*1
315 | 		# dim actionInput: batchSize*actionListLen
316 | 		# dim stateInput: batchSize**multPerInput*stateDim
317 | 		
318 | 		# check if lr < Minlr, stop its decreasing procedure
319 | 		lr = self.learning_rate.eval(session=self.session)
320 | 		if lr < self.config.Minlr and not self.iflrReseted:
321 | 			self.iflrReseted = True
322 | 			self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 10000000, 1, staircase=True)
323 | 		
324 | 		feed_dict={
325 | 			self.yInput : y_batch,			
326 | 			self.actionInput : action_batch,
327 | 			self.stateInput : state_batch
328 | 			}
329 | 		if self.config.TB and (self.timeStep % self.config.tbLogInterval == 1):
330 | 			# run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
331 | 			run_metadata = tf.RunMetadata()			
332 | 			# summary, _ = self.session.run([self.merged, self.trainStep], feed_dict, feed_dictoptions=run_options,
333 | 			# 								run_metadata=run_metadata)
334 | 			summary, _ = self.session.run([self.merged, self.trainStep], feed_dict,
335 | 											run_metadata=run_metadata)			
336 | 			self.train_writer.add_run_metadata(run_metadata, 'step%03d' % self.timeStep)
337 | 			self.train_writer.add_summary(summary, self.timeStep)			
338 | 			if self.config.INFO_print: 
339 | 				print('Adding run metadata for', self.timeStep)
340 | 		else:
341 | 			summary, _ = self.session.run([self.merged, self.trainStep], feed_dict)
342 | 			if self.config.TB and (self.timeStep%self.config.tbLogInterval==1):
343 | 				self.train_writer.add_summary(summary, self.timeStep)
344 | 			# self.trainStep.run(feed_dict, session=self.session)
345 | 			# self.session.run([self.trainStep], feed_dict)
346 | 			
347 | 			
348 | 		# grad_w= self.session.run([tf.norm(tf.gradients(self.cost, self.W_fc[3]))], feed_dict)
349 | 		# grad_b= self.session.run([tf.norm(tf.gradients(self.cost, self.b_fc[3]))], feed_dict)
350 | 		# print('grad is ', grad_w, grad_b)
351 | 		"""trainResult = self.session.run(self.cost,feed_dict={
352 | 			self.yInput : y_batch,
353 | 			self.actionInput : action_batch,
354 | 			self.stateInput : state_batch
355 | 			},session = self.session)	
356 | 		print("TRAIN_RESULT", trainResult) """
357 | 		
358 | 		# save network every saveInterval iteration
359 | 		if (self.timeStep+1) % self.config.saveInterval == 0:
360 | 			self.saver.save(self.session, self.addressName, global_step = self.timeStep)
361 | 			print("network weights are saved")
362 | 
363 | 		if self.timeStep % self.config.dnnUpCnt == 0:
364 | 			self.copyTargetQNetwork()
365 | 		
366 | 	def train(self,nextObservation,action,reward,terminal,playType):
367 | 		# Considering the multi-period observation idea, merges the last $m-1$ periods with the new state. 
368 | 		newState = np.append(self.currentState[1:,:],[nextObservation],axis = 0)
369 | 		
370 | 		if playType == "train":
371 | 			if self.config.MultiAgent:
372 | 				if self.config.MultiAgentRun[self.agentNum]:
373 | 					self.replayMemory.append([self.currentState,action,reward,newState,terminal])
374 | 					self.replaySize = len(self.replayMemory)
375 | 			else:
376 | 				self.replayMemory.append([self.currentState,action,reward,newState,terminal])
377 | 				self.replaySize = len(self.replayMemory)
378 | 
379 | 			if self.replaySize > self.config.maxReplayMem and self.config.MultiAgentRun[self.agentNum]:
380 | 				self.replayMemory.popleft()
381 | 				self.trainQNetwork()
382 | 				state = "train"
383 | 				self.timeStep += 1
384 | 			
385 | 			elif self.replaySize >= self.config.minReplayMem and self.config.MultiAgentRun[self.agentNum]:
386 | 				# Train the network
387 | 				state = "train"
388 | 				self.trainQNetwork()
389 | 				self.timeStep += 1
390 | 			else:
391 | 				state = "observe" 
392 | 				
393 | 			if terminal and state == "train":
394 | 				self.epsilonReduce()	
395 | 					
396 | 			# print(info)
397 | 			#print("AGENT", self.agentNum,"/TRAINING_ITER", self.timeStep, "/ STATE", state, \)
398 | 			#"/ EPSILON", self.epsilon
399 | 
400 | 		self.currentState = newState
401 | 		
402 | 	def getDNNAction(self,playType):
403 | 		action = np.zeros(self.config.actionListLen)
404 | 		action_index = 0
405 | 		if playType == "train":
406 | 			if (random.random() <= self.epsilon) or (self.replaySize < self.config.minReplayMem):
407 | 				action_index = random.randrange(self.config.actionListLen)
408 | 				action[action_index] = 1
409 | 			else:
410 | 				QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]},session = self.session)[0]
411 | 				action_index = np.argmax(QValue)
412 | 				action[action_index] = 1
413 | 		elif playType == "test"	:
414 | 			QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]},session = self.session)[0]
415 | 			action_index = np.argmax(QValue)
416 | 			action[action_index] = 1
417 | 
418 | 		return action
419 | 			
420 | 	# this functions sets the current state of the game in the begining of each game	
421 | 	def setInitState(self,observation):
422 | 		self.currentState = np.stack([observation for _ in range(self.config.multPerdInpt)], axis = 0) # multPerdInpt observations stacked. each row is an observation
423 | 
424 | 
425 | 	def epsilonBuild(self): # this function specifies how much we should deduct from /epsilon at each game
426 | 		betta = 0.8
427 | 		if self.config.maxEpisodesTrain != 0:
428 | 			epsilon_red = (self.config.epsilonBeg - self.config.epsilonEnd)/(self.config.maxEpisodesTrain*betta)
429 | 		else:
430 | 			epsilon_red = 0
431 | 		return epsilon_red
432 | 
433 | 	def epsilonReduce(self):
434 | 		# Reduces the values of epsilon at each iteration of episode
435 | 		if self.epsilon >self.config.epsilonEnd:
436 | 			self.epsilon -= self.epsilonRed
437 | 
438 | 	def deleteGraph(self):
439 | 		tf.reset_default_graph()
440 | 		self.sess.close()
441 | 
442 | 
443 | 
444 | 	def fc_layer(self, input_tensor, input_dim, output_dim, layer_name, j_, init_w=[], init_b=[], act=tf.nn.relu):
445 | 		"""Reusable code for making a simple fully connected neural net layer.
446 | 
447 | 		It does a matrix multiply, bias add, and then uses relu to nonlinearize.
448 | 		It also sets up name scoping so that the resultant graph is easy to read,
449 | 		and adds a number of summary ops.
450 | 		"""
451 | 		def variable_summaries(var):
452 | 			"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
453 | 			with tf.name_scope('summaries'):
454 | 				mean = tf.reduce_mean(var)
455 | 				tf.summary.scalar('mean', mean)
456 | 				with tf.name_scope('stddev'):
457 | 					stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
458 | 				tf.summary.scalar('stddev', stddev)
459 | 				tf.summary.scalar('max', tf.reduce_max(var))
460 | 				tf.summary.scalar('min', tf.reduce_min(var))
461 | 				tf.summary.histogram('histogram', var)
462 | 		
463 | 		def weight_variable(shape, j_, init_w=None):
464 | 			"""Create a weight variable with appropriate initialization."""
465 | 			if not list(init_w):
466 | 				initial = tf.random.truncated_normal(shape, stddev = 0.1)
467 | 			else:
468 | 				initial = tf.constant(init_w)
469 | 			if self.config.iftl and j_ < self.config.NoFixedLayer:
470 | 				return tf.Variable(initial, trainable=False)
471 | 			else:
472 | 				return tf.Variable(initial, trainable=True)
473 | 
474 | 		def bias_variable(shape, j_, init_b=None):
475 | 			"""Create a bias variable with appropriate initialization."""
476 | 			if not list(init_b):
477 | 				initial = tf.constant(0.1, shape = shape)
478 | 			else:
479 | 				initial = tf.constant(init_b)
480 | 			if self.config.iftl and j_ < self.config.NoFixedLayer:
481 | 				return tf.Variable(initial, trainable=False)
482 | 			else:
483 | 				return tf.Variable(initial, trainable=True)
484 | 
485 | 		# Adding a name scope ensures logical grouping of the layers in the graph.
486 | 		with tf.name_scope(layer_name):
487 | 			# This Variable will hold the state of the weights for the layer
488 | 			with tf.name_scope('weights'):
489 | 				weights = weight_variable([input_dim, output_dim], j_, init_w)
490 | 				variable_summaries(weights)
491 | 			with tf.name_scope('biases'):
492 | 				biases = bias_variable([output_dim], j_, init_b)
493 | 				variable_summaries(biases)
494 | 			with tf.name_scope('Wx_plus_b'):
495 | 				preactivate = tf.matmul(input_tensor, weights) + biases
496 | 				tf.summary.histogram('pre_activations', preactivate)
497 | 				activations = act(preactivate, name='activation')
498 | 				tf.summary.histogram('activations', activations)
499 | 			return activations, weights, biases
500 | 
501 | 


--------------------------------------------------------------------------------
/clBeergame.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from time import gmtime, strftime
  3 | import numpy as np
  4 | import random
  5 | from random import randint
  6 | from BGAgent import Agent
  7 | from plotting import plotting, savePlot, plotBaseStock
  8 | import matplotlib.pyplot as plt
  9 | import os 
 10 | from matplotlib import rc
 11 | rc('text', usetex=True)
 12 | import tensorflow as tf 
 13 | 
 14 | class clBeerGame(object):
 15 | 	def __init__(self, config):
 16 | 		self.config = config
 17 | 		self.curGame = 0 # The number associated with the current game (counter of the game)
 18 | 		self.curTime = 0
 19 | 		self.totIterPlayed = 0  # total iterations of the game, played so far in this and previous games
 20 | 		self.players = self.createAgent()  # create the agents 
 21 | 		self.T = 0
 22 | 		self.demand = []
 23 | 		self.playType = []  # "train" or "test"
 24 | 		self.ifOptimalSolExist = self.config.ifOptimalSolExist
 25 | 		self.getOptimalSol()
 26 | 		self.totRew = 0    # it is reward of all players obtained for the current player.
 27 | 		self.resultTest	= []
 28 | 		self.runnerMidlResults = []		# stores the results to use in runner comparisons
 29 | 		self.runnerFinlResults = []		# stores the results to use in runner comparisons
 30 | 		self.middleTestResult = []		# stores the whole middle results of bs, Strm, and random to avoid doing same tests multiple of times.
 31 | 		self.runNumber = 0		# the runNumber which is used when use runner
 32 | 		self.strNum = 0			# the runNumber which is used when use runner		
 33 | 		
 34 | 	# createAgent : Create agent objects (agentNum,IL,OO,c_h,c_p,type,config)
 35 | 	def createAgent(self): 	
 36 | 		agentTypes = self.config.agentTypes 
 37 | 		return [Agent(i,self.config.ILInit[i], self.config.AOInit, self.config.ASInit[i], 
 38 |                               self.config.c_h[i], self.config.c_p[i], self.config.eta[i], 
 39 |                               agentTypes[i],self.config) for i in range(self.config.NoAgent)]
 40 | 			
 41 | 	# planHorizon : Find a random planning horizon
 42 | 	def planHorizon(self):
 43 | 		# TLow: minimum number for the planning horizon # TUp: maximum number for the planning horizon
 44 | 		#output: The planning horizon which is chosen randomly.
 45 | 		return randint(self.config.TLow,self.config.TUp)
 46 | 
 47 | 	# this function resets the game for start of the new game
 48 | 	def resetGame(self, demand, playType):
 49 | 		self.playType = playType  #"train" or "test"
 50 | 		self.demand = demand
 51 | 		self.curTime = 0
 52 | 		if playType == "train":
 53 | 			self.curGame += 1
 54 | 			self.totIterPlayed += self.T
 55 | 			self.T = self.planHorizon()	
 56 | 		else:
 57 | 			self.T = self.config.Ttest	
 58 | 			
 59 | 		# reset the required information of player for each episode
 60 | 		for k in range(0,self.config.NoAgent):
 61 | 			self.players[k].resetPlayer(self.T)
 62 | 
 63 | 		# update OO when there are initial IL,AO,AS
 64 | 		self.update_OO()
 65 | 	
 66 | 	# correction on cost at time T according to the cost of the other players
 67 | 	def getTotRew(self):
 68 | 		totRew = 0
 69 | 		for i in range(self.config.NoAgent):
 70 | 			# sum all rewards for the agents and make correction
 71 | 			totRew += self.players[i].cumReward
 72 | 
 73 | 		for i in range(self.config.NoAgent):
 74 | 			self.players[i].curReward += self.players[i].eta*(totRew - self.players[i].cumReward) #/(self.T)
 75 | 	
 76 | 	# make correction to the rewards in the experience replay for all iterations of current game
 77 | 	def distTotReward(self):
 78 | 		totRew = 0
 79 | 		optRew = 0.1
 80 | 		for i in range(self.config.NoAgent):
 81 | 			# sum all rewards for the agents and make correction
 82 | 			totRew += self.players[i].cumReward	
 83 | 		totRew += optRew
 84 | 		
 85 | 		for i in range(self.config.NoAgent):
 86 | 			for j in range(self.T):
 87 | 				if self.config.NoAgent>1 and hasattr(self.players[i], 'brain') and (len(self.players[i].brain.replayMemory)>0):
 88 | 					#self.players[i].brain.replayMemory[-1*(j+1)][2] += (np.power(self.config.alpha,j)/(self.config.NoAgent-1))*((totRew - self.players[i].cumReward)/(self.T)) # changes the last T periods in the replayMemory
 89 | 					self.players[i].brain.replayMemory[-1*(j+1)][2] += (self.config.distCoeff/(self.config.NoAgent-1))*((totRew - self.players[i].cumReward)/(self.T)) # changes the last T periods in the replayMemory					
 90 | 
 91 | 	def getAction(self, k):
 92 | 		
 93 | 		# get action for training run
 94 | 		if self.playType == "train":
 95 | 			if  self.players[k].compTypeTrain == "srdqn":
 96 | 				self.players[k].action = np.zeros(self.config.actionListLen)
 97 | 				self.players[k].action = self.players[k].brain.getDNNAction(self.playType)
 98 | 			elif self.players[k].compTypeTrain == "Strm":
 99 | 				self.players[k].action = np.zeros(self.config.actionListLenOpt)
100 | 				self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)\
101 | 									-max(0,round(self.players[k].AO[self.curTime] +\
102 | 									self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) +\
103 | 									self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1
104 | 			elif self.players[k].compTypeTest == "rnd":	
105 | 				self.players[k].action = np.zeros(self.config.actionListLen)
106 | 				a = np.random.randint(self.config.actionListLen)
107 | 				self.players[k].action[a] = 1
108 | 			elif self.players[k].compTypeTrain == "bs":	
109 | 				self.players[k].action = np.zeros(self.config.actionListLenOpt)
110 | 				if self.config.demandDistribution == 2:
111 | 					if self.curTime   and self.config.use_initial_BS <= 4:
112 | 						self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
113 | 								max(0,(self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
114 | 					else: 
115 | 						self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
116 | 								max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
117 | 				else:
118 | 					self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
119 | 								max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
120 | 			else:
121 | 				# not a valid player is defined.
122 | 				raise Exception('The player type is not defined or it is not a valid type.!')
123 | 
124 | 		# get action for test runs
125 | 		elif self.playType == "test":
126 | 			if  self.players[k].compTypeTest == "srdqn":
127 | 				self.players[k].action = np.zeros(self.config.actionListLen)
128 | 				if self.config.ifPlaySavedData:
129 | 					self.players[k].action[int(self.loaded_dqn_actions[self.curTime])] = 1
130 | 				else:
131 | 					self.players[k].action = self.players[k].brain.getDNNAction(self.playType)
132 | 			elif self.players[k].compTypeTest == "Strm":
133 | 				self.players[k].action = np.zeros(self.config.actionListLenOpt)
134 | 
135 | 				self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)-\
136 | 								max(0,round(self.players[k].AO[self.curTime] +\
137 | 									self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) +\
138 | 									self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1	
139 | 			elif self.players[k].compTypeTest == "rnd":	
140 | 				self.players[k].action = np.zeros(self.config.actionListLen)
141 | 				a = np.random.randint(self.config.actionListLen)
142 | 				self.players[k].action[a] = 1
143 | 			elif self.players[k].compTypeTest == "bs":
144 | 				self.players[k].action = np.zeros(self.config.actionListLenOpt)
145 | 
146 | 				if self.config.demandDistribution == 2:
147 | 					if self.curTime   and self.config.use_initial_BS <= 4:
148 | 						self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
149 | 								max(0,(self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
150 | 					else: 
151 | 						self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
152 | 								max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
153 | 				else:
154 | 					self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
155 | 								max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1	
156 | 			else:
157 | 				# not a valid player is defined.
158 | 				raise Exception('The player type is not defined or it is not a valid type.!')
159 |               	# print(self.curTime, self.players[k].agentNum, "IL", self.players[k].IL, "OO", self.players[k].OO, "Op", self.players[k].bsBaseStock, self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO))
160 | 	
161 | 	# next action
162 | 	def next(self):
163 | 		# get a random leadtime		
164 | 		leadTimeIn = randint(self.config.leadRecItemLow[self.config.NoAgent-1], self.config.leadRecItemUp[self.config.NoAgent-1]) 
165 | 		# handle the most upstream recieved shipment 		
166 | 		self.players[self.config.NoAgent-1].AS[self.curTime + leadTimeIn] += self.players[self.config.NoAgent-1].actionValue(self.curTime, self.playType)
167 | 
168 | 		for k in range(self.config.NoAgent-1,-1,-1): # [3,2,1,0]
169 | 			
170 | 			# get current IL and Backorder
171 | 			current_IL = max(0, self.players[k].IL)
172 | 			current_backorder = max(0, -self.players[k].IL)
173 | 
174 | 			# TODO: We have get the AS and AO from the UI and update our AS and AO, so that code update the corresponding variables
175 | 			
176 | 			# increase IL and decrease OO based on the action, for the next period 
177 | 			self.players[k].recieveItems(self.curTime)
178 | 			
179 | 			# observe the reward
180 | 			possible_shipment = min(current_IL + self.players[k].AS[self.curTime], current_backorder + self.players[k].AO[self.curTime])
181 | 			
182 | 			# plan arrivals of the items to the downstream agent
183 | 			if self.players[k].agentNum > 0:
184 | 				leadTimeIn = randint(self.config.leadRecItemLow[k-1], self.config.leadRecItemUp[k-1])
185 | 				self.players[k-1].AS[self.curTime + leadTimeIn] += possible_shipment
186 | 
187 | 			# update IL
188 | 			self.players[k].IL -= self.players[k].AO[self.curTime]
189 | 			# observe the reward
190 | 			self.players[k].getReward()
191 | 			self.players[k].hist[-1][-2] = self.players[k].curReward
192 | 			self.players[k].hist2[-1][-2] = self.players[k].curReward
193 | 
194 | 			# update next observation 
195 | 			self.players[k].nextObservation = self.players[k].getCurState(self.curTime+1)
196 | 		
197 | 		if self.config.ifUseTotalReward:
198 | 			# correction on cost at time T
199 | 			if self.curTime == self.T:
200 | 				self.getTotRew()					
201 | 		
202 | 		self.curTime +=1				
203 | 	
204 | 	def handelAction(self):
205 | 		# get random lead time 
206 | 		leadTime = randint(self.config.leadRecOrderLow[0], self.config.leadRecOrderUp[0])
207 | 		# set AO 
208 | 		self.players[0].AO[self.curTime] += self.demand[self.curTime]
209 | 		for k in range(0,self.config.NoAgent): 
210 | 			self.getAction(k)
211 | 			
212 | 			self.players[k].srdqnBaseStock += [self.players[k].actionValue( \
213 | 				self.curTime, self.playType) + self.players[k].IL + self.players[k].OO]
214 | 			
215 | 			# update hist for the plots	
216 | 			self.players[k].hist += [[self.curTime,self.players[k].IL, self.players[k].OO,\
217 | 						self.players[k].actionValue(self.curTime,self.playType),self.players[k].curReward, self.players[k].srdqnBaseStock[-1]]]
218 | 
219 | 			if (self.players[k].compTypeTrain == "srdqn" and self.playType == "train") or (self.players[k].compTypeTest == "srdqn" and self.playType == "test"):
220 | 				self.players[k].hist2 += [[self.curTime,self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \
221 | 						self.players[k].actionValue(self.curTime,self.playType), self.players[k].curReward, \
222 | 						self.config.actionList[np.argmax(self.players[k].action)]]]
223 | 
224 | 			else:
225 | 				self.players[k].hist2 += [[self.curTime,self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \
226 | 						self.players[k].actionValue(self.curTime,self.playType), self.players[k].curReward, 0]]
227 | 
228 | 			# updates OO and AO at time t+1
229 | 			self.players[k].OO += self.players[k].actionValue(self.curTime, self.playType) # open order level update
230 | 			leadTime = randint(self.config.leadRecOrderLow[k], self.config.leadRecOrderUp[k])
231 | 			if self.players[k].agentNum < self.config.NoAgent-1:
232 | 				self.players[k+1].AO[self.curTime + leadTime] += self.players[k].actionValue(self.curTime, self.playType) # open order level update
233 | 
234 | 
235 | 	def playGame(self, demand, playType):
236 | 		self.resetGame(demand, playType)
237 | 
238 | 		# run the game
239 | 		while self.curTime <= self.T:
240 | 			self.handelAction()
241 | 			self.next()
242 | 
243 | 
244 | 			for k in range(0,self.config.NoAgent):					
245 | 				if (self.players[k].compTypeTrain == "srdqn" and playType == "train") or (self.players[k].compTypeTest == "srdqn" and playType == "test"):
246 | 					# control the learner agent 
247 | 
248 | 					self.players[k].brain.train(self.players[k].nextObservation,self.players[k].action, \
249 | 								self.players[k].curReward,self.curTime == self.T,self.playType)
250 | 		if self.config.ifUsedistTotReward and playType == "train":
251 | 			self.distTotReward()		
252 | 		return [-1*self.players[i].cumReward for i in range(0,self.config.NoAgent)]
253 | 	
254 | 	# check the Shang and Song (2003) condition, and if it works, obtains the base stock policy values for each agent
255 | 	def getOptimalSol(self):
256 | 		# if self.config.NoAgent !=1:
257 | 		if self.config.NoAgent !=1 and 1 == 2:
258 | 			# check the Shang and Song (2003) condition.
259 | 			for k in range(self.config.NoAgent-1):
260 | 				if not (self.players[k].c_h == self.players[k+1].c_h and self.players[k+1].c_p == 0):
261 | 					self.ifOptimalSolExist = False
262 | 				
263 | 			# if the Shang and Song (2003) condition satisfied, it runs the algorithm
264 | 			if self.ifOptimalSolExist == True:
265 | 				calculations = np.zeros((7,self.config.NoAgent))
266 | 				for k in range(self.config.NoAgent):
267 | 					# DL_high
268 | 					calculations[0][k] = ((self.config.leadRecItemLow +self.config.leadRecItemUp + 2)/2 \
269 | 									  + (self.config.leadRecOrderLow+self.config.leadRecOrderUp + 2)/2)* \
270 | 										(self.config.demandUp - self.config.demandLow- 1)
271 | 					if k > 0:
272 | 						calculations[0][k] += calculations[0][k-1]
273 | 					# probability_high
274 | 					nominator_ch = 0
275 | 					low_denominator_ch = 0				
276 | 					for j in range(k,self.config.NoAgent):
277 | 						if j < self.config.NoAgent-1:
278 | 							nominator_ch += self.players[j+1].c_h
279 | 						low_denominator_ch += self.players[j].c_h 
280 | 					if k == 0:
281 | 						high_denominator_ch = low_denominator_ch
282 | 					calculations[2][k] = (self.players[0].c_p + nominator_ch)/(self.players[0].c_p + low_denominator_ch + 0.0)
283 | 					# probability_low
284 | 					calculations[3][k] = (self.players[0].c_p + nominator_ch)/(self.players[0].c_p + high_denominator_ch + 0.0)
285 | 				# S_high
286 | 				calculations[4] = np.round(np.multiply(calculations[0],calculations[2]))
287 | 				# S_low
288 | 				calculations[5] = np.round(np.multiply(calculations[0],calculations[3]))
289 | 				# S_avg
290 | 				calculations[6] = np.round(np.mean(calculations[4:6], axis=0))
291 | 				# S', set the base stock values into each agent.
292 | 				for k in range(self.config.NoAgent):
293 | 					if k == 0:
294 | 						self.players[k].bsBaseStock = calculations[6][k]
295 | 						
296 | 					else:
297 | 						self.players[k].bsBaseStock = calculations[6][k] - calculations[6][k-1]
298 | 						if self.players[k].bsBaseStock < 0:
299 | 							self.players[k].bsBaseStock = 0
300 | 		elif self.config.NoAgent ==1:				
301 | 			if self.config.demandDistribution==0:
302 | 				self.players[0].bsBaseStock = np.ceil(self.config.c_h[0]/(self.config.c_h[0]+self.config.c_p[0]+ 0.0))*((self.config.demandUp-self.config.demandLow-1)/2)*self.config.leadRecItemUp
303 | 		elif 1 == 1:
304 | 			f = self.config.f
305 | 			f_init = self.config.f_init
306 | 			for k in range(self.config.NoAgent):
307 | 				self.players[k].bsBaseStock = f[k]
308 | 				self.players[k].int_bslBaseStock = f_init[k]
309 | 				
310 | 	def doTestMid(self, demandTs):
311 | 		if self.config.ifPlaySavedData:
312 | 			for c,i in enumerate(self.config.agentTypes):
313 | 				if i == "srdqn":
314 | 					dnn_agent = c
315 | 					break
316 | 
317 | 		self.resultTest = []
318 | 		for i in range(self.config.testRepeatMid):
319 | 			if self.config.ifPlaySavedData:
320 | 				hist2 = np.load(os.path.join(self.config.model_dir,'DQN-0-player-'+str(dnn_agent)+'-'+str(i)+'.npy'))
321 | 				self.loaded_dqn_actions = hist2[:,7]
322 | 			self.doTest(i,demandTs[i])
323 | 		
324 | 		print("---------------------------------------------------------------------------------------")
325 | 		resultSummary = np.array(self.resultTest).mean(axis=0).tolist()	
326 | 		
327 | 		
328 | 		
329 | 		result_srdqn= ', '.join(map("{:.2f}".format, resultSummary[0])) 
330 | 		result_rand= ', '.join(map("{:.2f}".format, resultSummary[1])) 
331 | 		result_strm= ', '.join(map("{:.2f}".format, resultSummary[2])) 
332 | 		if self.ifOptimalSolExist:
333 | 			result_bs= ', '.join(map("{:.2f}".format, resultSummary[3])) 
334 | 			print('SUMMARY; {0:s}; ITER= {1:d}; SRDQN= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}; BS= [{8:s}]; SUM = {9:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 
335 | 				self.curGame, result_srdqn, sum(resultSummary[0]), 
336 | 				result_rand, sum(resultSummary[1]),
337 | 				result_strm, sum(resultSummary[2]), 
338 | 				result_bs, sum(resultSummary[3])))	
339 | 
340 | 		else:
341 | 			print('SUMMARY; {0:s}; ITER= {1:d}; SRDQN= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 
342 | 				self.curGame, result_srdqn, sum(resultSummary[0]), 
343 | 				result_rand, sum(resultSummary[1]),
344 | 				result_strm, sum(resultSummary[2])))
345 | 		
346 | 		print("=======================================================================================")
347 | 
348 | 
349 | 						
350 | 
351 | 	def doTest(self, m,demand):
352 | 		import matplotlib.pyplot as plt
353 | 		
354 | 		if (self.config.ifSaveFigure) and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])):
355 | 			plt.figure(self.curGame, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
356 | 		
357 | 		self.demand = demand
358 | 		# use dnn to get output.
359 | 		Rsltdnn,plt = self.tester(self.config.agentTypes ,plt, 'b', 'DQN' ,m)
360 | 		baseStockdata = self.players[0].srdqnBaseStock
361 | 
362 | 		# check some condition to avoid doing same test middle again.
363 | 		if ((self.config.ifSaveFigure) and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1]))) \
364 | 			or (self.curGame >= self.config.maxEpisodesTrain-1) or (len(self.middleTestResult) < self.config.testRepeatMid):
365 | 
366 | 			# use random to get output.
367 | 			RsltRnd ,plt= self.tester(["rnd","rnd","rnd","rnd"], plt,'y21', 'RAND' ,m)
368 | 							
369 | 			# use formual to get output.
370 | 			RsltStrm ,plt= self.tester(["Strm","Strm","Strm","Strm"],plt, 'g', 'Strm' ,m)
371 | 
372 | 			# use optimal strategy to get output, if it works.
373 | 			if self.ifOptimalSolExist:
374 | 				if self.config.agentTypes == ["srdqn", "Strm","Strm","Strm"]:
375 | 					Rsltbs ,plt= self.tester(["bs","Strm","Strm","Strm"],plt, 'r', 'Strm-BS' ,m)
376 | 				elif self.config.agentTypes == ["Strm", "srdqn","Strm","Strm"]:
377 | 					 Rsltbs ,plt= self.tester(["Strm","bs","Strm","Strm"],plt, 'r', 'Strm-BS' ,m)
378 | 				elif self.config.agentTypes == ["Strm", "Strm","srdqn","Strm"]:
379 | 					 Rsltbs ,plt= self.tester(["Strm","Strm","bs","Strm"],plt, 'r', 'Strm-BS' ,m)
380 | 				elif self.config.agentTypes == ["Strm", "Strm","Strm","srdqn"]:				 
381 | 					 Rsltbs ,plt= self.tester(["Strm","Strm","Strm","bs"],plt, 'r', 'Strm-BS' ,m)
382 | 				elif self.config.agentTypes == ["srdqn", "rnd","rnd","rnd"]:
383 | 					Rsltbs ,plt= self.tester(["bs","rnd","rnd","rnd"],plt, 'r', 'RND-BS' ,m)
384 | 				elif self.config.agentTypes == ["rnd", "srdqn","rnd","rnd"]:
385 | 					 Rsltbs ,plt= self.tester(["rnd","bs","rnd","rnd"],plt, 'r', 'RND-BS' ,m)
386 | 				elif self.config.agentTypes == ["rnd", "rnd","srdqn","rnd"]:
387 | 					 Rsltbs ,plt= self.tester(["rnd","rnd","bs","rnd"],plt, 'r', 'RND-BS' ,m)
388 | 				elif self.config.agentTypes == ["rnd", "rnd","rnd","srdqn"]:				 
389 | 					 Rsltbs ,plt= self.tester(["rnd","rnd","rnd","bs"],plt, 'r', 'RND-BS' ,m)
390 | 				else:
391 | 					Rsltbs ,plt= self.tester(["bs","bs","bs","bs"],plt, 'r', 'BS' ,m)			
392 | 			# hold the results of the optimal solution
393 | 				self.middleTestResult += [[RsltRnd,RsltStrm,Rsltbs]]
394 | 			else:
395 | 				self.middleTestResult += [[RsltRnd,RsltStrm]]
396 | 			
397 | 		else:
398 | 			# return the obtained results into their lists
399 | 			RsltRnd = self.middleTestResult[m][0]
400 | 			RsltStrm = self.middleTestResult[m][1]
401 | 			if self.ifOptimalSolExist:
402 | 				Rsltbs = self.middleTestResult[m][2]
403 | 			
404 | 		# save the figure
405 | 		if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])):
406 | 			savePlot(self.players, self.curGame, Rsltdnn  ,RsltStrm, Rsltbs , self.config, m)
407 | 				
408 | 		result_srdqn = ', '.join(map("{:.2f}".format, Rsltdnn)) 
409 | 		result_rand = ', '.join(map("{:.2f}".format, RsltRnd)) 
410 | 		result_strm = ', '.join(map("{:.2f}".format, RsltStrm)) 
411 | 		if self.ifOptimalSolExist:
412 | 			result_bs = ', '.join(map("{:.2f}".format, Rsltbs)) 
413 | 			print('output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}; BS= [{8:s}]; sum = {9:2.4f}'.format(
414 | 			strftime("%Y-%m-%d %H:%M:%S", gmtime()) , str(str(self.curGame)+"-"+str(m)), result_srdqn , sum(Rsltdnn), 
415 | 			result_rand, sum(RsltRnd),
416 | 			result_strm, sum(RsltStrm), 
417 | 			result_bs, sum(Rsltbs)))	
418 | 			self.resultTest +=  [[Rsltdnn,RsltRnd,RsltStrm,Rsltbs]]
419 | 
420 | 		else:
421 | 			print('output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}'.format(strftime("%Y-%m-%d %H:%M:%S", gmtime()) , 
422 | 			str(str(self.curGame)+"-"+str(m)), result_srdqn, sum(Rsltdnn), 
423 | 			result_rand, sum(RsltRnd),
424 | 			result_strm, sum(RsltStrm)))	
425 | 			
426 | 			self.resultTest += [[Rsltdnn,RsltRnd,RsltStrm]]
427 | 
428 | 		return sum(Rsltdnn)
429 | 		
430 | 	def tester(self,testType,plt, colori, labeli ,m):
431 | 		
432 | 		# set computation type for test
433 | 		for k in range(0,self.config.NoAgent):
434 | 			self.players[k].compTypeTest = testType[k]
435 | 		# run the episode to get the results.
436 | 		result = self.playGame(self.demand,"test")
437 | 		# add the results into the figure
438 | 		if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])) and (testType[0] != "rnd"):
439 | 			plt = plotting(plt,[np.array(self.players[i].hist) for i in range(0,self.config.NoAgent)],colori, labeli)			
440 | 		if self.config.ifsaveHistInterval and ((self.curGame == 0) or (self.curGame == 1) or (self.curGame == 2) or(self.curGame == 3) or ((self.curGame - 1) % self.config.saveHistInterval == 0)\
441 | 			or ((self.curGame) % self.config.saveHistInterval == 0) or ((self.curGame) % self.config.saveHistInterval == 1) \
442 | 			or ((self.curGame) % self.config.saveHistInterval == 2)) :
443 | 			for k in range(0,self.config.NoAgent): 
444 | 				name = labeli + "-" + str(self.curGame) + "-" + "player" + "-" + str(k)+ "-" + str(m)
445 | 				np.save(os.path.join(self.config.model_dir,name), np.array(self.players[k].hist2))
446 | 	
447 | 		# save the figure of base stocks
448 | 		# if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])):
449 | 		# 	for k in range(self.config.NoAgent):
450 | 		# 		if self.players[k].compTypeTest == 'dnn':
451 | 		# 			plotBaseStock(self.players[k].srdqnBaseStock, 'b', 'base stock of agent '+ str(self.players[k].agentNum), self.curGame, self.config, m)
452 | 
453 | 		return result,plt
454 | 		
455 | 
456 | 	def update_OO(self):
457 | 		for k in range(0,self.config.NoAgent):
458 | 			if k < self.config.NoAgent - 1:
459 | 				self.players[k].OO = sum(self.players[k+1].AO) + sum(self.players[k].AS)
460 | 			else:
461 | 				self.players[k].OO = sum(self.players[k].AS)
462 | 
463 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np 
  4 | 
  5 | def str2bool(v):
  6 |   return v.lower() in ('true', '1')
  7 | 
  8 | arg_lists = []
  9 | parser = argparse.ArgumentParser()
 10 | 
 11 | def add_argument_group(name):
 12 |   arg = parser.add_argument_group(name)
 13 |   arg_lists.append(arg)
 14 |   return arg
 15 | 
 16 | # crm
 17 | game_arg = add_argument_group('BeerGame')
 18 | game_arg.add_argument('--task', type=str, default='bg')
 19 | game_arg.add_argument('--fixedAction', type=str2bool, default='False', help='if you want to have actions in [0,actionMax] set it to True. with False it will set it [actionLow, actionUp]')
 20 | game_arg.add_argument('--observation_data', type=str2bool, default=False, help='if it is True, then it uses the data that is generated by based on few real world observation')
 21 | game_arg.add_argument('--data_id', type=int, default=22, help='the default item id for the basket dataset')
 22 | game_arg.add_argument('--TLow', type=int, default=100, help='duration of one GAME (lower bound)')
 23 | game_arg.add_argument('--TUp', type=int, default=100, help='duration of one GAME (upper bound)')
 24 | game_arg.add_argument('--demandDistribution', type=int, default=0, help='0=uniform, 1=normal distribution, 2=the sequence of 4,4,4,4,8,..., 3= basket data, 4= forecast data')
 25 | game_arg.add_argument('--scaled', type=str2bool, default=False, help='if true it uses the (if) existing scaled parameters')
 26 | game_arg.add_argument('--demandLow', type=int, default=0, help='the lower bound of random demand')
 27 | game_arg.add_argument('--demandUp', type=int, default=3, help='the upper bound of random demand')
 28 | game_arg.add_argument('--demandMu', type=float, default=10, help='the mu of the normal distribution for demand ')
 29 | game_arg.add_argument('--demandSigma', type=float, default=2, help='the sigma of the normal distribution for demand ')
 30 | game_arg.add_argument('--actionMax', type=int, default=2, help='it works when fixedAction is True')
 31 | game_arg.add_argument('--actionUp', type=int, default=2, help='bounds on my decision (upper bound), it works when fixedAction is True')
 32 | game_arg.add_argument('--actionLow', type=int, default=-2, help='bounds on my decision (lower bound), it works when fixedAction is True')
 33 | game_arg.add_argument('--action_step', type=int, default=1, help='The obtained action value by dnn is multiplied by this value')
 34 | game_arg.add_argument('--actionList', type=list, default=[],  help='The list of the available actions')
 35 | game_arg.add_argument('--actionListLen', type=int, default=0, help='the length of the action list')
 36 | game_arg.add_argument('--actionListOpt', type=int, default=0 , help='the action list which is used in optimal and sterman')
 37 | game_arg.add_argument('--actionListLenOpt', type=int, default=0, help='the length of the actionlistopt')
 38 | game_arg.add_argument('--agentTypes', type=list, default=['dnn','dnn','dnn','dnn'], help='the player types')
 39 | game_arg.add_argument('--agent_type1', type=str, default='dnn', help='the player types for agent 1, it can be dnn, Strm, bs, rnd')
 40 | game_arg.add_argument('--agent_type2', type=str, default='dnn', help='the player types for agent 2, it can be dnn, Strm, bs, rnd')
 41 | game_arg.add_argument('--agent_type3', type=str, default='dnn', help='the player types for agent 3, it can be dnn, Strm, bs, rnd')
 42 | game_arg.add_argument('--agent_type4', type=str, default='dnn', help='the player types for agent 4, it can be dnn, Strm, bs, rnd')
 43 | game_arg.add_argument('--NoAgent', type=int, default=1, help='number of agents, currently it should be in {1,2,3,4}')
 44 | game_arg.add_argument('--cp1', type=float, default=2.0, help='shortage cost of player 1')
 45 | game_arg.add_argument('--cp2', type=float, default=0.0, help='shortage cost of player 2')
 46 | game_arg.add_argument('--cp3', type=float, default=0.0, help='shortage cost of player 3')
 47 | game_arg.add_argument('--cp4', type=float, default=0.0, help='shortage cost of player 4')
 48 | game_arg.add_argument('--ch1', type=float, default=2.0, help='holding cost of player 1')
 49 | game_arg.add_argument('--ch2', type=float, default=2.0, help='holding cost of player 2')
 50 | game_arg.add_argument('--ch3', type=float, default=2.0, help='holding cost of player 3')
 51 | game_arg.add_argument('--ch4', type=float, default=2.0, help='holding cost of player 4')
 52 | game_arg.add_argument('--alpha_b1', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 1')
 53 | game_arg.add_argument('--alpha_b2', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 2')
 54 | game_arg.add_argument('--alpha_b3', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 3')
 55 | game_arg.add_argument('--alpha_b4', type=float, default=-0.5, help='alpha of Sterman formula parameter for player 4')
 56 | game_arg.add_argument('--betta_b1', type=float, default=-0.2, help='beta of Sterman formula parameter for player 1')
 57 | game_arg.add_argument('--betta_b2', type=float, default=-0.2, help='beta of Sterman formula parameter for player 2')
 58 | game_arg.add_argument('--betta_b3', type=float, default=-0.2, help='beta of Sterman formula parameter for player 3')
 59 | game_arg.add_argument('--betta_b4', type=float, default=-0.2, help='beta of Sterman formula parameter for player 4')
 60 | game_arg.add_argument('--eta', type=list, default=[0,4,4,4], help='the total cost regulazer')
 61 | game_arg.add_argument('--distCoeff', type=int, default=20, help='the total cost regulazer')
 62 | game_arg.add_argument('--gameConfig', type=int, default=3, help='if it is "0", it uses the current "agentType", otherwise sets agent types according to the function setAgentType() in this file.')
 63 | game_arg.add_argument('--ifUseTotalReward', type=str2bool, default='False', help='if you want to have the total rewards in the experience replay, set it to true.')
 64 | game_arg.add_argument('--ifUsedistTotReward', type=str2bool, default='True', help='If use correction to the rewards in the experience replay for all iterations of current game')
 65 | game_arg.add_argument('--ifUseASAO', type=str2bool, default='True', help='if use AS and AO, i.e., received shipment and received orders in the input of DNN')
 66 | game_arg.add_argument('--ifUseActionInD', type=str2bool, default='False', help='if use action in the input of DNN')
 67 | game_arg.add_argument('--stateDim', type=int, default=5, help='Number of elements in the state desciptor - Depends on ifUseASAO')
 68 | game_arg.add_argument('--iftl', type=str2bool, default=False, help='if apply transfer learning')
 69 | game_arg.add_argument('--ifTransferFromSmallerActionSpace', type=str2bool, default=False, help='if want to transfer knowledge from a network with different action space size.')
 70 | game_arg.add_argument('--baseActionSize', type=int, default=5, help='if ifTransferFromSmallerActionSpace is true, this determines the size of action space of saved network')
 71 | game_arg.add_argument('--tlBaseBrain', type=int, default=3, help='the gameConfig of the base network for re-training with transfer-learning')
 72 | game_arg.add_argument('--baseDemandDistribution', type=int, default=0, help='same as the demandDistribution')
 73 | game_arg.add_argument('--MultiAgent', type=str2bool, default=False, help='if run multi-agent RL model, not fully operational')
 74 | game_arg.add_argument('--MultiAgentRun', type=list, default=[True, True, True, True], help='In the multi-RL setting, it determines which agent should get training.')
 75 | game_arg.add_argument('--if_use_AS_t_plus_1', type=str2bool, default='False', help='if use AS[t+1], not AS[t] in the input of DNN')
 76 | game_arg.add_argument('--ifSinglePathExist', type=str2bool, default=False, help='If true it uses the predefined path in pre_model_dir and does not merge it with demandDistribution.')
 77 | game_arg.add_argument('--ifPlaySavedData', type=str2bool, default=False, help='If true it uses the saved actions which are read from file.')
 78 | 
 79 | #################### parameters of the leadtimes ########################
 80 | leadtimes_arg = add_argument_group('leadtimes')
 81 | leadtimes_arg.add_argument('--leadRecItemLow', type=list, default=[2,2,2,4], help='the min lead time for receiving items')
 82 | leadtimes_arg.add_argument('--leadRecItemUp', type=list, default=[2,2,2,4], help='the max lead time for receiving items')
 83 | leadtimes_arg.add_argument('--leadRecOrderLow', type=int, default=[2,2,2,0], help='the min lead time for receiving orders')
 84 | leadtimes_arg.add_argument('--leadRecOrderUp', type=int, default=[2,2,2,0], help='the max lead time for receiving orders')
 85 | leadtimes_arg.add_argument('--ILInit', type=list, default=[0,0,0,0], help='')
 86 | leadtimes_arg.add_argument('--AOInit', type=list, default=[0,0,0,0], help='')
 87 | leadtimes_arg.add_argument('--ASInit', type=list, default=[0,0,0,0], help='the initial shipment of each agent')
 88 | leadtimes_arg.add_argument('--leadRecItem1', type=int, default=2, help='the min lead time for receiving items')
 89 | leadtimes_arg.add_argument('--leadRecItem2', type=int, default=2, help='the min lead time for receiving items')
 90 | leadtimes_arg.add_argument('--leadRecItem3', type=int, default=2, help='the min lead time for receiving items')
 91 | leadtimes_arg.add_argument('--leadRecItem4', type=int, default=2, help='the min lead time for receiving items')
 92 | leadtimes_arg.add_argument('--leadRecOrder1', type=int, default=2, help='the min lead time for receiving order')
 93 | leadtimes_arg.add_argument('--leadRecOrder2', type=int, default=2, help='the min lead time for receiving order')
 94 | leadtimes_arg.add_argument('--leadRecOrder3', type=int, default=2, help='the min lead time for receiving order')
 95 | leadtimes_arg.add_argument('--leadRecOrder4', type=int, default=2, help='the min lead time for receiving order')
 96 | leadtimes_arg.add_argument('--ILInit1', type=int, default=0, help='the initial inventory level of the agent')
 97 | leadtimes_arg.add_argument('--ILInit2', type=int, default=0, help='the initial inventory level of the agent')
 98 | leadtimes_arg.add_argument('--ILInit3', type=int, default=0, help='the initial inventory level of the agent')
 99 | leadtimes_arg.add_argument('--ILInit4', type=int, default=0, help='the initial inventory level of the agent')
100 | leadtimes_arg.add_argument('--AOInit1', type=int, default=0, help='the initial arriving order of the agent')
101 | leadtimes_arg.add_argument('--AOInit2', type=int, default=0, help='the initial arriving order of the agent')
102 | leadtimes_arg.add_argument('--AOInit3', type=int, default=0, help='the initial arriving order of the agent')
103 | leadtimes_arg.add_argument('--AOInit4', type=int, default=0, help='the initial arriving order of the agent')
104 | leadtimes_arg.add_argument('--ASInit1', type=int, default=0, help='the initial arriving shipment of the agent')
105 | leadtimes_arg.add_argument('--ASInit2', type=int, default=0, help='the initial arriving shipment of the agent')
106 | leadtimes_arg.add_argument('--ASInit3', type=int, default=0, help='the initial arriving shipment of the agent')
107 | leadtimes_arg.add_argument('--ASInit4', type=int, default=0, help='the initial arriving shipment of the agent')
108 | 
109 | 
110 | ####################	DQN setting		####################	
111 | DQN_arg = add_argument_group('DQN')
112 | DQN_arg.add_argument('--maxEpisodesTrain', type=int, default=60100, help='number of GAMES to be trained')
113 | DQN_arg.add_argument('--NoHiLayer', type=int, default=3, help='number of hidden layers')
114 | DQN_arg.add_argument('--NoFixedLayer', type=int, default=1, help='number of hidden layers')
115 | DQN_arg.add_argument('--node1', type=int, default=180, help='the number of nodes in the first hidden layer')
116 | DQN_arg.add_argument('--node2', type=int, default=130, help='the number of nodes in the second hidden layer')
117 | DQN_arg.add_argument('--node3', type=int, default=61, help='the number of nodes in the third hidden layer')
118 | DQN_arg.add_argument('--nodes', type=list, default=[], help='')
119 | 
120 | DQN_arg.add_argument('--seed', type=int, default=4, help='the seed for DNN stuff')
121 | DQN_arg.add_argument('--batchSize', type=int, default=64, help='the batch size which is used to obtain')
122 | DQN_arg.add_argument('--minReplayMem', type=int, default=50000, help='the minimum of experience reply size to start dnn')
123 | DQN_arg.add_argument('--maxReplayMem', type=int, default=1000000, help='the maximum size of the replay memory')
124 | DQN_arg.add_argument('--alpha', type=float, default=.97, help='learning rate for total reward distribution ')
125 | DQN_arg.add_argument('--gamma', type=float, default=.99, help='discount factor for Q-learning')
126 | DQN_arg.add_argument('--saveInterval', type=int, default=10000, help='every xx training iteration, saves the games network')
127 | DQN_arg.add_argument('--epsilonBeg', type=float, default=0.9, help='')
128 | DQN_arg.add_argument('--epsilonEnd', type=float, default=0.1, help='')
129 | 				
130 | DQN_arg.add_argument('--lr0', type=float, default=0.00025 , help='the learning rate')
131 | DQN_arg.add_argument('--Minlr', type=float, default=1e-8, help='the minimum learning rate, if it drops below it, fix it there ')
132 | DQN_arg.add_argument('--ifDecayAdam', type=str2bool, default=True, help='decays the learning rate of the adam optimizer')
133 | DQN_arg.add_argument('--decayStep', type=int, default=10000, help='the decay step of the learning rate')
134 | DQN_arg.add_argument('--decayRate', type=float, default=0.98, help='the rate to reduce the lr at every decayStep')
135 | 
136 | DQN_arg.add_argument('--display', type=int, default=1000, help='the number of iterations between two display of results.')
137 | DQN_arg.add_argument('--momentum', type=float, default=0.9, help='the momentum value')
138 | DQN_arg.add_argument('--dnnUpCnt', type=int, default=10000, help='the number of iterations that updates the dnn weights')
139 | DQN_arg.add_argument('--multPerdInpt', type=int, default=10, help='Number of history records which we feed into DNN')
140 | 
141 | 
142 | ####################	Utilities			####################	
143 | utility_arg = add_argument_group('Utilities')
144 | utility_arg.add_argument('--address', type=str, default="", help='the address which is used to save the model files')
145 | utility_arg.add_argument('--ifUsePreviousModel', type=str2bool, default='False', help='if there is a saved model, then False value of this parameter will overwrite.')
146 | utility_arg.add_argument('--number_cpu_active', type=int, default=5, help='number of cpu cores')
147 | utility_arg.add_argument('--gpu_memory_fraction', type=float, default=0.1, help='the fraction of gpu memory which we are gonna use')
148 | # Dirs
149 | utility_arg.add_argument('--load_path', type=str, default='', help='The directory to load the models')
150 | utility_arg.add_argument('--log_dir', type=str, default=os.path.expanduser('./logs/'), help='')
151 | utility_arg.add_argument('--pre_model_dir', type=str, default=os.path.expanduser('./pre_model'),help='')
152 | utility_arg.add_argument('--action_dir', type=str, default=os.path.expanduser('./'),help='if ifPlaySavedData is true, it uses this path to load actions')
153 | utility_arg.add_argument('--model_dir', type=str, default='./',help='')
154 | utility_arg.add_argument('--TB', type=str2bool, default=False, help='set to True if use tensor board and save the required data for TB.')
155 | utility_arg.add_argument('--INFO_print', type=str2bool, default=True, help='if true, it does not print anything all.')
156 | utility_arg.add_argument('--tbLogInterval', type=int, default=80000, help='number of GAMES for testing')
157 | 		
158 | ####################	testing			####################	
159 | test_arg = add_argument_group('testing')
160 | test_arg.add_argument('--testRepeatMid', type=int, default=50, help='it is number of episodes which is going to be used for testing in the middle of training')
161 | test_arg.add_argument('--testInterval', type=int, default=100, help='every xx games compute "test error"')
162 | test_arg.add_argument('--ifSaveFigure', type=str2bool, default=True, help='if is it True, save the figures in each testing.')
163 | test_arg.add_argument('--if_titled_figure', type=str2bool, default='True', help='if is it True, save the figures with details in the title.')
164 | test_arg.add_argument('--saveFigInt', type=list, default=[59990,60000], help='')
165 | test_arg.add_argument('--saveFigIntLow', type=int, default=59990, help='')
166 | test_arg.add_argument('--saveFigIntUp', type=int, default=60000, help='')
167 | test_arg.add_argument('--ifsaveHistInterval', type=str2bool, default=False, help='if every xx games save details of the episode')
168 | test_arg.add_argument('--saveHistInterval', type=int, default=50000, help='every xx games save details of the play')
169 | test_arg.add_argument('--Ttest', type=int, default=100, help='it defines the number of periods in the test cases')
170 | test_arg.add_argument('--ifOptimalSolExist', type=str2bool, default=True, help='if the instance has optimal base stock policy, set it to True, otherwise it should be False.')
171 | test_arg.add_argument('--f1', type=float, default=8, help='base stock policy decision of player 1')
172 | test_arg.add_argument('--f2', type=float, default=8, help='base stock policy decision of player 2')
173 | test_arg.add_argument('--f3', type=float, default=0, help='base stock policy decision of player 3')
174 | test_arg.add_argument('--f4', type=float, default=0, help='base stock policy decision of player 4')
175 | test_arg.add_argument('--f_init', type=list, default=[32,32,32,24], help='base stock policy decision for 4 time-steps on the C(4,8) demand distribution')
176 | test_arg.add_argument('--use_initial_BS', type=str2bool, default=False, help='If use f_init set it to True')
177 | 
178 | ####################	reporting			####################	
179 | reporting_arg = add_argument_group('reporting')
180 | reporting_arg.add_argument('--Rsltdnn', type=list, default=[], help='the result of dnn play tests will be saved here')
181 | reporting_arg.add_argument('--RsltRnd', type=list, default=[], help='the result of random play tests will be saved here')
182 | reporting_arg.add_argument('--RsltStrm', type=list, default=[], help='the result of heuristic fomula play tests will be saved here')
183 | reporting_arg.add_argument('--Rsltbs', type=list, default=[], help='the result of optimal play tests will be saved here')
184 | reporting_arg.add_argument('--ifSaveHist', type=str2bool, default='False', help='if it is true, saves history, prediction, and the randBatch in each period, WARNING: just make it True in small runs, it saves huge amount of files.')
185 | 
186 | 		
187 | #buildActionList: actions for the beer game problem	
188 | def buildActionList(config):
189 | 	aDiv = 1  # difference in the action list
190 | 	if config.fixedAction:
191 | 		actions = list(range(0,config.actionMax+1,aDiv)) # If you put the second argument =11, creates an actionlist from 0..xx
192 | 	else:
193 | 		actions = list(range(config.actionLow,config.actionUp+1,aDiv) )
194 | 	return actions	
195 | 	
196 | # specify the dimension of the state of the game	
197 | def getStateDim(config):
198 | 	if config.ifUseASAO:
199 | 		stateDim=5
200 | 	else:
201 | 		stateDim=3
202 | 
203 | 	if config.ifUseActionInD:
204 | 		stateDim += 1
205 | 
206 | 	return stateDim	
207 | 
208 | # agents 1=[dnn,dnn,dnn,dnn]; 2=[dnn,Strm,Strm,Strm]; 3=[dnn,bs,bs,bs]
209 | def setAgentType(config):
210 | 	if config.gameConfig == 1:   # all agents are run by DNN- Also, load-model loads from brain-3+agentNum-
211 | 		# Also multi-agent with double target uses this gameConfig.
212 | 		config.agentTypes = ["srdqn", "srdqn","srdqn","srdqn"]
213 | 		config.to_prev_ai = [3,-1,-1,-1]
214 | 	elif config.gameConfig == 2: # one agent is run by DNN- Also, load-model loads from brain-3+agentNum-
215 | 		# Also multi-agent with double target uses this gameConfig.
216 | 		config.agentTypes = ["srdqn", "srdqn","srdqn","srdqn"]
217 | 		config.to_prev_ai = [3,-1,-1,-1]
218 | 	elif config.gameConfig == 3: 
219 | 		config.agentTypes = ["srdqn", "bs","bs","bs"]
220 | 	elif config.gameConfig == 4: 
221 | 		config.agentTypes = ["bs", "srdqn","bs","bs"]
222 | 	elif config.gameConfig == 5: 
223 | 		config.agentTypes = ["bs", "bs","srdqn","bs"]
224 | 	elif config.gameConfig == 6: 
225 | 		config.agentTypes = ["bs", "bs","bs","srdqn"]
226 | 	elif config.gameConfig == 7: 
227 | 		config.agentTypes = ["srdqn", "Strm","Strm","Strm"]
228 | 	elif config.gameConfig == 8: 
229 | 		config.agentTypes = ["Strm", "srdqn","Strm","Strm"]
230 | 	elif config.gameConfig == 9: 
231 | 		config.agentTypes = ["Strm", "Strm","srdqn","Strm"]
232 | 	elif config.gameConfig == 10: 
233 | 		config.agentTypes = ["Strm", "Strm","Strm","srdqn"]
234 | 	elif config.gameConfig == 11: 
235 | 		config.agentTypes = ["srdqn", "rnd","rnd","rnd"]
236 | 	elif config.gameConfig == 12: 
237 | 		config.agentTypes = ["rnd", "srdqn","rnd","rnd"]
238 | 	elif config.gameConfig == 13: 
239 | 		config.agentTypes = ["rnd", "rnd","srdqn","rnd"]
240 | 	elif config.gameConfig == 14: 
241 | 		config.agentTypes = ["rnd", "rnd","rnd","srdqn"]
242 | 	elif config.gameConfig == 15: 
243 | 		config.agentTypes = ["Strm", "bs","bs","bs"]		
244 | 	elif config.gameConfig == 16: 
245 | 		config.agentTypes = ["bs", "Strm","bs","bs"]		
246 | 	elif config.gameConfig == 17: 
247 | 		config.agentTypes = ["bs", "bs","Strm","bs"]		
248 | 	elif config.gameConfig == 18: 
249 | 		config.agentTypes = ["bs", "bs","bs","Strm"]
250 | 	elif config.gameConfig == 19: 
251 | 		config.agentTypes = ["rnd", "bs","bs","bs"]		
252 | 	elif config.gameConfig == 20: 
253 | 		config.agentTypes = ["bs", "rnd","bs","bs"]		
254 | 	elif config.gameConfig == 21: 
255 | 		config.agentTypes = ["bs", "bs","rnd","bs"]		
256 | 	elif config.gameConfig == 22: 
257 | 		config.agentTypes = ["bs", "bs","bs","rnd"]						
258 | 	elif config.gameConfig == 23: 
259 | 		config.agentTypes = ["Strm", "Strm","Strm","Strm"]
260 | 	elif config.gameConfig == 24: 
261 | 		config.agentTypes = ["rnd", "rnd","rnd","rnd"]		
262 | 	elif config.gameConfig == 25: 
263 | 		config.agentTypes = ["bs", "bs","bs","bs"]
264 | 	elif config.gameConfig == 26: 
265 | 		config.agentTypes = ["bs", "Strm","Strm","Strm"]
266 | 	elif config.gameConfig == 27: 
267 | 		config.agentTypes = ["Strm", "bs","Strm","Strm"]
268 | 	elif config.gameConfig == 28: 
269 | 		config.agentTypes = ["Strm", "Strm","bs","Strm"]
270 | 	elif config.gameConfig == 29: 
271 | 		config.agentTypes = ["Strm", "Strm","Strm","bs"]
272 | 	elif config.gameConfig == 30: 
273 | 		config.agentTypes = ["bs", "rnd","rnd","rnd"]
274 | 	elif config.gameConfig == 31: 
275 | 		config.agentTypes = ["rnd", "bs","rnd","rnd"]
276 | 	elif config.gameConfig == 32: 
277 | 		config.agentTypes = ["rnd", "rnd","bs","rnd"]
278 | 	elif config.gameConfig == 33: 
279 | 		config.agentTypes = ["rnd", "rnd","rnd","bs"]		
280 | 	else:
281 | 		config.agentTypes = ["bs", "bs","bs","bs"]
282 | 
283 | def fillnodes(config):
284 | 	if config.NoHiLayer == 2:
285 | 		config.nodes = [config.stateDim * config.multPerdInpt, config.node1,config.node2,config.actionListLen]
286 | 	elif config.NoHiLayer == 3:
287 | 		config.nodes = [config.stateDim * config.multPerdInpt, config.node1,config.node2,config.node3,config.actionListLen]
288 | 
289 | 
290 | def setSavedDimentionPerBrain(config):
291 | 	if config.ifUsePreviousModel and not config.iftl:
292 | 		if config.demandDistribution == 0 and config.demandUp == 9 and config.demandLow == 0 and config.actionUp == 8:
293 | 			if config.gameConfig == 3:
294 | 				config.multPerdInpt = 5
295 | 				config.NoHiLayer = 3
296 | 				config.node1=180
297 | 				config.node2=130
298 | 				config.node3=61
299 | 			elif config.gameConfig == 4:
300 | 				config.multPerdInpt = 10
301 | 				config.NoHiLayer = 3
302 | 				config.node1=180
303 | 				config.node2=130
304 | 				config.node3=61
305 | 			elif config.gameConfig == 5:
306 | 				config.multPerdInpt = 5
307 | 				config.NoHiLayer = 3
308 | 				config.node1=180
309 | 				config.node2=130
310 | 				config.node3=61		
311 | 			elif config.gameConfig == 6:
312 | 				config.multPerdInpt = 5
313 | 				config.NoHiLayer = 3
314 | 				config.node1=180
315 | 				config.node2=130
316 | 				config.node3=61		
317 | 			elif config.gameConfig == 7:
318 | 				config.multPerdInpt = 10
319 | 				config.NoHiLayer = 3
320 | 				config.node1=180
321 | 				config.node2=130
322 | 				config.node3=61		
323 | 			elif config.gameConfig == 8:
324 | 				config.multPerdInpt = 10
325 | 				config.NoHiLayer = 3
326 | 				config.node1=180
327 | 				config.node2=130
328 | 				config.node3=61		
329 | 			elif config.gameConfig == 9:
330 | 				config.multPerdInpt = 10
331 | 				config.NoHiLayer = 3
332 | 				config.node1=180
333 | 				config.node2=130
334 | 				config.node3=61		
335 | 			elif config.gameConfig == 10:
336 | 				config.multPerdInpt = 10
337 | 				config.NoHiLayer = 3
338 | 				config.node1=180
339 | 				config.node2=130
340 | 				config.node3=61		
341 | 			elif config.gameConfig == 11:
342 | 				config.multPerdInpt = 5
343 | 				config.NoHiLayer = 3
344 | 				config.node1=180
345 | 				config.node2=130
346 | 				config.node3=61		
347 | 			elif config.gameConfig == 12:
348 | 				config.multPerdInpt = 5
349 | 				config.NoHiLayer = 3
350 | 				config.node1=180
351 | 				config.node2=130
352 | 				config.node3=61		
353 | 			elif config.gameConfig == 13:
354 | 				config.multPerdInpt = 10
355 | 				config.NoHiLayer = 3
356 | 				config.node1=180
357 | 				config.node2=130
358 | 				config.node3=61		
359 | 			elif config.gameConfig == 14:
360 | 				config.multPerdInpt = 5
361 | 				config.NoHiLayer = 3
362 | 				config.node1=180
363 | 				config.node2=130
364 | 				config.node3=61		
365 | 		
366 | 		elif config.demandDistribution == 1 and config.demandMu == 10 and config.demandSigma == 2 and config.actionUp == 5:
367 | 			if config.gameConfig == 3:
368 | 				config.multPerdInpt = 5
369 | 				config.NoHiLayer = 3
370 | 				config.node1=180
371 | 				config.node2=130
372 | 				config.node3=61
373 | 			elif config.gameConfig == 4:
374 | 				config.multPerdInpt = 5
375 | 				config.NoHiLayer = 3
376 | 				config.node1=180
377 | 				config.node2=130
378 | 				config.node3=61
379 | 			elif config.gameConfig == 5:
380 | 				config.multPerdInpt = 5
381 | 				config.NoHiLayer = 3
382 | 				config.node1=180
383 | 				config.node2=130
384 | 				config.node3=61		
385 | 			elif config.gameConfig == 6:
386 | 				config.multPerdInpt = 10
387 | 				config.NoHiLayer = 3
388 | 				config.node1=180
389 | 				config.node2=130
390 | 				config.node3=61		
391 | 			elif config.gameConfig == 7:
392 | 				config.multPerdInpt = 10
393 | 				config.NoHiLayer = 3
394 | 				config.node1=180
395 | 				config.node2=130
396 | 				config.node3=61		
397 | 			elif config.gameConfig == 8:
398 | 				config.multPerdInpt = 10
399 | 				config.NoHiLayer = 3
400 | 				config.node1=180
401 | 				config.node2=130
402 | 				config.node3=61		
403 | 			elif config.gameConfig == 9:
404 | 				config.multPerdInpt = 10
405 | 				config.NoHiLayer = 3
406 | 				config.node1=180
407 | 				config.node2=130
408 | 				config.node3=61		
409 | 			elif config.gameConfig == 10:
410 | 				config.multPerdInpt = 10
411 | 				config.NoHiLayer = 3
412 | 				config.node1=180
413 | 				config.node2=130
414 | 				config.node3=61		
415 | 			elif config.gameConfig == 11:
416 | 				config.multPerdInpt = 10
417 | 				config.NoHiLayer = 3
418 | 				config.node1=180
419 | 				config.node2=130
420 | 				config.node3=61		
421 | 			elif config.gameConfig == 12:
422 | 				config.multPerdInpt = 10
423 | 				config.NoHiLayer = 3
424 | 				config.node1=180
425 | 				config.node2=130
426 | 				config.node3=61		
427 | 			elif config.gameConfig == 13:
428 | 				config.multPerdInpt = 5
429 | 				config.NoHiLayer = 3
430 | 				config.node1=180
431 | 				config.node2=130
432 | 				config.node3=61		
433 | 			elif config.gameConfig == 14:
434 | 				config.multPerdInpt = 10
435 | 				config.NoHiLayer = 3
436 | 				config.node1=180
437 | 				config.node2=130
438 | 				config.node3=61		
439 | 
440 | 		elif config.demandDistribution == 2 and config.demandUp == 9 and config.demandLow == 0 and config.actionUp == 8:
441 | 			if config.gameConfig == 3:
442 | 				config.multPerdInpt = 10
443 | 				config.NoHiLayer = 3
444 | 				config.node1=180
445 | 				config.node2=130
446 | 				config.node3=61
447 | 			elif config.gameConfig == 4:
448 | 				config.multPerdInpt = 10
449 | 				config.NoHiLayer = 3
450 | 				config.node1=180
451 | 				config.node2=130
452 | 				config.node3=61
453 | 			elif config.gameConfig == 5:
454 | 				config.multPerdInpt = 10
455 | 				config.NoHiLayer = 3
456 | 				config.node1=180
457 | 				config.node2=130
458 | 				config.node3=61		
459 | 			elif config.gameConfig == 6:
460 | 				config.multPerdInpt = 5
461 | 				config.NoHiLayer = 3
462 | 				config.node1=180
463 | 				config.node2=130
464 | 				config.node3=61		
465 | 			elif config.gameConfig == 7:
466 | 				config.multPerdInpt = 5
467 | 				config.NoHiLayer = 3
468 | 				config.node1=180
469 | 				config.node2=130
470 | 				config.node3=61		
471 | 			elif config.gameConfig == 8:
472 | 				config.multPerdInpt = 10
473 | 				config.NoHiLayer = 3
474 | 				config.node1=180
475 | 				config.node2=130
476 | 				config.node3=61		
477 | 			elif config.gameConfig == 9:
478 | 				config.multPerdInpt = 5
479 | 				config.NoHiLayer = 3
480 | 				config.node1=180
481 | 				config.node2=130
482 | 				config.node3=61		
483 | 			elif config.gameConfig == 10:
484 | 				config.multPerdInpt = 10
485 | 				config.NoHiLayer = 3
486 | 				config.node1=180
487 | 				config.node2=130
488 | 				config.node3=61		
489 | 			elif config.gameConfig == 11:
490 | 				config.multPerdInpt = 10
491 | 				config.NoHiLayer = 3
492 | 				config.node1=180
493 | 				config.node2=130
494 | 				config.node3=61		
495 | 			elif config.gameConfig == 12:
496 | 				config.multPerdInpt = 5
497 | 				config.NoHiLayer = 3
498 | 				config.node1=180
499 | 				config.node2=130
500 | 				config.node3=61		
501 | 			elif config.gameConfig == 13:
502 | 				config.multPerdInpt = 5
503 | 				config.NoHiLayer = 3
504 | 				config.node1=180
505 | 				config.node2=130
506 | 				config.node3=61		
507 | 			elif config.gameConfig == 14:
508 | 				config.multPerdInpt = 10
509 | 				config.NoHiLayer = 3
510 | 				config.node1=180
511 | 				config.node2=130
512 | 				config.node3=61		
513 | 
514 | 		elif config.demandDistribution != 3 and config.demandDistribution != 4:
515 | 			if config.gameConfig == 7:
516 | 				config.dnnUpCnt = 10000
517 | 				config.multPerdInpt = 5
518 | 				config.NoHiLayer = 2
519 | 				config.lr0 = 0.001
520 | 			elif config.gameConfig == 8:
521 | 				config.dnnUpCnt = 5000
522 | 				config.multPerdInpt = 5
523 | 				config.NoHiLayer = 2 # this should be 3
524 | 				config.lr0 = 0.00025
525 | 			elif config.gameConfig == 9:
526 | 				config.dnnUpCnt = 5000
527 | 				config.multPerdInpt = 3
528 | 				config.NoHiLayer = 2
529 | 				config.lr0 = 0.001
530 | 			elif config.gameConfig == 10:
531 | 				config.dnnUpCnt = 5000
532 | 				config.multPerdInpt = 3 # it should be 5 
533 | 				config.NoHiLayer = 2
534 | 				config.lr0 = 0.001
535 | 
536 | def set_optimal(config):
537 | 	if config.demandDistribution == 0:
538 | 		if config.cp1==2 and config.ch1==2 and config.ch2==2 and config.ch3==2 and config.ch4==2 :
539 | 			config.f1 = 8.
540 | 			config.f2 = 8.
541 | 			config.f3 = 0.
542 | 			config.f4 = 0.
543 | 
544 | def get_config():
545 | 	config, unparsed = parser.parse_known_args()
546 | 	config = update_config(config)
547 | 
548 | 	return config, unparsed
549 | 
550 | def fill_leadtime_initial_values(config):
551 | 	config.leadRecItemLow = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4]
552 | 	config.leadRecItemUp = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4]
553 | 	config.leadRecOrderLow = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4]
554 | 	config.leadRecOrderUp = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4]
555 | 	config.ILInit = [config.ILInit1, config.ILInit2, config.ILInit3, config.ILInit4]
556 | 	config.AOInit = [config.AOInit1, config.AOInit2, config.AOInit3, config.AOInit4]
557 | 	config.ASInit = [config.ASInit1, config.ASInit2, config.ASInit3, config.ASInit4]
558 | 
559 | def get_auxuliary_leadtime_initial_values(config):
560 | 	config.leadRecOrderUp_aux = [config.leadRecOrder1, config.leadRecOrder2, config.leadRecOrder3, config.leadRecOrder4]
561 | 	config.leadRecItemUp_aux = [config.leadRecItem1, config.leadRecItem2, config.leadRecItem3, config.leadRecItem4]
562 | 
563 | def fix_lead_time_manufacturer(config):
564 | 	if config.leadRecOrder4 > 0:
565 | 		config.leadRecItem4 += config.leadRecOrder4
566 | 		config.leadRecOrder4 = 0 
567 | 
568 | def set_sterman_parameters(config):
569 | 	config.alpha_b =[config.alpha_b1,config.alpha_b2,config.alpha_b3,config.alpha_b4]
570 | 	config.betta_b =[config.betta_b1,config.betta_b2,config.betta_b3,config.betta_b4]	
571 | 
572 | 
573 | def update_config(config):
574 | 	config.actionList = buildActionList(config)		# The list of the available actions
575 | 	config.actionListLen = len(config.actionList)		# the length of the action list
576 | 		
577 | 	# set_optimal(config)
578 | 	config.f = [config.f1, config.f2, config.f3, config.f4] # [6.4, 2.88, 2.08, 0.8]
579 | 
580 | 	config.actionListLen=len(config.actionList)
581 | 	if config.demandDistribution == 0:
582 | 		config.actionListOpt=list(range(0,int(max(config.actionUp*30+1, 3*sum(config.f))),1))
583 | 	else:
584 | 		config.actionListOpt=list(range(0,int(max(config.actionUp*30+1, 7*sum(config.f))),1))
585 | 	config.actionListLenOpt=len(config.actionListOpt)
586 | 	config.agentTypes=['dnn','dnn','dnn','dnn']
587 | 	config.saveFigInt = [config.saveFigIntLow, config.saveFigIntUp]
588 | 	
589 | 	if config.gameConfig == 0:
590 | 		config.NoAgent=min(config.NoAgent,len(config.agentTypes))
591 | 		config.agentTypes=[config.agent_type1,config.agent_type2,config.agent_type3,config.agent_type4]
592 | 	else:
593 | 		config.NoAgent=4
594 | 		setAgentType(config)					# set the agent brain types according to ifFourDNNtrain, ...
595 | 
596 | 	config.c_h =[config.ch1, config.ch2, config.ch3, config.ch4]
597 | 	config.c_p =[config.cp1, config.cp2, config.cp3, config.cp4]
598 | 
599 | 	config.stateDim= getStateDim(config) # Number of elements in the state description - Depends on ifUseASAO		
600 | 	np.random.seed(seed = config.seed)
601 | 	setSavedDimentionPerBrain(config) # set the parameters of pre_trained model. 
602 | 	fillnodes(config)			# create the structure of network nodes 	
603 | 	get_auxuliary_leadtime_initial_values(config)
604 | 	fix_lead_time_manufacturer(config)
605 | 	fill_leadtime_initial_values(config)
606 | 	set_sterman_parameters(config)
607 | 
608 | 	return config
609 | 
610 | 


--------------------------------------------------------------------------------
/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OptMLGroup/DeepBeerInventory-RL/ca2bb90a5ee3a45fa89cfadf56354369a62bf5a8/data.zip


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from clBeergame import *
  3 | from utilities import *
  4 | import numpy as np 
  5 | #from clGeneralParameters import generalParameters
  6 | import random
  7 | from config import get_config, update_config
  8 | import tensorflow as tf 
  9 | 
 10 | config = None
 11 | 
 12 | #def main(config, beerGame):
 13 | def main(config):
 14 | 	random.seed(10) 
 15 | 
 16 | 	# prepare loggers and directories
 17 | 	prepare_dirs_and_logger(config)
 18 | 	config = update_config(config)
 19 | 	# save the current configuration of the problem in a json file
 20 | 	save_config(config)	
 21 | 
 22 | 	# get the address of data	
 23 | 	if config.observation_data:
 24 | 		adsr = 'data/demandTr-obs-'
 25 | 	elif config.demandDistribution == 3:
 26 | 		if config.scaled:
 27 | 			adsr = 'data/basket_data/scaled'
 28 | 		else:
 29 | 			adsr = 'data/basket_data'
 30 | 	elif config.demandDistribution == 4:
 31 | 		if config.scaled:
 32 | 			adsr = 'data/forecast_data/scaled'
 33 | 		else:
 34 | 			adsr = 'data/forecast_data'
 35 | 	else:
 36 | 		adsr = 'data/demandTr'
 37 | 
 38 | 		
 39 | 	# load demands
 40 | 	# demandTr = np.load('demandTr'+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy')
 41 | 	if config.demandDistribution == 0:
 42 | 		direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(config.demandUp)+'-'+str(config.maxEpisodesTrain)+'.npy')
 43 | 		if not os.path.exists(direc):
 44 | 			direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy')
 45 | 	elif config.demandDistribution == 1:
 46 | 		direc = os.path.realpath(adsr+str(config.demandDistribution)+'-'+str(int(config.demandMu))+'-'+str(int(config.demandSigma))+'.npy')
 47 | 	elif config.demandDistribution == 2:
 48 | 		direc = os.path.realpath(adsr+str(config.demandDistribution)+'.npy')	
 49 | 	elif config.demandDistribution == 3:
 50 | 		direc = os.path.realpath(adsr+'/demandTr-'+str(config.data_id)+'.npy')
 51 | 	elif config.demandDistribution == 4:
 52 | 		direc = os.path.realpath(adsr+'/demandTr-'+str(config.data_id)+'.npy')
 53 | 	demandTr = np.load(direc)	
 54 | 	print("loaded training set=", direc)
 55 | 	if config.demandDistribution == 0:
 56 | 		direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(config.demandUp)+'-'+str(config.maxEpisodesTrain)+'.npy')
 57 | 		if not os.path.exists(direc):
 58 | 			direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(config.demandUp)+'.npy')
 59 | 	elif config.demandDistribution == 1:
 60 | 		direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'-'+str(int(config.demandMu))+'-'+str(int(config.demandSigma))+'.npy')
 61 | 	elif config.demandDistribution == 2:
 62 | 		direc = os.path.realpath('data/demandTs'+str(config.demandDistribution)+'.npy')
 63 | 	elif config.demandDistribution == 3:
 64 | 		direc = os.path.realpath(adsr+'/demandTs-'+str(config.data_id)+'.npy')
 65 | 		direcVl = os.path.realpath(adsr+'/demandVl-'+str(config.data_id)+'.npy')
 66 | 		demandVl = np.load(direcVl)	
 67 | 	elif config.demandDistribution == 4:
 68 | 		direc = os.path.realpath(adsr+'/demandTs-'+str(config.data_id)+'.npy')
 69 | 		direcVl = os.path.realpath(adsr+'/demandVl-'+str(config.data_id)+'.npy')
 70 | 		demandVl = np.load(direcVl)	
 71 | 	demandTs = np.load(direc)	
 72 | 	print("loaded test set=", direc)
 73 | 
 74 | 
 75 | 	# initilize an instance of Beergame
 76 | 	beerGame = clBeerGame(config)
 77 | 	
 78 | 	# get the length of the demand.
 79 | 	demand_len = np.shape(demandTr)[0] 
 80 | 	# Do Initial tests
 81 | 	beerGame.doTestMid(demandTs[0:config.testRepeatMid])
 82 | 	
 83 | 	# train the specified number of games
 84 | 	for i in range(0, config.maxEpisodesTrain):
 85 | 		beerGame.playGame(demandTr[i%demand_len],"train")
 86 | 		# get the test results
 87 | 		if (np.mod(beerGame.curGame,config.testInterval) == 0) and (beerGame.curGame>500):	
 88 | 			beerGame.doTestMid(demandTs[0:config.testRepeatMid])			
 89 | 		
 90 | 	# do the last test on the middle test data set.
 91 | 	beerGame.doTestMid(demandTs[0:config.testRepeatMid])
 92 | 	if config.demandDistribution == 3:
 93 | 		beerGame.doTestMid(demandVl[0:config.testRepeatMid])
 94 | 	
 95 | if __name__ == '__main__':	
 96 | 	# load parameters
 97 | 	config, unparsed = get_config()
 98 | 
 99 | 	# run main
100 | 	main(config)
101 | 


--------------------------------------------------------------------------------
/plotting.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import numpy as np
 3 | import matplotlib
 4 | matplotlib.use('Agg')
 5 | import matplotlib.pyplot as plt
 6 | from pylab import *
 7 | 
 8 | # plotting 
 9 | def plotting(plt, data, colori, pltLabel):
10 | 	plt.hold(True)
11 | 	
12 | 	for i in range (np.shape(data)[0]):
13 | 		plt.subplot(4,5,5*i+1)
14 | 		plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[1,:],colori,label=pltLabel)
15 | 		plt.xlabel('Time')
16 | 		plt.ylabel('IL')	
17 | 		plt.grid(True)
18 | 
19 | 
20 | 		plt.subplot(4,5,5*i+2)
21 | 		plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[2,:],colori, label=pltLabel)
22 | 		plt.xlabel('Time')
23 | 		plt.ylabel('OO')
24 | 		plt.grid(True)
25 | 		
26 | 		plt.subplot(4,5,5*i+3)
27 | 		plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[3,:],colori, label=pltLabel)
28 | 		plt.xlabel('Time')
29 | 		plt.ylabel('a')
30 | 		plt.grid(True)
31 | 		
32 | 		plt.subplot(4,5,5*i+4)
33 | 		plt.plot(np.transpose(data[i])[0,:], np.transpose(data[i])[5,:],colori,label=pltLabel)
34 | 		plt.xlabel('Time')
35 | 		plt.ylabel('OUTL')
36 | 		plt.grid(True)
37 | 
38 | 		plt.subplot(4,5,5*i+5)
39 | 		plt.plot(np.transpose(data[i])[0,:], -1*np.transpose(data[i])[4,:],colori,label=pltLabel)
40 | 		plt.xlabel('Time')
41 | 		plt.ylabel('r')
42 | 		plt.grid(True)
43 | 
44 | 	return plt
45 | 	
46 | def savePlot(players, curGame, Rsltdnn, RsltFrmu, RsltOptm, config, m):
47 | 	node1 = config.node1
48 | 	node2 = config.node2
49 | 	node3 = config.node3
50 | 	#add title to plot
51 | 	if config.if_titled_figure:
52 | 		if config.NoHiLayer==2:
53 | 			plt.suptitle("Game No="+str(curGame)+";" + str(config.agentTypes.count("srdqn"))+ " SRDQN Agents; SRDQN nodes="+str(node1)+
54 | 			"-"+str(node2)+ "; sum SRDQN=" + str(round(sum(Rsltdnn),2)) + "; sum Strm=" 
55 | 			+ str(round(sum(RsltFrmu),2))  +"; sum BS=" +  str(round(sum(RsltOptm),2))+ "\n"+
56 | 			"Ag SRDQN="+str([round(Rsltdnn[i],2) for i in range(config.NoAgent)])+
57 | 			"; Ag Strm="+str([round(RsltFrmu[i],2) for i in range(config.NoAgent)])+
58 | 			"; Ag BS="+str([round(RsltOptm[i],2) for i in range(config.NoAgent)]), fontsize=12)
59 | 		elif config.NoHiLayer==3:
60 | 			plt.suptitle("Game No="+str(curGame)+";" + str(config.agentTypes.count("srdqn"))+ " SRDQN Agents; SRDQN nodes="+str(node1)+
61 | 			"-"+str(node2)+"-"+str(node3)+ "; sum SRDQN=" +  str(round(sum(Rsltdnn),2))  + 
62 | 			"; sum Strm=" +  str(round(sum(RsltFrmu),2))  +"; sum BS=" +  str(round(sum(RsltOptm),2))+"\n"+
63 | 			"Ag SRDQN="+str([round(Rsltdnn[i],2) for i in range(config.NoAgent)])+
64 | 			"; Ag Strm="+str([round(RsltFrmu[i],2) for i in range(config.NoAgent)])+
65 | 			"; Ag BS="+str([round(RsltOptm[i],2) for i in range(config.NoAgent)]), fontsize=12)
66 | 					
67 | 			
68 | 	#insert legend to the figure
69 | 	legend = plt.legend(bbox_to_anchor=(-1.4, -.165, 1., -.102), shadow=True, ncol=4)
70 | 
71 | 	# configures spaces between subplots
72 | 	plt.subplots_adjust(left=None, bottom=None, right=None, top=None,wspace=.5, hspace=.5)
73 | 	# save the figure
74 | 	plt.savefig(os.path.join(config.model_dir,'saved_figures/') + str(curGame)+ '-' + str(m)+'.pdf', format='pdf')
75 | 	print("figure"+str(curGame)+".pdf saved in folder \"saved_figures\"")
76 | 	plt.close(curGame)
77 | 
78 | 
79 | def plotBaseStock(data, colori, pltLabel, curGame, config, m):
80 | 	plt.figure(104, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')	
81 | 	plt.plot(range(len(data)), data, colori, label=pltLabel)
82 | 	plt.xlabel('Time')
83 | 	plt.ylabel('Order-up-to level')
84 | 	plt.grid(True)
85 | 	plt.savefig(os.path.join(config.model_dir,'saved_figures/') + "dnnBaseStock" + str(curGame)+ '-' + str(m)+'.pdf', format='pdf')
86 | 	print("base stock figure"+str(curGame)+ '-' + str(m)+".pdf saved in folder \"saved_figures\"")
87 | 	plt.close(104)
88 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15
2 | matplotlib
3 | numpyencoder
4 | 


--------------------------------------------------------------------------------
/utilities.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import json
  4 | import random
  5 | import matplotlib
  6 | if not True:
  7 | 	matplotlib.use('Agg')
  8 | import matplotlib.pyplot as plt
  9 | import os
 10 | import logging
 11 | from datetime import datetime
 12 | import tensorflow as tf
 13 | import tensorflow.contrib.slim as slim
 14 | from numpyencoder import NumpyEncoder
 15 | 
 16 | import sys
 17 | 
 18 | 
 19 | class Logger(object):
 20 | 	# writes the outputs to a file
 21 | 	def __init__(self,config):
 22 | 		self.terminal = sys.stdout
 23 | 		self.log = open(os.path.join(config.model_dir,"logfile.log"), "a",0)
 24 | 
 25 | 	def write(self, message):
 26 | 		self.terminal.write(message)
 27 | 		self.log.write(message)  
 28 | 
 29 | 	def flush(self):
 30 | 		#this flush method is needed for python 3 compatibility.
 31 | 		#this handles the flush command by doing nothing.
 32 | 		#you might want to specify some extra behavior here.
 33 | 		pass    
 34 | 
 35 | 
 36 | def prepare_dirs_and_logger(config):
 37 | 	# set format of the logger
 38 | 	formatter = logging.Formatter(
 39 | 		"%(asctime)s:%(levelname)s::%(message)s")
 40 | 	logger = logging.getLogger('tensorflow')
 41 | 
 42 | 	for hdlr in logger.handlers:
 43 | 		logger.removeHandler(hdlr)
 44 | 
 45 | 	handler = logging.StreamHandler()
 46 | 	handler.setFormatter(formatter)
 47 | 
 48 | 	logger.addHandler(handler)
 49 | 	logger.setLevel(tf.logging.INFO)
 50 | 	
 51 | 	# create load paths, if they don't exist
 52 | 	if config.load_path:
 53 | 		if config.load_path.startswith(config.task):
 54 | 			config.model_name = config.load_path
 55 | 		else:
 56 | 			config.model_name = "{}_{}".format(config.task, config.load_path)
 57 | 	else:
 58 | 		if config.iftl:
 59 | 			tl = 1
 60 | 		else:
 61 | 			tl = 0
 62 | 		config.model_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(config.task, get_time(),
 63 | 			config.gameConfig, config.tlBaseBrain, config.NoHiLayer, 
 64 | 			config.demandUp, config.cp1, config.cp2, config.cp3, config.cp4, config.ch1, config.ch2, config.ch3, config.ch4,
 65 | 			config.distCoeff, config.NoAgent, config.maxEpisodesTrain, config.lr0, config.multPerdInpt, config.dnnUpCnt, tl, 
 66 | 			config.actionUp, config.demandDistribution, config.action_step, config.data_id)
 67 | 		  
 68 | 	config.model_dir = os.path.join(config.log_dir, config.model_name)
 69 | 
 70 | 	for path in [config.pre_model_dir, config.log_dir, config.model_dir, 
 71 | 				os.path.join(config.model_dir,'saved_figures'),
 72 | 				os.path.join(config.model_dir,'model1'),
 73 | 				os.path.join(config.model_dir,'model2'),
 74 | 				os.path.join(config.model_dir,'model3'),
 75 | 				os.path.join(config.model_dir,'model4'),
 76 | 				os.path.join(config.pre_model_dir,'brain3'),
 77 | 				os.path.join(config.pre_model_dir,'brain4'),
 78 | 				os.path.join(config.pre_model_dir,'brain5'),
 79 | 				os.path.join(config.pre_model_dir,'brain6'),				
 80 | 				os.path.join(config.pre_model_dir,'brain7'),				
 81 | 				os.path.join(config.pre_model_dir,'brain8'),				
 82 | 				os.path.join(config.pre_model_dir,'brain9'),				
 83 | 				os.path.join(config.pre_model_dir,'brain10'),				
 84 | 				os.path.join(config.log_dir,'reports')]:
 85 | 				
 86 | 		if not os.path.exists(path):
 87 | 			os.makedirs(path)
 88 | 			
 89 | 
 90 | def get_time():
 91 | 	return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 92 | 
 93 | 
 94 | def save_config(config):
 95 | 	param_path = os.path.join(config.model_dir, "params.json")
 96 | 
 97 | 	tf.logging.info("MODEL dir: %s" % config.model_dir)
 98 | 	tf.logging.info("PARAM path: %s" % param_path)
 99 | 	save_json(config.__dict__,param_path)
100 | 
101 | 
102 | # input-output functions
103 | def save(obj,name):
104 | 	pickle.dump(obj, open(name, "wb"))
105 | 
106 | def load(name):
107 | 	return pickle.load(open(name, "rb"))
108 | 
109 | def save_json(obj,name):
110 | 	with open(name, 'w') as outfile:
111 | 		json.dump(obj, outfile, indent=4, sort_keys=True, cls=NumpyEncoder)
112 | 


--------------------------------------------------------------------------------