├── pybrainSG ├── __init__.py ├── rl │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── ceqa.pyc │ │ ├── faphc.pyc │ │ ├── nfceqa.pyc │ │ ├── sgspa.pyc │ │ ├── __init__.pyc │ │ ├── indexable.pyc │ │ ├── loggingSG.pyc │ │ ├── learningSG.pyc │ │ ├── linearfaSG.pyc │ │ ├── multiAgent.pyc │ │ ├── indexable.py │ │ ├── nfceqa.py │ │ ├── learningSG.py │ │ ├── sgspa.py │ │ ├── faphc.py │ │ ├── multiAgent.py │ │ ├── linearfaSG.py │ │ ├── ceqa.py │ │ └── loggingSG.py │ ├── examples │ │ ├── __init__.py │ │ ├── ceq │ │ │ ├── __init__.py │ │ │ ├── example_huntingGame.pyc │ │ │ ├── example_gridgames_CEQ_NFQ.pyc │ │ │ ├── example_huntinggame_CEQ-NFQ.pyc │ │ │ ├── example_staticGame.py │ │ │ ├── example_gridgames.py │ │ │ ├── example_huntingGame.py │ │ │ ├── example_gridgames_CEQ_NFQ.py │ │ │ └── example_huntinggame_CEQ-NFQ.py │ │ ├── phc │ │ │ ├── __init__.py │ │ │ ├── example_gridgames.py │ │ │ ├── example_staticGame.py │ │ │ └── example_huntingGame.py │ │ ├── sgsp │ │ │ ├── __init__.py │ │ │ ├── example_staticgame.py │ │ │ └── example_gridgames.py │ │ ├── nfqSG │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── example_gridgames.pyc │ │ │ ├── example_huntingGame.pyc │ │ │ ├── example_gridgames.py │ │ │ └── example_huntingGame.py │ │ ├── tasks │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── gridgames.pyc │ │ │ ├── huntinggame.pyc │ │ │ ├── staticgame.pyc │ │ │ ├── staticgame.py │ │ │ ├── huntinggame.py │ │ │ └── gridgames.py │ │ ├── linearfaSG │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── example_huntingGame.pyc │ │ │ ├── example_staticGame.py │ │ │ ├── example_gridgames.py │ │ │ └── example_huntingGame.py │ │ └── __init__.pyc │ ├── leaners │ │ ├── __init__.py │ │ ├── valuebased │ │ │ ├── __init__.py │ │ │ ├── ceq.pyc │ │ │ ├── phc.pyc │ │ │ ├── sgsp.pyc │ │ │ ├── nfqSG.pyc │ │ │ ├── __init__.pyc │ │ │ ├── learnerfaSG.pyc │ │ │ ├── indexablevaluebased.pyc │ │ │ ├── indexablevaluebased.py │ │ │ ├── nfqSG.py │ │ │ ├── learnerfaSG.py │ │ │ ├── sgsp.py │ │ │ ├── phc.py │ │ │ └── ceq.py │ │ └── __init__.pyc │ ├── environments │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── episodicSG.pyc │ │ └── episodicSG.py │ ├── experiments │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── episodicSG.pyc │ │ └── episodicSG.py │ └── __init__.pyc └── __init__.pyc ├── Images └── MDPsandGSSGs.jpg ├── .mamemose.rb ├── .pydevproject ├── .project └── README.md /pybrainSG/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/environments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/phc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/sgsp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Images/MDPsandGSSGs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/Images/MDPsandGSSGs.jpg -------------------------------------------------------------------------------- /pybrainSG/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/ceqa.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/ceqa.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/faphc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/faphc.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/nfceqa.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/nfceqa.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/sgspa.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/sgspa.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/indexable.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/indexable.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/loggingSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/loggingSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/learningSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/learningSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/linearfaSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/linearfaSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/agents/multiAgent.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/multiAgent.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/environments/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/environments/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/experiments/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/experiments/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/environments/episodicSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/environments/episodicSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/experiments/episodicSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/experiments/episodicSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/ceq.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/ceq.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/phc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/phc.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/sgsp.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/sgsp.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/gridgames.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/gridgames.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/huntinggame.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/huntinggame.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/staticgame.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/staticgame.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/nfqSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/nfqSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/linearfaSG/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/__init__.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/learnerfaSG.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/learnerfaSG.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_huntingGame.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_huntingGame.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/example_gridgames.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/example_gridgames.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/example_huntingGame.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/example_huntingGame.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/indexablevaluebased.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/indexablevaluebased.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.pyc -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/example_huntingGame.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/linearfaSG/example_huntingGame.pyc -------------------------------------------------------------------------------- /.mamemose.rb: -------------------------------------------------------------------------------- 1 | DOCUMENT_ROOT = "~/memo" 2 | 3 | PORT = 8888 4 | 5 | MARKDOWN_PATTERN = /\.(md|markdown|txt)$/ 6 | 7 | RECENT_NUM = 0 8 | 9 | # RECENT_PATTERN = MARKDOWN_PATTERN 10 | 11 | CUSTOM_HEADER = <
13 | 14 | HEADER -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | 6 | python 2.7 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Pybrain_StochasticGames 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/indexable.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | __author__ = 'Takuya Hiraoka, takuya-h@is.naist.jp' 7 | 8 | from pybrain.rl.agents.agent import Agent 9 | 10 | class IndexableAgent(Agent): 11 | ''' 12 | Agent which can be indexed. 13 | ''' 14 | indexOfAgent=None 15 | 16 | def __init__(self, index=None): 17 | self.setIndexOfAgent(index) 18 | 19 | def setIndexOfAgent(self,index): 20 | """ set index to agent. 21 | :key index: index of agent 22 | :type index: integer 23 | """ 24 | self.indexOfAgent=index 25 | 26 | -------------------------------------------------------------------------------- /pybrainSG/rl/environments/episodicSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrain.utilities import abstractMethod 7 | from pybrain.rl.environments.task import Task 8 | 9 | class EpisodicTaskSG(Task): 10 | """Stochastic game version of EpisodicTask class""" 11 | 12 | def __init__(self, environment): 13 | Task.__init__(self,environment) 14 | 15 | def reset(self): 16 | """ Re-initialize the environment """ 17 | self.env.reset() 18 | 19 | def isFinished(self): 20 | """ Is the current episode over? """ 21 | abstractMethod() 22 | 23 | def performAction(self, jointAction): 24 | """ Execute joint action of all agents. """ 25 | Task.performAction(self, jointAction) 26 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/indexablevaluebased.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner 7 | 8 | class IndexableValueBasedLearner(ValueBasedLearner): 9 | indexOfAgent=None 10 | ownerAgentProperties={ 11 | "requireOtherAgentsState": None, #Define if learner require, in addition to owner's state information, other agent state information as well. 12 | "requireJointAction":None, #Define if learner require, in addition to owner's state information, other agent action information as well. 13 | "requireJointReward":None}#Define if learner require, in addition to owner's state information, other agent reward information as well. 14 | 15 | def __init__(self, indexOfAgent=None, **kwargs): 16 | ValueBasedLearner.__init__(self) 17 | self.indexOfAgent=indexOfAgent 18 | 19 | def setIndexOfAgent(self, indexOfAgent): 20 | self.indexOfAgent=indexOfAgent 21 | 22 | def getProperty(self): 23 | for elem in self.ownerAgentProperties.values(): 24 | assert isinstance(elem,bool), "All property should be initialize with boolian." 25 | return self.ownerAgentProperties -------------------------------------------------------------------------------- /pybrainSG/rl/examples/phc/example_gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/07 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.examples.tasks.gridgames import * 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.phc import * 10 | from pybrainSG.rl.agents.faphc import PHC_Agent 11 | 12 | if __name__ == '__main__': 13 | ma=MultiAgent() 14 | for i in range(GridGame.numberofAgents): 15 | # learner= PHC_NN( 16 | # num_features=(GridGame.numberofAgents*2), 17 | # num_actions=len(GridGame.availableActions)) 18 | learner= PHC_WoLF_NN( 19 | num_features=(GridGame.numberofAgents*2), 20 | num_actions=len(GridGame.availableActions)) 21 | agent= PHC_Agent(learner,numAgents=GridGame.numberofAgents,index=i) 22 | ma.addAgent(agent) 23 | task=GridGameTask() 24 | 25 | # task=GridGameTask(gameType="GG1") 26 | # task=GridGameTask(gameType="GG2") 27 | task=GridGameTask(gameType="GG3") 28 | exp=EpisodicExperimentSG(task,ma) 29 | print "Rewards for agents at the end of episode:" 30 | for i in range(40000): 31 | rewards=exp.doEpisodes(number=1) 32 | print str(rewards[0][-1]) 33 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/phc/example_staticGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.agents.faphc import PHC_Agent 10 | from pybrainSG.rl.leaners.valuebased.phc import * 11 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask 12 | 13 | if __name__ == '__main__': 14 | ma=MultiAgent() 15 | for i in range(2): 16 | # learner= PHC_NN( 17 | # num_features=1, 18 | # num_actions=len(SimpleMatrixGame.availableActions)) 19 | learner= PHC_WoLF_NN( 20 | num_features=1, 21 | num_actions=len(SimpleMatrixGame.availableActions), 22 | ) 23 | learner.rewardDiscount=0.0 24 | agent= PHC_Agent(learner,numAgents=2,index=i) 25 | ma.addAgent(agent) 26 | task=StaticGameTask() 27 | 28 | exp=EpisodicExperimentSG(task,ma) 29 | rewards=exp.doEpisodes(number=1000) 30 | print "Given reward for " + str(len(rewards)) + " episodes:" 31 | print "Reward for Agent 1, Reward for Agent 2" 32 | for i in range(len(rewards)): 33 | print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1]) 34 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/example_staticGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG 9 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG 10 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask 11 | from pybrainSG.rl.agents.multiAgent import MultiAgent 12 | import numpy as np 13 | if __name__ == '__main__': 14 | ma=MultiAgent() 15 | for i in range(2): 16 | learner= Q_LinFA_SG( 17 | num_features=1, 18 | num_actions=len(SimpleMatrixGame.availableActions)) 19 | agent= LinearFA_AgentSG(learner, 20 | num_features=np.ones((2,1)), 21 | num_actions=(np.ones(2)*len(SimpleMatrixGame.availableActions)), 22 | num_agents=2, 23 | index=i) 24 | ma.addAgent(agent) 25 | 26 | task=StaticGameTask() 27 | 28 | exp=EpisodicExperimentSG(task,ma) 29 | rewards=exp.doEpisodes(number=500) 30 | print "Given reward for " + str(len(rewards)) + " episodes:" 31 | print "Reward for Agent 1, Reward for Agent 2" 32 | for i in range(len(rewards)): 33 | print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1]) 34 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/phc/example_huntingGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.phc import * 10 | from pybrainSG.rl.agents.faphc import PHC_Agent 11 | 12 | if __name__ == '__main__': 13 | ma=MultiAgent() 14 | HuntingGame.numberofAgents=2 15 | for i in range(HuntingGame.numberofAgents): 16 | # learner= PHC_NN( 17 | # num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 18 | # num_actions=len(HuntingGame.availableActions)) 19 | learner= PHC_WoLF_NN( 20 | num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 21 | num_actions=len(HuntingGame.availableActions)) 22 | agent= PHC_Agent(learner,numAgents=HuntingGame.numberofAgents,index=i) 23 | ma.addAgent(agent) 24 | 25 | task=HuntingGameTask() 26 | 27 | print "Given reward for Agents" 28 | for i in range(10000): 29 | exp=EpisodicExperimentSG(task,ma) 30 | rewards=exp.doEpisodes(number=1) 31 | for i in range(len(rewards)): 32 | print str(rewards[i][-1][0]) 33 | 34 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_staticGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.leaners.valuebased.ceq import * 9 | from pybrainSG.rl.agents.ceqa import * 10 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask 11 | from pybrainSG.rl.agents.multiAgent import MultiAgent 12 | import numpy as np 13 | if __name__ == '__main__': 14 | ma=MultiAgent() 15 | for i in range(2): 16 | 17 | learner= CEQ_Lin( 18 | num_features=1, 19 | num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions), 20 | num_agents=2, 21 | indexOfAgent=i) 22 | learner.rewardDiscount=0.0 23 | agent= CEQ_Agent(learner, 24 | num_features=1, 25 | num_actions=(np.ones(2)*len(SimpleMatrixGame.availableActions)), 26 | num_agents=2, 27 | index=i) 28 | ma.addAgent(agent) 29 | 30 | task=StaticGameTask() 31 | 32 | exp=EpisodicExperimentSG(task,ma) 33 | print "Reward for Agent 1, Reward for Agent 2" 34 | for i in range(50000): 35 | rewards=exp.doEpisodes(number=1) 36 | print str(rewards[0][-1][0])+", "+str(rewards[0][-1][1]) 37 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/sgsp/example_staticgame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/10 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.sgsp import * 10 | from pybrainSG.rl.agents.sgspa import SGSP_Agent 11 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask 12 | 13 | if __name__ == '__main__': 14 | ma=MultiAgent() 15 | for i in range(2): 16 | learner= ON_SGSP_NN( 17 | num_features=1, 18 | num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions), 19 | num_agents=2, 20 | index=i) 21 | learner.rewardDiscount=0.0 22 | agent= SGSP_Agent( 23 | learner, 24 | num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions), 25 | numAgents=2, 26 | index=i) 27 | ma.addAgent(agent) 28 | task=StaticGameTask() 29 | 30 | exp=EpisodicExperimentSG(task,ma) 31 | rewards=exp.doEpisodes(number=1000) 32 | print "Given reward for " + str(len(rewards)) + " episodes:" 33 | print "Reward for Agent 1, Reward for Agent 2" 34 | for i in range(len(rewards)): 35 | print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1]) 36 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/sgsp/example_gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/07 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.examples.tasks.gridgames import * 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.sgsp import * 10 | from pybrainSG.rl.agents.sgspa import SGSP_Agent 11 | 12 | if __name__ == '__main__': 13 | ma=MultiAgent() 14 | for i in range(GridGame.numberofAgents): 15 | learner= ON_SGSP_NN( 16 | num_features=(GridGame.numberofAgents*2), 17 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 18 | num_agents=GridGame.numberofAgents, 19 | index=i) 20 | agent= SGSP_Agent( 21 | learner, 22 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 23 | numAgents=GridGame.numberofAgents, 24 | index=i) 25 | ma.addAgent(agent) 26 | task=GridGameTask() 27 | 28 | # task=GridGameTask(gameType="GG1") 29 | # task=GridGameTask(gameType="GG2") 30 | task=GridGameTask(gameType="GG3") 31 | exp=EpisodicExperimentSG(task,ma) 32 | print "Rewards for agents at the end of episode:" 33 | for i in range(40000): 34 | rewards=exp.doEpisodes(number=1) 35 | print str(rewards[0][-1]) 36 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/example_gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/07 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.examples.tasks.gridgames import * 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG 10 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG 11 | 12 | if __name__ == '__main__': 13 | ma=MultiAgent() 14 | for i in range(GridGame.numberofAgents): 15 | learner= Q_LinFA_SG( 16 | num_features=(GridGame.numberofAgents*2), 17 | num_actions=len(GridGame.availableActions)) 18 | agent= LinearFA_AgentSG( 19 | learner, 20 | num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2), 21 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 22 | num_agents=GridGame.numberofAgents, 23 | index=i) 24 | ma.addAgent(agent) 25 | task=GridGameTask() 26 | 27 | # task=GridGameTask(gameType="GG1") 28 | # task=GridGameTask(gameType="GG2") 29 | task=GridGameTask(gameType="GG3") 30 | exp=EpisodicExperimentSG(task,ma) 31 | print "Rewards for agents at the end of episode:" 32 | for i in range(40000): 33 | rewards=exp.doEpisodes(number=1) 34 | print str(rewards[0][-1]) 35 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/linearfaSG/example_huntingGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG 9 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG 10 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask 11 | from pybrainSG.rl.agents.multiAgent import MultiAgent 12 | import numpy as np 13 | if __name__ == '__main__': 14 | ma=MultiAgent() 15 | for i in range(HuntingGame.numberofAgents): 16 | learner= Q_LinFA_SG( 17 | num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 18 | num_actions=len(HuntingGame.availableActions)) 19 | agent= LinearFA_AgentSG( 20 | learner, 21 | num_features=np.ones(HuntingGame.numberofAgents)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 22 | num_actions=np.ones(HuntingGame.numberofAgents)*len(HuntingGame.availableActions), 23 | num_agents=HuntingGame.numberofAgents, 24 | index=i) 25 | ma.addAgent(agent) 26 | task=HuntingGameTask() 27 | exp=EpisodicExperimentSG(task,ma) 28 | rewards=exp.doEpisodes(number=1000) 29 | print "Given reward for " + str(len(rewards)) + " episodes:" 30 | print "Reward for Agents" 31 | for i in range(len(rewards)): 32 | print str(rewards[i][-1][0]) 33 | -------------------------------------------------------------------------------- /pybrainSG/rl/experiments/episodicSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | __author__ = 'Takuya Hiraoka, takuya-h@is.naist.jp' 7 | 8 | from pybrain.rl.experiments.experiment import Experiment 9 | from pybrainSG.rl.agents.multiAgent import MultiAgent 10 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG 11 | 12 | class EpisodicExperimentSG(Experiment): 13 | """ Stochastic version of EpisodicExperiment class. """ 14 | def __init__(self, task, multiAgent): 15 | assert isinstance(task, EpisodicTaskSG), "task should be the subclass of EpisodicTaskSG." 16 | assert isinstance(multiAgent, MultiAgent), "task should be MultAgent." 17 | Experiment.__init__(self, task, multiAgent) 18 | 19 | 20 | def _oneInteraction(self): 21 | """ Do an interaction between the Task and Agents. """ 22 | self.stepid += 1 23 | self.agent.integrateObservation(self.task.getObservation()) 24 | self.task.performAction(self.agent.getJointAction()) 25 | reward = self.task.getReward() 26 | self.agent.giveJointReward(reward) 27 | return reward 28 | 29 | def doEpisodes(self, number = 1): 30 | """ Do one episode, and return the joint rewards of each step as a list. """ 31 | all_rewards = [] 32 | for dummy in range(number): 33 | self.agent.newEpisode() 34 | rewards = [] 35 | self.stepid = 0 36 | self.task.reset() 37 | while not self.task.isFinished(): 38 | r = self._oneInteraction() 39 | rewards.append(r) 40 | all_rewards.append(rewards) 41 | return all_rewards 42 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/07 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.agents.multiAgent import MultiAgent 8 | from pybrainSG.rl.leaners.valuebased.ceq import * 9 | from pybrainSG.rl.agents.ceqa import * 10 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame 11 | import numpy as np 12 | import warnings 13 | 14 | if __name__ == '__main__': 15 | warnings.simplefilter("ignore") 16 | ma=MultiAgent() 17 | for i in range(GridGame.numberofAgents): 18 | learner= CEQ_Lin( 19 | num_features=(GridGame.numberofAgents*2), 20 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 21 | num_agents=GridGame.numberofAgents, 22 | indexOfAgent=i) 23 | agent= CEQ_Agent( 24 | learner, 25 | num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2), 26 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 27 | num_agents=GridGame.numberofAgents, 28 | index=i) 29 | ma.addAgent(agent) 30 | # task=GridGameTask(gameType="GG1") 31 | # task=GridGameTask(gameType="GG2") 32 | task=GridGameTask(gameType="GG3") 33 | exp=EpisodicExperimentSG(task,ma) 34 | print "Rewards for agents at the end of episode:" 35 | for i in range(40000): 36 | rewards=exp.doEpisodes(number=1) 37 | print str(rewards[0][-1]) 38 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_huntingGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.leaners.valuebased.ceq import * 9 | from pybrainSG.rl.agents.ceqa import * 10 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask 11 | from pybrainSG.rl.agents.multiAgent import MultiAgent 12 | #from pybrain.unsupervised.trainers.deepbelief 13 | 14 | import numpy as np 15 | if __name__ == '__main__': 16 | # warnings.simplefilter("ignore") 17 | ma=MultiAgent() 18 | HuntingGame.numberofAgents=2 19 | for i in range(HuntingGame.numberofAgents): 20 | learner= CEQ_Lin( 21 | num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 22 | num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions), 23 | num_agents=HuntingGame.numberofAgents, 24 | indexOfAgent=i) 25 | agent= CEQ_Agent( 26 | learner, 27 | num_features=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 28 | num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions), 29 | num_agents=HuntingGame.numberofAgents, 30 | index=i) 31 | ma.addAgent(agent) 32 | task=HuntingGameTask() 33 | exp=EpisodicExperimentSG(task,ma) 34 | print "Reward for Agents" 35 | for i in range(40000): 36 | rewards=exp.doEpisodes(number=1) 37 | print str(rewards[0][-1][0]) 38 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/example_gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/06 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrain.rl.learners.valuebased.interface import ActionValueNetwork 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.nfqSG import NFQ_SG 10 | from pybrainSG.rl.agents.learningSG import LearningAgentSG 11 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame 12 | import numpy as np 13 | import warnings 14 | 15 | if __name__ == '__main__': 16 | warnings.simplefilter("ignore") 17 | for _ in range(500): 18 | ma=MultiAgent() 19 | for i in range(GridGame.numberofAgents): 20 | net=ActionValueNetwork(dimState=(GridGame.numberofAgents*2), 21 | numActions=len(GridGame.availableActions)) 22 | learner= NFQ_SG(maxEpochs=100) 23 | agent = LearningAgentSG(net, 24 | num_features=(np.ones(GridGame.numberofAgents)*(GridGame.numberofAgents*2)), 25 | num_actions=(np.ones(GridGame.numberofAgents)*len(GridGame.availableActions)), 26 | num_agents=GridGame.numberofAgents, 27 | learner=learner, 28 | index=i) 29 | ma.addAgent(agent) 30 | # task=GridGameTask(gameType="GG1") 31 | # task=GridGameTask(gameType="GG2") 32 | task=GridGameTask(gameType="GG3") 33 | exp=EpisodicExperimentSG(task,ma) 34 | print "Average reward for agents at the end of episode:" 35 | #Two phase learning 36 | rewards=exp.doEpisodes(number=10)#first phase 37 | ma.learn() 38 | for numBatch in range(40): 39 | avr=np.array([0.0,0.0]) 40 | for i in range(len(rewards)): 41 | avr+=rewards[i][-1] 42 | avr/=float(np.size(rewards,axis=0)) 43 | print avr 44 | rewards=exp.doEpisodes(number=10) 45 | ma.learn() 46 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/06 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.agents.multiAgent import MultiAgent 8 | from pybrainSG.rl.leaners.valuebased.ceq import * 9 | from pybrainSG.rl.agents.nfceqa import * 10 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame 11 | import numpy as np 12 | import warnings 13 | 14 | if __name__ == '__main__': 15 | warnings.simplefilter("ignore") 16 | for _ in range(500): 17 | ma=MultiAgent() 18 | for i in range(GridGame.numberofAgents): 19 | learner= NFCEQ( 20 | num_features=(GridGame.numberofAgents*2), 21 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 22 | num_agents=GridGame.numberofAgents, 23 | max_epochs=100, 24 | indexOfAgent=i) 25 | agent= NFCEQ_Agent( 26 | learner, 27 | num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2), 28 | num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions), 29 | num_agents=GridGame.numberofAgents, 30 | index=i) 31 | ma.addAgent(agent) 32 | # task=GridGameTask(gameType="GG1") 33 | # task=GridGameTask(gameType="GG2") 34 | task=GridGameTask(gameType="GG3") 35 | exp=EpisodicExperimentSG(task,ma) 36 | print "Average reward for agents at the end of episode:" 37 | #Two phase learning 38 | rewards=exp.doEpisodes(number=30)#first phase 39 | ma.learn() 40 | for numBatch in range(40): 41 | avr=np.array([0.0,0.0]) 42 | for i in range(len(rewards)): 43 | avr+=rewards[i][-1] 44 | avr/=float(np.size(rewards,axis=0)) 45 | print avr 46 | rewards=exp.doEpisodes(number=10) 47 | ma.learn() 48 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/28 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 7 | from pybrainSG.rl.leaners.valuebased.ceq import * 8 | from pybrainSG.rl.agents.nfceqa import * 9 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask 10 | from pybrainSG.rl.agents.multiAgent import MultiAgent 11 | 12 | import numpy as np 13 | if __name__ == '__main__': 14 | warnings.simplefilter("ignore") 15 | 16 | for _ in range(500): 17 | ma=MultiAgent() 18 | HuntingGame.numberofAgents=2 19 | for i in range(HuntingGame.numberofAgents): 20 | learner= NFCEQ( 21 | num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 22 | num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions), 23 | num_agents=HuntingGame.numberofAgents, 24 | max_epochs=100, 25 | indexOfAgent=i) 26 | agent= NFCEQ_Agent( 27 | learner, 28 | num_features=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 29 | num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions), 30 | num_agents=HuntingGame.numberofAgents, 31 | index=i) 32 | ma.addAgent(agent) 33 | task=HuntingGameTask() 34 | exp=EpisodicExperimentSG(task,ma) 35 | print "Reward for Agents" 36 | print "Average Reward for Agents (at the end of episode)" 37 | #Two phase leanring 38 | rewards=exp.doEpisodes(number=10)#firstphase 39 | ma.learn() 40 | for numBatch in range(40): 41 | avr=0.0 42 | for i in range(len(rewards)): 43 | #print str(rewards[i][-1][0]) 44 | #average 45 | avr+=rewards[i][-1][0] 46 | avr/=float(len(rewards)) 47 | print avr 48 | rewards=exp.doEpisodes(number=10) 49 | ma.learn() 50 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/staticgame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/20 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrain.rl.environments import Environment 8 | from pybrainSG.rl.experiments.episodicSG import EpisodicTaskSG 9 | import numpy as np 10 | 11 | 12 | class StaticGameTask(EpisodicTaskSG): 13 | '''All agent make decision (Head or Tail) simultaneously only one time. ''' 14 | 15 | isGameFinished=False 16 | def __init__(self): 17 | EpisodicTaskSG.__init__(self, SimpleMatrixGame()) 18 | 19 | def reset(self): 20 | EpisodicTaskSG.reset(self) 21 | self.isGameFinished=False 22 | 23 | def isFinished(self): 24 | return self.isGameFinished 25 | 26 | def getReward(self): 27 | self.isGameFinished=True 28 | return self.env.getJointReward() 29 | 30 | 31 | class SimpleMatrixGame(Environment): 32 | '''Corresponding to Heads and Tails respectively.''' 33 | availableActions=[0,1] 34 | '''payoff matrix of each agent in cooperative task scenario''' 35 | payoffMatricForAgent1=[[10,-10], 36 | [-10,-10]] 37 | payoffMatricForAgent2=[[10,-10], 38 | [-10,-10]] 39 | # '''payoff matrix of zero-sumgame scenario. nash equilibrium: (Agenat1's action=0,Agent2's action=1)''' 40 | # payoffMatricForAgent1=[[5,2], 41 | # [-1,6]] 42 | # payoffMatricForAgent2=[[-5,-2], 43 | # [1,-6]] 44 | # '''payoff matrix of zero-sumgame scenario. matching pennies''' 45 | # payoffMatricForAgent1=[[1,-1], 46 | # [-1,1]] 47 | # payoffMatricForAgent2=[[-1,1], 48 | # [1,-1]] 49 | 50 | outcomeForAfgenet1=None 51 | outcomeForAfgenet2=None 52 | 53 | def getSensors(self): 54 | return np.ones((2,1))#Static state (i.e., no state transition) 55 | 56 | def performAction(self, action): 57 | self.outcomeForAfgenet1=SimpleMatrixGame.payoffMatricForAgent1[action[0]][action[1]] 58 | self.outcomeForAfgenet2=SimpleMatrixGame.payoffMatricForAgent2[action[0]][action[1]] 59 | 60 | def reset(self): 61 | self.outcomeForAfgenet1=None 62 | self.outcomeForAfgenet2=None 63 | 64 | def getJointReward(self): 65 | return np.array([self.outcomeForAfgenet1,self.outcomeForAfgenet2]) 66 | 67 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/nfqSG/example_huntingGame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrain.rl.learners.valuebased.interface import ActionValueNetwork 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent 9 | from pybrainSG.rl.leaners.valuebased.nfqSG import NFQ_SG 10 | from pybrainSG.rl.agents.learningSG import LearningAgentSG 11 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask 12 | import numpy as np 13 | import warnings 14 | if __name__ == '__main__': 15 | warnings.simplefilter("ignore") 16 | for _ in range(500): 17 | ma=MultiAgent() 18 | # 19 | HuntingGame.numberofAgents=2 20 | for i in range(HuntingGame.numberofAgents): 21 | #dimState=# position of each agent in grid world + # position of each niman in grid world + bias 22 | net=ActionValueNetwork(dimState=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1), 23 | numActions=len(HuntingGame.availableActions)) 24 | learner= NFQ_SG(maxEpochs=100)#hopefully, more than 100. 25 | # learner._explorer.epsilon=0.1#In one player case, that too small. 26 | #print learner.explorer 27 | agent = LearningAgentSG(net, 28 | num_features=(np.ones(HuntingGame.numberofAgents)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1)), 29 | num_actions=(np.ones(HuntingGame.numberofAgents)*len(HuntingGame.availableActions)), 30 | num_agents=HuntingGame.numberofAgents, 31 | learner=learner, 32 | index=i) 33 | ma.addAgent(agent) 34 | 35 | task=HuntingGameTask() 36 | 37 | exp=EpisodicExperimentSG(task,ma) 38 | print "Reward for Agents" 39 | print "Average Reward for Agents (at the end of episode)" 40 | #Two phase leanring 41 | rewards=exp.doEpisodes(number=10)#firstphase 42 | ma.learn() 43 | for numBatch in range(40): 44 | avr=0.0 45 | for i in range(len(rewards)): 46 | #print str(rewards[i][-1][0]) 47 | #average 48 | avr+=rewards[i][-1][0] 49 | avr/=float(len(rewards)) 50 | print avr 51 | rewards=exp.doEpisodes(number=10) 52 | ma.learn() 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multi-agent reinforcement learning in stochastic games 2 | ==== 3 | 4 | # What is this package? 5 | This package is unofficial PyBrain extension for __multi-agent reinforcement learning__ in __general sum stochastic games__. 6 | The package provides 1) the framework for modeling general sum stochastic games and 2) its multi-agent reinforcement learning algorithms. 7 | 8 | 9 | ## General sum stochastic games (GSSGs) 10 | GSSGs is generalized Markov decision processes (MDPs) for multi-agent situations, and represented as a tuple <_D,_ _S,_ ___A,___ _T,_ ___R___> (right side of following figure). 11 | _D_ represents a agents set, _S_ represents a state of an environment, ___A___ represents a joint action of all agents, and ___R___ represents a joint reward for each agent. In contrast to MDPs, GSSGs allows multiple agents to affect the environment and receive rewards simultaneously. 12 | We can model many phenomena in the real world with GSSGs (e.g., trading in market place, negotiation of stakeholders, or collaborative task of robots). 13 | 14 | ![img](./Images/MDPsandGSSGs.jpg "MDPs and GSSGs") 15 | 16 | 17 | ## Multi-agent reinforcement learning (MARL) 18 | MARL is used for learning agent policies $\pi$ concurrently. 19 | $\pi$ is a mapping function from an observed state _S_ to an agent action _A_ (see above figure). 20 | Each agent policy is learnt to maximize its own expected cumulative rewards, converging to equilibrium (typically Nash equilibrium) where all agent policies cannot be modified to better one. 21 | This package provides variations of [PHC, PHC-WoLF](http://www.cs.cmu.edu/~mmv/papers/01ijcai-mike.pdf), [Correlated-Q Learning](https://www.aaai.org/Papers/ICML/2003/ICML03-034.pdf), [SGSP](http://www.ifaamas.org/Proceedings/aamas2015/aamas/p1371.pdf) in addition to GSSGs version of single-agent reinforcement learning implemented in PyBrain. 22 | 23 | 24 | # How to use this package? 25 | To use this package, we need 1) install all requirements, 2) implement GSSGs to specify target domain, and 3) apply MARL to implemented GSSGs to learn agent policies. 26 | 27 | ## 1. Install Requirement 28 | * Python 2.7.6 29 | * Numpy 1.11.0rc1+ 30 | * Scipy 0.17.0+ 31 | * PyBrain 0.3.3+ 32 | 33 | ## 2. Implement GSSGs 34 | Implement the class extending EpisodicTaskSG (pybrainSG.rl.environments.episodicSG) and the class extending Environment class (pybrain.rl.environments.environment). 35 | Some examples of implementation are put on following package: 36 | 37 | * pybrainSG.rl.examples.tasks 38 | 39 | For example, "gridgames.py" provides examples of grid world domain, and "staticgame.py" provides examples of bi-matrix game domain. 40 | 41 | ## 3. Apply MARL to implemented GSSGs 42 | To apply MARL to implemented GSSGs, we need construct an agent set and an experiment. 43 | You can find examples of a construction in the following folder: 44 | 45 | * pybrainSG.rl.examples 46 | 47 | For example, "example_gridgames.py" in "ceq" package shows how to use one of Correlated-Q learning implementations in the grid game domain. 48 | 49 | # Future work 50 | Refactoring and cleaning up source codes. 51 | Introducing inverse reinforcement learning for estimation of other agents reward structure. 52 | 53 | # Author 54 | [Takuya Hiraoka](http://isw3.naist.jp/~takuya-h/) 55 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/nfqSG.py: -------------------------------------------------------------------------------- 1 | from scipy import r_ 2 | from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner 3 | from pybrain.datasets import SupervisedDataSet 4 | from pybrain.supervised.trainers.rprop import RPropMinusTrainer 5 | from pybrain.utilities import one_to_n 6 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import IndexableValueBasedLearner 7 | from pybrain.tools.shortcuts import buildNetwork 8 | 9 | class NFQ_SG(IndexableValueBasedLearner):#Mod. version 10 | """ 11 | Stochastic game version of Neural-fitted Q-iteration 12 | """ 13 | 14 | def __init__(self, maxEpochs=20, indexOfAgent=None,): 15 | ValueBasedLearner.__init__(self) 16 | self.gamma = 0.9 17 | self.maxEpochs = maxEpochs 18 | # 19 | self.ownerAgentProperties["requireOtherAgentsState"]=False 20 | self.ownerAgentProperties["requireJointAction"]=False 21 | self.ownerAgentProperties["requireJointReward"]=False 22 | self.isFirstLerning=True 23 | 24 | def learn(self): 25 | # convert reinforcement dataset to NFQ supervised dataset 26 | supervised = SupervisedDataSet(self.module.network.indim, 1) 27 | for seq in self.dataset[self.indexOfAgent]: 28 | lastexperience = None 29 | for state, action, reward in seq: 30 | if not lastexperience: 31 | # delay each experience in sequence by one 32 | lastexperience = (state, action, reward) 33 | continue 34 | 35 | # use experience from last timestep to do Q update 36 | (state_, action_, reward_) = lastexperience 37 | 38 | Q = self.module.getValue(state_, action_[0]) 39 | 40 | inp = r_[state_, one_to_n(action_[0], self.module.numActions)] 41 | if self.isFirstLerning: 42 | tgt = reward_ 43 | else: 44 | tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) 45 | supervised.addSample(inp, tgt) 46 | 47 | #for reward normalization 48 | 49 | # update last experience with current one 50 | lastexperience = (state, action, reward) 51 | 52 | #Re-building netowrks is required in multiprocessing environments. 53 | params=self.module.network.params 54 | self.module.network=buildNetwork(self.module.indim+self.module.numActions, 55 | self.module.indim+self.module.numActions, 56 | 1) 57 | self.module.network._setParameters(params) 58 | 59 | # train module with backprop/rprop on dataset 60 | trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01) 61 | trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) 62 | if self.isFirstLerning: 63 | self.isFirstLerning=False 64 | # alternative: backprop, was not as stable as rprop 65 | # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.005, batchlearning=True, verbose=True) 66 | # trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/nfceqa.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/28 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrain.utilities import drawIndex 8 | from pybrainSG.rl.leaners.valuebased.ceq import NFCEQ 9 | from scipy import array 10 | 11 | class NFCEQ_Agent(LoggingAgentSG): 12 | """ 13 | Agent based on NFCEQ put on: 14 | pybrainSG.rl.leaners.valuebased.ceq 15 | """ 16 | init_exploration = 0.3 # aka epsilon 17 | exploration_decay = 0.98 # per episode 18 | 19 | init_temperature = 1. 20 | temperature_decay = 0.99 # per episode 21 | 22 | # flags for exploration strategies 23 | epsilonGreedy = True 24 | learning = True 25 | greedy = False 26 | 27 | def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs): 28 | assert isinstance(learner, NFCEQ), "learner should be instance of NFCEQ." 29 | self.learner = learner 30 | LoggingAgentSG.__init__(self, num_features, num_actions, num_agents, index, **kwargs) 31 | # if learner is available, tell it the module and data 32 | if self.learner is not None: 33 | self.learner.dataset = self.history 34 | self.learning = True 35 | self.learner._behaviorPolicy = self._actionProbs 36 | self.reset() 37 | 38 | self.agentProperties["requireOtherAgentsState"]=False 39 | self.agentProperties["requireJointAction"]=True 40 | self.agentProperties["requireJointReward"]=True 41 | for prop in self.learner.getProperty().keys(): 42 | if learner.getProperty()[prop]: 43 | assert self.getProperty()[prop], "learners property should same to that of agents." 44 | 45 | def _actionProbs(self, state): 46 | if self.greedy: 47 | return self.learner._greedyPolicy(state) 48 | elif self.epsilonGreedy: 49 | return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 50 | + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent])) 51 | else: 52 | return self.learner._boltzmannPolicy(state, self._temperature) 53 | 54 | def getAction(self): 55 | self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) 56 | return array([self.lastaction]) 57 | 58 | def integrateObservation(self, obs): 59 | LoggingAgentSG.integrateObservation(self, obs) 60 | 61 | def reset(self): 62 | LoggingAgentSG.reset(self) 63 | self._temperature = self.init_temperature 64 | self._expl_proportion = self.init_exploration 65 | self.learner.reset() 66 | self.newEpisode() 67 | 68 | def newEpisode(self): 69 | """ Indicate the beginning of a new episode in the training cycle. """ 70 | if self.logging: 71 | for i in range(self.numAgents): 72 | self.history[i].newSequence() 73 | if self.learning and not self.learner.batchMode: 74 | self.learner.newEpisode() 75 | else: 76 | self._temperature *= self.temperature_decay 77 | self._expl_proportion *= self.exploration_decay 78 | self.learner.newEpisode() 79 | 80 | 81 | def learn(self,episodes): 82 | assert isinstance(self.learner,NFCEQ), "learner should be an instance of CEQ-NFQ" 83 | self.learner.learn() 84 | 85 | def setIndexOfAgent(self,index): 86 | """ set index to agent. 87 | :key index: index of agent 88 | :type index: integer 89 | """ 90 | super(NFCEQ_Agent, self).setIndexOfAgent(index) 91 | self.learner.setIndexOfAgent(index) 92 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/learningSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 8 | 9 | class LearningAgentSG(LoggingAgentSG): 10 | """ 11 | Variation of LearningAgent (pybrain.rl.agents.learning) for stochastic game, 12 | which can use some single-agent reinforcement learnings (currently only NFQ) put on: 13 | pybrainSG.rl.leaners.valuebased.nfqSG 14 | """ 15 | 16 | def __init__(self, module, num_features, num_actions, num_agents, index, learner): 17 | """ 18 | :key module: the acting module 19 | :key learner: the learner (optional) """ 20 | assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable." 21 | self.module = module 22 | self.learner = learner 23 | LoggingAgentSG.__init__(self, num_features, num_actions, num_agents,index) 24 | 25 | # if learner is available, tell it the module and data 26 | if self.learner is not None: 27 | self.learner.module = self.module 28 | self.learner.dataset = self.history 29 | 30 | self.learning = True 31 | 32 | self.agentProperties["requireOtherAgentsState"]=False 33 | self.agentProperties["requireJointAction"]=False 34 | self.agentProperties["requireJointReward"]=False 35 | #parity check 36 | for prop in self.learner.getProperty().keys(): 37 | if learner.getProperty()[prop]: 38 | assert self.getProperty()[prop], "learners property should same to that of agents." 39 | 40 | def _getLearning(self): 41 | """ Return whether the agent currently learns from experience or not. """ 42 | return self.__learning 43 | 44 | 45 | def _setLearning(self, flag): 46 | """ Set whether or not the agent should learn from its experience """ 47 | if self.learner is not None: 48 | self.__learning = flag 49 | else: 50 | self.__learning = False 51 | 52 | learning = property(_getLearning, _setLearning) 53 | 54 | 55 | def getAction(self): 56 | """ Activate the module with the last observation, add the exploration from 57 | the explorer object and store the result as last action. """ 58 | LoggingAgentSG.getAction(self) 59 | 60 | self.lastaction = self.module.activate(self.lastobs) 61 | 62 | if self.learning: 63 | self.lastaction = self.learner.explore(self.lastobs, self.lastaction) 64 | 65 | return self.lastaction 66 | 67 | 68 | def newEpisode(self): 69 | """ Indicate the beginning of a new episode in the training cycle. """ 70 | # reset the module when a new episode starts. 71 | self.module.reset() 72 | 73 | if self.logging: 74 | for i in range(self.numAgents): 75 | self.history[i].newSequence() 76 | 77 | # inform learner about the start of a new episode 78 | if self.learning: 79 | self.learner.newEpisode() 80 | 81 | def reset(self): 82 | """ Clear the history of the agent and resets the module and learner. """ 83 | LoggingAgentSG.reset(self) 84 | self.module.reset() 85 | if self.learning: 86 | self.learner.reset() 87 | 88 | 89 | def learn(self, episodes=1): 90 | """ Call the learner's learn method, which has access to both module and history. """ 91 | if self.learning: 92 | self.learner.learnEpisodes(episodes) 93 | 94 | def setIndexOfAgent(self,index): 95 | """ set index to agent. 96 | :key index: index of agent 97 | :type index: integer 98 | """ 99 | super(LearningAgentSG, self).setIndexOfAgent(index) 100 | self.learner.setIndexOfAgent(index) 101 | 102 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/learnerfaSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 7 | from scipy import zeros, dot, exp, clip, randn 8 | from pybrain.utilities import r_argmax, setAllArgs 9 | 10 | class LinearFALearnerSG(IndexableValueBasedLearner): 11 | """ 12 | Stochastic game version of LinearFALearner 13 | """ 14 | 15 | learningRate = 0.5 # aka alpha: make sure this is being decreased by calls from the learning agent! 16 | learningRateDecay = 100 # aka n_0, but counting decay-calls 17 | 18 | randomInit = True 19 | 20 | rewardDiscount = 0.99 # aka gamma 21 | 22 | batchMode = False 23 | passNextAction = False # for the _updateWeights method 24 | # 25 | 26 | 27 | def __init__(self, num_features, num_actions, indexOfAgent=None, **kwargs): 28 | IndexableValueBasedLearner.__init__(self, indexOfAgent) 29 | setAllArgs(self, kwargs) 30 | self.explorer = None 31 | self.indexOfAgent=indexOfAgent 32 | self.num_actions = num_actions 33 | self.num_features = num_features 34 | if self.randomInit: 35 | self._theta = randn(self.num_actions, self.num_features) / 10. 36 | else: 37 | self._theta = zeros((self.num_actions, self.num_features)) 38 | self._additionalInit() 39 | self._behaviorPolicy = self._boltzmannPolicy 40 | self.reset() 41 | # 42 | self.ownerAgentProperties["requireOtherAgentsState"]=False 43 | self.ownerAgentProperties["requireJointAction"]=False 44 | self.ownerAgentProperties["requireJointReward"]=False 45 | 46 | 47 | def _additionalInit(self): 48 | pass 49 | 50 | def _qValues(self, state): 51 | """ Return vector of q-values for all actions, 52 | given the state(-features). """ 53 | return dot(self._theta, state) 54 | 55 | def _greedyAction(self, state): 56 | return r_argmax(self._qValues(state)) 57 | 58 | def _greedyPolicy(self, state): 59 | tmp = zeros(self.num_actions) 60 | tmp[self._greedyAction(state)] = 1 61 | return tmp 62 | 63 | def _boltzmannPolicy(self, state, temperature=1.): 64 | tmp = self._qValues(state) 65 | return LinearFALearnerSG._boltzmannProbs(tmp, temperature) 66 | 67 | @staticmethod 68 | def _boltzmannProbs(qvalues, temperature=1.): 69 | if temperature == 0: 70 | tmp = zeros(len(qvalues)) 71 | tmp[r_argmax(qvalues)] = 1. 72 | else: 73 | tmp = qvalues / temperature 74 | tmp -= max(tmp) 75 | tmp = exp(clip(tmp, -20, 0)) 76 | return tmp / sum(tmp) 77 | 78 | def reset(self): 79 | IndexableValueBasedLearner.reset(self) 80 | self._callcount = 0 81 | self.newEpisode() 82 | 83 | def newEpisode(self): 84 | IndexableValueBasedLearner.newEpisode(self) 85 | self._callcount += 1 86 | self.learningRate *= ((self.learningRateDecay + self._callcount) 87 | / (self.learningRateDecay + self._callcount + 1.)) 88 | 89 | 90 | class Q_LinFA_SG(LinearFALearnerSG): 91 | """ Standard Q-learning with linear FA. """ 92 | 93 | def _updateWeights(self, state, action, reward, next_state): 94 | """ state and next_state are vectors, action is an integer. """ 95 | td_error = reward + self.rewardDiscount * max(dot(self._theta, next_state)) - dot(self._theta[action], state) 96 | #print(action, reward, td_error,self._theta[action], state, dot(self._theta[action], state)) 97 | #print(self.learningRate * td_error * state) 98 | #print() 99 | self._theta[action] += self.learningRate * td_error * state 100 | 101 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/sgspa.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/10 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrain.utilities import drawIndex 8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 9 | from scipy import array 10 | import numpy as np 11 | 12 | #Implmenting now 13 | class SGSP_Agent(LoggingAgentSG): 14 | """ 15 | Agent based on SPSG RL algorithms put on: 16 | pybrainSG.rl.leaners.valuebased.spsg 17 | """ 18 | init_exploration = 0.005 # aka epsilon 19 | exploration_decay = 0.9999 # per episode 20 | 21 | # flags for exploration strategies 22 | epsilonGreedy = True 23 | learning = True 24 | 25 | def __init__(self, learner, num_actions, numAgents, index, **kwargs): 26 | assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable." 27 | self.learner = learner 28 | LoggingAgentSG.__init__(self, np.ones(numAgents)*learner.num_features, num_actions, numAgents, index, **kwargs) 29 | self.learner._behaviorPolicy = self._actionProbs 30 | self.reset() 31 | self.agentProperties["requireOtherAgentsState"]=False 32 | self.agentProperties["requireJointAction"]=True 33 | self.agentProperties["requireJointReward"]=True 34 | for prop in self.learner.getProperty().keys(): 35 | if learner.getProperty()[prop]: 36 | assert self.getProperty()[prop], "learners property should same to that of agents." 37 | 38 | def _actionProbs(self, state): 39 | if not self.epsilonGreedy: 40 | return self.learner._softmaxPolicy(state) 41 | elif self.epsilonGreedy: 42 | return (self.learner._softmaxPolicy(state) * (1 - self._expl_proportion) 43 | + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent])) 44 | 45 | def getAction(self): 46 | self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) 47 | if self.learning and not self.learner.batchMode and self._oaro is not None: 48 | self.learner._updateWeights(*(self._oaro + [self.lastaction])) 49 | self._oaro = None 50 | # print "Agent " + str(self.indexOfAgent) + ": " + str(self.lastaction) 51 | return array([self.lastaction]) 52 | 53 | def integrateObservation(self, obs): 54 | if self.learning and not self.learner.batchMode and self.lastobs is not None: 55 | if self.learner.passNextAction: 56 | self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs] 57 | else: 58 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 59 | LoggingAgentSG.integrateObservation(self, obs) 60 | 61 | def reset(self): 62 | LoggingAgentSG.reset(self) 63 | self._expl_proportion = self.init_exploration 64 | self.learner.reset() 65 | self._oaro = None 66 | self.newEpisode() 67 | 68 | def newEpisode(self): 69 | if self.logging: 70 | for i in range(self.numAgents): 71 | self.history[i].newSequence() 72 | if self.learning and not self.learner.batchMode: 73 | self.learner.newEpisode() 74 | else: 75 | self._expl_proportion *= self.exploration_decay 76 | self.learner.newEpisode() 77 | 78 | def learn(self): 79 | if not self.learning: 80 | return 81 | if not self.learner.batchMode: 82 | print('Learning is done online, and already finished.') 83 | return 84 | for seq in self.history[self.indexOfAgent]: 85 | for obs, action, reward in seq: 86 | if self.laststate is not None: 87 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 88 | self.lastobs = obs 89 | self.lastaction = action[0] 90 | self.lastreward = reward 91 | self.learner.newEpisode() 92 | 93 | def setIndexOfAgent(self,index): 94 | """ indexing agent and its learner. 95 | :key index: index of agent 96 | :type index: integer 97 | """ 98 | super(SGSP_Agent, self).setIndexOfAgent(index) 99 | self.learner.setIndexOfAgent(index) 100 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/faphc.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrain.utilities import drawIndex 8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 9 | from scipy import array 10 | import numpy as np 11 | class PHC_Agent(LoggingAgentSG): 12 | """ 13 | Agent based on PHC RL algorithms put on: 14 | pybrainSG.rl.leaners.valuebased.phc 15 | """ 16 | init_exploration = 0.01 # aka epsilon 17 | exploration_decay = 0.99 # per episode 18 | 19 | init_temperature = 1. 20 | temperature_decay = 0.99 # per episode 21 | 22 | # flags for exploration strategies 23 | epsilonGreedy = True 24 | learning = True 25 | 26 | def __init__(self, learner, numAgents, index, **kwargs): 27 | assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable." 28 | self.learner = learner 29 | LoggingAgentSG.__init__(self, np.ones(numAgents)*learner.num_features, np.ones(numAgents), numAgents, index, **kwargs) 30 | self.learner._behaviorPolicy = self._actionProbs 31 | self.reset() 32 | self.agentProperties["requireOtherAgentsState"]=False 33 | self.agentProperties["requireJointAction"]=False 34 | self.agentProperties["requireJointReward"]=False 35 | for prop in self.learner.getProperty().keys(): 36 | if learner.getProperty()[prop]: 37 | assert self.getProperty()[prop], "learners property should same to that of agents." 38 | 39 | def _actionProbs(self, state): 40 | if not self.epsilonGreedy: 41 | return self.learner._softmaxPolicy(state) 42 | elif self.epsilonGreedy: 43 | return (self.learner._softmaxPolicy(state) * (1 - self._expl_proportion) 44 | + self._expl_proportion / float(self.learner.num_actions)) 45 | 46 | def getAction(self): 47 | self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) 48 | if self.learning and not self.learner.batchMode and self._oaro is not None: 49 | self.learner._updateWeights(*(self._oaro + [self.lastaction])) 50 | self._oaro = None 51 | return array([self.lastaction]) 52 | 53 | def integrateObservation(self, obs): 54 | if self.learning and not self.learner.batchMode and self.lastobs is not None: 55 | if self.learner.passNextAction: 56 | self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs] 57 | else: 58 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 59 | LoggingAgentSG.integrateObservation(self, obs) 60 | 61 | def reset(self): 62 | LoggingAgentSG.reset(self) 63 | self._temperature = self.init_temperature 64 | self._expl_proportion = self.init_exploration 65 | self.learner.reset() 66 | self._oaro = None 67 | self.newEpisode() 68 | 69 | def newEpisode(self): 70 | if self.logging: 71 | for i in range(self.numAgents): 72 | self.history[i].newSequence() 73 | if self.learning and not self.learner.batchMode: 74 | self.learner.newEpisode() 75 | else: 76 | self._temperature *= self.temperature_decay 77 | self._expl_proportion *= self.exploration_decay 78 | self.learner.newEpisode() 79 | 80 | def learn(self): 81 | if not self.learning: 82 | return 83 | if not self.learner.batchMode: 84 | print('Learning is done online, and already finished.') 85 | return 86 | for seq in self.history[self.indexOfAgent]: 87 | for obs, action, reward in seq: 88 | if self.laststate is not None: 89 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 90 | self.lastobs = obs 91 | self.lastaction = action[0] 92 | self.lastreward = reward 93 | self.learner.newEpisode() 94 | 95 | def setIndexOfAgent(self,index): 96 | """ indexing agent and its learner. 97 | :key index: index of agent 98 | :type index: integer 99 | """ 100 | super(PHC_Agent, self).setIndexOfAgent(index) 101 | self.learner.setIndexOfAgent(index) 102 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/multiAgent.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrain.rl.agents.agent import Agent 8 | from pybrainSG.rl.agents.indexable import IndexableAgent 9 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 10 | import numpy as np 11 | from multiprocessing import Process, Queue 12 | import copy_reg 13 | import types 14 | 15 | def _pickle_method(m): 16 | if m.im_self is None: 17 | return getattr, (m.im_class, m.im_func.func_name) 18 | else: 19 | return getattr, (m.im_self, m.im_func.func_name) 20 | 21 | copy_reg.pickle(types.MethodType, _pickle_method) 22 | 23 | class MultiAgent(Agent): 24 | ''' 25 | This class defines set of agents. 26 | Each agent should be instance of IndexableAgent or its subclass. 27 | ''' 28 | agentSet=[] 29 | 30 | def __init__(self): 31 | Agent.__init__(self) 32 | self.agentSet=[] 33 | 34 | def integrateObservation(self, obs): 35 | """ Integrate the current observation of the environment. 36 | :arg obs: The last observation returned from the environment 37 | :type obs: by default, this is assumed to be a numpy array of doubles 38 | """ 39 | for index in range(len(self.agentSet)): 40 | if self.agentSet[index].getProperty()["requireOtherAgentsState"]: 41 | self.agentSet[index].integrateObservation(obs) 42 | else: 43 | self.agentSet[index].integrateObservation(obs[index]) 44 | 45 | def getJointAction(self): 46 | """ Return a chosen joint-action. 47 | :rtype: by default, this is assumed to ba a numpy array of integers that correspond to particular action at each. 48 | """ 49 | jointAction=np.zeros(len(self.agentSet), dtype=np.int) 50 | for index in range(len(self.agentSet)): 51 | jointAction[index]=self.agentSet[index].getAction() 52 | for index in range(len(self.agentSet)): 53 | if isinstance(self.agentSet[index], LoggingAgentSG) and self.agentSet[index].getProperty()["requireJointAction"]: 54 | self.agentSet[index].lastaction=jointAction 55 | else: 56 | self.agentSet[index].lastaction=jointAction[index] 57 | return jointAction 58 | 59 | def _getAction(self,q, agent, index): 60 | act=agent.getAction() 61 | q.put([index,act]) 62 | 63 | def giveJointReward(self, r): 64 | """ give joint-teward to all agents. 65 | :key r: joint reward 66 | :type r: numpy array of doubles 67 | """ 68 | for index in range(len(self.agentSet)): 69 | if self.agentSet[index].getProperty()["requireJointReward"]: 70 | self.agentSet[index].giveReward(r) 71 | else: 72 | self.agentSet[index].giveReward(r[index]) 73 | 74 | def reset(self): 75 | for agent in self.agentSet: 76 | agent.reset() 77 | 78 | def learn(self, episodes=1): 79 | procs=[] 80 | i=0 81 | qResult=Queue() 82 | for agent in self.agentSet: 83 | procs.append(Process(target=self._paraLearn, kwargs={"agent":agent,"episodes":episodes,"qResult":qResult})) 84 | i+=1 85 | for proc in procs: 86 | proc.start() 87 | for _ in range(len(self.agentSet)): 88 | res=qResult.get() 89 | self.agentSet[res[0]]=res[1] 90 | 91 | def _paraLearn(self, agent, episodes, qResult): 92 | agent.learn(episodes) 93 | qResult.put([agent.indexOfAgent, agent]) 94 | 95 | def newEpisode(self): 96 | for agent in self.agentSet: 97 | agent.newEpisode() 98 | 99 | def addAgent(self, agent): 100 | assert isinstance(agent, IndexableAgent), "agent should be IndxableAgent class or its subclass." 101 | assert agent.indexOfAgent is not None, "Index should be identified" 102 | if len(self.agentSet) ==0: 103 | assert agent.indexOfAgent==0, "Illegal indexing." 104 | else: 105 | ind=0 106 | for elem in self.agentSet: 107 | assert ind == (elem.indexOfAgent), "Illegal indexing." 108 | ind+=1 109 | assert agent.indexOfAgent==ind, "Illegal indexing." 110 | self.agentSet.append(agent) 111 | 112 | def popAgent(self, index): 113 | agent=self.agentSet.pop(index) 114 | agent.setIndexOfAgent(None) 115 | 116 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/huntinggame.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/20 3 | 4 | @author: takuya-hv2 5 | ''' 6 | 7 | from pybrain.rl.environments import Environment 8 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG 9 | import numpy as np 10 | 11 | #Integrated to gridgames.py in future. 12 | class HuntingGameTask(EpisodicTaskSG): 13 | ''''Agents hunt animals in grid world. If all agents gather into particular place where more than one animal stay, 14 | hunting is succeeded and agents are rewarded. 15 | Agents are punished as turn passes. 16 | ''' 17 | isGameFinished=False 18 | maximumTurn=10 19 | currentTurn=0 20 | def __init__(self,task=None): 21 | if task == None: 22 | task=HuntingGame() 23 | EpisodicTaskSG.__init__(self, task) 24 | 25 | def reset(self): 26 | EpisodicTaskSG.reset(self) 27 | self.isGameFinished=False 28 | self.currentTurn=0 29 | 30 | def isFinished(self): 31 | return self.isGameFinished 32 | 33 | def getReward(self): 34 | jointReward=self.env.getJointReward() 35 | if (self.env.getJointReward()[0] > 0): 36 | self.isGameFinished=True 37 | 38 | #Time pressure 39 | jointReward=jointReward-1 40 | if (self.currentTurn >= HuntingGameTask.maximumTurn): 41 | self.isGameFinished=True 42 | #print str(jointReward[0])#+", " + str(jointReward[1]) 43 | self.currentTurn+=1 44 | return jointReward 45 | 46 | 47 | class HuntingGame(Environment): 48 | availableActions=[0,1,2,3,4]#Corresponding to forward north, west, south, east, stay respectively. 49 | sizeofGlidWorld=3 50 | numberofAnimals=1 51 | numberofAgents=2 52 | animals=None 53 | agents=None 54 | 55 | def getSensors(self): 56 | for i in range(HuntingGame.numberofAnimals): 57 | if np.random.rand() > 0.8: 58 | self.animals[i]=self.__move__(self.animals[i], np.random.randint(5)) 59 | stateTemp1=np.append(self.animals.flatten(),self.agents.flatten()) 60 | stateTemp2=np.append(stateTemp1,np.ones(1)) 61 | stateTemp3=[] 62 | for _ in range(self.numberofAgents): 63 | stateTemp3.append(stateTemp2) 64 | 65 | return stateTemp3 66 | 67 | def performAction(self, action): 68 | for i in range(HuntingGame.numberofAgents): 69 | self.agents[i]=self.__move__(self.agents[i], action[i]) 70 | 71 | def isSucceedHunting(self): 72 | #return true only if all agent gather in one place where animal exists 73 | for i in range(HuntingGame.numberofAgents): 74 | for j in range(HuntingGame.numberofAgents): 75 | if(self.agents[i][0] != self.agents[j][0]) or (self.agents[i][1] != self.agents[j][1]): 76 | return False 77 | for k in range(HuntingGame.numberofAnimals): 78 | if(self.agents[0][0] == self.animals[k][0]) and (self.agents[0][1] == self.animals[k][1]): 79 | return True 80 | return False 81 | 82 | def __move__(self,position, forward): 83 | if forward == 0:#Move North 84 | position[1]+=1 85 | elif forward == 1:#Move west 86 | position[0]-=1 87 | elif forward == 2:#Move south 88 | position[1]-=1 89 | elif forward == 3:#Move east 90 | position[0]+=1 91 | elif forward == 4:#stay here 92 | return position 93 | else: 94 | assert False, "Unexpected action" 95 | 96 | if position[0] >= HuntingGame.sizeofGlidWorld: 97 | position[0]=HuntingGame.sizeofGlidWorld-1 98 | if position[0] < 0: 99 | position[0]=0 100 | if position[1] >= HuntingGame.sizeofGlidWorld: 101 | position[1]=HuntingGame.sizeofGlidWorld-1 102 | if position[1] < 0: 103 | position[1]=0 104 | return position 105 | 106 | def reset(self): 107 | # self.animals=np.random.randint(HuntingGame.sizeofGlidWorld,size=(HuntingGame.numberofAnimals,2)) 108 | self.animals=np.zeros((HuntingGame.numberofAnimals,2)) 109 | self.agents=np.random.randint(HuntingGame.sizeofGlidWorld,size=(HuntingGame.numberofAgents,2)) 110 | # self.agents=np.ones((HuntingGame.numberofAgents,2)) 111 | 112 | 113 | def getJointReward(self): 114 | if self.isSucceedHunting(): 115 | return np.ones(self.numberofAgents)*10 116 | return np.zeros(self.numberofAgents) 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/linearfaSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrain.utilities import drawIndex 8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 9 | from scipy import array 10 | import numpy as np 11 | class LinearFA_AgentSG(LoggingAgentSG): 12 | """ 13 | Agent based on simple Q-learning put on: 14 | pybrainSG.rl.leaners.valuebased.learnerfaSG 15 | """ 16 | 17 | init_exploration = 0.1 # aka epsilon 18 | exploration_decay = 0.99 # per episode 19 | 20 | init_temperature = 1. 21 | temperature_decay = 0.99 # per episode 22 | 23 | # flags for exploration strategies 24 | epsilonGreedy = False 25 | learning = True 26 | greedy = False 27 | 28 | def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs): 29 | assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable." 30 | self.learner = learner 31 | LoggingAgentSG.__init__(self, num_features, num_actions, num_agents, index, **kwargs) 32 | self.learner._behaviorPolicy = self._actionProbs 33 | self.reset() 34 | self.agentProperties["requireOtherAgentsState"]=False 35 | self.agentProperties["requireJointAction"]=False 36 | self.agentProperties["requireJointReward"]=False 37 | for prop in self.learner.getProperty().keys(): 38 | if learner.getProperty()[prop]: 39 | assert self.getProperty()[prop], "learners property should same to that of agents." 40 | 41 | def _actionProbs(self, state): 42 | if self.greedy: 43 | return self.learner._greedyPolicy(state) 44 | elif self.epsilonGreedy: 45 | return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 46 | + self._expl_proportion / float(self.learner.num_actions)) 47 | else: 48 | return self.learner._boltzmannPolicy(state, self._temperature) 49 | 50 | def getAction(self): 51 | self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) 52 | if self.learning and not self.learner.batchMode and self._oaro is not None: 53 | self.learner._updateWeights(*(self._oaro + [self.lastaction])) 54 | self._oaro = None 55 | return array([self.lastaction]) 56 | 57 | def integrateObservation(self, obs): 58 | if self.learning and not self.learner.batchMode and self.lastobs is not None: 59 | if self.learner.passNextAction: 60 | self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs] 61 | else: 62 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 63 | LoggingAgentSG.integrateObservation(self, obs) 64 | 65 | def reset(self): 66 | LoggingAgentSG.reset(self) 67 | self._temperature = self.init_temperature 68 | self._expl_proportion = self.init_exploration 69 | self.learner.reset() 70 | self._oaro = None 71 | self.newEpisode() 72 | 73 | def newEpisode(self): 74 | if self.logging: 75 | for i in range(self.numAgents): 76 | self.history[i].newSequence() 77 | if self.learning and not self.learner.batchMode: 78 | self.learner.newEpisode() 79 | else: 80 | self._temperature *= self.temperature_decay 81 | self._expl_proportion *= self.exploration_decay 82 | self.learner.newEpisode() 83 | 84 | 85 | def learn(self): 86 | if not self.learning: 87 | return 88 | if not self.learner.batchMode: 89 | print('Learning is done online, and already finished.') 90 | return 91 | for seq in self.history[self.indexOfAgent]: 92 | for obs, action, reward in seq: 93 | if self.laststate is not None: 94 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 95 | self.lastobs = obs 96 | self.lastaction = action[0] 97 | self.lastreward = reward 98 | self.learner.newEpisode() 99 | 100 | def setIndexOfAgent(self,index): 101 | """ indexing agent and its learner. 102 | :key index: index of agent 103 | :type index: integer 104 | """ 105 | super(LinearFA_AgentSG, self).setIndexOfAgent(index) 106 | self.learner.setIndexOfAgent(index) 107 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/ceqa.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/26 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG 7 | from pybrain.utilities import drawIndex 8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 9 | from scipy import array 10 | import numpy as np 11 | class CEQ_Agent(LoggingAgentSG): 12 | """ 13 | Agent based on CE-Q RL algorithms put on: 14 | pybrainSG.rl.leaners.valuebased.ceq 15 | """ 16 | init_exploration = 0.3 # aka epsilon 17 | exploration_decay = 0.99 # per episode 18 | 19 | init_temperature = 1. 20 | temperature_decay = 0.99 # per episode 21 | 22 | # flags for exploration strategies 23 | epsilonGreedy = True 24 | learning = True 25 | greedy = False 26 | 27 | def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs): 28 | self.learner = learner 29 | LoggingAgentSG.__init__(self, np.ones(num_agents, dtype=np.int8)*num_features, num_actions, num_agents, index, **kwargs) 30 | assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable." 31 | self.learner._behaviorPolicy = self._actionProbs 32 | self.reset() 33 | self.agentProperties["requireOtherAgentsState"]=False 34 | self.agentProperties["requireJointAction"]=True 35 | self.agentProperties["requireJointReward"]=True 36 | for prop in self.learner.getProperty().keys(): 37 | if learner.getProperty()[prop]: 38 | assert self.getProperty()[prop], "learners property should same to that of agents." 39 | 40 | def _actionProbs(self, state): 41 | if self.greedy: 42 | return self.learner._greedyPolicy(state) 43 | elif self.epsilonGreedy: 44 | return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 45 | + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent])) 46 | else: 47 | return self.learner._boltzmannPolicy(state, self._temperature) 48 | 49 | def getAction(self): 50 | self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) 51 | if self.learning and not self.learner.batchMode and self._oaro is not None: 52 | self.learner._updateWeights(*(self._oaro + [self.lastaction])) 53 | self._oaro = None 54 | return array([self.lastaction]) 55 | 56 | def integrateObservation(self, obs): 57 | if self.learning and not self.learner.batchMode and self.lastobs is not None: 58 | if self.learner.passNextAction: 59 | self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs] 60 | else: 61 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 62 | LoggingAgentSG.integrateObservation(self, obs) 63 | 64 | def reset(self): 65 | LoggingAgentSG.reset(self) 66 | self._temperature = self.init_temperature 67 | self._expl_proportion = self.init_exploration 68 | self.learner.reset() 69 | self._oaro = None 70 | self.newEpisode() 71 | 72 | def newEpisode(self): 73 | if self.logging: 74 | for i in range(self.numAgents): 75 | self.history[i].newSequence() 76 | if self.learning and not self.learner.batchMode: 77 | self.learner.newEpisode() 78 | else: 79 | self._temperature *= self.temperature_decay 80 | self._expl_proportion *= self.exploration_decay 81 | self.learner.newEpisode() 82 | 83 | def learn(self): 84 | if not self.learning: 85 | return 86 | if not self.learner.batchMode: 87 | print('Learning is done online, and already finished.') 88 | return 89 | for seq in self.history[self.indexOfAgent]: 90 | for obs, action, reward in seq: 91 | if self.laststate is not None: 92 | self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs) 93 | self.lastobs = obs 94 | self.lastaction = action[0] 95 | self.lastreward = reward 96 | self.learner.newEpisode() 97 | 98 | def setIndexOfAgent(self,index): 99 | """ indexing agent and its learner. 100 | :key index: index of agent 101 | :type index: integer 102 | """ 103 | super(CEQ_Agent, self).setIndexOfAgent(index) 104 | self.learner.setIndexOfAgent(index) 105 | -------------------------------------------------------------------------------- /pybrainSG/rl/agents/loggingSG.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.agents.indexable import IndexableAgent 7 | from pybrain.datasets.reinforcement import ReinforcementDataSet 8 | import numpy as np 9 | class LoggingAgentSG(IndexableAgent): 10 | """ This agent stores actions, states, and rewards encountered during 11 | interaction with an environment in a ReinforcementDataSet (which is 12 | a variation of SequentialDataSet). 13 | The stored history can be used for learning and is erased by resetting 14 | the agent. It also makes sure that integrateObservation, getAction and 15 | giveReward are called in exactly that order. 16 | """ 17 | 18 | logging = True 19 | 20 | lastobs = None 21 | lastaction = None 22 | lastreward = None 23 | 24 | agentProperties={ 25 | "requireOtherAgentsState": None, #Define if agent require other agent state information. 26 | "requireJointAction":None, #Define if agent require other agent action information. 27 | "requireJointReward":None}#Define if agent require other agent reward information. 28 | 29 | 30 | def __init__(self, indims, outdims, numAgents, index=None, **kwargs): 31 | IndexableAgent.__init__(self, index) 32 | self.setArgs(**kwargs) 33 | 34 | # store input and output dimension #input, output dimension for each agent 35 | self.indim = indims 36 | self.outdim = outdims 37 | self.numAgents=numAgents 38 | # create the history dataset 39 | self.history=[] 40 | for i in range(self.numAgents): 41 | self.history.append(ReinforcementDataSet(self.indim[i], self.outdim[i])) 42 | 43 | 44 | def integrateObservation(self, obs): 45 | """Step 1: store the observation received in a temporary variable until action is called and 46 | reward is given. """ 47 | self.lastobs = obs 48 | self.lastaction = None 49 | self.lastreward = None 50 | 51 | 52 | def getAction(self): 53 | """Step 2: store the action in a temporary variable until reward is given. """ 54 | assert self.lastobs != None 55 | assert self.lastaction == None 56 | assert self.lastreward == None 57 | # implement getAction in subclass and set self.lastaction 58 | 59 | 60 | def giveReward(self, r): 61 | """Step 3: store observation, action and reward in the history dataset. """ 62 | # step 3: assume that state and action have been set 63 | assert self.lastobs != None 64 | assert self.lastaction != None 65 | assert self.lastreward == None 66 | 67 | self.lastreward = r 68 | 69 | # store state, action and reward in dataset if logging is enabled 70 | if self.logging: 71 | for i in range(self.numAgents): 72 | tlastobs=None 73 | tlastaction=None 74 | tlastreward=None 75 | 76 | if self.getProperty()["requireOtherAgentsState"]: 77 | tlastobs=self.lastobs[i] 78 | elif i==self.indexOfAgent: 79 | tlastobs=self.lastobs 80 | else: 81 | tlastobs=np.zeros(self.indim[i]) 82 | if self.getProperty()["requireJointAction"]: 83 | tlastaction=self.lastaction[i] 84 | elif i==self.indexOfAgent: 85 | tlastaction=self.lastaction 86 | else: 87 | tlastaction=np.zeros(self.outdim[i]) 88 | if self.getProperty()["requireJointReward"]: 89 | tlastreward=self.lastreward[i] 90 | elif i==self.indexOfAgent: 91 | tlastreward=self.lastreward 92 | else: 93 | tlastreward=np.zeros(1) 94 | self.history[i].addSample(tlastobs, tlastaction, tlastreward) 95 | 96 | def newEpisode(self): 97 | """ Indicate the beginning of a new episode in the training cycle. """ 98 | if self.logging: 99 | for i in range(self.numAgents): 100 | self.history[i].newSequence() 101 | 102 | 103 | def reset(self): 104 | """ Clear the history of the agent. """ 105 | self.lastobs = None 106 | self.lastaction = None 107 | self.lastreward = None 108 | for i in range(self.numAgents): 109 | self.history[i].clear() 110 | 111 | def getProperty(self): 112 | for elem in self.agentProperties.values(): 113 | assert isinstance(elem,bool), "All property should be initialize with proper boolean values." 114 | return self.agentProperties 115 | -------------------------------------------------------------------------------- /pybrainSG/rl/examples/tasks/gridgames.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/06 3 | 4 | @author: takuya-hv2 5 | ''' 6 | ''' 7 | Created on 2016/02/20 8 | 9 | @author: takuya-hv2 10 | ''' 11 | 12 | from pybrain.rl.environments import Environment 13 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG 14 | import numpy as np 15 | import copy 16 | 17 | class GridGameTask(EpisodicTaskSG): 18 | '''' ''' 19 | maximumTurn=30 20 | def __init__(self,gameType="GG1"): 21 | ''' 22 | gameType: indicates game type an experiment perform: 23 | [GG1:] simple coordinate game 24 | [GG2:] "Battle of the Sexes" 25 | [GG3:] "Chicken" 26 | See the following paper for detailed descriptions: 27 | https://www.aaai.org/Papers/ICML/2003/ICML03-034.pdf 28 | ''' 29 | EpisodicTaskSG.__init__(self, GridGame(gameType)) 30 | 31 | def reset(self): 32 | EpisodicTaskSG.reset(self) 33 | self.isGameFinished=False 34 | self.currentTurn=0 35 | 36 | def isFinished(self): 37 | if self.currentTurn > self.maximumTurn: 38 | return True 39 | return self.env.isReachGoal 40 | 41 | def getReward(self): 42 | self.currentTurn+=1 43 | # print "rew:"+str(self.env.getJointReward()) 44 | return self.env.getJointReward() 45 | 46 | class GridGame(Environment): 47 | availableActions=[0,1,2,3]#Corresponding to forward north, west, south, east respectively. 48 | sizeofGlidWorld=3 49 | numberofGoals=2 50 | numberofAgents=2 51 | 52 | def __init__(self,gameType="GG1"): 53 | Environment.__init__(self) 54 | assert (gameType == "GG1") or (gameType == "GG2") or (gameType == "GG3"), "gameType should be either GG1, GG2, or GG3." 55 | self.gameType=gameType 56 | print "Game type: " + str(self.gameType) 57 | 58 | def getSensors(self): 59 | #State: locations of all agents 60 | state=[np.r_[self.agents[0],self.agents[1]], 61 | np.r_[self.agents[0],self.agents[1]]] 62 | # print "state:" + str(state) 63 | return state 64 | 65 | def performAction(self, action): 66 | tempPos=[] 67 | self.prevAgents=copy.deepcopy(self.agents) 68 | # print "act:" + str(action) 69 | self.isColide=False 70 | for i in range(GridGame.numberofAgents): 71 | tempPos.append(self.__move__(copy.deepcopy(self.agents[i]), action[i])) 72 | if not self.__isColideWithEachOther(tempPos): 73 | self.agents=tempPos 74 | 75 | 76 | def __move__(self,position, forward): 77 | if forward == 0:#Move North 78 | if self.gameType=="GG2": 79 | if (position[0] != 1 and position[1]==0) and (np.random.rand() < 0.5): 80 | return position 81 | position[1]+=1 82 | elif forward == 1:#Move west 83 | position[0]-=1 84 | elif forward == 2:#Move south 85 | if self.gameType=="GG2": 86 | if (position[0] != 1 and position[1]==1) and (np.random.rand() < 0.5): 87 | return position 88 | position[1]-=1 89 | elif forward == 3:#Move east 90 | position[0]+=1 91 | else: 92 | assert False, "Unexpected action" 93 | 94 | if position[0] >= GridGame.sizeofGlidWorld: 95 | position[0]=GridGame.sizeofGlidWorld-1 96 | if position[0] < 0: 97 | position[0]=0 98 | if position[1] >= GridGame.sizeofGlidWorld: 99 | position[1]=GridGame.sizeofGlidWorld-1 100 | if position[1] < 0: 101 | position[1]=0 102 | return position 103 | 104 | def __isColideWithEachOther(self,tempPos): 105 | if (tempPos[0][0] == tempPos[1][0]) and (tempPos[0][1] == tempPos[1][1]): 106 | if (tempPos[0][0] != self.goals[0][0]) or (tempPos[0][1] != self.goals[0][1]): 107 | self.isColide=True 108 | return True 109 | else: 110 | return False 111 | else: 112 | return False 113 | 114 | def __isReachGoal(self): 115 | #return boolean list, that determine if each agent reach each goal. 116 | irGoal=[False,False] 117 | if (self.agents[0][0] == self.goals[0][0]) and (self.agents[0][1] == self.goals[0][1]):#For the first agent. 118 | irGoal[0]=True 119 | self.isReachGoal=True 120 | if (self.agents[1][0] == self.goals[1][0]) and (self.agents[1][1] == self.goals[1][1]):#For the second agent. 121 | irGoal[1]=True 122 | self.isReachGoal=True 123 | return irGoal 124 | 125 | def reset(self): 126 | self.agents=[np.array([0,0]), 127 | np.array([2,0])] 128 | self.prevAgents=[np.array([0,0]), 129 | np.array([2,0])] 130 | 131 | self.isReachGoal=False 132 | if self.gameType == "GG1": 133 | self.goals=[np.array([2,2]), 134 | np.array([0,2])] 135 | else: 136 | self.goals=[np.array([1,2]), 137 | np.array([1,2])] 138 | 139 | def getJointReward(self): 140 | jointRew=[0,0] 141 | irGoal=self.__isReachGoal() 142 | if not (self.gameType == "GG3"): 143 | if irGoal[0]: 144 | jointRew[0]=100 145 | if irGoal[1]: 146 | jointRew[1]=100 147 | else: 148 | if irGoal[0] and irGoal[1]: 149 | if self.prevAgents[0][0] == 1: 150 | jointRew[0]= 125 151 | jointRew[1]=100 152 | elif self.prevAgents[1][0] == 1: 153 | jointRew[0]= 100 154 | jointRew[1]=125 155 | else: 156 | jointRew[0]= 120 157 | jointRew[1]=120 158 | elif irGoal[0]: 159 | jointRew[0]=100 160 | elif irGoal[1]: 161 | jointRew[1]=100 162 | if self.isColide: 163 | jointRew[0]-=1 164 | jointRew[1]-=1 165 | 166 | return np.array(jointRew) 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/sgsp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/03/09 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 7 | from scipy import zeros 8 | from pybrain.utilities import r_argmax 9 | import numpy as np 10 | from pybrain.utilities import abstractMethod 11 | from pybrain.datasets import SupervisedDataSet 12 | from pybrain.supervised.trainers import BackpropTrainer 13 | from pybrain.utilities import one_to_n 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer 15 | from pybrain.tools.shortcuts import buildNetwork 16 | from scipy import r_, asarray 17 | import copy 18 | 19 | class ON_SGSP_FA(IndexableValueBasedLearner): 20 | """ 21 | Stochastic game sub-problem (with function approximation for Q-value and policy): 22 | http://www.ifaamas.org/Proceedings/aamas2015/aamas/p1371.pdf 23 | """ 24 | randomInit = True 25 | 26 | rewardDiscount = 0.99 # aka gamma 27 | 28 | batchMode = False 29 | passNextAction = False # for the _updateWeights method 30 | 31 | def __init__(self, num_features, num_actions, indexOfAgent=None): 32 | IndexableValueBasedLearner.__init__(self, indexOfAgent) 33 | self.explorer = None 34 | self.num_actions = num_actions 35 | self.num_features = num_features 36 | self.indexOfAgent=indexOfAgent 37 | self._behaviorPolicy = self._softmaxPolicy 38 | self.reset() 39 | self.ownerAgentProperties["requireOtherAgentsState"]=False 40 | self.ownerAgentProperties["requireJointAction"]=True 41 | self.ownerAgentProperties["requireJointReward"]=True 42 | 43 | def _pi(self, state): 44 | """ Return vector of probability of policy for all actions, 45 | given the state(-features). """ 46 | abstractMethod() 47 | 48 | def _softmaxPolicy(self, state): 49 | tmp = zeros(self.num_actions[self.indexOfAgent]) 50 | pi=self._pi(state) 51 | rand=np.random.rand() 52 | cum=0.0 53 | for i in range(self.num_actions[self.indexOfAgent]): 54 | cum+=pi[i] 55 | if rand < cum: 56 | tmp[i] = 1 57 | return tmp 58 | 59 | def reset(self): 60 | IndexableValueBasedLearner.reset(self) 61 | self._callcount = 0 62 | self.newEpisode() 63 | 64 | def newEpisode(self): 65 | IndexableValueBasedLearner.newEpisode(self) 66 | 67 | def _updateWeights(self, state, action, reward, next_state): 68 | ''' 69 | Expected to update approximator. 70 | ''' 71 | abstractMethod() 72 | 73 | 74 | class ON_SGSP_NN(ON_SGSP_FA): 75 | '''ON_SGSP with neural function approximation. ''' 76 | weightdecay=0.01 77 | zeta=0.00001 78 | # 79 | cn=0.05 80 | bn=0.05 81 | # 82 | decayCn=0.9999 83 | decayBn=0.9995 84 | 85 | 86 | def __init__(self, num_features, num_actions, num_agents, index): 87 | ON_SGSP_FA.__init__(self, num_features, num_actions, index) 88 | self.num_agents= num_agents 89 | self.linQ = [] 90 | for iAgent in range(self.num_agents): 91 | self.linQ.append(buildNetwork(num_features + num_actions[iAgent], 92 | num_features*2, 93 | 1, 94 | hiddenclass = SigmoidLayer, 95 | outclass = LinearLayer)) 96 | self.linGradient = buildNetwork(num_features + num_actions[self.indexOfAgent], 97 | (num_features + num_actions[self.indexOfAgent])*2, 98 | 1, 99 | hiddenclass = SigmoidLayer, 100 | outclass = LinearLayer) 101 | self.linPolicy = buildNetwork(num_features, 102 | (num_features + num_actions[self.indexOfAgent])*2, 103 | num_actions[self.indexOfAgent], 104 | hiddenclass = SigmoidLayer, 105 | outclass = SigmoidLayer) 106 | assert self.decayBn < self.decayCn, "Cn shold be bigger than Bn." 107 | 108 | def _pi(self, state): 109 | """Given state, compute probabilities for each action.""" 110 | values = np.array(self.linPolicy.activate(r_[state])) 111 | z=np.sum(values) 112 | return (values/z).flatten() 113 | 114 | def _qValues(self, state, iAgent): 115 | """ Return vector of q-values for all actions, 116 | given the state(-features). """ 117 | values = np.array([self.linQ[iAgent].activate(r_[state, one_to_n(i, self.num_actions[iAgent])]) for i in range(self.num_actions[iAgent])]) 118 | return values.flatten() 119 | 120 | def _sgn(self, val): 121 | if val > self.zeta: 122 | return 1.0 123 | elif val < (-1.0*self.zeta): 124 | return -1.0 125 | else: 126 | return val 127 | 128 | def _gamma(self, val): 129 | if val > 1.0: 130 | return 1.0 131 | elif val < 0: 132 | return 0.0 133 | else: 134 | return val 135 | 136 | 137 | def _updateWeights(self, state, action, reward, next_state): 138 | """ state and next_state are vectors, action is an integer. """ 139 | #update Q-value function approximator (estimate Q-value instead of V) 140 | BellmanErrors=np.zeros(self.num_agents) 141 | for iAgent in range(self.num_agents): 142 | vValC=self._qValues(state,iAgent) 143 | vValN=self._qValues(next_state,iAgent) 144 | vArgMaxValC=r_argmax(vValC) 145 | vArgMaxValN=r_argmax(vValN) 146 | BellmanError=(reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[vArgMaxValC] 147 | target=vValC[action[iAgent]]+self.cn*((reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[action[iAgent]]) 148 | BellmanErrors[iAgent]=BellmanError 149 | inp=r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])] 150 | ds = SupervisedDataSet(self.num_features+self.num_actions[iAgent],1) 151 | ds.addSample(inp, target) 152 | BackpropTrainer(self.linQ[iAgent], learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) 153 | 154 | #Estimate gradient 155 | grad=self.linGradient.activate(np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])])[0] 156 | target=grad+self.cn*(np.sum(BellmanErrors, axis=0)-grad) 157 | inp=np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])] 158 | ds = SupervisedDataSet(self.num_features+self.num_actions[self.indexOfAgent],1) 159 | ds.addSample(inp, target) 160 | BackpropTrainer(self.linGradient, learningrate=1.0,weightdecay=self.weightdecay).trainOnDataset(ds) 161 | # print str(self.indexOfAgent) + "-th agents optimization info.:" 162 | # print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0)) 163 | # print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent])) 164 | # print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent)) 165 | #Update policy 166 | c_pi=self._pi(state) 167 | # print "Policy: " + str(c_pi) 168 | firstTerm=c_pi[action[self.indexOfAgent]] 169 | secondTerm=(np.sqrt(firstTerm) 170 | * np.absolute(BellmanErrors[self.indexOfAgent]) 171 | * self._sgn(-1.0*self.linGradient.activate(np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])])[0]) 172 | ) 173 | target=c_pi 174 | target[action[self.indexOfAgent]]=self._gamma(firstTerm - self.bn * secondTerm) 175 | inp=r_[asarray(state)] 176 | ds = SupervisedDataSet(self.num_features, self.num_actions[self.indexOfAgent]) 177 | ds.addSample(inp, target) 178 | BackpropTrainer(self.linPolicy, learningrate=1.0,weightdecay=self.weightdecay).trainOnDataset(ds) 179 | 180 | #update bn, cn 181 | self.bn = self.bn * self.decayBn 182 | self.cn = self.cn * self.decayCn 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/phc.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/19 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 7 | from scipy import zeros 8 | from pybrain.utilities import r_argmax 9 | import numpy as np 10 | from pybrain.utilities import abstractMethod 11 | from pybrain.datasets import SupervisedDataSet 12 | from pybrain.supervised.trainers import BackpropTrainer 13 | from pybrain.utilities import one_to_n 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer 15 | from pybrain.tools.shortcuts import buildNetwork 16 | from scipy import r_, asarray 17 | import copy 18 | 19 | class PHC_FA(IndexableValueBasedLearner): 20 | """ 21 | Policy hill climbing algorithm (with function approximation for Q-value and policy): 22 | http://www.cs.cmu.edu/~mmv/papers/01ijcai-mike.pdf 23 | """ 24 | 25 | learningRate = 0.5 # aka alpha: make sure this is being decreased by calls from the learning agent! 26 | learningRateDecay = 100 # aka n_0, but counting decay-calls 27 | 28 | randomInit = True 29 | 30 | rewardDiscount = 0.99 # aka gamma 31 | 32 | batchMode = False 33 | passNextAction = False # for the _updateWeights method 34 | 35 | def __init__(self, num_features, num_actions, indexOfAgent=None): 36 | IndexableValueBasedLearner.__init__(self, indexOfAgent) 37 | self.explorer = None 38 | self.num_actions = num_actions 39 | self.num_features = num_features 40 | self.indexOfAgent=indexOfAgent 41 | self._behaviorPolicy = self._softmaxPolicy 42 | self.reset() 43 | self.ownerAgentProperties["requireOtherAgentsState"]=False 44 | self.ownerAgentProperties["requireJointAction"]=False 45 | self.ownerAgentProperties["requireJointReward"]=False 46 | 47 | def _pi(self, state): 48 | """ Return vector of probability of policy for all actions, 49 | given the state(-features). """ 50 | abstractMethod() 51 | 52 | def _softmaxPolicy(self, state): 53 | tmp = zeros(self.num_actions) 54 | pi=self._pi(state) 55 | rand=np.random.rand() 56 | cum=0.0 57 | for i in range(self.num_actions): 58 | cum+=pi[i] 59 | if rand < cum: 60 | tmp[i] = 1 61 | return tmp 62 | 63 | def reset(self): 64 | IndexableValueBasedLearner.reset(self) 65 | self._callcount = 0 66 | self.newEpisode() 67 | 68 | def newEpisode(self): 69 | IndexableValueBasedLearner.newEpisode(self) 70 | 71 | def _updateWeights(self, state, action, reward, next_state): 72 | ''' 73 | Expected to update approximator. 74 | ''' 75 | abstractMethod() 76 | 77 | 78 | class PHC_NN(PHC_FA): 79 | '''PHC with neural function approximation. ''' 80 | delta=0.1 81 | maxNumberofAverage=30 82 | weightdecay=0.001 83 | trainingEpochPerUpdateWight=2 84 | 85 | def __init__(self, num_features, num_actions, indexOfAgent=None): 86 | PHC_FA.__init__(self, num_features, num_actions, indexOfAgent) 87 | self.linQ = buildNetwork(num_features + num_actions, (num_features + num_actions), 1, hiddenclass = SigmoidLayer, outclass = LinearLayer) 88 | self.linPolicy = buildNetwork(num_features, (num_features + num_actions), num_actions, hiddenclass = SigmoidLayer,outclass = SigmoidLayer) 89 | self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) 90 | self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,weightdecay=self.weightdecay) 91 | 92 | def _pi(self, state): 93 | """Given state, compute probabilities for each action.""" 94 | values = np.array(self.linPolicy.activate(r_[state])) 95 | z=np.sum(values) 96 | return (values/z).flatten() 97 | 98 | def _qValues(self, state): 99 | """ Return vector of q-values for all actions, 100 | given the state(-features). """ 101 | values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)]) 102 | return values.flatten() 103 | 104 | 105 | def _updateWeights(self, state, action, reward, next_state): 106 | """ state and next_state are vectors, action is an integer. """ 107 | #update Q-value function approximator 108 | target=reward + self.rewardDiscount * max(self._qValues(next_state)) 109 | inp=r_[asarray(state), one_to_n(action, self.num_actions)] 110 | self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) 111 | ds = SupervisedDataSet(self.num_features+self.num_actions,1) 112 | ds.addSample(inp, target) 113 | self.trainer4LinQ.trainOnDataset(ds) 114 | #Update policy 115 | bestAction=r_argmax(self._qValues(state)) 116 | target= one_to_n(bestAction, self.num_actions) 117 | inp=r_[asarray(state)] 118 | ds = SupervisedDataSet(self.num_features,self.num_actions) 119 | ds.addSample(inp, target) 120 | self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, 121 | learningrate=self.delta, 122 | weightdecay=self.weightdecay) 123 | self.trainer4LinPolicy.setData(ds) 124 | self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight) 125 | 126 | 127 | 128 | 129 | 130 | class PHC_WoLF_NN(PHC_FA): 131 | '''PHC_WoLF with neural function ''' 132 | deltaW=0.05 133 | deltaL=0.2 134 | maxNumberofAverage=30 135 | weightdecay=0.001 136 | trainingEpochPerUpdateWight=1 137 | 138 | def __init__(self, num_features, num_actions, indexOfAgent=None): 139 | PHC_FA.__init__(self, num_features, num_actions, indexOfAgent) 140 | self.linQ = buildNetwork(num_features + num_actions, (num_features + num_actions), 1, hiddenclass = SigmoidLayer, outclass = LinearLayer) 141 | self.linPolicy = buildNetwork(num_features, (num_features + num_actions), num_actions, hiddenclass = SigmoidLayer,outclass = SigmoidLayer) 142 | self.averagePolicy=[] 143 | self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) 144 | self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,weightdecay=self.weightdecay) 145 | 146 | def _pi(self, state): 147 | """Given state, compute softmax probability for each action.""" 148 | values = np.array(self.linPolicy.activate(r_[state])) 149 | z=np.sum(values) 150 | return (values/z).flatten() 151 | 152 | def _qValues(self, state): 153 | """ Return vector of q-values for all actions, 154 | given the state(-features). """ 155 | values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)]) 156 | return values.flatten() 157 | 158 | def _piAvr(self, state): 159 | pi=np.zeros(self.num_actions) 160 | for elem in self.averagePolicy: 161 | values = np.array(elem.activate(r_[state])) 162 | pi=np.add(pi.flatten(),values.flatten()) 163 | z=np.sum(pi) 164 | pi=pi/z 165 | return pi.flatten() 166 | 167 | def _updateWeights(self, state, action, reward, next_state): 168 | """ state and next_state are vectors, action is an integer. """ 169 | #update Q-value function approximator 170 | target=reward + self.rewardDiscount * max(self._qValues(next_state)) 171 | inp=r_[asarray(state), one_to_n(action, self.num_actions)] 172 | self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) 173 | ds = SupervisedDataSet(self.num_features+self.num_actions,1) 174 | ds.addSample(inp, target) 175 | self.trainer4LinQ.trainOnDataset(ds) 176 | 177 | #update estimate of average policy 178 | self.averagePolicy.append(copy.deepcopy(self.linPolicy)) 179 | if len(self.averagePolicy) > self.maxNumberofAverage: 180 | self.averagePolicy.pop(np.random.randint(len(self.averagePolicy))) 181 | 182 | #update policy function approximator 183 | delta=None 184 | cumRewardOfCurrentPolicy=0.0 185 | values=self._qValues(state) 186 | pi=self._pi(state) 187 | for elem_action in range(self.num_actions): 188 | cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action] 189 | cumRewardOfAveragePolicy=0.0 190 | api=self._piAvr(state) 191 | for elem_action in range(self.num_actions): 192 | cumRewardOfAveragePolicy=api[elem_action]*values[elem_action] 193 | if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy: 194 | delta=self.deltaW 195 | else: 196 | delta=self.deltaL 197 | 198 | #Update policy 199 | bestAction=r_argmax(self._qValues(state)) 200 | target=one_to_n(bestAction, self.num_actions) 201 | inp=r_[asarray(state)] 202 | ds = SupervisedDataSet(self.num_features,self.num_actions) 203 | ds.addSample(inp, target) 204 | self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, 205 | learningrate=(delta), 206 | weightdecay=self.weightdecay) 207 | self.trainer4LinPolicy.setData(ds) 208 | self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight) 209 | 210 | -------------------------------------------------------------------------------- /pybrainSG/rl/leaners/valuebased/ceq.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2016/02/26 3 | 4 | @author: takuya-hv2 5 | ''' 6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner 7 | from pybrain.utilities import r_argmax 8 | import numpy as np 9 | from pybrain.utilities import abstractMethod 10 | from pybrain.datasets import SupervisedDataSet 11 | from pybrain.supervised.trainers.rprop import RPropMinusTrainer 12 | from pybrain.supervised.trainers import BackpropTrainer 13 | from pybrain.utilities import one_to_n 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer 15 | from pybrain.tools.shortcuts import buildNetwork 16 | import copy 17 | from scipy import zeros, exp, clip 18 | from scipy.optimize._linprog import linprog 19 | import warnings 20 | from multiprocessing import Process, Queue 21 | 22 | 23 | class CEQ_FA(IndexableValueBasedLearner): 24 | """ 25 | Correlated Q (with function approximation): 26 | http://www.aaai.org/Papers/Symposia/Spring/2002/SS-02-02/SS02-02-012.pdf 27 | """ 28 | 29 | learningRate = 0.2 # aka alpha: make sure this is being decreased by calls from the learning agent! 30 | learningRateDecay = 100 # aka n_0, but counting decay-calls 31 | 32 | randomInit = True 33 | 34 | rewardDiscount = 0.99 # aka gamma 35 | 36 | batchMode = False 37 | passNextAction = False # for the _updateWeights method 38 | 39 | def __init__(self, num_features, num_actions, num_agents, indexOfAgent): 40 | IndexableValueBasedLearner.__init__(self, indexOfAgent) 41 | self.explorer = None 42 | self.num_actions = num_actions 43 | self.num_features = num_features 44 | self.num_agents=num_agents 45 | self.reset() 46 | self.ownerAgentProperties["requireOtherAgentsState"]=False 47 | self.ownerAgentProperties["requireJointAction"]=True 48 | self.ownerAgentProperties["requireJointReward"]=True 49 | assert self.num_agents == np.size(self.num_actions, axis=0), "Size of 1st row of action array should be equal to number of agent. " 50 | 51 | def _qValues(self, state): 52 | """ Return vector of probability of policy for all actions, 53 | given the state(-features). """ 54 | abstractMethod() 55 | 56 | def _greedyAction(self, state): 57 | return r_argmax(self._qValues(state)) 58 | 59 | def _greedyPolicy(self, state): 60 | tmp = zeros(self.num_actions[self.indexOfAgent]) 61 | tmp[self._greedyAction(state)] = 1 62 | return tmp 63 | 64 | def _boltzmannPolicy(self, state, temperature=1.): 65 | tmp = self._qValues(state) 66 | return CEQ_FA._boltzmannProbs(tmp, temperature) 67 | 68 | @staticmethod 69 | def _boltzmannProbs(qvalues, temperature=1.): 70 | if temperature == 0: 71 | tmp = zeros(len(qvalues)) 72 | tmp[r_argmax(qvalues)] = 1. 73 | else: 74 | tmp = qvalues / temperature 75 | tmp -= max(tmp) 76 | tmp = exp(clip(tmp, -20, 0)) 77 | return tmp / sum(tmp) 78 | 79 | def reset(self): 80 | IndexableValueBasedLearner.reset(self) 81 | self._callcount = 0 82 | self.newEpisode() 83 | 84 | def newEpisode(self): 85 | IndexableValueBasedLearner.newEpisode(self) 86 | self._callcount += 1 87 | self.learningRate *= ((self.learningRateDecay + self._callcount) 88 | / (self.learningRateDecay + self._callcount + 1.)) 89 | 90 | def _updateWeights(self, state, action, reward, next_state): 91 | ''' 92 | Expected to update Q-value approximator. 93 | ''' 94 | abstractMethod() 95 | 96 | 97 | class CEQ_Lin(CEQ_FA): 98 | ''' 99 | CEQ with linear function approximation. 100 | ''' 101 | def __init__(self, num_features, num_actions, num_agents, indexOfAgent=None): 102 | CEQ_FA.__init__(self, num_features, num_actions, num_agents, indexOfAgent) 103 | self.possibleJointAction, self.w4ActIndexing = self._initJointActAndItsIndex(num_agents, num_actions) 104 | self.numJointAct=np.size(self.possibleJointAction, axis=0) 105 | self.linQ=[] 106 | self.actionDiminInput=0 107 | for i in range(self.num_agents): 108 | self.actionDiminInput+=self.num_actions[i] 109 | for i in range(self.num_agents): 110 | self.linQ.append(buildNetwork(num_features + self.actionDiminInput, 1, outclass = LinearLayer)) 111 | self.actionVecDic={} 112 | 113 | def _initJointActAndItsIndex(self, num_agents, num_actions): 114 | numJointAct=1 115 | w4ActIndexing=np.zeros(num_agents) 116 | for index in range(len(num_actions)): 117 | numJointAct*=num_actions[index] 118 | temp=numJointAct 119 | for index in range(np.size(num_actions,axis=0)): 120 | temp/=num_actions[index] 121 | w4ActIndexing[index]=(temp) 122 | possibleJointAction=[[]] 123 | for i in range(num_agents): 124 | temp=[] 125 | for j in range(num_actions[i]): 126 | for k in range(len(possibleJointAction)): 127 | temp2=copy.deepcopy(possibleJointAction[k]) 128 | temp2.append(j) 129 | temp.append(temp2) 130 | possibleJointAction=temp 131 | possibleJointAction.sort() 132 | possibleJointAction=np.array(possibleJointAction) 133 | return possibleJointAction, w4ActIndexing 134 | 135 | def _qValues(self, state): 136 | """ Return vector of q-values for all actions, 137 | given the state(-features). """ 138 | qValues=self._qValuesForAllPossibleJointAction(state) 139 | eq=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValues, self.possibleJointAction, self.w4ActIndexing) 140 | return np.array(self._qValuesForEachActionOfAgent(state, eq, self.indexOfAgent)).reshape(self.num_actions[self.indexOfAgent]) 141 | 142 | def _updateWeights(self, state, action, reward, next_state): 143 | """ state and next_state are vectors, action is an integer. """ 144 | #update Q-value function approximator 145 | qValuesNext=self._qValuesForAllPossibleJointAction(next_state) 146 | eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing) 147 | #Learn 148 | inp=self._EncodeStateAndJointActionIntoInputVector(state, action) 149 | for i in range(self.num_agents): 150 | target=reward[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(next_state, eqNext, i)) 151 | self.trainer4LinQ=BackpropTrainer(self.linQ[i],learningrate=self.learningRate,weightdecay=0.0) 152 | ds = SupervisedDataSet(self.num_features+self.actionDiminInput,1) 153 | ds.addSample(inp, target) 154 | self.trainer4LinQ.trainOnDataset(ds) 155 | 156 | def _qValuesForAllPossibleJointAction(self, state): 157 | qValues=[] 158 | for iAgent in range(self.num_agents): 159 | qValuesIthAgent=[] 160 | for jointAct in self.possibleJointAction: 161 | val=np.array(self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct))) 162 | qValuesIthAgent.append(val) 163 | qValues.append(qValuesIthAgent) 164 | return qValues#QValues for all possible joint actions for each agents [numAgents][index of joint act in list] 165 | 166 | def _qValuesForEachActionOfAgent(self, state, CEq, iAgent): 167 | qValuesForeachAct=[] 168 | for iAct in range(self.num_actions[iAgent]): 169 | expQ=0.0 170 | sumP=0.0 171 | numPJA=0.0 172 | for jointAct in self.possibleJointAction: 173 | if iAct == int(jointAct[iAgent]): 174 | sumP+=CEq[int(np.dot(self.w4ActIndexing, jointAct))] 175 | numPJA+=1.0 176 | for jointAct in self.possibleJointAction: 177 | if iAct == int(jointAct[iAgent]): 178 | if sumP > 0.00001: 179 | prob=CEq[int(np.dot(self.w4ActIndexing, jointAct))] 180 | if prob > 0.0: 181 | Q=self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct)) 182 | expQ+=(prob/sumP)*Q[0] 183 | else: 184 | Q=self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct)) 185 | expQ+=(1.0/numPJA)*Q[0] 186 | qValuesForeachAct.append(expQ) 187 | return qValuesForeachAct 188 | 189 | 190 | def _EncodeStateAndJointActionIntoInputVector(self, state, jointAct): 191 | index=int(np.dot(self.w4ActIndexing, jointAct)) 192 | if index in self.actionVecDic: 193 | return np.r_[state, self.actionVecDic[index]] 194 | else: 195 | iVector=np.array([]) 196 | for iAgent in range(len(jointAct)): 197 | iVector=np.r_[iVector, one_to_n(jointAct[iAgent], self.num_actions[iAgent])] 198 | self.actionVecDic[index]=iVector 199 | return np.r_[state, self.actionVecDic[index]] 200 | 201 | 202 | 203 | class NFCEQ(CEQ_Lin): 204 | '''Neural fitted Q iteration version. ''' 205 | def __init__(self, num_features, num_actions, num_agents, max_epochs=20, indexOfAgent=None, validateMultiProc=True): 206 | CEQ_Lin.__init__(self, num_features, num_actions, num_agents, indexOfAgent) 207 | self.max_epochs=max_epochs 208 | self.linQ=[]#update 209 | for _ in range(self.num_agents): 210 | self.linQ.append(buildNetwork(num_features + self.actionDiminInput, (num_features + self.actionDiminInput), 1, hiddenclass=SigmoidLayer, outclass = LinearLayer)) 211 | self.isFirstLerning=True 212 | self.validateMultiProc=validateMultiProc 213 | 214 | def _updateWeights(self, state, action, reward, next_state): 215 | """ state and next_state are vectors, action is an integer. """ 216 | pass 217 | def learn(self): 218 | # convert reinforcement dataset to NFQ supervised dataset 219 | supervised = [] 220 | dats=[]#[seq index][turn]=[state,jointAct,jointReward] 221 | for i in range(self.num_agents): 222 | supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1)) 223 | for i in range(self.dataset[self.indexOfAgent].getNumSequences()): 224 | seq=[] 225 | for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])): 226 | state=self.dataset[self.indexOfAgent].getSequence(i)[0][j] 227 | jointAct=[] 228 | jointReward=[] 229 | for k in range(self.num_agents): 230 | jointAct.append(self.dataset[k].getSequence(i)[1][j][0]) 231 | jointReward.append(self.dataset[k].getSequence(i)[2][j][0]) 232 | seq.append([state, jointAct, jointReward]) 233 | dats.append(seq) 234 | #prepare data set 235 | for i in range(self.num_agents): 236 | for seq in dats: 237 | lastexperience = None 238 | for sarPair in seq: 239 | state = sarPair[0] 240 | action = sarPair[1] 241 | reward = sarPair[2] 242 | if not lastexperience: 243 | # delay each experience in sequence by one 244 | lastexperience = (state, action, reward) 245 | continue 246 | # use experience from last timestep to do Q update 247 | (state_, action_, reward_) = lastexperience 248 | 249 | #update Q-value function approximator 250 | qValuesNext=self._qValuesForAllPossibleJointAction(state) 251 | eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing) 252 | #Learn 253 | inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_) 254 | if self.isFirstLerning: 255 | target=reward_[i] 256 | else: 257 | target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i)) 258 | target=np.array([target]) 259 | supervised[i].addSample(inp, target) 260 | # update last experience with current one 261 | lastexperience = (state, action, reward) 262 | if self.isFirstLerning: 263 | self.isFirstLerning=False 264 | 265 | procTrainers=[] 266 | qResult=Queue() 267 | for i in range(self.num_agents): 268 | trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], 269 | batchlearning=True, 270 | verbose=False, 271 | ) 272 | if not self.validateMultiProc: 273 | trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False) 274 | else: 275 | procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult})) 276 | if self.validateMultiProc: 277 | for proc in procTrainers: 278 | proc.start() 279 | for i in range(self.num_agents): 280 | res=qResult.get() 281 | self.linQ[res[0]]=res[1] 282 | 283 | def _learningQfunction(self, trainer,i,q): 284 | #Re-builde networks is required in multiprocessing environments. 285 | params=trainer.module.params 286 | trainer.module=buildNetwork(self.num_features + self.actionDiminInput, (self.num_features + self.actionDiminInput), 1, hiddenclass=SigmoidLayer, outclass = LinearLayer) 287 | trainer.module._setParameters(params) 288 | trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False) 289 | q.put([i,trainer.module]) 290 | 291 | 292 | 293 | 294 | 295 | 296 | def findCorrelatedEquilibrium(numAgent, numAction, Qvalues, possibleJointAction, w4ActIndexing): 297 | ''' 298 | Given a list of all possible joint action, and its QValue table, 299 | this function find correlated equilibrium based on the linear programming. 300 | #In current implementation, the objective function, to determine the identical equilibrium, is "republican" function. 301 | ''' 302 | numJointAct=np.size(possibleJointAction,axis=0) 303 | STs=[]#constraints for LP 304 | for iAgent in range(numAgent): 305 | # print "==================== Agent " + str(iAgent) + "===============" 306 | vecQ=Qvalues[iAgent] 307 | eCumdeltaOutCome=np.zeros(numJointAct) 308 | for ithAgentsOptAct in range(numAction[iAgent]): 309 | #Calculate expected Q-Value when agent follow its optimal action "ithAgentsOptAct". 310 | eOutcomeInOpt=np.zeros(numJointAct) 311 | for jointAction in possibleJointAction: 312 | if ithAgentsOptAct == jointAction[iAgent]: 313 | index=int(np.dot(w4ActIndexing, jointAction)) 314 | eOutcomeInOpt[index]=vecQ[index] 315 | #Calculate expected Q-Value when agent follow its non-optimal action "ithAgentsOptAct". 316 | for ithAgentsNonOptAct in range(numAction[iAgent]): 317 | if ithAgentsNonOptAct == ithAgentsOptAct: 318 | continue 319 | eOutcomeInNonOpt=np.zeros(numJointAct) 320 | for jointAction in possibleJointAction: 321 | if (ithAgentsOptAct != jointAction[iAgent]) and (ithAgentsNonOptAct == jointAction[iAgent]): 322 | jointActionWithNonOptimal=copy.deepcopy(jointAction) 323 | jointActionWithNonOptimal[iAgent]=ithAgentsOptAct 324 | index1=int(np.dot(w4ActIndexing, jointActionWithNonOptimal)) 325 | index2=int(np.dot(w4ActIndexing, jointAction)) 326 | eOutcomeInNonOpt[index1]=vecQ[index2] 327 | eCumdeltaOutCome = eCumdeltaOutCome + (eOutcomeInOpt - eOutcomeInNonOpt) 328 | STs.append(eCumdeltaOutCome) 329 | 330 | #All all possible ith agent action 331 | for i in range(numJointAct): 332 | t=np.zeros(numJointAct) 333 | t[i]=1.0 334 | STs.append(t) 335 | STs=np.array(STs)*(-1) 336 | #Constraints (uneq.) 337 | b_ub=np.zeros(np.size(STs,axis=0)) 338 | #Constraints (eq.) 339 | A_eq=np.ones((1,numJointAct)) 340 | b_eq=np.ones(1) 341 | #Objective function 342 | c=np.zeros(numJointAct) 343 | for iAgent in range(numAgent): 344 | #find maximum Q value in each decision making. 345 | vecQ=np.array(Qvalues[iAgent]).reshape(numJointAct) 346 | for jointAction in possibleJointAction: 347 | index=int(np.dot(w4ActIndexing, jointAction)) 348 | if c[index] < vecQ[index]: 349 | c[index] = vecQ[index] 350 | c*=-1 351 | #Implement linear programing with scipy library 352 | res=linprog(c=c, A_ub=STs, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=None, method='simplex', callback=None, options=None) 353 | if not res.success: 354 | warnings.warn("LP was failed uniform probability was set .") 355 | res.x = np.ones(numJointAct)/(numJointAct) 356 | return res.x 357 | --------------------------------------------------------------------------------