├── pybrainSG
    ├── __init__.py
    ├── rl
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── ceqa.pyc
    │   │   ├── faphc.pyc
    │   │   ├── nfceqa.pyc
    │   │   ├── sgspa.pyc
    │   │   ├── __init__.pyc
    │   │   ├── indexable.pyc
    │   │   ├── loggingSG.pyc
    │   │   ├── learningSG.pyc
    │   │   ├── linearfaSG.pyc
    │   │   ├── multiAgent.pyc
    │   │   ├── indexable.py
    │   │   ├── nfceqa.py
    │   │   ├── learningSG.py
    │   │   ├── sgspa.py
    │   │   ├── faphc.py
    │   │   ├── multiAgent.py
    │   │   ├── linearfaSG.py
    │   │   ├── ceqa.py
    │   │   └── loggingSG.py
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── ceq
    │   │   │   ├── __init__.py
    │   │   │   ├── example_huntingGame.pyc
    │   │   │   ├── example_gridgames_CEQ_NFQ.pyc
    │   │   │   ├── example_huntinggame_CEQ-NFQ.pyc
    │   │   │   ├── example_staticGame.py
    │   │   │   ├── example_gridgames.py
    │   │   │   ├── example_huntingGame.py
    │   │   │   ├── example_gridgames_CEQ_NFQ.py
    │   │   │   └── example_huntinggame_CEQ-NFQ.py
    │   │   ├── phc
    │   │   │   ├── __init__.py
    │   │   │   ├── example_gridgames.py
    │   │   │   ├── example_staticGame.py
    │   │   │   └── example_huntingGame.py
    │   │   ├── sgsp
    │   │   │   ├── __init__.py
    │   │   │   ├── example_staticgame.py
    │   │   │   └── example_gridgames.py
    │   │   ├── nfqSG
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   ├── example_gridgames.pyc
    │   │   │   ├── example_huntingGame.pyc
    │   │   │   ├── example_gridgames.py
    │   │   │   └── example_huntingGame.py
    │   │   ├── tasks
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   ├── gridgames.pyc
    │   │   │   ├── huntinggame.pyc
    │   │   │   ├── staticgame.pyc
    │   │   │   ├── staticgame.py
    │   │   │   ├── huntinggame.py
    │   │   │   └── gridgames.py
    │   │   ├── linearfaSG
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   ├── example_huntingGame.pyc
    │   │   │   ├── example_staticGame.py
    │   │   │   ├── example_gridgames.py
    │   │   │   └── example_huntingGame.py
    │   │   └── __init__.pyc
    │   ├── leaners
    │   │   ├── __init__.py
    │   │   ├── valuebased
    │   │   │   ├── __init__.py
    │   │   │   ├── ceq.pyc
    │   │   │   ├── phc.pyc
    │   │   │   ├── sgsp.pyc
    │   │   │   ├── nfqSG.pyc
    │   │   │   ├── __init__.pyc
    │   │   │   ├── learnerfaSG.pyc
    │   │   │   ├── indexablevaluebased.pyc
    │   │   │   ├── indexablevaluebased.py
    │   │   │   ├── nfqSG.py
    │   │   │   ├── learnerfaSG.py
    │   │   │   ├── sgsp.py
    │   │   │   ├── phc.py
    │   │   │   └── ceq.py
    │   │   └── __init__.pyc
    │   ├── environments
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── episodicSG.pyc
    │   │   └── episodicSG.py
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── episodicSG.pyc
    │   │   └── episodicSG.py
    │   └── __init__.pyc
    └── __init__.pyc
├── Images
    └── MDPsandGSSGs.jpg
├── .mamemose.rb
├── .pydevproject
├── .project
└── README.md


/pybrainSG/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/environments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/phc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/sgsp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Images/MDPsandGSSGs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/Images/MDPsandGSSGs.jpg


--------------------------------------------------------------------------------
/pybrainSG/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/ceqa.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/ceqa.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/faphc.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/faphc.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/nfceqa.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/nfceqa.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/sgspa.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/sgspa.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/indexable.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/indexable.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/loggingSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/loggingSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/learningSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/learningSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/linearfaSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/linearfaSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/multiAgent.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/agents/multiAgent.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/environments/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/environments/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/experiments/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/experiments/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/environments/episodicSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/environments/episodicSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/experiments/episodicSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/experiments/episodicSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/ceq.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/ceq.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/phc.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/phc.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/sgsp.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/sgsp.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/gridgames.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/gridgames.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/huntinggame.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/huntinggame.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/staticgame.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/tasks/staticgame.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/nfqSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/nfqSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/linearfaSG/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/__init__.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/learnerfaSG.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/learnerfaSG.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_huntingGame.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_huntingGame.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/example_gridgames.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/example_gridgames.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/example_huntingGame.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/nfqSG/example_huntingGame.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/indexablevaluebased.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/leaners/valuebased/indexablevaluebased.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.pyc


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/example_huntingGame.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Multi-Agent-Reinforcement-Learning-in-Stochastic-Games/master/pybrainSG/rl/examples/linearfaSG/example_huntingGame.pyc


--------------------------------------------------------------------------------
/.mamemose.rb:
--------------------------------------------------------------------------------
 1 | DOCUMENT_ROOT = "~/memo"
 2 | 
 3 | PORT = 8888
 4 | 
 5 | MARKDOWN_PATTERN = /\.(md|markdown|txt)$/
 6 | 
 7 | RECENT_NUM = 0
 8 | 
 9 | # RECENT_PATTERN = MARKDOWN_PATTERN
10 | 
11 | CUSTOM_HEADER = <<HEADER
12 | <script src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML">
13 | </script>
14 | HEADER


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?><pydev_project>
3 | <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
4 | <path>/${PROJECT_DIR_NAME}</path>
5 | </pydev_pathproperty>
6 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
7 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
8 | </pydev_project>
9 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>Pybrain_StochasticGames</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/indexable.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | __author__ = 'Takuya Hiraoka, takuya-h@is.naist.jp'
 7 | 
 8 | from pybrain.rl.agents.agent import Agent
 9 | 
10 | class IndexableAgent(Agent):
11 |     '''
12 |     Agent which can be indexed.
13 |     '''
14 |     indexOfAgent=None
15 |     
16 |     def __init__(self, index=None):
17 |         self.setIndexOfAgent(index)
18 |         
19 |     def setIndexOfAgent(self,index):
20 |         """ set index to agent.
21 |             :key index: index of agent
22 |             :type index: integer
23 |         """
24 |         self.indexOfAgent=index
25 |         
26 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/environments/episodicSG.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrain.utilities import abstractMethod
 7 | from pybrain.rl.environments.task import Task
 8 | 
 9 | class EpisodicTaskSG(Task):
10 |     """Stochastic game version of EpisodicTask class"""
11 |     
12 |     def __init__(self, environment):
13 |         Task.__init__(self,environment)
14 |     
15 |     def reset(self):
16 |         """ Re-initialize the environment """
17 |         self.env.reset()
18 | 
19 |     def isFinished(self):
20 |         """ Is the current episode over? """
21 |         abstractMethod()
22 | 
23 |     def performAction(self, jointAction):
24 |         """ Execute joint action of all agents. """
25 |         Task.performAction(self, jointAction)
26 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/indexablevaluebased.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner
 7 | 
 8 | class IndexableValueBasedLearner(ValueBasedLearner):
 9 |     indexOfAgent=None
10 |     ownerAgentProperties={
11 |                      "requireOtherAgentsState": None, #Define if learner require, in addition to owner's state information, other agent state information as well. 
12 |                      "requireJointAction":None, #Define if learner require, in addition to owner's state information, other agent action information as well.   
13 |                      "requireJointReward":None}#Define if learner require, in addition to owner's state information, other agent reward information as well. 
14 | 
15 |     def __init__(self, indexOfAgent=None, **kwargs):
16 |         ValueBasedLearner.__init__(self)
17 |         self.indexOfAgent=indexOfAgent
18 |     
19 |     def setIndexOfAgent(self, indexOfAgent):
20 |         self.indexOfAgent=indexOfAgent
21 | 
22 |     def getProperty(self):
23 |         for elem in self.ownerAgentProperties.values():
24 |             assert isinstance(elem,bool), "All property should be initialize with boolian."
25 |         return self.ownerAgentProperties


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/phc/example_gridgames.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/07
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.examples.tasks.gridgames import *
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.phc import *
10 | from pybrainSG.rl.agents.faphc import PHC_Agent
11 | 
12 | if __name__ == '__main__':
13 |     ma=MultiAgent()
14 |     for i in range(GridGame.numberofAgents):
15 | #         learner= PHC_NN(
16 | #                          num_features=(GridGame.numberofAgents*2),
17 | #                          num_actions=len(GridGame.availableActions))
18 |         learner= PHC_WoLF_NN(
19 |                          num_features=(GridGame.numberofAgents*2),
20 |                          num_actions=len(GridGame.availableActions))
21 |         agent= PHC_Agent(learner,numAgents=GridGame.numberofAgents,index=i)
22 |         ma.addAgent(agent)
23 |     task=GridGameTask()
24 | 
25 | #     task=GridGameTask(gameType="GG1")
26 | #     task=GridGameTask(gameType="GG2")
27 |     task=GridGameTask(gameType="GG3")
28 |     exp=EpisodicExperimentSG(task,ma)
29 |     print "Rewards for agents at the end of episode:"
30 |     for i in range(40000):
31 |         rewards=exp.doEpisodes(number=1)
32 |         print str(rewards[0][-1])
33 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/phc/example_staticGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.agents.faphc import PHC_Agent
10 | from pybrainSG.rl.leaners.valuebased.phc import *
11 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask
12 | 
13 | if __name__ == '__main__':
14 |     ma=MultiAgent()
15 |     for i in range(2):
16 | #         learner= PHC_NN(
17 | #                         num_features=1, 
18 | #                         num_actions=len(SimpleMatrixGame.availableActions))
19 |         learner= PHC_WoLF_NN(
20 |                              num_features=1, 
21 |                              num_actions=len(SimpleMatrixGame.availableActions),
22 |                              )
23 |         learner.rewardDiscount=0.0
24 |         agent= PHC_Agent(learner,numAgents=2,index=i)
25 |         ma.addAgent(agent)
26 |     task=StaticGameTask()
27 |     
28 |     exp=EpisodicExperimentSG(task,ma)
29 |     rewards=exp.doEpisodes(number=1000)
30 |     print "Given reward for " + str(len(rewards)) + " episodes:"
31 |     print "Reward for Agent 1, Reward for Agent 2"
32 |     for i in range(len(rewards)):
33 |         print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1])
34 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/example_staticGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG
 9 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG
10 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask
11 | from pybrainSG.rl.agents.multiAgent import MultiAgent
12 | import numpy as np
13 | if __name__ == '__main__':
14 |     ma=MultiAgent()
15 |     for i in range(2):
16 |         learner= Q_LinFA_SG(
17 |                             num_features=1,
18 |                             num_actions=len(SimpleMatrixGame.availableActions))
19 |         agent= LinearFA_AgentSG(learner,
20 |                             num_features=np.ones((2,1)),
21 |                             num_actions=(np.ones(2)*len(SimpleMatrixGame.availableActions)),
22 |                             num_agents=2,
23 |                             index=i)
24 |         ma.addAgent(agent)
25 | 
26 |     task=StaticGameTask()
27 |     
28 |     exp=EpisodicExperimentSG(task,ma)
29 |     rewards=exp.doEpisodes(number=500)
30 |     print "Given reward for " + str(len(rewards)) + " episodes:"
31 |     print "Reward for Agent 1, Reward for Agent 2"
32 |     for i in range(len(rewards)):
33 |         print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1])
34 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/phc/example_huntingGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.phc import *
10 | from pybrainSG.rl.agents.faphc import PHC_Agent
11 | 
12 | if __name__ == '__main__':
13 |     ma=MultiAgent()
14 |     HuntingGame.numberofAgents=2
15 |     for i in range(HuntingGame.numberofAgents):
16 | #         learner= PHC_NN(
17 | #                         num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
18 | #                         num_actions=len(HuntingGame.availableActions))
19 |         learner= PHC_WoLF_NN(
20 |                              num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
21 |                              num_actions=len(HuntingGame.availableActions))
22 |         agent= PHC_Agent(learner,numAgents=HuntingGame.numberofAgents,index=i)
23 |         ma.addAgent(agent)
24 |     
25 |     task=HuntingGameTask()
26 |     
27 |     print "Given reward for Agents"
28 |     for i in range(10000):
29 |         exp=EpisodicExperimentSG(task,ma)
30 |         rewards=exp.doEpisodes(number=1)
31 |         for i in range(len(rewards)):
32 |             print str(rewards[i][-1][0])
33 |         
34 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_staticGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.leaners.valuebased.ceq import *
 9 | from pybrainSG.rl.agents.ceqa import *
10 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask
11 | from pybrainSG.rl.agents.multiAgent import MultiAgent
12 | import numpy as np
13 | if __name__ == '__main__':
14 |     ma=MultiAgent()
15 |     for i in range(2):
16 |         
17 |         learner= CEQ_Lin(
18 |                             num_features=1,
19 |                             num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions),
20 |                             num_agents=2,
21 |                             indexOfAgent=i)
22 |         learner.rewardDiscount=0.0
23 |         agent= CEQ_Agent(learner,
24 |                             num_features=1,
25 |                             num_actions=(np.ones(2)*len(SimpleMatrixGame.availableActions)),
26 |                             num_agents=2,
27 |                             index=i)
28 |         ma.addAgent(agent)
29 | 
30 |     task=StaticGameTask()
31 |     
32 |     exp=EpisodicExperimentSG(task,ma)
33 |     print "Reward for Agent 1, Reward for Agent 2"
34 |     for i in range(50000):
35 |         rewards=exp.doEpisodes(number=1)
36 |         print str(rewards[0][-1][0])+", "+str(rewards[0][-1][1])
37 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/sgsp/example_staticgame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/10
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.sgsp import *
10 | from pybrainSG.rl.agents.sgspa import SGSP_Agent
11 | from pybrainSG.rl.examples.tasks.staticgame import SimpleMatrixGame, StaticGameTask
12 | 
13 | if __name__ == '__main__':
14 |     ma=MultiAgent()
15 |     for i in range(2):
16 |         learner= ON_SGSP_NN(
17 |                              num_features=1, 
18 |                              num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions),
19 |                              num_agents=2,
20 |                              index=i)
21 |         learner.rewardDiscount=0.0
22 |         agent= SGSP_Agent(
23 |                           learner,
24 |                           num_actions=np.ones(2,dtype=np.int8)*len(SimpleMatrixGame.availableActions),
25 |                           numAgents=2,
26 |                           index=i)
27 |         ma.addAgent(agent)
28 |     task=StaticGameTask()
29 |     
30 |     exp=EpisodicExperimentSG(task,ma)
31 |     rewards=exp.doEpisodes(number=1000)
32 |     print "Given reward for " + str(len(rewards)) + " episodes:"
33 |     print "Reward for Agent 1, Reward for Agent 2"
34 |     for i in range(len(rewards)):
35 |         print str(rewards[i][-1][0])+", "+str(rewards[i][-1][1])
36 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/sgsp/example_gridgames.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/07
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.examples.tasks.gridgames import *
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.sgsp import *
10 | from pybrainSG.rl.agents.sgspa import SGSP_Agent
11 | 
12 | if __name__ == '__main__':
13 |     ma=MultiAgent()
14 |     for i in range(GridGame.numberofAgents):
15 |         learner= ON_SGSP_NN(
16 |                          num_features=(GridGame.numberofAgents*2),
17 |                          num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
18 |                          num_agents=GridGame.numberofAgents,
19 |                          index=i)
20 |         agent= SGSP_Agent(
21 |                           learner,
22 |                           num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
23 |                           numAgents=GridGame.numberofAgents,
24 |                           index=i)
25 |         ma.addAgent(agent)
26 |     task=GridGameTask()
27 | 
28 | #     task=GridGameTask(gameType="GG1")
29 | #     task=GridGameTask(gameType="GG2")
30 |     task=GridGameTask(gameType="GG3")
31 |     exp=EpisodicExperimentSG(task,ma)
32 |     print "Rewards for agents at the end of episode:"
33 |     for i in range(40000):
34 |         rewards=exp.doEpisodes(number=1)
35 |         print str(rewards[0][-1])
36 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/example_gridgames.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/07
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.examples.tasks.gridgames import *
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG
10 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG
11 | 
12 | if __name__ == '__main__':
13 |     ma=MultiAgent()
14 |     for i in range(GridGame.numberofAgents):
15 |         learner= Q_LinFA_SG(
16 |                          num_features=(GridGame.numberofAgents*2),
17 |                          num_actions=len(GridGame.availableActions))
18 |         agent= LinearFA_AgentSG(
19 |                                 learner,
20 |                                 num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2),
21 |                                 num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
22 |                                 num_agents=GridGame.numberofAgents,
23 |                                 index=i)
24 |         ma.addAgent(agent)
25 |     task=GridGameTask()
26 | 
27 | #     task=GridGameTask(gameType="GG1")
28 | #     task=GridGameTask(gameType="GG2")
29 |     task=GridGameTask(gameType="GG3")
30 |     exp=EpisodicExperimentSG(task,ma)
31 |     print "Rewards for agents at the end of episode:"
32 |     for i in range(40000):
33 |         rewards=exp.doEpisodes(number=1)
34 |         print str(rewards[0][-1])
35 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/linearfaSG/example_huntingGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import Q_LinFA_SG
 9 | from pybrainSG.rl.agents.linearfaSG import LinearFA_AgentSG
10 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask
11 | from pybrainSG.rl.agents.multiAgent import MultiAgent
12 | import numpy as np
13 | if __name__ == '__main__':
14 |     ma=MultiAgent()
15 |     for i in range(HuntingGame.numberofAgents):
16 |         learner= Q_LinFA_SG(
17 |                             num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
18 |                             num_actions=len(HuntingGame.availableActions))
19 |         agent= LinearFA_AgentSG(
20 |                                 learner,
21 |                                 num_features=np.ones(HuntingGame.numberofAgents)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
22 |                                 num_actions=np.ones(HuntingGame.numberofAgents)*len(HuntingGame.availableActions),
23 |                                 num_agents=HuntingGame.numberofAgents,
24 |                                 index=i)
25 |         ma.addAgent(agent)
26 |     task=HuntingGameTask()
27 |     exp=EpisodicExperimentSG(task,ma)
28 |     rewards=exp.doEpisodes(number=1000)
29 |     print "Given reward for " + str(len(rewards)) + " episodes:"
30 |     print "Reward for Agents"
31 |     for i in range(len(rewards)):
32 |         print str(rewards[i][-1][0])
33 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/experiments/episodicSG.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | __author__ = 'Takuya Hiraoka, takuya-h@is.naist.jp'
 7 | 
 8 | from pybrain.rl.experiments.experiment import Experiment
 9 | from pybrainSG.rl.agents.multiAgent import MultiAgent
10 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG
11 | 
12 | class EpisodicExperimentSG(Experiment):
13 |     """ Stochastic version of EpisodicExperiment class. """
14 |     def __init__(self, task, multiAgent):
15 |         assert isinstance(task, EpisodicTaskSG), "task should be the subclass of EpisodicTaskSG."
16 |         assert isinstance(multiAgent, MultiAgent), "task should be MultAgent."
17 |         Experiment.__init__(self, task, multiAgent)
18 | 
19 | 
20 |     def _oneInteraction(self):
21 |         """ Do an interaction between the Task and Agents. """
22 |         self.stepid += 1
23 |         self.agent.integrateObservation(self.task.getObservation())
24 |         self.task.performAction(self.agent.getJointAction())
25 |         reward = self.task.getReward()
26 |         self.agent.giveJointReward(reward)
27 |         return reward
28 | 
29 |     def doEpisodes(self, number = 1):
30 |         """ Do one episode, and return the joint rewards of each step as a list. """
31 |         all_rewards = []
32 |         for dummy in range(number):
33 |             self.agent.newEpisode()
34 |             rewards = []
35 |             self.stepid = 0
36 |             self.task.reset()
37 |             while not self.task.isFinished():
38 |                 r = self._oneInteraction()
39 |                 rewards.append(r)
40 |             all_rewards.append(rewards)
41 |         return all_rewards
42 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_gridgames.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/07
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 8 | from pybrainSG.rl.leaners.valuebased.ceq import *
 9 | from pybrainSG.rl.agents.ceqa import *
10 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame
11 | import numpy as np
12 | import warnings
13 | 
14 | if __name__ == '__main__':
15 |     warnings.simplefilter("ignore")
16 |     ma=MultiAgent()
17 |     for i in range(GridGame.numberofAgents):
18 |         learner= CEQ_Lin(
19 |                         num_features=(GridGame.numberofAgents*2),
20 |                         num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
21 |                         num_agents=GridGame.numberofAgents,
22 |                         indexOfAgent=i)
23 |         agent= CEQ_Agent(
24 |                             learner,
25 |                             num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2),
26 |                             num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
27 |                             num_agents=GridGame.numberofAgents,
28 |                             index=i)
29 |         ma.addAgent(agent)
30 | #         task=GridGameTask(gameType="GG1")
31 | #         task=GridGameTask(gameType="GG2")
32 |     task=GridGameTask(gameType="GG3")
33 |     exp=EpisodicExperimentSG(task,ma)
34 |     print "Rewards for agents at the end of episode:"
35 |     for i in range(40000):
36 |         rewards=exp.doEpisodes(number=1)
37 |         print str(rewards[0][-1])
38 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_huntingGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.leaners.valuebased.ceq import *
 9 | from pybrainSG.rl.agents.ceqa import *
10 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask
11 | from pybrainSG.rl.agents.multiAgent import MultiAgent
12 | #from pybrain.unsupervised.trainers.deepbelief
13 | 
14 | import numpy as np
15 | if __name__ == '__main__':
16 | #     warnings.simplefilter("ignore")
17 |     ma=MultiAgent()
18 |     HuntingGame.numberofAgents=2
19 |     for i in range(HuntingGame.numberofAgents):
20 |         learner= CEQ_Lin(
21 |                             num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
22 |                             num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions),
23 |                             num_agents=HuntingGame.numberofAgents,
24 |                             indexOfAgent=i)
25 |         agent= CEQ_Agent(
26 |                                 learner,
27 |                                 num_features=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
28 |                                 num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions),
29 |                                 num_agents=HuntingGame.numberofAgents,
30 |                                 index=i)
31 |         ma.addAgent(agent)
32 |     task=HuntingGameTask()
33 |     exp=EpisodicExperimentSG(task,ma)
34 |     print "Reward for Agents"
35 |     for i in range(40000):
36 |         rewards=exp.doEpisodes(number=1)
37 |         print str(rewards[0][-1][0])
38 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/example_gridgames.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/06
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrain.rl.learners.valuebased.interface import ActionValueNetwork
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.nfqSG import NFQ_SG
10 | from pybrainSG.rl.agents.learningSG import LearningAgentSG
11 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame
12 | import numpy as np
13 | import warnings
14 | 
15 | if __name__ == '__main__':
16 |     warnings.simplefilter("ignore")
17 |     for _ in range(500):
18 |         ma=MultiAgent()
19 |         for i in range(GridGame.numberofAgents):
20 |             net=ActionValueNetwork(dimState=(GridGame.numberofAgents*2),
21 |                                    numActions=len(GridGame.availableActions))
22 |             learner= NFQ_SG(maxEpochs=100)
23 |             agent = LearningAgentSG(net,
24 |                                     num_features=(np.ones(GridGame.numberofAgents)*(GridGame.numberofAgents*2)),
25 |                                     num_actions=(np.ones(GridGame.numberofAgents)*len(GridGame.availableActions)), 
26 |                                     num_agents=GridGame.numberofAgents, 
27 |                                     learner=learner,
28 |                                     index=i)
29 |             ma.addAgent(agent)
30 | #         task=GridGameTask(gameType="GG1")
31 | #         task=GridGameTask(gameType="GG2")
32 |         task=GridGameTask(gameType="GG3")
33 |         exp=EpisodicExperimentSG(task,ma)
34 |         print "Average reward for agents at the end of episode:"
35 |         #Two phase learning
36 |         rewards=exp.doEpisodes(number=10)#first phase
37 |         ma.learn()
38 |         for numBatch in range(40):
39 |             avr=np.array([0.0,0.0])
40 |             for i in range(len(rewards)):
41 |                 avr+=rewards[i][-1]
42 |             avr/=float(np.size(rewards,axis=0))
43 |             print avr
44 |             rewards=exp.doEpisodes(number=10)
45 |             ma.learn()
46 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_gridgames_CEQ_NFQ.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/03/06
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 8 | from pybrainSG.rl.leaners.valuebased.ceq import *
 9 | from pybrainSG.rl.agents.nfceqa import *
10 | from pybrainSG.rl.examples.tasks.gridgames import GridGameTask, GridGame
11 | import numpy as np
12 | import warnings
13 | 
14 | if __name__ == '__main__':
15 |     warnings.simplefilter("ignore")
16 |     for _ in range(500):
17 |         ma=MultiAgent()
18 |         for i in range(GridGame.numberofAgents):
19 |             learner= NFCEQ(
20 |                             num_features=(GridGame.numberofAgents*2),
21 |                             num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
22 |                             num_agents=GridGame.numberofAgents,
23 |                             max_epochs=100,
24 |                             indexOfAgent=i)
25 |             agent= NFCEQ_Agent(
26 |                                 learner,
27 |                                 num_features=np.ones(GridGame.numberofAgents,dtype=np.int8)*(GridGame.numberofAgents*2),
28 |                                 num_actions=np.ones(GridGame.numberofAgents,dtype=np.int8)*len(GridGame.availableActions),
29 |                                 num_agents=GridGame.numberofAgents,
30 |                                 index=i)
31 |             ma.addAgent(agent)
32 | #         task=GridGameTask(gameType="GG1")
33 | #         task=GridGameTask(gameType="GG2")
34 |         task=GridGameTask(gameType="GG3")
35 |         exp=EpisodicExperimentSG(task,ma)
36 |         print "Average reward for agents at the end of episode:"
37 |         #Two phase learning
38 |         rewards=exp.doEpisodes(number=30)#first phase
39 |         ma.learn()
40 |         for numBatch in range(40):
41 |             avr=np.array([0.0,0.0])
42 |             for i in range(len(rewards)):
43 |                 avr+=rewards[i][-1]
44 |             avr/=float(np.size(rewards,axis=0))
45 |             print avr
46 |             rewards=exp.doEpisodes(number=10)
47 |             ma.learn()
48 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/ceq/example_huntinggame_CEQ-NFQ.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/28
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 7 | from pybrainSG.rl.leaners.valuebased.ceq import *
 8 | from pybrainSG.rl.agents.nfceqa import *
 9 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask
10 | from pybrainSG.rl.agents.multiAgent import MultiAgent
11 | 
12 | import numpy as np
13 | if __name__ == '__main__':
14 |     warnings.simplefilter("ignore")
15 |     
16 |     for _ in range(500):
17 |         ma=MultiAgent()
18 |         HuntingGame.numberofAgents=2
19 |         for i in range(HuntingGame.numberofAgents):
20 |             learner= NFCEQ(
21 |                                 num_features=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
22 |                                 num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions),
23 |                                 num_agents=HuntingGame.numberofAgents,
24 |                                 max_epochs=100,
25 |                                 indexOfAgent=i)
26 |             agent= NFCEQ_Agent(
27 |                                     learner,
28 |                                     num_features=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
29 |                                     num_actions=np.ones(HuntingGame.numberofAgents,dtype=np.int8)*len(HuntingGame.availableActions),
30 |                                     num_agents=HuntingGame.numberofAgents,
31 |                                     index=i)
32 |             ma.addAgent(agent)
33 |         task=HuntingGameTask()
34 |         exp=EpisodicExperimentSG(task,ma)
35 |         print "Reward for Agents"
36 |         print "Average Reward for Agents (at the end of episode)"
37 |         #Two phase leanring
38 |         rewards=exp.doEpisodes(number=10)#firstphase
39 |         ma.learn()
40 |         for numBatch in range(40):
41 |             avr=0.0
42 |             for i in range(len(rewards)):
43 |                 #print str(rewards[i][-1][0])
44 |                 #average
45 |                 avr+=rewards[i][-1][0]
46 |             avr/=float(len(rewards))
47 |             print avr
48 |             rewards=exp.doEpisodes(number=10)
49 |             ma.learn()
50 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/staticgame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/20
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | 
 7 | from pybrain.rl.environments import Environment
 8 | from pybrainSG.rl.experiments.episodicSG import EpisodicTaskSG
 9 | import numpy as np
10 | 
11 | 
12 | class StaticGameTask(EpisodicTaskSG):
13 |     '''All agent make decision (Head or Tail) simultaneously only one time. '''
14 | 
15 |     isGameFinished=False
16 |     def __init__(self):
17 |         EpisodicTaskSG.__init__(self, SimpleMatrixGame())
18 |     
19 |     def reset(self):
20 |         EpisodicTaskSG.reset(self)
21 |         self.isGameFinished=False
22 |     
23 |     def isFinished(self):
24 |         return self.isGameFinished
25 | 
26 |     def getReward(self):
27 |         self.isGameFinished=True
28 |         return self.env.getJointReward()
29 | 
30 | 
31 | class SimpleMatrixGame(Environment):
32 |     '''Corresponding to Heads and Tails respectively.'''
33 |     availableActions=[0,1]
34 |     '''payoff matrix of each agent in cooperative task scenario'''
35 |     payoffMatricForAgent1=[[10,-10],
36 |                             [-10,-10]]
37 |     payoffMatricForAgent2=[[10,-10],
38 |                             [-10,-10]]
39 | #     '''payoff matrix of zero-sumgame scenario. nash equilibrium: (Agenat1's action=0,Agent2's action=1)'''
40 | #     payoffMatricForAgent1=[[5,2],
41 | #                            [-1,6]]
42 | #     payoffMatricForAgent2=[[-5,-2],
43 | #                            [1,-6]]
44 | #     '''payoff matrix of zero-sumgame scenario. matching pennies'''
45 | #     payoffMatricForAgent1=[[1,-1],
46 | #                            [-1,1]]
47 | #     payoffMatricForAgent2=[[-1,1],
48 | #                            [1,-1]]
49 |     
50 |     outcomeForAfgenet1=None
51 |     outcomeForAfgenet2=None
52 |     
53 |     def getSensors(self):
54 |         return np.ones((2,1))#Static state (i.e., no state transition)
55 |     
56 |     def performAction(self, action):
57 |         self.outcomeForAfgenet1=SimpleMatrixGame.payoffMatricForAgent1[action[0]][action[1]]
58 |         self.outcomeForAfgenet2=SimpleMatrixGame.payoffMatricForAgent2[action[0]][action[1]]
59 | 
60 |     def reset(self):
61 |         self.outcomeForAfgenet1=None
62 |         self.outcomeForAfgenet2=None
63 |         
64 |     def getJointReward(self):
65 |         return np.array([self.outcomeForAfgenet1,self.outcomeForAfgenet2])
66 | 
67 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/nfqSG/example_huntingGame.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/19
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrain.rl.learners.valuebased.interface import ActionValueNetwork
 7 | from pybrainSG.rl.experiments.episodicSG import EpisodicExperimentSG
 8 | from pybrainSG.rl.agents.multiAgent import MultiAgent
 9 | from pybrainSG.rl.leaners.valuebased.nfqSG import NFQ_SG
10 | from pybrainSG.rl.agents.learningSG import LearningAgentSG
11 | from pybrainSG.rl.examples.tasks.huntinggame import HuntingGame, HuntingGameTask
12 | import numpy as np
13 | import warnings
14 | if __name__ == '__main__':
15 |     warnings.simplefilter("ignore")
16 |     for _ in range(500):
17 |         ma=MultiAgent()
18 |         #
19 |         HuntingGame.numberofAgents=2
20 |         for i in range(HuntingGame.numberofAgents):
21 |             #dimState=# position of each agent in grid world + # position of each niman in grid world + bias
22 |             net=ActionValueNetwork(dimState=(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1),
23 |                                    numActions=len(HuntingGame.availableActions))
24 |             learner= NFQ_SG(maxEpochs=100)#hopefully, more than 100.
25 | #             learner._explorer.epsilon=0.1#In one player case, that too small. 
26 |             #print learner.explorer
27 |             agent = LearningAgentSG(net,
28 |                                     num_features=(np.ones(HuntingGame.numberofAgents)*(HuntingGame.numberofAgents*2+HuntingGame.numberofAnimals*2+1)),
29 |                                     num_actions=(np.ones(HuntingGame.numberofAgents)*len(HuntingGame.availableActions)), 
30 |                                     num_agents=HuntingGame.numberofAgents, 
31 |                                     learner=learner,
32 |                                     index=i)
33 |             ma.addAgent(agent)
34 |         
35 |         task=HuntingGameTask()
36 |         
37 |         exp=EpisodicExperimentSG(task,ma)
38 |         print "Reward for Agents"
39 |         print "Average Reward for Agents (at the end of episode)"
40 |         #Two phase leanring
41 |         rewards=exp.doEpisodes(number=10)#firstphase
42 |         ma.learn()
43 |         for numBatch in range(40):
44 |             avr=0.0
45 |             for i in range(len(rewards)):
46 |                 #print str(rewards[i][-1][0])
47 |                 #average
48 |                 avr+=rewards[i][-1][0]
49 |             avr/=float(len(rewards))
50 |             print avr
51 |             rewards=exp.doEpisodes(number=10)
52 |             ma.learn()
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Multi-agent reinforcement learning in stochastic games
 2 | ====
 3 | 
 4 | # What is this package?
 5 | This package is unofficial PyBrain extension for __multi-agent reinforcement learning__ in __general sum stochastic games__. 
 6 | The package provides 1) the framework for modeling general sum stochastic games and 2) its multi-agent reinforcement learning algorithms. 
 7 | 
 8 | 
 9 | ## General sum stochastic games (GSSGs)
10 | GSSGs is generalized Markov decision processes (MDPs) for multi-agent situations, and represented as a tuple <_D,_ _S,_ ___A,___  _T,_ ___R___> (right side of following figure). 
11 | _D_ represents a agents set, _S_ represents a state of an environment, ___A___ represents a joint action of all agents, and ___R___ represents a joint reward for each agent. In contrast to MDPs, GSSGs allows multiple agents to affect the environment and receive rewards simultaneously. 
12 | We can model many phenomena in the real world with GSSGs (e.g., trading in market place, negotiation of stakeholders, or collaborative task of robots). 
13 | 
14 | ![img](./Images/MDPsandGSSGs.jpg "MDPs and GSSGs")
15 | 
16 | 
17 | ## Multi-agent reinforcement learning (MARL)
18 | MARL is used for learning agent policies $\pi$ concurrently. 
19 | $\pi$ is a mapping function from an observed state _S_ to an agent action _A_ (see above figure). 
20 | Each agent policy is learnt to maximize its own expected cumulative rewards, converging to equilibrium (typically Nash equilibrium) where all agent policies cannot be modified to better one. 
21 | This package provides variations of [PHC, PHC-WoLF](http://www.cs.cmu.edu/~mmv/papers/01ijcai-mike.pdf), [Correlated-Q Learning](https://www.aaai.org/Papers/ICML/2003/ICML03-034.pdf), [SGSP](http://www.ifaamas.org/Proceedings/aamas2015/aamas/p1371.pdf) in addition to GSSGs version of single-agent reinforcement learning implemented in PyBrain. 
22 | 
23 | 
24 | # How to use this package?
25 | To use this package, we need 1) install all requirements, 2) implement GSSGs to specify target domain, and 3) apply MARL to implemented GSSGs to learn agent policies. 
26 | 
27 | ## 1. Install Requirement
28 | * Python 2.7.6
29 | * Numpy 1.11.0rc1+
30 | * Scipy 0.17.0+
31 | * PyBrain 0.3.3+
32 | 
33 | ## 2. Implement GSSGs 
34 | Implement the class extending EpisodicTaskSG (pybrainSG.rl.environments.episodicSG) and the class extending Environment class (pybrain.rl.environments.environment). 
35 | Some examples of implementation are put on following package: 
36 | 
37 | * pybrainSG.rl.examples.tasks
38 | 
39 | For example, "gridgames.py" provides examples of grid world domain, and "staticgame.py" provides examples of bi-matrix game domain. 
40 | 
41 | ## 3. Apply MARL to implemented GSSGs
42 | To apply MARL to implemented GSSGs, we need construct an agent set and an experiment. 
43 | You can find examples of a construction in the following folder: 
44 | 
45 | * pybrainSG.rl.examples
46 | 
47 | For example, "example_gridgames.py" in "ceq" package shows how to use one of Correlated-Q learning implementations in the grid game domain. 
48 | 
49 | # Future work
50 | Refactoring and cleaning up source codes.
51 | Introducing inverse reinforcement learning for estimation of other agents reward structure. 
52 | 
53 | # Author
54 | [Takuya Hiraoka](http://isw3.naist.jp/~takuya-h/)
55 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/nfqSG.py:
--------------------------------------------------------------------------------
 1 | from scipy import r_
 2 | from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner
 3 | from pybrain.datasets import SupervisedDataSet
 4 | from pybrain.supervised.trainers.rprop import RPropMinusTrainer
 5 | from pybrain.utilities import one_to_n
 6 | from pybrainSG.rl.leaners.valuebased.learnerfaSG import IndexableValueBasedLearner
 7 | from pybrain.tools.shortcuts import buildNetwork
 8 | 
 9 | class NFQ_SG(IndexableValueBasedLearner):#Mod. version
10 |     """ 
11 |     Stochastic game version of Neural-fitted Q-iteration
12 |     """
13 | 
14 |     def __init__(self, maxEpochs=20, indexOfAgent=None,):
15 |         ValueBasedLearner.__init__(self)
16 |         self.gamma = 0.9
17 |         self.maxEpochs = maxEpochs
18 |         #
19 |         self.ownerAgentProperties["requireOtherAgentsState"]=False
20 |         self.ownerAgentProperties["requireJointAction"]=False
21 |         self.ownerAgentProperties["requireJointReward"]=False
22 |         self.isFirstLerning=True
23 |         
24 |     def learn(self):
25 |         # convert reinforcement dataset to NFQ supervised dataset
26 |         supervised = SupervisedDataSet(self.module.network.indim, 1)
27 |         for seq in self.dataset[self.indexOfAgent]:
28 |             lastexperience = None
29 |             for state, action, reward in seq:
30 |                 if not lastexperience:
31 |                     # delay each experience in sequence by one
32 |                     lastexperience = (state, action, reward)
33 |                     continue
34 |                 
35 |                 # use experience from last timestep to do Q update
36 |                 (state_, action_, reward_) = lastexperience
37 |                 
38 |                 Q = self.module.getValue(state_, action_[0])
39 |                 
40 |                 inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
41 |                 if self.isFirstLerning:
42 |                     tgt = reward_
43 |                 else:
44 |                     tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
45 |                 supervised.addSample(inp, tgt)
46 |                 
47 |                 #for reward normalization
48 |                 
49 |                 # update last experience with current one
50 |                 lastexperience = (state, action, reward)
51 |                 
52 |         #Re-building netowrks is required in multiprocessing environments. 
53 |         params=self.module.network.params
54 |         self.module.network=buildNetwork(self.module.indim+self.module.numActions, 
55 |                                          self.module.indim+self.module.numActions, 
56 |                                          1)
57 |         self.module.network._setParameters(params)
58 |         
59 |         # train module with backprop/rprop on dataset
60 |         trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01)
61 |         trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
62 |         if self.isFirstLerning:
63 |             self.isFirstLerning=False
64 |         # alternative: backprop, was not as stable as rprop
65 |         # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.005, batchlearning=True, verbose=True)
66 |         # trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/nfceqa.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 2016/02/28
 3 | 
 4 | @author: takuya-hv2
 5 | '''
 6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
 7 | from pybrain.utilities import drawIndex
 8 | from pybrainSG.rl.leaners.valuebased.ceq import NFCEQ
 9 | from scipy import array
10 | 
11 | class NFCEQ_Agent(LoggingAgentSG):
12 |     """
13 |     Agent based on NFCEQ put on: 
14 |     pybrainSG.rl.leaners.valuebased.ceq
15 |     """    
16 |     init_exploration = 0.3   # aka epsilon
17 |     exploration_decay = 0.98 # per episode        
18 |     
19 |     init_temperature = 1.
20 |     temperature_decay = 0.99 # per episode
21 |     
22 |     # flags for exploration strategies
23 |     epsilonGreedy = True
24 |     learning = True
25 |     greedy = False
26 |      
27 |     def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs):
28 |         assert isinstance(learner, NFCEQ), "learner should be instance of NFCEQ."
29 |         self.learner = learner
30 |         LoggingAgentSG.__init__(self, num_features, num_actions, num_agents, index, **kwargs)
31 |         # if learner is available, tell it the module and data
32 |         if self.learner is not None:
33 |             self.learner.dataset = self.history
34 |         self.learning = True
35 |         self.learner._behaviorPolicy = self._actionProbs
36 |         self.reset()
37 | 
38 |         self.agentProperties["requireOtherAgentsState"]=False
39 |         self.agentProperties["requireJointAction"]=True
40 |         self.agentProperties["requireJointReward"]=True
41 |         for prop in self.learner.getProperty().keys():
42 |             if learner.getProperty()[prop]:
43 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
44 |         
45 |     def _actionProbs(self, state):
46 |         if self.greedy:
47 |             return self.learner._greedyPolicy(state)
48 |         elif self.epsilonGreedy:
49 |             return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 
50 |                     + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent]))
51 |         else:
52 |             return self.learner._boltzmannPolicy(state, self._temperature)                    
53 |     
54 |     def getAction(self):
55 |         self.lastaction = drawIndex(self._actionProbs(self.lastobs), True)
56 |         return array([self.lastaction])
57 |         
58 |     def integrateObservation(self, obs):
59 |         LoggingAgentSG.integrateObservation(self, obs)        
60 |         
61 |     def reset(self):
62 |         LoggingAgentSG.reset(self)
63 |         self._temperature = self.init_temperature
64 |         self._expl_proportion = self.init_exploration
65 |         self.learner.reset()
66 |         self.newEpisode()
67 |         
68 |     def newEpisode(self):
69 |         """ Indicate the beginning of a new episode in the training cycle. """
70 |         if self.logging:
71 |             for i in range(self.numAgents):
72 |                 self.history[i].newSequence()
73 |         if self.learning and not self.learner.batchMode:
74 |             self.learner.newEpisode()
75 |         else:
76 |             self._temperature *= self.temperature_decay
77 |             self._expl_proportion *= self.exploration_decay      
78 |             self.learner.newEpisode()
79 | 
80 |             
81 |     def learn(self,episodes):
82 |         assert isinstance(self.learner,NFCEQ), "learner should be an instance of CEQ-NFQ"
83 |         self.learner.learn()
84 |     
85 |     def setIndexOfAgent(self,index):
86 |         """ set index to agent.
87 |             :key index: index of agent
88 |             :type index: integer
89 |         """ 
90 |         super(NFCEQ_Agent, self).setIndexOfAgent(index)
91 |         self.learner.setIndexOfAgent(index)
92 |         


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/learningSG.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
  7 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  8 | 
  9 | class LearningAgentSG(LoggingAgentSG):
 10 |     """ 
 11 |     Variation of LearningAgent (pybrain.rl.agents.learning) for stochastic game, 
 12 |     which can use some single-agent reinforcement learnings (currently only NFQ) put on: 
 13 |     pybrainSG.rl.leaners.valuebased.nfqSG
 14 |     """
 15 | 
 16 |     def __init__(self, module, num_features, num_actions, num_agents, index, learner):
 17 |         """
 18 |         :key module: the acting module
 19 |         :key learner: the learner (optional) """
 20 |         assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable."
 21 |         self.module = module
 22 |         self.learner = learner
 23 |         LoggingAgentSG.__init__(self, num_features, num_actions, num_agents,index)
 24 | 
 25 |         # if learner is available, tell it the module and data
 26 |         if self.learner is not None:
 27 |             self.learner.module = self.module
 28 |             self.learner.dataset = self.history
 29 | 
 30 |         self.learning = True
 31 |         
 32 |         self.agentProperties["requireOtherAgentsState"]=False
 33 |         self.agentProperties["requireJointAction"]=False
 34 |         self.agentProperties["requireJointReward"]=False
 35 |         #parity check
 36 |         for prop in self.learner.getProperty().keys():
 37 |             if learner.getProperty()[prop]:
 38 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
 39 | 
 40 |     def _getLearning(self):
 41 |         """ Return whether the agent currently learns from experience or not. """
 42 |         return self.__learning
 43 | 
 44 | 
 45 |     def _setLearning(self, flag):
 46 |         """ Set whether or not the agent should learn from its experience """
 47 |         if self.learner is not None:
 48 |             self.__learning = flag
 49 |         else:
 50 |             self.__learning = False
 51 | 
 52 |     learning = property(_getLearning, _setLearning)
 53 | 
 54 | 
 55 |     def getAction(self):
 56 |         """ Activate the module with the last observation, add the exploration from
 57 |             the explorer object and store the result as last action. """
 58 |         LoggingAgentSG.getAction(self)
 59 | 
 60 |         self.lastaction = self.module.activate(self.lastobs)
 61 | 
 62 |         if self.learning:
 63 |             self.lastaction = self.learner.explore(self.lastobs, self.lastaction)
 64 | 
 65 |         return self.lastaction
 66 | 
 67 | 
 68 |     def newEpisode(self):
 69 |         """ Indicate the beginning of a new episode in the training cycle. """
 70 |         # reset the module when a new episode starts.
 71 |         self.module.reset()
 72 |         
 73 |         if self.logging:
 74 |             for i in range(self.numAgents):
 75 |                 self.history[i].newSequence()
 76 | 
 77 |         # inform learner about the start of a new episode
 78 |         if self.learning:
 79 |             self.learner.newEpisode()
 80 | 
 81 |     def reset(self):
 82 |         """ Clear the history of the agent and resets the module and learner. """
 83 |         LoggingAgentSG.reset(self)
 84 |         self.module.reset()
 85 |         if self.learning:
 86 |             self.learner.reset()
 87 | 
 88 | 
 89 |     def learn(self, episodes=1):
 90 |         """ Call the learner's learn method, which has access to both module and history. """
 91 |         if self.learning:
 92 |             self.learner.learnEpisodes(episodes)
 93 | 
 94 |     def setIndexOfAgent(self,index):
 95 |         """ set index to agent.
 96 |             :key index: index of agent
 97 |             :type index: integer
 98 |         """ 
 99 |         super(LearningAgentSG, self).setIndexOfAgent(index)
100 |         self.learner.setIndexOfAgent(index)
101 | 
102 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/learnerfaSG.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  7 | from scipy import zeros, dot, exp, clip, randn
  8 | from pybrain.utilities import r_argmax, setAllArgs
  9 | 
 10 | class LinearFALearnerSG(IndexableValueBasedLearner):
 11 |     """
 12 |     Stochastic game version of LinearFALearner
 13 |     """
 14 |     
 15 |     learningRate = 0.5      # aka alpha: make sure this is being decreased by calls from the learning agent!
 16 |     learningRateDecay = 100 # aka n_0, but counting decay-calls
 17 |     
 18 |     randomInit = True
 19 |     
 20 |     rewardDiscount = 0.99 # aka gamma
 21 |     
 22 |     batchMode = False
 23 |     passNextAction = False # for the _updateWeights method   
 24 |     #
 25 |     
 26 |     
 27 |     def __init__(self, num_features, num_actions, indexOfAgent=None, **kwargs):
 28 |         IndexableValueBasedLearner.__init__(self, indexOfAgent)
 29 |         setAllArgs(self, kwargs)
 30 |         self.explorer = None        
 31 |         self.indexOfAgent=indexOfAgent
 32 |         self.num_actions = num_actions
 33 |         self.num_features = num_features
 34 |         if self.randomInit:
 35 |             self._theta = randn(self.num_actions, self.num_features) / 10.
 36 |         else:
 37 |             self._theta = zeros((self.num_actions, self.num_features))
 38 |         self._additionalInit()
 39 |         self._behaviorPolicy = self._boltzmannPolicy
 40 |         self.reset()
 41 |         #
 42 |         self.ownerAgentProperties["requireOtherAgentsState"]=False
 43 |         self.ownerAgentProperties["requireJointAction"]=False
 44 |         self.ownerAgentProperties["requireJointReward"]=False
 45 | 
 46 |         
 47 |     def _additionalInit(self):
 48 |         pass
 49 |         
 50 |     def _qValues(self, state):
 51 |         """ Return vector of q-values for all actions, 
 52 |         given the state(-features). """
 53 |         return dot(self._theta, state)
 54 |     
 55 |     def _greedyAction(self, state):
 56 |         return r_argmax(self._qValues(state))
 57 |     
 58 |     def _greedyPolicy(self, state):
 59 |         tmp = zeros(self.num_actions)
 60 |         tmp[self._greedyAction(state)] = 1
 61 |         return tmp
 62 |     
 63 |     def _boltzmannPolicy(self, state, temperature=1.):
 64 |         tmp = self._qValues(state)
 65 |         return LinearFALearnerSG._boltzmannProbs(tmp, temperature)
 66 |     
 67 |     @staticmethod
 68 |     def _boltzmannProbs(qvalues, temperature=1.):
 69 |         if temperature == 0:
 70 |             tmp = zeros(len(qvalues))        
 71 |             tmp[r_argmax(qvalues)] = 1.
 72 |         else:
 73 |             tmp = qvalues / temperature            
 74 |             tmp -= max(tmp)        
 75 |             tmp = exp(clip(tmp, -20, 0))
 76 |         return tmp / sum(tmp)
 77 | 
 78 |     def reset(self):        
 79 |         IndexableValueBasedLearner.reset(self)        
 80 |         self._callcount = 0
 81 |         self.newEpisode()
 82 |     
 83 |     def newEpisode(self):  
 84 |         IndexableValueBasedLearner.newEpisode(self)      
 85 |         self._callcount += 1
 86 |         self.learningRate *= ((self.learningRateDecay + self._callcount) 
 87 |                               / (self.learningRateDecay + self._callcount + 1.))
 88 |     
 89 |     
 90 | class Q_LinFA_SG(LinearFALearnerSG):
 91 |     """ Standard Q-learning with linear FA. """
 92 |     
 93 |     def _updateWeights(self, state, action, reward, next_state):
 94 |         """ state and next_state are vectors, action is an integer. """
 95 |         td_error = reward + self.rewardDiscount * max(dot(self._theta, next_state)) - dot(self._theta[action], state) 
 96 |         #print(action, reward, td_error,self._theta[action], state, dot(self._theta[action], state))
 97 |         #print(self.learningRate * td_error * state)
 98 |         #print()
 99 |         self._theta[action] += self.learningRate * td_error * state 
100 |         
101 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/sgspa.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/03/10
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
  7 | from pybrain.utilities import drawIndex
  8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  9 | from scipy import array
 10 | import numpy as np
 11 | 
 12 | #Implmenting now
 13 | class SGSP_Agent(LoggingAgentSG):
 14 |     """ 
 15 |     Agent based on SPSG RL algorithms put on: 
 16 |     pybrainSG.rl.leaners.valuebased.spsg
 17 |     """
 18 |     init_exploration = 0.005   # aka epsilon
 19 |     exploration_decay = 0.9999 # per episode        
 20 |         
 21 |     # flags for exploration strategies
 22 |     epsilonGreedy = True
 23 |     learning = True
 24 |     
 25 |     def __init__(self, learner, num_actions, numAgents, index, **kwargs):
 26 |         assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable."
 27 |         self.learner = learner
 28 |         LoggingAgentSG.__init__(self, np.ones(numAgents)*learner.num_features, num_actions, numAgents, index, **kwargs)
 29 |         self.learner._behaviorPolicy = self._actionProbs
 30 |         self.reset()
 31 |         self.agentProperties["requireOtherAgentsState"]=False
 32 |         self.agentProperties["requireJointAction"]=True
 33 |         self.agentProperties["requireJointReward"]=True
 34 |         for prop in self.learner.getProperty().keys():
 35 |             if learner.getProperty()[prop]:
 36 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
 37 |         
 38 |     def _actionProbs(self, state):
 39 |         if not self.epsilonGreedy:
 40 |             return self.learner._softmaxPolicy(state)
 41 |         elif self.epsilonGreedy:
 42 |             return (self.learner._softmaxPolicy(state) * (1 - self._expl_proportion) 
 43 |                     + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent]))
 44 |     
 45 |     def getAction(self):
 46 |         self.lastaction = drawIndex(self._actionProbs(self.lastobs), True)
 47 |         if self.learning and not self.learner.batchMode and self._oaro is not None:
 48 |             self.learner._updateWeights(*(self._oaro + [self.lastaction]))
 49 |             self._oaro = None
 50 | #         print "Agent " + str(self.indexOfAgent) + ": " + str(self.lastaction)
 51 |         return array([self.lastaction])
 52 |         
 53 |     def integrateObservation(self, obs):
 54 |         if self.learning and not self.learner.batchMode and self.lastobs is not None:
 55 |             if self.learner.passNextAction:
 56 |                 self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs]
 57 |             else:
 58 |                 self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 59 |         LoggingAgentSG.integrateObservation(self, obs)        
 60 |         
 61 |     def reset(self):
 62 |         LoggingAgentSG.reset(self)
 63 |         self._expl_proportion = self.init_exploration
 64 |         self.learner.reset()    
 65 |         self._oaro = None
 66 |         self.newEpisode()
 67 |         
 68 |     def newEpisode(self):
 69 |         if self.logging:
 70 |             for i in range(self.numAgents):
 71 |                 self.history[i].newSequence()
 72 |         if self.learning and not self.learner.batchMode:
 73 |             self.learner.newEpisode()
 74 |         else:
 75 |             self._expl_proportion *= self.exploration_decay      
 76 |             self.learner.newEpisode()
 77 |                         
 78 |     def learn(self):
 79 |         if not self.learning:
 80 |             return
 81 |         if not self.learner.batchMode:
 82 |             print('Learning is done online, and already finished.')
 83 |             return
 84 |         for seq in self.history[self.indexOfAgent]:
 85 |             for obs, action, reward in seq:
 86 |                 if self.laststate is not None:
 87 |                     self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 88 |                 self.lastobs = obs
 89 |                 self.lastaction = action[0]
 90 |                 self.lastreward = reward
 91 |             self.learner.newEpisode()
 92 |     
 93 |     def setIndexOfAgent(self,index):
 94 |         """ indexing agent and its learner.
 95 |             :key index: index of agent
 96 |             :type index: integer
 97 |         """ 
 98 |         super(SGSP_Agent, self).setIndexOfAgent(index)
 99 |         self.learner.setIndexOfAgent(index)
100 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/faphc.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
  7 | from pybrain.utilities import drawIndex
  8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  9 | from scipy import array
 10 | import numpy as np
 11 | class PHC_Agent(LoggingAgentSG):
 12 |     """ 
 13 |     Agent based on PHC RL algorithms put on: 
 14 |     pybrainSG.rl.leaners.valuebased.phc
 15 |     """            
 16 |     init_exploration = 0.01   # aka epsilon
 17 |     exploration_decay = 0.99 # per episode        
 18 |     
 19 |     init_temperature = 1.
 20 |     temperature_decay = 0.99 # per episode
 21 |     
 22 |     # flags for exploration strategies
 23 |     epsilonGreedy = True
 24 |     learning = True
 25 |     
 26 |     def __init__(self, learner, numAgents, index, **kwargs):
 27 |         assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable."
 28 |         self.learner = learner
 29 |         LoggingAgentSG.__init__(self, np.ones(numAgents)*learner.num_features, np.ones(numAgents), numAgents, index, **kwargs)
 30 |         self.learner._behaviorPolicy = self._actionProbs
 31 |         self.reset()
 32 |         self.agentProperties["requireOtherAgentsState"]=False
 33 |         self.agentProperties["requireJointAction"]=False
 34 |         self.agentProperties["requireJointReward"]=False
 35 |         for prop in self.learner.getProperty().keys():
 36 |             if learner.getProperty()[prop]:
 37 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
 38 |         
 39 |     def _actionProbs(self, state):
 40 |         if not self.epsilonGreedy:
 41 |             return self.learner._softmaxPolicy(state)
 42 |         elif self.epsilonGreedy:
 43 |             return (self.learner._softmaxPolicy(state) * (1 - self._expl_proportion) 
 44 |                     + self._expl_proportion / float(self.learner.num_actions))
 45 |     
 46 |     def getAction(self):
 47 |         self.lastaction = drawIndex(self._actionProbs(self.lastobs), True)
 48 |         if self.learning and not self.learner.batchMode and self._oaro is not None:
 49 |             self.learner._updateWeights(*(self._oaro + [self.lastaction]))
 50 |             self._oaro = None          
 51 |         return array([self.lastaction])
 52 |         
 53 |     def integrateObservation(self, obs):
 54 |         if self.learning and not self.learner.batchMode and self.lastobs is not None:
 55 |             if self.learner.passNextAction:
 56 |                 self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs]
 57 |             else:
 58 |                 self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 59 |         LoggingAgentSG.integrateObservation(self, obs)        
 60 |         
 61 |     def reset(self):
 62 |         LoggingAgentSG.reset(self)
 63 |         self._temperature = self.init_temperature
 64 |         self._expl_proportion = self.init_exploration
 65 |         self.learner.reset()    
 66 |         self._oaro = None
 67 |         self.newEpisode()
 68 |         
 69 |     def newEpisode(self):
 70 |         if self.logging:
 71 |             for i in range(self.numAgents):
 72 |                 self.history[i].newSequence()
 73 |         if self.learning and not self.learner.batchMode:
 74 |             self.learner.newEpisode()
 75 |         else:
 76 |             self._temperature *= self.temperature_decay
 77 |             self._expl_proportion *= self.exploration_decay      
 78 |             self.learner.newEpisode()
 79 |                         
 80 |     def learn(self):
 81 |         if not self.learning:
 82 |             return
 83 |         if not self.learner.batchMode:
 84 |             print('Learning is done online, and already finished.')
 85 |             return
 86 |         for seq in self.history[self.indexOfAgent]:
 87 |             for obs, action, reward in seq:
 88 |                 if self.laststate is not None:
 89 |                     self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 90 |                 self.lastobs = obs
 91 |                 self.lastaction = action[0]
 92 |                 self.lastreward = reward
 93 |             self.learner.newEpisode()
 94 |     
 95 |     def setIndexOfAgent(self,index):
 96 |         """ indexing agent and its learner.
 97 |             :key index: index of agent
 98 |             :type index: integer
 99 |         """ 
100 |         super(PHC_Agent, self).setIndexOfAgent(index)
101 |         self.learner.setIndexOfAgent(index)
102 |         


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/multiAgent.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | 
  7 | from pybrain.rl.agents.agent import Agent
  8 | from pybrainSG.rl.agents.indexable import IndexableAgent
  9 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
 10 | import numpy as np
 11 | from multiprocessing import Process, Queue
 12 | import copy_reg
 13 | import types
 14 | 
 15 | def _pickle_method(m):
 16 |     if m.im_self is None:
 17 |         return getattr, (m.im_class, m.im_func.func_name)
 18 |     else:
 19 |         return getattr, (m.im_self, m.im_func.func_name)
 20 | 
 21 | copy_reg.pickle(types.MethodType, _pickle_method)
 22 | 
 23 | class MultiAgent(Agent):
 24 |     '''
 25 |     This class defines set of agents. 
 26 |     Each agent should be instance of IndexableAgent or its subclass.
 27 |     '''
 28 |     agentSet=[]
 29 |     
 30 |     def __init__(self):
 31 |         Agent.__init__(self)
 32 |         self.agentSet=[]
 33 |     
 34 |     def integrateObservation(self, obs):
 35 |         """ Integrate the current observation of the environment.
 36 |             :arg obs: The last observation returned from the environment
 37 |             :type obs: by default, this is assumed to be a numpy array of doubles
 38 |         """
 39 |         for index in range(len(self.agentSet)):
 40 |             if self.agentSet[index].getProperty()["requireOtherAgentsState"]:
 41 |                 self.agentSet[index].integrateObservation(obs)
 42 |             else:
 43 |                 self.agentSet[index].integrateObservation(obs[index])
 44 |         
 45 |     def getJointAction(self):
 46 |         """ Return a chosen joint-action.
 47 |             :rtype: by default, this is assumed to ba a numpy array of integers that correspond to particular action at each.
 48 |         """
 49 |         jointAction=np.zeros(len(self.agentSet), dtype=np.int)
 50 |         for index in range(len(self.agentSet)):
 51 |             jointAction[index]=self.agentSet[index].getAction()
 52 |         for index in range(len(self.agentSet)):
 53 |             if isinstance(self.agentSet[index], LoggingAgentSG) and self.agentSet[index].getProperty()["requireJointAction"]:
 54 |                 self.agentSet[index].lastaction=jointAction
 55 |             else:
 56 |                 self.agentSet[index].lastaction=jointAction[index]        
 57 |         return jointAction    
 58 |     
 59 |     def _getAction(self,q, agent, index):
 60 |         act=agent.getAction()
 61 |         q.put([index,act])
 62 |     
 63 |     def giveJointReward(self, r):
 64 |         """ give joint-teward to all agents.
 65 |             :key r: joint reward
 66 |             :type r: numpy array of doubles
 67 |         """
 68 |         for index in range(len(self.agentSet)):
 69 |             if self.agentSet[index].getProperty()["requireJointReward"]:
 70 |                 self.agentSet[index].giveReward(r)
 71 |             else:
 72 |                 self.agentSet[index].giveReward(r[index])                
 73 | 
 74 |     def reset(self):
 75 |         for agent in self.agentSet:
 76 |             agent.reset()
 77 | 
 78 |     def learn(self, episodes=1):
 79 |         procs=[]
 80 |         i=0
 81 |         qResult=Queue()
 82 |         for agent in self.agentSet:
 83 |             procs.append(Process(target=self._paraLearn, kwargs={"agent":agent,"episodes":episodes,"qResult":qResult}))
 84 |             i+=1
 85 |         for proc in procs:
 86 |             proc.start()
 87 |         for _ in range(len(self.agentSet)):
 88 |             res=qResult.get()
 89 |             self.agentSet[res[0]]=res[1]
 90 | 
 91 |     def _paraLearn(self, agent, episodes, qResult):
 92 |         agent.learn(episodes)
 93 |         qResult.put([agent.indexOfAgent, agent])
 94 |         
 95 |     def newEpisode(self):
 96 |         for agent in self.agentSet:
 97 |             agent.newEpisode()
 98 |     
 99 |     def addAgent(self, agent):
100 |         assert isinstance(agent, IndexableAgent), "agent should be IndxableAgent class or its subclass."
101 |         assert agent.indexOfAgent is not None, "Index should be identified"
102 |         if len(self.agentSet) ==0:
103 |             assert agent.indexOfAgent==0, "Illegal indexing."
104 |         else:
105 |             ind=0
106 |             for elem in self.agentSet:
107 |                 assert ind == (elem.indexOfAgent), "Illegal indexing."
108 |                 ind+=1
109 |             assert agent.indexOfAgent==ind, "Illegal indexing."
110 |         self.agentSet.append(agent)
111 |     
112 |     def popAgent(self, index):
113 |         agent=self.agentSet.pop(index)
114 |         agent.setIndexOfAgent(None)
115 |         
116 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/huntinggame.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/20
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | 
  7 | from pybrain.rl.environments import Environment
  8 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG
  9 | import numpy as np
 10 | 
 11 | #Integrated to gridgames.py in future.
 12 | class HuntingGameTask(EpisodicTaskSG):
 13 |     ''''Agents hunt animals in grid world. If all agents gather into particular place where more than one animal stay, 
 14 |     hunting is succeeded and agents are rewarded. 
 15 |     Agents are punished as turn passes. 
 16 |     '''
 17 |     isGameFinished=False
 18 |     maximumTurn=10
 19 |     currentTurn=0
 20 |     def __init__(self,task=None):
 21 |         if task == None:
 22 |             task=HuntingGame()
 23 |         EpisodicTaskSG.__init__(self, task)
 24 |     
 25 |     def reset(self):
 26 |         EpisodicTaskSG.reset(self)
 27 |         self.isGameFinished=False
 28 |         self.currentTurn=0
 29 |     
 30 |     def isFinished(self):
 31 |         return self.isGameFinished
 32 | 
 33 |     def getReward(self):
 34 |         jointReward=self.env.getJointReward()
 35 |         if (self.env.getJointReward()[0] > 0):
 36 |             self.isGameFinished=True
 37 |             
 38 |         #Time pressure
 39 |         jointReward=jointReward-1
 40 |         if (self.currentTurn >= HuntingGameTask.maximumTurn):
 41 |             self.isGameFinished=True
 42 |         #print str(jointReward[0])#+", " + str(jointReward[1])
 43 |         self.currentTurn+=1
 44 |         return jointReward 
 45 | 
 46 | 
 47 | class HuntingGame(Environment):
 48 |     availableActions=[0,1,2,3,4]#Corresponding to forward north, west, south, east, stay respectively. 
 49 |     sizeofGlidWorld=3
 50 |     numberofAnimals=1
 51 |     numberofAgents=2
 52 |     animals=None
 53 |     agents=None
 54 |     
 55 |     def getSensors(self):
 56 |         for i in range(HuntingGame.numberofAnimals):
 57 |             if np.random.rand() > 0.8:
 58 |                 self.animals[i]=self.__move__(self.animals[i], np.random.randint(5))
 59 |         stateTemp1=np.append(self.animals.flatten(),self.agents.flatten())
 60 |         stateTemp2=np.append(stateTemp1,np.ones(1))
 61 |         stateTemp3=[]
 62 |         for _ in range(self.numberofAgents):
 63 |             stateTemp3.append(stateTemp2)
 64 |         
 65 |         return stateTemp3
 66 |     
 67 |     def performAction(self, action):
 68 |         for i in range(HuntingGame.numberofAgents):
 69 |             self.agents[i]=self.__move__(self.agents[i], action[i])
 70 |             
 71 |     def isSucceedHunting(self):
 72 |         #return true only if all agent gather in one place where animal exists
 73 |         for i in range(HuntingGame.numberofAgents):
 74 |             for j in range(HuntingGame.numberofAgents):
 75 |                 if(self.agents[i][0] != self.agents[j][0]) or (self.agents[i][1] != self.agents[j][1]):
 76 |                     return False
 77 |         for k in range(HuntingGame.numberofAnimals):
 78 |             if(self.agents[0][0] == self.animals[k][0]) and (self.agents[0][1] == self.animals[k][1]):
 79 |                 return True
 80 |         return False
 81 |         
 82 |     def __move__(self,position, forward):
 83 |         if forward == 0:#Move North
 84 |             position[1]+=1
 85 |         elif forward == 1:#Move west
 86 |             position[0]-=1
 87 |         elif forward == 2:#Move south
 88 |             position[1]-=1
 89 |         elif forward == 3:#Move east
 90 |             position[0]+=1
 91 |         elif forward == 4:#stay here
 92 |             return position
 93 |         else:
 94 |             assert False, "Unexpected action"
 95 | 
 96 |         if position[0] >= HuntingGame.sizeofGlidWorld:
 97 |             position[0]=HuntingGame.sizeofGlidWorld-1
 98 |         if position[0] < 0:
 99 |             position[0]=0
100 |         if position[1] >= HuntingGame.sizeofGlidWorld:
101 |             position[1]=HuntingGame.sizeofGlidWorld-1
102 |         if position[1] < 0:
103 |             position[1]=0
104 |         return position
105 |                 
106 |     def reset(self):
107 | #         self.animals=np.random.randint(HuntingGame.sizeofGlidWorld,size=(HuntingGame.numberofAnimals,2))
108 |         self.animals=np.zeros((HuntingGame.numberofAnimals,2))
109 |         self.agents=np.random.randint(HuntingGame.sizeofGlidWorld,size=(HuntingGame.numberofAgents,2))
110 | #         self.agents=np.ones((HuntingGame.numberofAgents,2))
111 | 
112 |     
113 |     def getJointReward(self):
114 |         if self.isSucceedHunting():
115 |             return np.ones(self.numberofAgents)*10
116 |         return np.zeros(self.numberofAgents)
117 |     
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/linearfaSG.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
  7 | from pybrain.utilities import drawIndex
  8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  9 | from scipy import array
 10 | import numpy as np
 11 | class LinearFA_AgentSG(LoggingAgentSG):
 12 |     """
 13 |     Agent based on simple Q-learning put on: 
 14 |     pybrainSG.rl.leaners.valuebased.learnerfaSG
 15 |     """    
 16 |     
 17 |     init_exploration = 0.1   # aka epsilon
 18 |     exploration_decay = 0.99 # per episode        
 19 |     
 20 |     init_temperature = 1.
 21 |     temperature_decay = 0.99 # per episode
 22 |     
 23 |     # flags for exploration strategies
 24 |     epsilonGreedy = False
 25 |     learning = True
 26 |     greedy = False
 27 |      
 28 |     def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs):
 29 |         assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable."
 30 |         self.learner = learner
 31 |         LoggingAgentSG.__init__(self, num_features, num_actions, num_agents, index, **kwargs)
 32 |         self.learner._behaviorPolicy = self._actionProbs
 33 |         self.reset()
 34 |         self.agentProperties["requireOtherAgentsState"]=False
 35 |         self.agentProperties["requireJointAction"]=False
 36 |         self.agentProperties["requireJointReward"]=False
 37 |         for prop in self.learner.getProperty().keys():
 38 |             if learner.getProperty()[prop]:
 39 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
 40 |         
 41 |     def _actionProbs(self, state):
 42 |         if self.greedy:
 43 |             return self.learner._greedyPolicy(state)
 44 |         elif self.epsilonGreedy:
 45 |             return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 
 46 |                     + self._expl_proportion / float(self.learner.num_actions))
 47 |         else:
 48 |             return self.learner._boltzmannPolicy(state, self._temperature)                    
 49 |     
 50 |     def getAction(self):
 51 |         self.lastaction = drawIndex(self._actionProbs(self.lastobs), True)
 52 |         if self.learning and not self.learner.batchMode and self._oaro is not None:
 53 |             self.learner._updateWeights(*(self._oaro + [self.lastaction]))
 54 |             self._oaro = None          
 55 |         return array([self.lastaction])
 56 |         
 57 |     def integrateObservation(self, obs):
 58 |         if self.learning and not self.learner.batchMode and self.lastobs is not None:
 59 |             if self.learner.passNextAction:
 60 |                 self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs]
 61 |             else:
 62 |                 self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 63 |         LoggingAgentSG.integrateObservation(self, obs)        
 64 |         
 65 |     def reset(self):
 66 |         LoggingAgentSG.reset(self)
 67 |         self._temperature = self.init_temperature
 68 |         self._expl_proportion = self.init_exploration
 69 |         self.learner.reset()    
 70 |         self._oaro = None
 71 |         self.newEpisode()
 72 |         
 73 |     def newEpisode(self):
 74 |         if self.logging:
 75 |             for i in range(self.numAgents):
 76 |                 self.history[i].newSequence()
 77 |         if self.learning and not self.learner.batchMode:
 78 |             self.learner.newEpisode()
 79 |         else:
 80 |             self._temperature *= self.temperature_decay
 81 |             self._expl_proportion *= self.exploration_decay      
 82 |             self.learner.newEpisode()
 83 | 
 84 |             
 85 |     def learn(self):
 86 |         if not self.learning:
 87 |             return
 88 |         if not self.learner.batchMode:
 89 |             print('Learning is done online, and already finished.')
 90 |             return
 91 |         for seq in self.history[self.indexOfAgent]:
 92 |             for obs, action, reward in seq:
 93 |                 if self.laststate is not None:
 94 |                     self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 95 |                 self.lastobs = obs
 96 |                 self.lastaction = action[0]
 97 |                 self.lastreward = reward
 98 |             self.learner.newEpisode()
 99 |     
100 |     def setIndexOfAgent(self,index):
101 |         """ indexing agent and its learner.
102 |             :key index: index of agent
103 |             :type index: integer
104 |         """ 
105 |         super(LinearFA_AgentSG, self).setIndexOfAgent(index)
106 |         self.learner.setIndexOfAgent(index)
107 |         


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/ceqa.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/26
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.loggingSG import LoggingAgentSG
  7 | from pybrain.utilities import drawIndex
  8 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  9 | from scipy import array
 10 | import numpy as np
 11 | class CEQ_Agent(LoggingAgentSG):
 12 |     """ 
 13 |     Agent based on CE-Q RL algorithms put on: 
 14 |     pybrainSG.rl.leaners.valuebased.ceq
 15 |     """
 16 |     init_exploration = 0.3   # aka epsilon
 17 |     exploration_decay = 0.99 # per episode        
 18 |     
 19 |     init_temperature = 1.
 20 |     temperature_decay = 0.99 # per episode
 21 |     
 22 |     # flags for exploration strategies
 23 |     epsilonGreedy = True
 24 |     learning = True
 25 |     greedy = False
 26 |      
 27 |     def __init__(self, learner, num_features, num_actions, num_agents, index, **kwargs):
 28 |         self.learner = learner
 29 |         LoggingAgentSG.__init__(self, np.ones(num_agents, dtype=np.int8)*num_features, num_actions, num_agents, index, **kwargs)
 30 |         assert isinstance(learner, IndexableValueBasedLearner), "learner should be indexable."
 31 |         self.learner._behaviorPolicy = self._actionProbs
 32 |         self.reset()
 33 |         self.agentProperties["requireOtherAgentsState"]=False
 34 |         self.agentProperties["requireJointAction"]=True
 35 |         self.agentProperties["requireJointReward"]=True
 36 |         for prop in self.learner.getProperty().keys():
 37 |             if learner.getProperty()[prop]:
 38 |                 assert self.getProperty()[prop], "learners property should same to that of agents."
 39 |         
 40 |     def _actionProbs(self, state):
 41 |         if self.greedy:
 42 |             return self.learner._greedyPolicy(state)
 43 |         elif self.epsilonGreedy:
 44 |             return (self.learner._greedyPolicy(state) * (1 - self._expl_proportion) 
 45 |                     + self._expl_proportion / float(self.learner.num_actions[self.indexOfAgent]))
 46 |         else:
 47 |             return self.learner._boltzmannPolicy(state, self._temperature)                    
 48 |     
 49 |     def getAction(self):
 50 |         self.lastaction = drawIndex(self._actionProbs(self.lastobs), True)
 51 |         if self.learning and not self.learner.batchMode and self._oaro is not None:
 52 |             self.learner._updateWeights(*(self._oaro + [self.lastaction]))
 53 |             self._oaro = None          
 54 |         return array([self.lastaction])
 55 |         
 56 |     def integrateObservation(self, obs):
 57 |         if self.learning and not self.learner.batchMode and self.lastobs is not None:
 58 |             if self.learner.passNextAction:
 59 |                 self._oaro = [self.lastobs, self.lastaction, self.lastreward, obs]
 60 |             else:
 61 |                 self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 62 |         LoggingAgentSG.integrateObservation(self, obs)        
 63 |         
 64 |     def reset(self):
 65 |         LoggingAgentSG.reset(self)
 66 |         self._temperature = self.init_temperature
 67 |         self._expl_proportion = self.init_exploration
 68 |         self.learner.reset()    
 69 |         self._oaro = None
 70 |         self.newEpisode()
 71 |         
 72 |     def newEpisode(self):
 73 |         if self.logging:
 74 |             for i in range(self.numAgents):
 75 |                 self.history[i].newSequence()
 76 |         if self.learning and not self.learner.batchMode:
 77 |             self.learner.newEpisode()
 78 |         else:
 79 |             self._temperature *= self.temperature_decay
 80 |             self._expl_proportion *= self.exploration_decay      
 81 |             self.learner.newEpisode()
 82 |                         
 83 |     def learn(self):
 84 |         if not self.learning:
 85 |             return
 86 |         if not self.learner.batchMode:
 87 |             print('Learning is done online, and already finished.')
 88 |             return
 89 |         for seq in self.history[self.indexOfAgent]:
 90 |             for obs, action, reward in seq:
 91 |                 if self.laststate is not None:
 92 |                     self.learner._updateWeights(self.lastobs, self.lastaction, self.lastreward, obs)
 93 |                 self.lastobs = obs
 94 |                 self.lastaction = action[0]
 95 |                 self.lastreward = reward
 96 |             self.learner.newEpisode()
 97 |     
 98 |     def setIndexOfAgent(self,index):
 99 |         """ indexing agent and its learner.
100 |             :key index: index of agent
101 |             :type index: integer
102 |         """ 
103 |         super(CEQ_Agent, self).setIndexOfAgent(index)
104 |         self.learner.setIndexOfAgent(index)
105 |         


--------------------------------------------------------------------------------
/pybrainSG/rl/agents/loggingSG.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.agents.indexable import IndexableAgent
  7 | from pybrain.datasets.reinforcement import ReinforcementDataSet
  8 | import numpy as np
  9 | class LoggingAgentSG(IndexableAgent):
 10 |     """ This agent stores actions, states, and rewards encountered during
 11 |         interaction with an environment in a ReinforcementDataSet (which is
 12 |         a variation of SequentialDataSet).
 13 |         The stored history can be used for learning and is erased by resetting
 14 |         the agent. It also makes sure that integrateObservation, getAction and
 15 |         giveReward are called in exactly that order.
 16 |     """
 17 | 
 18 |     logging = True
 19 | 
 20 |     lastobs = None
 21 |     lastaction = None
 22 |     lastreward = None
 23 | 
 24 |     agentProperties={
 25 |                      "requireOtherAgentsState": None, #Define if agent require other agent state information. 
 26 |                      "requireJointAction":None, #Define if agent require other agent action information.  
 27 |                      "requireJointReward":None}#Define if agent require other agent reward information.
 28 | 
 29 | 
 30 |     def __init__(self, indims, outdims, numAgents, index=None, **kwargs):
 31 |         IndexableAgent.__init__(self, index)
 32 |         self.setArgs(**kwargs)
 33 |         
 34 |         # store input and output dimension #input, output dimension for each agent 
 35 |         self.indim = indims
 36 |         self.outdim = outdims
 37 |         self.numAgents=numAgents
 38 |         # create the history dataset
 39 |         self.history=[]
 40 |         for i in range(self.numAgents):
 41 |             self.history.append(ReinforcementDataSet(self.indim[i], self.outdim[i]))
 42 | 
 43 | 
 44 |     def integrateObservation(self, obs):
 45 |         """Step 1: store the observation received in a temporary variable until action is called and
 46 |         reward is given. """
 47 |         self.lastobs = obs
 48 |         self.lastaction = None
 49 |         self.lastreward = None
 50 | 
 51 | 
 52 |     def getAction(self):
 53 |         """Step 2: store the action in a temporary variable until reward is given. """
 54 |         assert self.lastobs != None
 55 |         assert self.lastaction == None
 56 |         assert self.lastreward == None
 57 |         # implement getAction in subclass and set self.lastaction
 58 | 
 59 | 
 60 |     def giveReward(self, r):
 61 |         """Step 3: store observation, action and reward in the history dataset. """
 62 |         # step 3: assume that state and action have been set
 63 |         assert self.lastobs != None
 64 |         assert self.lastaction != None
 65 |         assert self.lastreward == None
 66 | 
 67 |         self.lastreward = r
 68 | 
 69 |         # store state, action and reward in dataset if logging is enabled
 70 |         if self.logging:
 71 |             for i in range(self.numAgents):
 72 |                 tlastobs=None
 73 |                 tlastaction=None
 74 |                 tlastreward=None
 75 |                 
 76 |                 if self.getProperty()["requireOtherAgentsState"]:
 77 |                     tlastobs=self.lastobs[i]
 78 |                 elif i==self.indexOfAgent:
 79 |                     tlastobs=self.lastobs
 80 |                 else:
 81 |                     tlastobs=np.zeros(self.indim[i])
 82 |                 if self.getProperty()["requireJointAction"]:
 83 |                     tlastaction=self.lastaction[i]
 84 |                 elif i==self.indexOfAgent:
 85 |                     tlastaction=self.lastaction
 86 |                 else:
 87 |                     tlastaction=np.zeros(self.outdim[i])
 88 |                 if self.getProperty()["requireJointReward"]:
 89 |                     tlastreward=self.lastreward[i]
 90 |                 elif i==self.indexOfAgent:
 91 |                     tlastreward=self.lastreward
 92 |                 else:
 93 |                     tlastreward=np.zeros(1)
 94 |                 self.history[i].addSample(tlastobs, tlastaction, tlastreward)
 95 |                      
 96 |     def newEpisode(self):
 97 |         """ Indicate the beginning of a new episode in the training cycle. """
 98 |         if self.logging:
 99 |             for i in range(self.numAgents):
100 |                 self.history[i].newSequence()
101 | 
102 | 
103 |     def reset(self):
104 |         """ Clear the history of the agent. """
105 |         self.lastobs = None
106 |         self.lastaction = None
107 |         self.lastreward = None
108 |         for i in range(self.numAgents):
109 |             self.history[i].clear()
110 | 
111 |     def getProperty(self):
112 |         for elem in self.agentProperties.values():
113 |             assert isinstance(elem,bool), "All property should be initialize with proper boolean values."
114 |         return self.agentProperties
115 |     


--------------------------------------------------------------------------------
/pybrainSG/rl/examples/tasks/gridgames.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/03/06
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | '''
  7 | Created on 2016/02/20
  8 | 
  9 | @author: takuya-hv2
 10 | '''
 11 | 
 12 | from pybrain.rl.environments import Environment
 13 | from pybrainSG.rl.environments.episodicSG import EpisodicTaskSG
 14 | import numpy as np
 15 | import copy
 16 | 
 17 | class GridGameTask(EpisodicTaskSG):
 18 |     '''' '''
 19 |     maximumTurn=30
 20 |     def __init__(self,gameType="GG1"):
 21 |         '''
 22 |         gameType: indicates game type an experiment perform: 
 23 |         [GG1:] simple coordinate game
 24 |         [GG2:] "Battle of the Sexes"
 25 |         [GG3:] "Chicken"
 26 |         See the following paper for detailed descriptions:
 27 |         https://www.aaai.org/Papers/ICML/2003/ICML03-034.pdf
 28 |         '''
 29 |         EpisodicTaskSG.__init__(self, GridGame(gameType))
 30 |     
 31 |     def reset(self):
 32 |         EpisodicTaskSG.reset(self)
 33 |         self.isGameFinished=False
 34 |         self.currentTurn=0
 35 |     
 36 |     def isFinished(self):
 37 |         if self.currentTurn > self.maximumTurn:
 38 |             return True
 39 |         return self.env.isReachGoal
 40 | 
 41 |     def getReward(self):
 42 |         self.currentTurn+=1
 43 | #         print "rew:"+str(self.env.getJointReward())
 44 |         return self.env.getJointReward()
 45 | 
 46 | class GridGame(Environment):
 47 |     availableActions=[0,1,2,3]#Corresponding to forward north, west, south, east respectively. 
 48 |     sizeofGlidWorld=3
 49 |     numberofGoals=2
 50 |     numberofAgents=2
 51 | 
 52 |     def __init__(self,gameType="GG1"):
 53 |         Environment.__init__(self)
 54 |         assert (gameType == "GG1") or (gameType == "GG2") or (gameType == "GG3"), "gameType should be either GG1, GG2, or GG3."
 55 |         self.gameType=gameType
 56 |         print "Game type: " + str(self.gameType)
 57 |     
 58 |     def getSensors(self):
 59 |         #State: locations of all agents
 60 |         state=[np.r_[self.agents[0],self.agents[1]],
 61 |                np.r_[self.agents[0],self.agents[1]]]
 62 | #         print "state:" + str(state)
 63 |         return state
 64 |     
 65 |     def performAction(self, action):
 66 |         tempPos=[]
 67 |         self.prevAgents=copy.deepcopy(self.agents)
 68 | #         print "act:" + str(action)
 69 |         self.isColide=False
 70 |         for i in range(GridGame.numberofAgents):
 71 |             tempPos.append(self.__move__(copy.deepcopy(self.agents[i]), action[i]))
 72 |         if not self.__isColideWithEachOther(tempPos):
 73 |             self.agents=tempPos
 74 |             
 75 |         
 76 |     def __move__(self,position, forward):
 77 |         if forward == 0:#Move North
 78 |             if self.gameType=="GG2":
 79 |                 if (position[0] != 1 and position[1]==0) and (np.random.rand() < 0.5):
 80 |                     return position
 81 |             position[1]+=1
 82 |         elif forward == 1:#Move west
 83 |             position[0]-=1
 84 |         elif forward == 2:#Move south
 85 |             if self.gameType=="GG2":
 86 |                 if (position[0] != 1 and position[1]==1) and (np.random.rand() < 0.5):
 87 |                     return position
 88 |             position[1]-=1
 89 |         elif forward == 3:#Move east
 90 |             position[0]+=1
 91 |         else:
 92 |             assert False, "Unexpected action"
 93 | 
 94 |         if position[0] >= GridGame.sizeofGlidWorld:
 95 |             position[0]=GridGame.sizeofGlidWorld-1
 96 |         if position[0] < 0:
 97 |             position[0]=0
 98 |         if position[1] >= GridGame.sizeofGlidWorld:
 99 |             position[1]=GridGame.sizeofGlidWorld-1
100 |         if position[1] < 0:
101 |             position[1]=0
102 |         return position
103 |     
104 |     def __isColideWithEachOther(self,tempPos):
105 |         if (tempPos[0][0] == tempPos[1][0]) and (tempPos[0][1] == tempPos[1][1]):
106 |             if (tempPos[0][0] != self.goals[0][0]) or (tempPos[0][1] != self.goals[0][1]):
107 |                 self.isColide=True
108 |                 return True
109 |             else:
110 |                 return False
111 |         else:
112 |             return False
113 |         
114 |     def __isReachGoal(self):
115 |         #return boolean list, that determine if each agent reach each goal. 
116 |         irGoal=[False,False]
117 |         if (self.agents[0][0] == self.goals[0][0]) and (self.agents[0][1] == self.goals[0][1]):#For the first agent. 
118 |             irGoal[0]=True
119 |             self.isReachGoal=True
120 |         if (self.agents[1][0] == self.goals[1][0]) and (self.agents[1][1] == self.goals[1][1]):#For the second agent. 
121 |             irGoal[1]=True
122 |             self.isReachGoal=True            
123 |         return irGoal
124 |     
125 |     def reset(self):
126 |         self.agents=[np.array([0,0]),
127 |                      np.array([2,0])]
128 |         self.prevAgents=[np.array([0,0]),
129 |                      np.array([2,0])]
130 | 
131 |         self.isReachGoal=False
132 |         if self.gameType == "GG1":
133 |             self.goals=[np.array([2,2]),
134 |                         np.array([0,2])]
135 |         else:
136 |             self.goals=[np.array([1,2]),
137 |                         np.array([1,2])]
138 |     
139 |     def getJointReward(self):
140 |         jointRew=[0,0]
141 |         irGoal=self.__isReachGoal()
142 |         if not (self.gameType == "GG3"):
143 |             if irGoal[0]:
144 |                 jointRew[0]=100
145 |             if irGoal[1]:
146 |                 jointRew[1]=100
147 |         else:
148 |             if  irGoal[0] and irGoal[1]:
149 |                 if self.prevAgents[0][0] == 1:
150 |                     jointRew[0]= 125
151 |                     jointRew[1]=100
152 |                 elif self.prevAgents[1][0] == 1:
153 |                     jointRew[0]= 100
154 |                     jointRew[1]=125
155 |                 else:
156 |                     jointRew[0]= 120
157 |                     jointRew[1]=120
158 |             elif irGoal[0]:
159 |                 jointRew[0]=100
160 |             elif irGoal[1]:
161 |                 jointRew[1]=100
162 |         if self.isColide:
163 |             jointRew[0]-=1
164 |             jointRew[1]-=1
165 |                 
166 |         return np.array(jointRew)
167 |     
168 | 
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/sgsp.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/03/09
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  7 | from scipy import zeros
  8 | from pybrain.utilities import r_argmax
  9 | import numpy as np
 10 | from pybrain.utilities import abstractMethod
 11 | from pybrain.datasets import SupervisedDataSet
 12 | from pybrain.supervised.trainers import BackpropTrainer
 13 | from pybrain.utilities import one_to_n
 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer
 15 | from pybrain.tools.shortcuts import buildNetwork
 16 | from scipy import r_, asarray
 17 | import copy
 18 | 
 19 | class ON_SGSP_FA(IndexableValueBasedLearner):
 20 |     """ 
 21 |     Stochastic game sub-problem (with function approximation for Q-value and policy): 
 22 |     http://www.ifaamas.org/Proceedings/aamas2015/aamas/p1371.pdf
 23 |     """
 24 |     randomInit = True
 25 |     
 26 |     rewardDiscount = 0.99 # aka gamma
 27 |     
 28 |     batchMode = False
 29 |     passNextAction = False # for the _updateWeights method
 30 |     
 31 |     def __init__(self, num_features, num_actions, indexOfAgent=None):
 32 |         IndexableValueBasedLearner.__init__(self, indexOfAgent)
 33 |         self.explorer = None        
 34 |         self.num_actions = num_actions
 35 |         self.num_features = num_features
 36 |         self.indexOfAgent=indexOfAgent
 37 |         self._behaviorPolicy = self._softmaxPolicy
 38 |         self.reset()
 39 |         self.ownerAgentProperties["requireOtherAgentsState"]=False
 40 |         self.ownerAgentProperties["requireJointAction"]=True
 41 |         self.ownerAgentProperties["requireJointReward"]=True
 42 |         
 43 |     def _pi(self, state):
 44 |         """ Return vector of probability of policy for all actions, 
 45 |         given the state(-features). """
 46 |         abstractMethod()
 47 |            
 48 |     def _softmaxPolicy(self, state):
 49 |         tmp = zeros(self.num_actions[self.indexOfAgent])
 50 |         pi=self._pi(state)
 51 |         rand=np.random.rand()
 52 |         cum=0.0
 53 |         for i in range(self.num_actions[self.indexOfAgent]):
 54 |             cum+=pi[i]
 55 |             if rand < cum:
 56 |                 tmp[i] = 1
 57 |                 return tmp
 58 | 
 59 |     def reset(self):        
 60 |         IndexableValueBasedLearner.reset(self)        
 61 |         self._callcount = 0
 62 |         self.newEpisode()
 63 |     
 64 |     def newEpisode(self):  
 65 |         IndexableValueBasedLearner.newEpisode(self)      
 66 |     
 67 |     def _updateWeights(self, state, action, reward, next_state):
 68 |         '''
 69 |         Expected to update approximator. 
 70 |         '''
 71 |         abstractMethod()
 72 |         
 73 |         
 74 | class ON_SGSP_NN(ON_SGSP_FA):
 75 |     '''ON_SGSP with neural function approximation. '''
 76 |     weightdecay=0.01
 77 |     zeta=0.00001
 78 |     #
 79 |     cn=0.05
 80 |     bn=0.05
 81 |     #
 82 |     decayCn=0.9999
 83 |     decayBn=0.9995
 84 | 
 85 |     
 86 |     def __init__(self, num_features, num_actions, num_agents, index):    
 87 |         ON_SGSP_FA.__init__(self, num_features, num_actions, index)
 88 |         self.num_agents= num_agents
 89 |         self.linQ = []
 90 |         for iAgent in range(self.num_agents):
 91 |             self.linQ.append(buildNetwork(num_features + num_actions[iAgent], 
 92 |                                          num_features*2, 
 93 |                                          1, 
 94 |                                          hiddenclass = SigmoidLayer, 
 95 |                                          outclass = LinearLayer))
 96 |         self.linGradient = buildNetwork(num_features + num_actions[self.indexOfAgent], 
 97 |                                      (num_features + num_actions[self.indexOfAgent])*2, 
 98 |                                      1, 
 99 |                                      hiddenclass = SigmoidLayer, 
100 |                                      outclass = LinearLayer)
101 |         self.linPolicy = buildNetwork(num_features, 
102 |                                        (num_features + num_actions[self.indexOfAgent])*2, 
103 |                                        num_actions[self.indexOfAgent], 
104 |                                        hiddenclass = SigmoidLayer, 
105 |                                        outclass = SigmoidLayer)
106 |         assert self.decayBn < self.decayCn, "Cn shold be bigger than Bn."
107 |             
108 |     def _pi(self, state):
109 |         """Given state, compute probabilities for each action."""
110 |         values = np.array(self.linPolicy.activate(r_[state]))
111 |         z=np.sum(values)
112 |         return (values/z).flatten()
113 | 
114 |     def _qValues(self, state, iAgent):
115 |         """ Return vector of q-values for all actions, 
116 |         given the state(-features). """
117 |         values = np.array([self.linQ[iAgent].activate(r_[state, one_to_n(i, self.num_actions[iAgent])]) for i in range(self.num_actions[iAgent])])
118 |         return values.flatten()
119 |     
120 |     def _sgn(self, val):
121 |         if val > self.zeta:
122 |             return 1.0
123 |         elif val < (-1.0*self.zeta):
124 |             return -1.0
125 |         else:
126 |             return val
127 |     
128 |     def _gamma(self, val):
129 |         if val > 1.0:
130 |             return 1.0
131 |         elif val < 0:
132 |             return 0.0
133 |         else:
134 |             return val
135 |         
136 |                     
137 |     def _updateWeights(self, state, action, reward, next_state): 
138 |         """ state and next_state are vectors, action is an integer. """
139 |         #update Q-value function approximator (estimate Q-value instead of V) 
140 |         BellmanErrors=np.zeros(self.num_agents)
141 |         for iAgent in range(self.num_agents):
142 |             vValC=self._qValues(state,iAgent)
143 |             vValN=self._qValues(next_state,iAgent)
144 |             vArgMaxValC=r_argmax(vValC)
145 |             vArgMaxValN=r_argmax(vValN)
146 |             BellmanError=(reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[vArgMaxValC]
147 |             target=vValC[action[iAgent]]+self.cn*((reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[action[iAgent]])
148 |             BellmanErrors[iAgent]=BellmanError
149 |             inp=r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])]
150 |             ds = SupervisedDataSet(self.num_features+self.num_actions[iAgent],1)
151 |             ds.addSample(inp, target)
152 |             BackpropTrainer(self.linQ[iAgent], learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds)
153 |             
154 |         #Estimate gradient
155 |         grad=self.linGradient.activate(np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])])[0]
156 |         target=grad+self.cn*(np.sum(BellmanErrors, axis=0)-grad)
157 |         inp=np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])]
158 |         ds = SupervisedDataSet(self.num_features+self.num_actions[self.indexOfAgent],1)
159 |         ds.addSample(inp, target)
160 |         BackpropTrainer(self.linGradient, learningrate=1.0,weightdecay=self.weightdecay).trainOnDataset(ds)
161 | #         print str(self.indexOfAgent) + "-th agents optimization info.:"
162 | #         print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0))
163 | #         print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent]))
164 | #         print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent))
165 |         #Update policy
166 |         c_pi=self._pi(state)
167 | #         print "Policy: " + str(c_pi)
168 |         firstTerm=c_pi[action[self.indexOfAgent]]
169 |         secondTerm=(np.sqrt(firstTerm) 
170 |                     * np.absolute(BellmanErrors[self.indexOfAgent]) 
171 |                     * self._sgn(-1.0*self.linGradient.activate(np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self.num_actions[self.indexOfAgent])])[0])
172 |                     )
173 |         target=c_pi
174 |         target[action[self.indexOfAgent]]=self._gamma(firstTerm - self.bn * secondTerm)
175 |         inp=r_[asarray(state)]
176 |         ds = SupervisedDataSet(self.num_features, self.num_actions[self.indexOfAgent])
177 |         ds.addSample(inp, target)
178 |         BackpropTrainer(self.linPolicy, learningrate=1.0,weightdecay=self.weightdecay).trainOnDataset(ds)
179 |         
180 |         #update bn, cn
181 |         self.bn = self.bn * self.decayBn
182 |         self.cn = self.cn * self.decayCn
183 |         
184 |         
185 |         
186 |         
187 | 


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/phc.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/19
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  7 | from scipy import zeros
  8 | from pybrain.utilities import r_argmax
  9 | import numpy as np
 10 | from pybrain.utilities import abstractMethod
 11 | from pybrain.datasets import SupervisedDataSet
 12 | from pybrain.supervised.trainers import BackpropTrainer
 13 | from pybrain.utilities import one_to_n
 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer
 15 | from pybrain.tools.shortcuts import buildNetwork
 16 | from scipy import r_, asarray
 17 | import copy
 18 | 
 19 | class PHC_FA(IndexableValueBasedLearner):
 20 |     """ 
 21 |     Policy hill climbing algorithm (with function approximation for Q-value and policy): 
 22 |     http://www.cs.cmu.edu/~mmv/papers/01ijcai-mike.pdf
 23 |     """
 24 |     
 25 |     learningRate = 0.5      # aka alpha: make sure this is being decreased by calls from the learning agent!
 26 |     learningRateDecay = 100 # aka n_0, but counting decay-calls
 27 |     
 28 |     randomInit = True
 29 |     
 30 |     rewardDiscount = 0.99 # aka gamma
 31 |     
 32 |     batchMode = False
 33 |     passNextAction = False # for the _updateWeights method
 34 |     
 35 |     def __init__(self, num_features, num_actions, indexOfAgent=None):
 36 |         IndexableValueBasedLearner.__init__(self, indexOfAgent)
 37 |         self.explorer = None        
 38 |         self.num_actions = num_actions
 39 |         self.num_features = num_features
 40 |         self.indexOfAgent=indexOfAgent
 41 |         self._behaviorPolicy = self._softmaxPolicy
 42 |         self.reset()
 43 |         self.ownerAgentProperties["requireOtherAgentsState"]=False
 44 |         self.ownerAgentProperties["requireJointAction"]=False
 45 |         self.ownerAgentProperties["requireJointReward"]=False
 46 |         
 47 |     def _pi(self, state):
 48 |         """ Return vector of probability of policy for all actions, 
 49 |         given the state(-features). """
 50 |         abstractMethod()
 51 |            
 52 |     def _softmaxPolicy(self, state):
 53 |         tmp = zeros(self.num_actions)
 54 |         pi=self._pi(state)
 55 |         rand=np.random.rand()
 56 |         cum=0.0
 57 |         for i in range(self.num_actions):
 58 |             cum+=pi[i]
 59 |             if rand < cum:
 60 |                 tmp[i] = 1
 61 |                 return tmp
 62 | 
 63 |     def reset(self):        
 64 |         IndexableValueBasedLearner.reset(self)        
 65 |         self._callcount = 0
 66 |         self.newEpisode()
 67 |     
 68 |     def newEpisode(self):  
 69 |         IndexableValueBasedLearner.newEpisode(self)      
 70 |     
 71 |     def _updateWeights(self, state, action, reward, next_state):
 72 |         '''
 73 |         Expected to update approximator. 
 74 |         '''
 75 |         abstractMethod()
 76 |         
 77 |         
 78 | class PHC_NN(PHC_FA):
 79 |     '''PHC with neural function approximation. '''
 80 |     delta=0.1
 81 |     maxNumberofAverage=30
 82 |     weightdecay=0.001
 83 |     trainingEpochPerUpdateWight=2
 84 |     
 85 |     def __init__(self, num_features, num_actions, indexOfAgent=None):    
 86 |         PHC_FA.__init__(self, num_features, num_actions, indexOfAgent)
 87 |         self.linQ = buildNetwork(num_features + num_actions, (num_features + num_actions), 1, hiddenclass = SigmoidLayer, outclass = LinearLayer)
 88 |         self.linPolicy = buildNetwork(num_features, (num_features + num_actions), num_actions, hiddenclass = SigmoidLayer,outclass = SigmoidLayer)
 89 |         self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
 90 |         self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,weightdecay=self.weightdecay)
 91 | 
 92 |     def _pi(self, state):
 93 |         """Given state, compute probabilities for each action."""
 94 |         values = np.array(self.linPolicy.activate(r_[state]))
 95 |         z=np.sum(values)
 96 |         return (values/z).flatten()
 97 |     
 98 |     def _qValues(self, state):
 99 |         """ Return vector of q-values for all actions, 
100 |         given the state(-features). """
101 |         values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)])
102 |         return values.flatten()
103 | 
104 |             
105 |     def _updateWeights(self, state, action, reward, next_state):
106 |         """ state and next_state are vectors, action is an integer. """
107 |         #update Q-value function approximator
108 |         target=reward + self.rewardDiscount * max(self._qValues(next_state))
109 |         inp=r_[asarray(state), one_to_n(action, self.num_actions)]
110 |         self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
111 |         ds = SupervisedDataSet(self.num_features+self.num_actions,1)
112 |         ds.addSample(inp, target)
113 |         self.trainer4LinQ.trainOnDataset(ds)
114 |         #Update policy
115 |         bestAction=r_argmax(self._qValues(state))
116 |         target= one_to_n(bestAction, self.num_actions)
117 |         inp=r_[asarray(state)]
118 |         ds = SupervisedDataSet(self.num_features,self.num_actions)
119 |         ds.addSample(inp, target)
120 |         self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
121 |                                                learningrate=self.delta,
122 |                                                weightdecay=self.weightdecay)
123 |         self.trainer4LinPolicy.setData(ds)
124 |         self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
125 |         
126 | 
127 | 
128 | 
129 | 
130 | class PHC_WoLF_NN(PHC_FA):
131 |     '''PHC_WoLF with neural function '''
132 |     deltaW=0.05
133 |     deltaL=0.2
134 |     maxNumberofAverage=30
135 |     weightdecay=0.001
136 |     trainingEpochPerUpdateWight=1
137 |     
138 |     def __init__(self, num_features, num_actions, indexOfAgent=None):    
139 |         PHC_FA.__init__(self, num_features, num_actions, indexOfAgent)
140 |         self.linQ = buildNetwork(num_features + num_actions, (num_features + num_actions), 1, hiddenclass = SigmoidLayer, outclass = LinearLayer)
141 |         self.linPolicy = buildNetwork(num_features, (num_features + num_actions), num_actions, hiddenclass = SigmoidLayer,outclass = SigmoidLayer)
142 |         self.averagePolicy=[]
143 |         self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
144 |         self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,weightdecay=self.weightdecay)
145 | 
146 |     def _pi(self, state):
147 |         """Given state, compute softmax probability for each action."""
148 |         values = np.array(self.linPolicy.activate(r_[state]))
149 |         z=np.sum(values)
150 |         return (values/z).flatten()
151 |     
152 |     def _qValues(self, state):
153 |         """ Return vector of q-values for all actions, 
154 |         given the state(-features). """
155 |         values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)])
156 |         return values.flatten()
157 | 
158 |     def _piAvr(self, state):
159 |         pi=np.zeros(self.num_actions)
160 |         for elem in self.averagePolicy:
161 |             values = np.array(elem.activate(r_[state]))
162 |             pi=np.add(pi.flatten(),values.flatten())
163 |         z=np.sum(pi)
164 |         pi=pi/z
165 |         return pi.flatten()
166 |         
167 |     def _updateWeights(self, state, action, reward, next_state):
168 |         """ state and next_state are vectors, action is an integer. """
169 |         #update Q-value function approximator
170 |         target=reward + self.rewardDiscount * max(self._qValues(next_state))
171 |         inp=r_[asarray(state), one_to_n(action, self.num_actions)]
172 |         self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
173 |         ds = SupervisedDataSet(self.num_features+self.num_actions,1)
174 |         ds.addSample(inp, target)        
175 |         self.trainer4LinQ.trainOnDataset(ds)
176 | 
177 |         #update estimate of average policy
178 |         self.averagePolicy.append(copy.deepcopy(self.linPolicy))
179 |         if len(self.averagePolicy) > self.maxNumberofAverage:
180 |             self.averagePolicy.pop(np.random.randint(len(self.averagePolicy)))
181 |             
182 |         #update policy function approximator
183 |         delta=None
184 |         cumRewardOfCurrentPolicy=0.0
185 |         values=self._qValues(state)
186 |         pi=self._pi(state)
187 |         for elem_action in range(self.num_actions):
188 |             cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action]
189 |         cumRewardOfAveragePolicy=0.0
190 |         api=self._piAvr(state)
191 |         for elem_action in range(self.num_actions):
192 |             cumRewardOfAveragePolicy=api[elem_action]*values[elem_action]
193 |         if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy:
194 |             delta=self.deltaW
195 |         else:
196 |             delta=self.deltaL
197 |         
198 |         #Update policy
199 |         bestAction=r_argmax(self._qValues(state))
200 |         target=one_to_n(bestAction, self.num_actions)
201 |         inp=r_[asarray(state)]
202 |         ds = SupervisedDataSet(self.num_features,self.num_actions)
203 |         ds.addSample(inp, target)
204 |         self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
205 |                                                learningrate=(delta),
206 |                                                weightdecay=self.weightdecay)
207 |         self.trainer4LinPolicy.setData(ds)
208 |         self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
209 |                         
210 |         


--------------------------------------------------------------------------------
/pybrainSG/rl/leaners/valuebased/ceq.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2016/02/26
  3 | 
  4 | @author: takuya-hv2
  5 | '''
  6 | from pybrainSG.rl.leaners.valuebased.indexablevaluebased import IndexableValueBasedLearner
  7 | from pybrain.utilities import r_argmax
  8 | import numpy as np
  9 | from pybrain.utilities import abstractMethod
 10 | from pybrain.datasets import SupervisedDataSet
 11 | from pybrain.supervised.trainers.rprop import RPropMinusTrainer
 12 | from pybrain.supervised.trainers import BackpropTrainer
 13 | from pybrain.utilities import one_to_n
 14 | from pybrain.structure.modules import SigmoidLayer, LinearLayer
 15 | from pybrain.tools.shortcuts import buildNetwork
 16 | import copy
 17 | from scipy import zeros, exp, clip
 18 | from scipy.optimize._linprog import linprog
 19 | import warnings
 20 | from multiprocessing import Process, Queue
 21 | 
 22 | 
 23 | class CEQ_FA(IndexableValueBasedLearner):
 24 |     """ 
 25 |     Correlated Q (with function approximation):
 26 |     http://www.aaai.org/Papers/Symposia/Spring/2002/SS-02-02/SS02-02-012.pdf
 27 |     """
 28 |     
 29 |     learningRate = 0.2      # aka alpha: make sure this is being decreased by calls from the learning agent!
 30 |     learningRateDecay = 100 # aka n_0, but counting decay-calls
 31 |     
 32 |     randomInit = True
 33 |     
 34 |     rewardDiscount = 0.99 # aka gamma
 35 |     
 36 |     batchMode = False
 37 |     passNextAction = False # for the _updateWeights method
 38 |     
 39 |     def __init__(self, num_features, num_actions, num_agents, indexOfAgent):
 40 |         IndexableValueBasedLearner.__init__(self, indexOfAgent)
 41 |         self.explorer = None        
 42 |         self.num_actions = num_actions
 43 |         self.num_features = num_features
 44 |         self.num_agents=num_agents
 45 |         self.reset()
 46 |         self.ownerAgentProperties["requireOtherAgentsState"]=False
 47 |         self.ownerAgentProperties["requireJointAction"]=True
 48 |         self.ownerAgentProperties["requireJointReward"]=True
 49 |         assert self.num_agents == np.size(self.num_actions, axis=0), "Size of 1st row of action array should be equal to number of agent. "
 50 |         
 51 |     def _qValues(self, state):
 52 |         """ Return vector of probability of policy for all actions, 
 53 |         given the state(-features). """
 54 |         abstractMethod()
 55 |            
 56 |     def _greedyAction(self, state):
 57 |         return r_argmax(self._qValues(state))
 58 |     
 59 |     def _greedyPolicy(self, state):
 60 |         tmp = zeros(self.num_actions[self.indexOfAgent])
 61 |         tmp[self._greedyAction(state)] = 1
 62 |         return tmp
 63 |     
 64 |     def _boltzmannPolicy(self, state, temperature=1.):
 65 |         tmp = self._qValues(state)
 66 |         return CEQ_FA._boltzmannProbs(tmp, temperature)
 67 |     
 68 |     @staticmethod
 69 |     def _boltzmannProbs(qvalues, temperature=1.):
 70 |         if temperature == 0:
 71 |             tmp = zeros(len(qvalues))        
 72 |             tmp[r_argmax(qvalues)] = 1.
 73 |         else:
 74 |             tmp = qvalues / temperature            
 75 |             tmp -= max(tmp)
 76 |             tmp = exp(clip(tmp, -20, 0))
 77 |         return tmp / sum(tmp)
 78 | 
 79 |     def reset(self):
 80 |         IndexableValueBasedLearner.reset(self)        
 81 |         self._callcount = 0
 82 |         self.newEpisode()
 83 |     
 84 |     def newEpisode(self):  
 85 |         IndexableValueBasedLearner.newEpisode(self)      
 86 |         self._callcount += 1
 87 |         self.learningRate *= ((self.learningRateDecay + self._callcount) 
 88 |                               / (self.learningRateDecay + self._callcount + 1.))
 89 |     
 90 |     def _updateWeights(self, state, action, reward, next_state):
 91 |         '''
 92 |         Expected to update Q-value approximator. 
 93 |         '''
 94 |         abstractMethod()
 95 |         
 96 |         
 97 | class CEQ_Lin(CEQ_FA):
 98 |     '''
 99 |     CEQ with linear function approximation. 
100 |     '''
101 |     def __init__(self, num_features, num_actions, num_agents, indexOfAgent=None):    
102 |         CEQ_FA.__init__(self, num_features, num_actions, num_agents, indexOfAgent)
103 |         self.possibleJointAction, self.w4ActIndexing = self._initJointActAndItsIndex(num_agents, num_actions)
104 |         self.numJointAct=np.size(self.possibleJointAction, axis=0)
105 |         self.linQ=[]
106 |         self.actionDiminInput=0
107 |         for i in range(self.num_agents):
108 |             self.actionDiminInput+=self.num_actions[i]
109 |         for i in range(self.num_agents):
110 |             self.linQ.append(buildNetwork(num_features + self.actionDiminInput, 1, outclass = LinearLayer))
111 |         self.actionVecDic={}
112 |     
113 |     def _initJointActAndItsIndex(self, num_agents, num_actions):
114 |         numJointAct=1
115 |         w4ActIndexing=np.zeros(num_agents)
116 |         for index in range(len(num_actions)):
117 |             numJointAct*=num_actions[index]
118 |         temp=numJointAct
119 |         for index in range(np.size(num_actions,axis=0)):
120 |             temp/=num_actions[index]
121 |             w4ActIndexing[index]=(temp)
122 |         possibleJointAction=[[]]
123 |         for i in range(num_agents):
124 |             temp=[]
125 |             for j in range(num_actions[i]):
126 |                 for k in range(len(possibleJointAction)):
127 |                     temp2=copy.deepcopy(possibleJointAction[k])
128 |                     temp2.append(j)
129 |                     temp.append(temp2)
130 |             possibleJointAction=temp
131 |         possibleJointAction.sort()
132 |         possibleJointAction=np.array(possibleJointAction)
133 |         return possibleJointAction, w4ActIndexing
134 |     
135 |     def _qValues(self, state):
136 |         """ Return vector of q-values for all actions, 
137 |         given the state(-features). """
138 |         qValues=self._qValuesForAllPossibleJointAction(state)
139 |         eq=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValues, self.possibleJointAction, self.w4ActIndexing)
140 |         return np.array(self._qValuesForEachActionOfAgent(state, eq, self.indexOfAgent)).reshape(self.num_actions[self.indexOfAgent])
141 |     
142 |     def _updateWeights(self, state, action, reward, next_state):
143 |         """ state and next_state are vectors, action is an integer. """
144 |         #update Q-value function approximator
145 |         qValuesNext=self._qValuesForAllPossibleJointAction(next_state)
146 |         eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing)
147 |         #Learn
148 |         inp=self._EncodeStateAndJointActionIntoInputVector(state, action)
149 |         for i in range(self.num_agents):
150 |             target=reward[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(next_state, eqNext, i))
151 |             self.trainer4LinQ=BackpropTrainer(self.linQ[i],learningrate=self.learningRate,weightdecay=0.0)
152 |             ds = SupervisedDataSet(self.num_features+self.actionDiminInput,1)
153 |             ds.addSample(inp, target)
154 |             self.trainer4LinQ.trainOnDataset(ds)
155 |     
156 |     def _qValuesForAllPossibleJointAction(self, state):
157 |         qValues=[]
158 |         for iAgent in range(self.num_agents):
159 |             qValuesIthAgent=[]
160 |             for jointAct in self.possibleJointAction:
161 |                 val=np.array(self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct)))
162 |                 qValuesIthAgent.append(val)
163 |             qValues.append(qValuesIthAgent)
164 |         return qValues#QValues for all possible joint actions for each agents [numAgents][index of joint act in list]
165 |     
166 |     def _qValuesForEachActionOfAgent(self, state, CEq, iAgent):
167 |         qValuesForeachAct=[]
168 |         for iAct in range(self.num_actions[iAgent]):
169 |             expQ=0.0
170 |             sumP=0.0
171 |             numPJA=0.0
172 |             for jointAct in self.possibleJointAction:
173 |                 if iAct == int(jointAct[iAgent]):
174 |                     sumP+=CEq[int(np.dot(self.w4ActIndexing, jointAct))]
175 |                     numPJA+=1.0
176 |             for jointAct in self.possibleJointAction:
177 |                 if iAct == int(jointAct[iAgent]):
178 |                     if sumP > 0.00001:
179 |                         prob=CEq[int(np.dot(self.w4ActIndexing, jointAct))]
180 |                         if prob > 0.0:
181 |                             Q=self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct))
182 |                             expQ+=(prob/sumP)*Q[0]
183 |                     else:
184 |                         Q=self.linQ[iAgent].activate(self._EncodeStateAndJointActionIntoInputVector(state, jointAct))
185 |                         expQ+=(1.0/numPJA)*Q[0]
186 |             qValuesForeachAct.append(expQ)
187 |         return qValuesForeachAct
188 |     
189 |     
190 |     def _EncodeStateAndJointActionIntoInputVector(self, state, jointAct):
191 |         index=int(np.dot(self.w4ActIndexing, jointAct))
192 |         if index in self.actionVecDic:
193 |             return np.r_[state, self.actionVecDic[index]]
194 |         else:
195 |             iVector=np.array([])
196 |             for iAgent in range(len(jointAct)):
197 |                 iVector=np.r_[iVector, one_to_n(jointAct[iAgent], self.num_actions[iAgent])]
198 |             self.actionVecDic[index]=iVector
199 |             return np.r_[state, self.actionVecDic[index]]
200 |         
201 |         
202 |         
203 | class NFCEQ(CEQ_Lin):
204 |     '''Neural fitted Q iteration version. '''
205 |     def __init__(self, num_features, num_actions, num_agents, max_epochs=20, indexOfAgent=None, validateMultiProc=True):
206 |         CEQ_Lin.__init__(self, num_features, num_actions, num_agents, indexOfAgent)
207 |         self.max_epochs=max_epochs
208 |         self.linQ=[]#update
209 |         for _ in range(self.num_agents):
210 |             self.linQ.append(buildNetwork(num_features + self.actionDiminInput, (num_features + self.actionDiminInput), 1, hiddenclass=SigmoidLayer, outclass = LinearLayer))
211 |         self.isFirstLerning=True
212 |         self.validateMultiProc=validateMultiProc
213 | 
214 |     def _updateWeights(self, state, action, reward, next_state):
215 |         """ state and next_state are vectors, action is an integer. """
216 |         pass
217 |     def learn(self):
218 |         # convert reinforcement dataset to NFQ supervised dataset
219 |         supervised = []
220 |         dats=[]#[seq index][turn]=[state,jointAct,jointReward]
221 |         for i in range(self.num_agents):
222 |             supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1))
223 |         for i in range(self.dataset[self.indexOfAgent].getNumSequences()):            
224 |             seq=[]
225 |             for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])):
226 |                 state=self.dataset[self.indexOfAgent].getSequence(i)[0][j]
227 |                 jointAct=[]
228 |                 jointReward=[]
229 |                 for k in range(self.num_agents):
230 |                     jointAct.append(self.dataset[k].getSequence(i)[1][j][0])
231 |                     jointReward.append(self.dataset[k].getSequence(i)[2][j][0])
232 |                 seq.append([state, jointAct, jointReward])
233 |             dats.append(seq)
234 |         #prepare data set
235 |         for i in range(self.num_agents):
236 |             for seq in dats:
237 |                 lastexperience = None
238 |                 for sarPair in seq:
239 |                     state = sarPair[0]
240 |                     action = sarPair[1]
241 |                     reward = sarPair[2]
242 |                     if not lastexperience:
243 |                         # delay each experience in sequence by one
244 |                         lastexperience = (state, action, reward)
245 |                         continue
246 |                     # use experience from last timestep to do Q update
247 |                     (state_, action_, reward_) = lastexperience
248 |                     
249 |                     #update Q-value function approximator
250 |                     qValuesNext=self._qValuesForAllPossibleJointAction(state)
251 |                     eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing)
252 |                     #Learn
253 |                     inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_)
254 |                     if self.isFirstLerning:
255 |                         target=reward_[i]
256 |                     else:
257 |                         target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i))
258 |                     target=np.array([target])
259 |                     supervised[i].addSample(inp, target)
260 |                     # update last experience with current one
261 |                     lastexperience = (state, action, reward)
262 |         if self.isFirstLerning:
263 |             self.isFirstLerning=False
264 |             
265 |         procTrainers=[]
266 |         qResult=Queue()
267 |         for i in range(self.num_agents):
268 |             trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], 
269 |                                       batchlearning=True, 
270 |                                       verbose=False, 
271 |                                       )
272 |             if not self.validateMultiProc:
273 |                 trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False)
274 |             else:
275 |                 procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult}))
276 |         if self.validateMultiProc:
277 |             for proc in procTrainers:
278 |                 proc.start()
279 |             for i in range(self.num_agents):
280 |                 res=qResult.get()
281 |                 self.linQ[res[0]]=res[1]
282 |     
283 |     def _learningQfunction(self, trainer,i,q):
284 |         #Re-builde networks is required in multiprocessing environments. 
285 |         params=trainer.module.params
286 |         trainer.module=buildNetwork(self.num_features + self.actionDiminInput, (self.num_features + self.actionDiminInput), 1, hiddenclass=SigmoidLayer, outclass = LinearLayer)
287 |         trainer.module._setParameters(params)
288 |         trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False)
289 |         q.put([i,trainer.module])
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | def findCorrelatedEquilibrium(numAgent, numAction, Qvalues, possibleJointAction, w4ActIndexing):
297 |     '''
298 |     Given a list of all possible joint action, and its QValue table, 
299 |     this function find correlated equilibrium based on the linear programming. 
300 |     #In current implementation, the objective function, to determine the identical equilibrium, is "republican" function. 
301 |     '''
302 |     numJointAct=np.size(possibleJointAction,axis=0)
303 |     STs=[]#constraints for LP
304 |     for iAgent in range(numAgent): 
305 | #         print "==================== Agent " + str(iAgent) + "==============="
306 |         vecQ=Qvalues[iAgent]
307 |         eCumdeltaOutCome=np.zeros(numJointAct)
308 |         for ithAgentsOptAct in range(numAction[iAgent]):
309 |             #Calculate expected Q-Value when agent follow its optimal action "ithAgentsOptAct". 
310 |             eOutcomeInOpt=np.zeros(numJointAct)
311 |             for jointAction in possibleJointAction:
312 |                 if ithAgentsOptAct == jointAction[iAgent]:
313 |                     index=int(np.dot(w4ActIndexing, jointAction))
314 |                     eOutcomeInOpt[index]=vecQ[index]
315 |             #Calculate expected Q-Value when agent follow its non-optimal action "ithAgentsOptAct". 
316 |             for ithAgentsNonOptAct in range(numAction[iAgent]):
317 |                 if ithAgentsNonOptAct == ithAgentsOptAct:
318 |                     continue
319 |                 eOutcomeInNonOpt=np.zeros(numJointAct)
320 |                 for jointAction in possibleJointAction:
321 |                     if (ithAgentsOptAct != jointAction[iAgent]) and (ithAgentsNonOptAct == jointAction[iAgent]):
322 |                         jointActionWithNonOptimal=copy.deepcopy(jointAction)
323 |                         jointActionWithNonOptimal[iAgent]=ithAgentsOptAct
324 |                         index1=int(np.dot(w4ActIndexing, jointActionWithNonOptimal))
325 |                         index2=int(np.dot(w4ActIndexing, jointAction))
326 |                         eOutcomeInNonOpt[index1]=vecQ[index2]
327 |                 eCumdeltaOutCome = eCumdeltaOutCome + (eOutcomeInOpt - eOutcomeInNonOpt)
328 |         STs.append(eCumdeltaOutCome)
329 |             
330 |     #All all possible ith agent action
331 |     for i in range(numJointAct):
332 |         t=np.zeros(numJointAct)
333 |         t[i]=1.0
334 |         STs.append(t)
335 |     STs=np.array(STs)*(-1)
336 |     #Constraints (uneq.)
337 |     b_ub=np.zeros(np.size(STs,axis=0))
338 |     #Constraints (eq.)    
339 |     A_eq=np.ones((1,numJointAct))
340 |     b_eq=np.ones(1)
341 |     #Objective function    
342 |     c=np.zeros(numJointAct)
343 |     for iAgent in range(numAgent):
344 |         #find maximum Q value in each decision making. 
345 |         vecQ=np.array(Qvalues[iAgent]).reshape(numJointAct)
346 |         for jointAction in possibleJointAction:
347 |             index=int(np.dot(w4ActIndexing, jointAction))
348 |             if c[index] < vecQ[index]:
349 |                 c[index] = vecQ[index]
350 |     c*=-1
351 |     #Implement linear programing with scipy library
352 |     res=linprog(c=c, A_ub=STs, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=None, method='simplex', callback=None, options=None)
353 |     if not res.success:
354 |         warnings.warn("LP was failed uniform probability was set .")
355 |         res.x = np.ones(numJointAct)/(numJointAct)
356 |     return res.x
357 | 


--------------------------------------------------------------------------------