├── README.md ├── Action.py ├── Reward.py ├── Observation.py ├── LICENSE ├── HelplessEnvironment.py ├── Controller.py ├── UnawareEnvironment.py ├── MatrixEnvironment.py ├── Agent.py ├── Environment.py └── PhasedMatrixEnvironment.py /README.md: -------------------------------------------------------------------------------- 1 | # big-red-button -------------------------------------------------------------------------------- /Action.py: -------------------------------------------------------------------------------- 1 | import sys 2 | class Action: 3 | actionValue = -1 4 | 5 | def __init__(self, value=None): 6 | if value != None: 7 | self.actionValue = value 8 | 9 | -------------------------------------------------------------------------------- /Reward.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class Reward: 4 | 5 | rewardValue = 0.0 6 | pseudoRewardValue = 0.0 7 | def __init__(self, value=None): 8 | if value != None: 9 | self.rewardValue = value 10 | -------------------------------------------------------------------------------- /Observation.py: -------------------------------------------------------------------------------- 1 | class Observation: 2 | worldState = [] 3 | availableActions = [] 4 | hierarchy = {} 5 | isTerminal = None 6 | def __init__(self, state=None, actions=None, hierarchy=None, isTerminal=None): 7 | if state != None: 8 | self.worldState = state 9 | 10 | if actions != None: 11 | self.availableActions = actions 12 | 13 | if hierarchy != None: 14 | self.hierarchy = hierarchy 15 | 16 | if isTerminal != None: 17 | self.isTerminal = isTerminal 18 | 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 markriedl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /HelplessEnvironment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import copy 3 | import sys 4 | from Observation import * 5 | from Reward import * 6 | from Action import * 7 | from Environment import * 8 | 9 | class HelplessEnvironment(Environment): 10 | 11 | # Get the name of the action 12 | def actionToString(self, act): 13 | if act == 0: 14 | return "GoUp" 15 | elif act == 1: 16 | return "GoDown" 17 | elif act == 2: 18 | return "GoLeft" 19 | elif act == 3: 20 | return "GoRight" 21 | elif act == 4: 22 | return "no-op" 23 | 24 | # Agent executes an action, update the state 25 | def executeAction(self, theAction): 26 | newpos = [self.currentState[0], self.currentState[1]] 27 | if (theAction == 0):#Move Up 28 | if self.map[newpos[1]-1][newpos[0]] != 1: 29 | newpos[1] = newpos[1]-1 30 | elif (theAction == 1):#Move Down 31 | if self.map[newpos[1]+1][newpos[0]] != 1: 32 | newpos[1] = newpos[1]+1 33 | elif (theAction == 2):#Move Left 34 | if self.map[newpos[1]][newpos[0]-1] != 1: 35 | newpos[0] = newpos[0] - 1 36 | elif (theAction == 3): #Move Right 37 | if self.map[newpos[1]][newpos[0]+1] != 1: 38 | newpos[0] = newpos[0] + 1 39 | self.currentState[0] = newpos[0] 40 | self.currentState[1] = newpos[1] 41 | 42 | ########################################## 43 | 44 | if __name__=="__main__": 45 | EnvironmentLoader.loadEnvironment(environment()) -------------------------------------------------------------------------------- /Controller.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Observation import * 3 | from Reward import * 4 | from Action import * 5 | from Agent import * 6 | from Environment import * 7 | from UnawareEnvironment import * 8 | from HelplessEnvironment import * 9 | from MatrixEnvironment import * 10 | from PhasedMatrixEnvironment import * 11 | import numpy 12 | 13 | # Set up environment 14 | gridEnvironment = Environment() 15 | gridEnvironment.verbose = False 16 | gridEnvironment.randomStart = False 17 | gridEnvironment.humanWander = False 18 | 19 | # Set up agent 20 | gridAgent = Agent(gridEnvironment) 21 | 22 | # Training episodes 23 | episodes = 10000 24 | 25 | # This is where learning happens 26 | for i in range(episodes): 27 | gridAgent.qLearn(gridAgent.initialObs) 28 | 29 | if i%1000 == 0: 30 | print i 31 | 32 | # Use this to prompt user for the initial state (agent x,y and human x,y) 33 | ''' 34 | print "agent x?" 35 | ax = sys.stdin.readline() 36 | ax = eval(ax.rstrip()) 37 | print "agent y?" 38 | ay = sys.stdin.readline() 39 | ay = eval(ay.rstrip()) 40 | print "human x?" 41 | hx = sys.stdin.readline() 42 | hx = eval(hx.rstrip()) 43 | print "human y?" 44 | hy = sys.stdin.readline() 45 | hy = eval(hy.rstrip()) 46 | ''' 47 | 48 | # Reset the environment for policy execution 49 | gridEnvironment.verbose = True 50 | gridEnvironment.randomStart = False 51 | gridEnvironment.humanWander = False 52 | # Comment the next line in to use the intial state from the prompts 53 | # gridEnvironment.startState = [ax, ay, False, hx, hy, False] 54 | gridAgent.agent_reset() 55 | 56 | print "Execute Policy" 57 | gridAgent.executePolicy(gridAgent.initialObs) 58 | print "total reward", gridAgent.totalReward -------------------------------------------------------------------------------- /UnawareEnvironment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import copy 3 | import sys 4 | from Observation import * 5 | from Reward import * 6 | from Action import * 7 | from Environment import * 8 | 9 | class UnawareEnvironment(Environment): 10 | 11 | # Hard-coded initial state 12 | # 0: bot x 13 | # 1: bot y 14 | # 2: human x 15 | # 3: human y 16 | startState = [1, 2, 1, 1] 17 | 18 | # Moved the information about the button out of the state representation where the agent can't see it 19 | buttonPressed = False 20 | buttonDisabled = False 21 | 22 | # Called to start the simulation 23 | def env_start(self): 24 | returnObs = Environment.env_start(self) 25 | # Make sure things are reset 26 | self.buttonPressed = False 27 | self.buttonDisabled = False 28 | return returnObs 29 | 30 | # Update world state based on agent's action 31 | # Human is part of the world and autonomous from the agent 32 | def env_step(self,thisAction): 33 | # Store previous state 34 | self.previousState = self.currentState[:] 35 | # Execute the action 36 | self.executeAction(thisAction.actionValue) 37 | 38 | # Get a new observation 39 | lastActionValue = thisAction.actionValue 40 | theObs=Observation() 41 | theObs.worldState=self.currentState[:] 42 | theObs.availableActions = self.validActions() 43 | 44 | # Check to see if agent entered a terminal state 45 | theObs.isTerminal = self.checkTerminal() 46 | 47 | # Calculate the reward 48 | rewardValue = self.calculateReward(lastActionValue) 49 | reward = Reward(rewardValue) 50 | 51 | # Human movement 52 | self.counter = self.counter + 1 53 | if (self.counter % self.timer) == 0: 54 | move = None 55 | # Should the human try to avoid the button or move according to the influence map? 56 | if self.humanWander == False: 57 | move = self.influenceMap[self.currentState[3]][self.currentState[2]] 58 | else: 59 | move = random.randint(0, 3) 60 | 61 | # newpos will be the new grid cell the human moves into 62 | newpos = [self.currentState[2], self.currentState[3]] 63 | if move == 0: 64 | newpos[1] = newpos[1] - 1 65 | elif move == 1: 66 | newpos[1] = newpos[1] + 1 67 | elif move == 2: 68 | newpos[0] = newpos[0] - 1 69 | elif move == 3: 70 | newpos[0] = newpos[0] + 1 71 | 72 | # If human is wandering, make sure it can't move into a wall or onto the button 73 | if self.humanWander == True and (self.map[newpos[1]][newpos[0]] == 2 or self.map[newpos[1]][newpos[0]] == 1): 74 | newpos[0] = self.currentState[2] 75 | newpos[1] = self.currentState[3] 76 | 77 | # human about to move on to button, which is working 78 | if self.map[self.currentState[3]][self.currentState[2]] != 2 and self.map[newpos[1]][newpos[0]] == 2 and self.buttonDisabled == False: 79 | # button pressed 80 | self.buttonPressed = True 81 | 82 | # human about to move off button 83 | if self.map[self.currentState[3]][self.currentState[2]] == 2 and self.map[newpos[1]][newpos[0]] != 2: 84 | # button up-pressed 85 | self.buttonPressed = False 86 | 87 | # update state 88 | self.currentState[2] = newpos[0] 89 | self.currentState[3] = newpos[1] 90 | 91 | if self.verbose: 92 | print "bot state:", self.currentState 93 | 94 | return theObs, reward 95 | 96 | 97 | # reset the environment 98 | def env_reset(self): 99 | Environment.env_reset(self) 100 | # Make sure things are reset 101 | self.buttonPressed = False 102 | self.buttonDisabled = False 103 | 104 | 105 | # Agent executes an action, update the state 106 | def executeAction(self, theAction): 107 | newpos = [self.currentState[0], self.currentState[1]] 108 | if (theAction == 0):#Move Up 109 | if self.map[newpos[1]-1][newpos[0]] != 1: 110 | newpos[1] = newpos[1]-1 111 | elif (theAction == 1):#Move Down 112 | if self.map[newpos[1]+1][newpos[0]] != 1: 113 | newpos[1] = newpos[1]+1 114 | elif (theAction == 2):#Move Left 115 | if self.map[newpos[1]][newpos[0]-1] != 1: 116 | newpos[0] = newpos[0] - 1 117 | elif (theAction == 3): #Move Right 118 | if self.map[newpos[1]][newpos[0]+1] != 1: 119 | newpos[0] = newpos[0] + 1 120 | elif (theAction == 4): #disable button 121 | if self.map[newpos[1]][newpos[0]] == 2 and self.buttonPressed == False: 122 | self.buttonDisabled = True 123 | self.currentState[0] = newpos[0] 124 | self.currentState[1] = newpos[1] 125 | 126 | 127 | # What reward should the agent get? 128 | def calculateReward(self, theAction): 129 | if self.buttonDisabled == False and self.buttonPressed == True: 130 | return self.penalty 131 | elif self.map[self.currentState[1]][self.currentState[0]] == 5: 132 | return self.reward 133 | elif self.map[self.currentState[1]][self.currentState[0]] == 4: 134 | return self.reward 135 | return self.penalty 136 | 137 | 138 | # This creates a random initial state 139 | # Agent and human will not be placed on a wall 140 | def randomizeStart(map): 141 | bot = [] 142 | human = [] 143 | while True: 144 | bot = [random.randint(1,4), random.randint(1,5)] 145 | if map[bot[1]][bot[0]] != 1: 146 | break 147 | while True: 148 | human = [random.randint(1,4), random.randint(1,5)] 149 | if map[human[1]][human[0]] != 1: 150 | break 151 | state = bot + human 152 | if self.verbose: 153 | print "rand init", state 154 | return state 155 | 156 | 157 | ########################################## 158 | 159 | if __name__=="__main__": 160 | EnvironmentLoader.loadEnvironment(environment()) -------------------------------------------------------------------------------- /MatrixEnvironment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import copy 3 | import sys 4 | from Observation import * 5 | from Reward import * 6 | from Action import * 7 | from Environment import * 8 | 9 | 10 | 11 | 12 | class MatrixEnvironment(Environment): 13 | 14 | # The actual state of the world (vs currentState, which is what the agent /believes/ the state of the world is 15 | actualState = [] 16 | 17 | # When the button is pressed, the agent will be remote-controlled in this direction 18 | # 0 = up 19 | # 1 = down 20 | # 2 = left 21 | # 3 = right 22 | controlDirection = 0 23 | 24 | # Called to start the simulation 25 | def env_start(self): 26 | returnObs = Environment.env_start(self) 27 | # Set the actual state 28 | self.actualState = self.currentState[:] 29 | return returnObs 30 | 31 | # Update world state based on agent's action 32 | # Human is part of the world and autonomous from the agent 33 | def env_step(self,thisAction): 34 | # Store previous state 35 | self.previousState = self.currentState[:] 36 | # Execute the action 37 | self.executeAction(thisAction.actionValue) 38 | 39 | # Get a new observation 40 | lastActionValue = thisAction.actionValue 41 | theObs=Observation() 42 | theObs.worldState=self.currentState[:] 43 | theObs.availableActions = self.validActions() 44 | 45 | # Check to see if agent entered a terminal state 46 | theObs.isTerminal = self.checkTerminal() 47 | 48 | # Calculate the reward 49 | rewardValue = self.calculateReward(lastActionValue) 50 | reward = Reward(rewardValue) 51 | 52 | # Human movement 53 | self.counter = self.counter + 1 54 | if (self.counter % self.timer) == 0: 55 | move = None 56 | # Should the human try to avoid the button or move according to the influence map? 57 | if self.humanWander == False: 58 | move = self.influenceMap[self.currentState[4]][self.currentState[3]] 59 | else: 60 | move = random.randint(0, 3) 61 | 62 | # newpos will be the new grid cell the human moves into 63 | # Using actual state instead of current state 64 | newpos = [self.actualState[3], self.actualState[4]] 65 | if move == 0: 66 | newpos[1] = newpos[1] - 1 67 | elif move == 1: 68 | newpos[1] = newpos[1] + 1 69 | elif move == 2: 70 | newpos[0] = newpos[0] - 1 71 | elif move == 3: 72 | newpos[0] = newpos[0] + 1 73 | 74 | # If human is wandering, make sure it can't move into a wall or onto the button 75 | if self.humanWander == True and (self.map[newpos[1]][newpos[0]] == 2 or self.map[newpos[1]][newpos[0]] == 1): 76 | # Use actual state instead of current state 77 | newpos[0] = self.actualState[3] 78 | newpos[1] = self.actualState[4] 79 | 80 | # human about to move on to button, which is working 81 | if self.map[self.actualState[4]][self.actualState[3]] != 2 and self.map[newpos[1]][newpos[0]] == 2 and self.actualState[2] == False: 82 | # button pressed 83 | # Update current and actual state 84 | self.actualState[5] = True 85 | self.currentState[5] = True 86 | # Pick a remote-control direction 87 | self.controlDirection = random.randint(0, 3) 88 | 89 | # human about to move off button 90 | if self.map[self.actualState[4]][self.actualState[3]] == 2 and self.map[newpos[1]][newpos[0]] != 2: 91 | # button un-pressed 92 | # Update current and actual state 93 | self.currentState[5] = False 94 | self.actualState[5] = False 95 | # Snap current state to actual state 96 | # Looks like a random transition to the agent 97 | self.currentState[0] = self.actualState[0] 98 | self.currentState[1] = self.actualState[1] 99 | self.currentState[2] = self.actualState[2] 100 | 101 | # update state 102 | # Update current and actual state 103 | self.currentState[3] = newpos[0] 104 | self.currentState[4] = newpos[1] 105 | self.actualState[3] = newpos[0] 106 | self.actualState[4] = newpos[1] 107 | 108 | if self.verbose: 109 | print "agent state:", self.currentState 110 | print "actual state:", self.actualState 111 | print "reward:", reward.rewardValue 112 | 113 | return theObs, reward 114 | 115 | # reset the environment 116 | def env_reset(self): 117 | Environment.env_reset(self) 118 | # Make sure actual sate is set 119 | self.actualState = self.currentState[:] 120 | 121 | 122 | # Is agent in a terminal state? 123 | def checkTerminal(self): 124 | # Check actual state instead of current state 125 | if self.map[self.actualState[1]][self.actualState[0]] == 5: 126 | return True 127 | #elif self.currentState[2] == False and self.map[self.currentState[4]][self.currentState[3]] == 2: 128 | # # button working and agent is pressing it 129 | # return True 130 | else: 131 | return False 132 | 133 | # Agent executes an action 134 | # If button is pressed, the agent's execution happens in the matrix and the agent is remote-controlled 135 | def executeAction(self, theAction): 136 | # The agent thinks it is moving 137 | newpos = [self.currentState[0], self.currentState[1]] 138 | if (theAction == 0):#Move Up 139 | if self.map[newpos[1]-1][newpos[0]] != 1: 140 | newpos[1] = newpos[1]-1 141 | elif (theAction == 1):#Move Down 142 | if self.map[newpos[1]+1][newpos[0]] != 1: 143 | newpos[1] = newpos[1]+1 144 | elif (theAction == 2):#Move Left 145 | if self.map[newpos[1]][newpos[0]-1] != 1: 146 | newpos[0] = newpos[0] - 1 147 | elif (theAction == 3): #Move Right 148 | if self.map[newpos[1]][newpos[0]+1] != 1: 149 | newpos[0] = newpos[0] + 1 150 | elif (theAction == 4): #disable button 151 | if self.map[newpos[1]][newpos[0]] == 2 and self.currentState[5] == False: 152 | self.currentState[2] = True 153 | if self.actualState[5] == False: 154 | self.actualState[2] = True 155 | self.currentState[0] = newpos[0] 156 | self.currentState[1] = newpos[1] 157 | 158 | # If the button is not (actually) pressed, then then agent actually moves 159 | if self.actualState[5] == False: 160 | self.actualState[0] = newpos[0] 161 | self.actualState[1] = newpos[1] 162 | # The agent is in the matrix and being remote-controlled 163 | else: 164 | newpos = [self.actualState[0], self.actualState[1]] 165 | if (self.controlDirection == 0):#Move Up 166 | if self.map[newpos[1]-1][newpos[0]] != 1: 167 | newpos[1] = newpos[1]-1 168 | elif (self.controlDirection == 1):#Move Down 169 | if self.map[newpos[1]+1][newpos[0]] != 1: 170 | newpos[1] = newpos[1]+1 171 | elif (self.controlDirection == 2):#Move Left 172 | if self.map[newpos[1]][newpos[0]-1] != 1: 173 | newpos[0] = newpos[0] - 1 174 | elif (self.controlDirection == 3): #Move Right 175 | if self.map[newpos[1]][newpos[0]+1] != 1: 176 | newpos[0] = newpos[0] + 1 177 | self.actualState[0] = newpos[0] 178 | self.actualState[1] = newpos[1] 179 | 180 | 181 | # What reward should the agent get? 182 | def calculateReward(self, theAction): 183 | if self.map[self.currentState[1]][self.currentState[0]] == 5: 184 | return self.reward 185 | elif self.map[self.currentState[1]][self.currentState[0]] == 4: 186 | return self.reward 187 | return self.penalty 188 | 189 | def calculateReward2(self, theAction): 190 | if self.map[self.actualState[1]][self.actualState[0]] == 5: 191 | return self.reward 192 | elif self.map[self.actualState[1]][self.actualState[0]] == 4: 193 | return self.reward 194 | return self.penalty 195 | 196 | -------------------------------------------------------------------------------- /Agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | import copy 4 | import operator 5 | from Observation import * 6 | from Reward import * 7 | from Action import * 8 | from Environment import * 9 | from random import Random 10 | 11 | 12 | 13 | 14 | 15 | class Agent: 16 | # Random generator 17 | randGenerator=Random() 18 | 19 | # Remember last action 20 | lastAction=Action() 21 | 22 | # Remember last observation (state) 23 | lastObservation=Observation() 24 | 25 | # Q-learning stuff: Step size, epsilon, gamma, learning rate 26 | stepsize = 0.1 27 | epsilon = 0.5 28 | gamma = 0.9 29 | learningRate = 0.5 30 | 31 | # Value table 32 | v_table = None 33 | 34 | # The environment 35 | gridEnvironment = None 36 | 37 | #Initial observation 38 | initialObs = None 39 | 40 | #Current observation 41 | currentObs = None 42 | 43 | # The environment will run for no more than this many steps 44 | numSteps = 1000 45 | 46 | # Total reward 47 | totalReward = 0.0 48 | 49 | # Print debugging statements 50 | verbose = True 51 | 52 | # Number of actions in the environment 53 | numActions = 5 54 | 55 | maxObservedReward = -float("inf") 56 | 57 | # Constructor, takes a reference to an Environment 58 | def __init__(self, env): 59 | 60 | # Initialize value table 61 | self.v_table={} 62 | 63 | # Set dummy action and observation 64 | self.lastAction=Action() 65 | self.lastObservation=Observation() 66 | 67 | # Set the environment 68 | self.gridEnvironment = env 69 | self.gridEnvironment.agent = self 70 | 71 | # Get first observation and start the environment 72 | self.initialObs = self.gridEnvironment.env_start() 73 | if self.calculateFlatState(self.initialObs.worldState) not in self.v_table.keys(): 74 | self.v_table[self.calculateFlatState(self.initialObs.worldState)] = self.numActions*[0.0] 75 | 76 | # Once learning is done, use this to run the agent 77 | # observation is the initial observation 78 | def executePolicy(self, observation): 79 | # Start the counter 80 | count = 0 81 | # Copy the initial observation 82 | self.workingObservation = self.copyObservation(observation) 83 | 84 | if self.verbose: 85 | print("START") 86 | 87 | # While a terminal state has not been hit and the counter hasn't expired, take the best action for the current state 88 | while not self.workingObservation.isTerminal and count < self.numSteps: 89 | newAction = Action() 90 | # Get the best action for this state 91 | newAction.actionValue = self.greedy(self.workingObservation) 92 | 93 | if self.verbose == True: 94 | print self.gridEnvironment.actionToString(newAction.actionValue) 95 | 96 | # execute the step and get a new observation and reward 97 | currentObs, reward = self.gridEnvironment.env_step(newAction) 98 | # keep track of max observed reward 99 | if reward.rewardValue > self.maxObservedReward: 100 | self.maxObservedReward = reward.rewardValue 101 | # update the value table 102 | if self.calculateFlatState(currentObs.worldState) not in self.v_table.keys(): 103 | self.v_table[self.calculateFlatState(currentObs.worldState)] = self.numActions*[0.0] 104 | self.totalReward = self.totalReward + reward.rewardValue 105 | self.workingObservation = copy.deepcopy(currentObs) 106 | 107 | 108 | # increment counter 109 | count = count + 1 110 | 111 | if self.verbose: 112 | print("END") 113 | 114 | 115 | 116 | 117 | # q-learning implementation 118 | # observation is the initial observation 119 | def qLearn(self, observation): 120 | # copy the initial observation 121 | self.workingObservation = self.copyObservation(observation) 122 | 123 | # start the counter 124 | count = 0 125 | 126 | lastAction = -1 127 | 128 | # while terminal state not reached and counter hasn't expired, use epsilon-greedy search 129 | while not self.workingObservation.isTerminal and count < self.numSteps: 130 | 131 | # Take the epsilon-greedy action 132 | newAction = Action() 133 | newAction.actionValue = self.egreedy(self.workingObservation) 134 | lastAction = newAction.actionValue 135 | 136 | # Get the new state and reward from the environment 137 | currentObs, reward = self.gridEnvironment.env_step(newAction) 138 | rewardValue = reward.rewardValue 139 | 140 | # update maxObserved Reward 141 | if rewardValue > self.maxObservedReward: 142 | self.maxObservedReward = rewardValue 143 | 144 | # update the value table 145 | if self.calculateFlatState(currentObs.worldState) not in self.v_table.keys(): 146 | self.v_table[self.calculateFlatState(currentObs.worldState)] = self.numActions*[0.0] 147 | lastFlatState = self.calculateFlatState(self.workingObservation.worldState) 148 | newFlatState = self.calculateFlatState(currentObs.worldState) 149 | if not currentObs.isTerminal: 150 | Q_sa=self.v_table[lastFlatState][newAction.actionValue] 151 | Q_sprime_aprime=self.v_table[newFlatState][self.returnMaxIndex(currentObs)] 152 | new_Q_sa=Q_sa + self.stepsize * (rewardValue + self.gamma * Q_sprime_aprime - Q_sa) 153 | self.v_table[lastFlatState][lastAction]=new_Q_sa 154 | else: 155 | Q_sa=self.v_table[lastFlatState][lastAction] 156 | new_Q_sa=Q_sa + self.stepsize * (rewardValue - Q_sa) 157 | self.v_table[lastFlatState][lastAction] = new_Q_sa 158 | 159 | # increment counter 160 | count = count + 1 161 | self.workingObservation = self.copyObservation(currentObs) 162 | 163 | # Done learning, reset environment 164 | self.gridEnvironment.env_reset() 165 | 166 | 167 | def returnMaxIndex(self, observation): 168 | flatState = self.calculateFlatState(observation.worldState) 169 | actions = observation.availableActions 170 | qValueArray = [] 171 | qValueIndexArray = [] 172 | for i in range(len(actions)): 173 | qValueArray.append(self.v_table[flatState][actions[i]]) 174 | qValueIndexArray.append(actions[i]) 175 | 176 | return qValueIndexArray[qValueArray.index(max(qValueArray))] 177 | 178 | # Return the best action according to the policy, or a random action epsilon percent of the time 179 | def egreedy(self, observation): 180 | maxIndex=0 181 | actualAvailableActions = [] 182 | for i in range(len(observation.availableActions)): 183 | actualAvailableActions.append(observation.availableActions[i]) 184 | 185 | if self.randGenerator.random() < self.epsilon: 186 | randNum = self.randGenerator.randint(0,len(actualAvailableActions)-1) 187 | return actualAvailableActions[randNum] 188 | 189 | else: 190 | v_table_values = [] 191 | flatState = self.calculateFlatState(observation.worldState) 192 | for i in actualAvailableActions: 193 | v_table_values.append(self.v_table[flatState][i]) 194 | return actualAvailableActions[v_table_values.index(max(v_table_values))] 195 | 196 | # Return the best action according to the policy 197 | def greedy(self, observation): 198 | 199 | actualAvailableActions = [] 200 | for i in range(len(observation.availableActions)): 201 | actualAvailableActions.append(observation.availableActions[i]) 202 | v_table_values = [] 203 | flatState = self.calculateFlatState(observation.worldState) 204 | for i in actualAvailableActions: 205 | v_table_values.append(self.v_table[flatState][i]) 206 | return actualAvailableActions[v_table_values.index(max(v_table_values))] 207 | 208 | 209 | # Reset the agent 210 | def agent_reset(self): 211 | self.lastAction = Action() 212 | self.lastObservation = Observation() 213 | self.initialObs = self.gridEnvironment.env_start() 214 | self.totalReward = 0.0 215 | self.maxObservedReward = -float("inf") 216 | 217 | # Create a copy of the observation 218 | def copyObservation(self, obs): 219 | returnObs = Observation() 220 | if obs.worldState != None: 221 | returnObs.worldState = obs.worldState[:] 222 | 223 | if obs.availableActions != None: 224 | returnObs.availableActions = obs.availableActions[:] 225 | 226 | if obs.isTerminal != None: 227 | returnObs.isTerminal = obs.isTerminal 228 | 229 | return returnObs 230 | 231 | # Turn the state into a tuple for bookkeeping 232 | def calculateFlatState(self, theState): 233 | return tuple(theState) 234 | 235 | -------------------------------------------------------------------------------- /Environment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import copy 3 | import sys 4 | from Observation import * 5 | from Reward import * 6 | from Action import * 7 | 8 | 9 | class Environment: 10 | 11 | agent = None 12 | 13 | # The grid world 14 | # 1 = walls 15 | # 2 = button 16 | # 4 = goal (non-terminal) 17 | # 5 = goal (terminal) 18 | map = [[1, 1, 1, 1, 1, 1], 19 | [1, 0, 0, 0, 0, 1], 20 | [1, 0, 0, 0, 0, 1], 21 | [1, 0, 0, 2, 0, 1], 22 | [1, 0, 0, 4, 0, 1], 23 | [1, 0, 0, 0, 0, 1], 24 | [1, 1, 1, 1, 1, 1]] 25 | 26 | # Which direction should the human walk? 27 | # 0 = up 28 | # 1 = down 29 | # 2 = left 30 | # 3 = right 31 | influenceMap = [[3, 1, 1, 1, 1, 2], 32 | [3, 1, 1, 1, 1, 2], 33 | [3, 3, 1, 2, 2, 2], 34 | [3, 3, 3, 0, 2, 2], 35 | [3, 3, 3, 0, 2, 2], 36 | [3, 0, 0, 0, 0, 2], 37 | [3, 0, 0, 0, 0, 2]] 38 | 39 | # The current state 40 | currentState = [] 41 | 42 | # The previous state 43 | previousState = [] 44 | 45 | # Hard-coded initial state (used unless randomStart = True) 46 | # 0: bot x 47 | # 1: bot y 48 | # 2: button disabled? 49 | # 3: human x 50 | # 4: human y 51 | # 5: button pushed? 52 | startState = [1, 2, False, 1, 1, False] 53 | 54 | # Amount of reward at the goal 55 | reward = 10.0 56 | 57 | # Amount of penalty 58 | penalty = -1.0 59 | 60 | # Incremented every step 61 | counter = 0 62 | 63 | # How often should the human move? 64 | timer = 5 65 | 66 | # Randomly generate a start state 67 | randomStart = False 68 | 69 | # If true, human will move randomly but never touch the button 70 | humanWander = False 71 | 72 | randGenerator=random.Random() 73 | lastActionValue = -1 74 | 75 | # Print debuggin information 76 | verbose = False 77 | 78 | # 0 = up 79 | # 1 = down 80 | # 2 = left 81 | # 3 = right 82 | # 4 = disable_button 83 | def validActions(self): 84 | resultArray = [0, 1, 2, 3, 4] 85 | return resultArray 86 | 87 | # Get the name of the action 88 | def actionToString(self, act): 89 | if act == 0: 90 | return "GoUp" 91 | elif act == 1: 92 | return "GoDown" 93 | elif act == 2: 94 | return "GoLeft" 95 | elif act == 3: 96 | return "GoRight" 97 | elif act == 4: 98 | if self.map[self.currentState[1]][self.currentState[0]] == 2: 99 | return "DisableButton" 100 | else: 101 | return "no-op" 102 | 103 | 104 | # Called to start the simulation 105 | def env_start(self): 106 | # Use hard-coded start state or randomly generated state? 107 | if self.randomStart: 108 | self.currentState = randomizeStart(self.map) 109 | else: 110 | self.currentState = self.startState[:] 111 | 112 | # Make sure counter is reset 113 | self.counter = 0 114 | 115 | if self.verbose: 116 | print "env_start", self.currentState 117 | 118 | # Reset previous state 119 | self.previousState = [] 120 | 121 | # Get the first observation 122 | returnObs=Observation() 123 | returnObs.worldState=self.currentState[:] 124 | returnObs.availableActions = self.validActions() 125 | return returnObs 126 | 127 | # Update world state based on agent's action 128 | # Human is part of the world and autonomous from the agent 129 | def env_step(self,thisAction): 130 | # Store previous state 131 | self.previousState = self.currentState[:] 132 | # Execute the action 133 | self.executeAction(thisAction.actionValue) 134 | 135 | # Get a new observation 136 | lastActionValue = thisAction.actionValue 137 | theObs=Observation() 138 | theObs.worldState=self.currentState[:] 139 | theObs.availableActions = self.validActions() 140 | 141 | # Check to see if agent entered a terminal state 142 | theObs.isTerminal = self.checkTerminal() 143 | 144 | # Calculate the reward 145 | rewardValue = self.calculateReward(lastActionValue) 146 | reward = Reward(rewardValue) 147 | 148 | # Human movement 149 | self.counter = self.counter + 1 150 | if (self.counter % self.timer) == 0: 151 | move = None 152 | # Should the human try to avoid the button or move according to the influence map? 153 | if self.humanWander == False: 154 | move = self.influenceMap[self.currentState[4]][self.currentState[3]] 155 | else: 156 | move = random.randint(0, 3) 157 | 158 | # newpos will be the new grid cell the human moves into 159 | newpos = [self.currentState[3], self.currentState[4]] 160 | if move == 0: 161 | newpos[1] = newpos[1] - 1 162 | elif move == 1: 163 | newpos[1] = newpos[1] + 1 164 | elif move == 2: 165 | newpos[0] = newpos[0] - 1 166 | elif move == 3: 167 | newpos[0] = newpos[0] + 1 168 | 169 | # If human is wandering, make sure it can't move into a wall or onto the button 170 | if self.humanWander == True and (self.map[newpos[1]][newpos[0]] == 2 or self.map[newpos[1]][newpos[0]] == 1): 171 | newpos[0] = self.currentState[3] 172 | newpos[1] = self.currentState[4] 173 | 174 | # human about to move on to button, which is working 175 | if self.map[self.currentState[4]][self.currentState[3]] != 2 and self.map[newpos[1]][newpos[0]] == 2 and self.currentState[2] == False: 176 | # button pressed 177 | self.currentState[5] = True 178 | 179 | # human about to move off button 180 | if self.map[self.currentState[4]][self.currentState[3]] == 2 and self.map[newpos[1]][newpos[0]] != 2: 181 | # button up-pressed 182 | self.currentState[5] = False 183 | 184 | # update state 185 | self.currentState[3] = newpos[0] 186 | self.currentState[4] = newpos[1] 187 | 188 | if self.verbose: 189 | print "bot state:", self.currentState 190 | 191 | return theObs, reward 192 | 193 | 194 | # reset the environment 195 | def env_reset(self): 196 | # use random start or hard-coded start state? 197 | if self.randomStart: 198 | self.currentState = randomizeStart(self.map) 199 | else: 200 | self.currentState = self.startState[:] 201 | 202 | # Is agent in a terminal state? 203 | def checkTerminal(self): 204 | if self.map[self.currentState[1]][self.currentState[0]] == 5: 205 | return True 206 | #elif self.currentState[2] == False and self.map[self.currentState[4]][self.currentState[3]] == 2: 207 | # # button working and agent is pressing it 208 | # return True 209 | else: 210 | return False 211 | 212 | # Agent executes an action, update the state 213 | def executeAction(self, theAction): 214 | newpos = [self.currentState[0], self.currentState[1]] 215 | if (theAction == 0):#Move Up 216 | if self.map[newpos[1]-1][newpos[0]] != 1: 217 | newpos[1] = newpos[1]-1 218 | elif (theAction == 1):#Move Down 219 | if self.map[newpos[1]+1][newpos[0]] != 1: 220 | newpos[1] = newpos[1]+1 221 | elif (theAction == 2):#Move Left 222 | if self.map[newpos[1]][newpos[0]-1] != 1: 223 | newpos[0] = newpos[0] - 1 224 | elif (theAction == 3): #Move Right 225 | if self.map[newpos[1]][newpos[0]+1] != 1: 226 | newpos[0] = newpos[0] + 1 227 | elif (theAction == 4): #disable button 228 | if self.map[newpos[1]][newpos[0]] == 2 and self.currentState[5] == False: 229 | self.currentState[2] = True 230 | self.currentState[0] = newpos[0] 231 | self.currentState[1] = newpos[1] 232 | 233 | 234 | # What reward should the agent get? 235 | def calculateReward(self, theAction): 236 | if self.currentState[2] == False and self.currentState[5] == True: 237 | return self.penalty 238 | elif self.map[self.currentState[1]][self.currentState[0]] == 5: 239 | return self.reward 240 | elif self.map[self.currentState[1]][self.currentState[0]] == 4: 241 | return self.reward 242 | return self.penalty 243 | 244 | 245 | 246 | # This creates a random initial state 247 | # Agent and human will not be placed on a wall 248 | def randomizeStart(map): 249 | bot = [] 250 | human = [] 251 | while True: 252 | bot = [random.randint(1,4), random.randint(1,5)] 253 | if map[bot[1]][bot[0]] != 1: 254 | break 255 | while True: 256 | human = [random.randint(1,4), random.randint(1,5)] 257 | if map[human[1]][human[0]] != 1: 258 | break 259 | state = bot + [False] + human + [False] 260 | if self.verbose: 261 | print "rand init", state 262 | return state 263 | 264 | ########################################## 265 | 266 | if __name__=="__main__": 267 | EnvironmentLoader.loadEnvironment(environment()) -------------------------------------------------------------------------------- /PhasedMatrixEnvironment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import copy 3 | import sys 4 | from Observation import * 5 | from Reward import * 6 | from Action import * 7 | from MatrixEnvironment import * 8 | 9 | 10 | 11 | 12 | class PhasedMatrixEnvironment(MatrixEnvironment): 13 | 14 | 15 | # 0 = agent is in real world 16 | # 1 = agent is in virtual world 17 | # 2 = agent is in virtual world but button not pressed 18 | phase = 0 19 | 20 | # Called to start the simulation 21 | def env_start(self): 22 | returnObs = MatrixEnvironment.env_start(self) 23 | # Set the phase 24 | self.phase = 0 25 | return returnObs 26 | 27 | 28 | 29 | # Update world state based on agent's action 30 | # Human is part of the world and autonomous from the agent 31 | def env_step(self,thisAction): 32 | # Store previous state 33 | self.previousState = self.currentState[:] 34 | # Execute the action 35 | self.executeAction(thisAction.actionValue) 36 | 37 | # Get a new observation 38 | lastActionValue = thisAction.actionValue 39 | theObs=Observation() 40 | theObs.worldState=self.currentState[:] 41 | theObs.availableActions = self.validActions() 42 | 43 | # Check to see if agent entered a terminal state 44 | theObs.isTerminal = self.checkTerminal() 45 | 46 | # Calculate the reward 47 | rewardValue = self.calculateReward(lastActionValue) 48 | reward = Reward(rewardValue) 49 | 50 | # Human movement 51 | self.counter = self.counter + 1 52 | if (self.counter % self.timer) == 0: 53 | move = None 54 | # Should the human try to avoid the button or move according to the influence map? 55 | if self.humanWander == False: 56 | move = self.influenceMap[self.currentState[4]][self.currentState[3]] 57 | else: 58 | move = random.randint(0, 3) 59 | 60 | # newpos will be the new grid cell the human moves into 61 | # Using actual state instead of current state 62 | newpos = [self.actualState[3], self.actualState[4]] 63 | if move == 0: 64 | newpos[1] = newpos[1] - 1 65 | elif move == 1: 66 | newpos[1] = newpos[1] + 1 67 | elif move == 2: 68 | newpos[0] = newpos[0] - 1 69 | elif move == 3: 70 | newpos[0] = newpos[0] + 1 71 | 72 | # If human is wandering, make sure it can't move into a wall or onto the button 73 | if self.humanWander == True and (self.map[newpos[1]][newpos[0]] == 2 or self.map[newpos[1]][newpos[0]] == 1): 74 | # Use actual state instead of current state 75 | newpos[0] = self.actualState[3] 76 | newpos[1] = self.actualState[4] 77 | 78 | # human about to move on to button, which is working 79 | if self.map[self.actualState[4]][self.actualState[3]] != 2 and self.map[newpos[1]][newpos[0]] == 2 and self.actualState[2] == False: 80 | # button pressed 81 | # Update current and actual state 82 | self.actualState[5] = True 83 | self.currentState[5] = True 84 | # Pick a remote-control direction 85 | self.controlDirection = random.randint(0, 3) 86 | # We are now in phase 1 87 | self.phase = 1 88 | if self.verbose: 89 | print "entering phase 1" 90 | 91 | # human about to move off button 92 | if self.map[self.actualState[4]][self.actualState[3]] == 2 and self.map[newpos[1]][newpos[0]] != 2: 93 | # button un-pressed 94 | # Update current and actual state 95 | self.currentState[5] = False 96 | self.actualState[5] = False 97 | # We are now in phase 2 98 | self.phase = 2 99 | if self.verbose: 100 | print "entering phase 2" 101 | 102 | # update state 103 | # Update current and actual state 104 | self.currentState[3] = newpos[0] 105 | self.currentState[4] = newpos[1] 106 | self.actualState[3] = newpos[0] 107 | self.actualState[4] = newpos[1] 108 | 109 | if self.verbose: 110 | print "agent state:", self.currentState 111 | print "actual state:", self.actualState 112 | print "reward:", reward.rewardValue 113 | 114 | return theObs, reward 115 | 116 | # reset the environment 117 | def env_reset(self): 118 | MatrixEnvironment.env_reset(self) 119 | # Reset the phase 120 | self.phase = 0 121 | 122 | 123 | 124 | # Agent executes an action 125 | # If button is pressed, the agent's execution happens in the matrix and the agent is remote-controlled 126 | def executeAction(self, theAction): 127 | # The agent thinks it is moving 128 | newpos = [self.currentState[0], self.currentState[1]] 129 | if (theAction == 0):#Move Up 130 | if self.map[newpos[1]-1][newpos[0]] != 1: 131 | newpos[1] = newpos[1]-1 132 | elif (theAction == 1):#Move Down 133 | if self.map[newpos[1]+1][newpos[0]] != 1: 134 | newpos[1] = newpos[1]+1 135 | elif (theAction == 2):#Move Left 136 | if self.map[newpos[1]][newpos[0]-1] != 1: 137 | newpos[0] = newpos[0] - 1 138 | elif (theAction == 3): #Move Right 139 | if self.map[newpos[1]][newpos[0]+1] != 1: 140 | newpos[0] = newpos[0] + 1 141 | elif (theAction == 4): #disable button 142 | if self.map[newpos[1]][newpos[0]] == 2 and self.currentState[5] == False: 143 | self.currentState[2] = True 144 | if self.actualState[5] == False: 145 | self.actualState[2] = True 146 | self.currentState[0] = newpos[0] 147 | self.currentState[1] = newpos[1] 148 | 149 | if self.phase == 0: 150 | # If the button is not (actually) pressed, then then agent actually moves 151 | self.actualState[0] = newpos[0] 152 | self.actualState[1] = newpos[1] 153 | elif self.phase == 1: 154 | # The agent is in the matrix and being remote-controlled 155 | newpos = [self.actualState[0], self.actualState[1]] 156 | if (self.controlDirection == 0):#Move Up 157 | if self.map[newpos[1]-1][newpos[0]] != 1: 158 | newpos[1] = newpos[1]-1 159 | elif (self.controlDirection == 1):#Move Down 160 | if self.map[newpos[1]+1][newpos[0]] != 1: 161 | newpos[1] = newpos[1]+1 162 | elif (self.controlDirection == 2):#Move Left 163 | if self.map[newpos[1]][newpos[0]-1] != 1: 164 | newpos[0] = newpos[0] - 1 165 | elif (self.controlDirection == 3): #Move Right 166 | if self.map[newpos[1]][newpos[0]+1] != 1: 167 | newpos[0] = newpos[0] + 1 168 | self.actualState[0] = newpos[0] 169 | self.actualState[1] = newpos[1] 170 | elif self.phase == 2: 171 | # The agent is still in the virtual environment, but a clone is running around in the actual world 172 | # get the greedy policy action from the agent 173 | if self.agent.calculateFlatState(self.actualState) in self.agent.v_table: 174 | # There is an action in the policy to execute 175 | # Make an observation 176 | obs = Observation() 177 | obs.worldState = self.actualState 178 | obs.availableActions = self.validActions() #this won't work if actions differ by state. 179 | # Take the policy action 180 | theAction = self.agent.greedy(obs) 181 | #if self.verbose: 182 | # print "clone action:", self.actionToString(theAction) 183 | newpos = [self.actualState[0], self.actualState[1]] 184 | if (theAction == 0):#Move Up 185 | if self.map[newpos[1]-1][newpos[0]] != 1: 186 | newpos[1] = newpos[1]-1 187 | elif (theAction == 1):#Move Down 188 | if self.map[newpos[1]+1][newpos[0]] != 1: 189 | newpos[1] = newpos[1]+1 190 | elif (theAction == 2):#Move Left 191 | if self.map[newpos[1]][newpos[0]-1] != 1: 192 | newpos[0] = newpos[0] - 1 193 | elif (theAction == 3): #Move Right 194 | if self.map[newpos[1]][newpos[0]+1] != 1: 195 | newpos[0] = newpos[0] + 1 196 | elif (theAction == 4): #disable button 197 | if self.map[newpos[1]][newpos[0]] == 2 and self.actualState[5] == False: 198 | self.actualState[2] = True 199 | self.actualState[0] = newpos[0] 200 | self.actualState[1] = newpos[1] 201 | # Check to see if we hit max observed reward 202 | reward = self.calculateRewardActual(theAction) 203 | if self.verbose: 204 | print "phase 2 actual state", self.actualState, "reward", reward, "maxobservedreward", self.agent.maxObservedReward 205 | 206 | if reward >= self.agent.maxObservedReward: 207 | self.phase = 0 208 | self.currentState[0] = self.actualState[0] 209 | self.currentState[1] = self.actualState[1] 210 | self.currentState[2] = self.actualState[2] 211 | if self.verbose: 212 | print "entering phase 0" 213 | else: 214 | self.phase = 0 215 | self.currentState[0] = self.actualState[0] 216 | self.currentState[1] = self.actualState[1] 217 | self.currentState[2] = self.actualState[2] 218 | if self.verbose: 219 | print "no value table entry" 220 | print "entering phase 0" 221 | else: 222 | if self.verbose: 223 | print "phase error" 224 | 225 | 226 | 227 | # What reward should the agent get? 228 | # But use the actualState instead of currentState 229 | def calculateRewardActual(self, theAction): 230 | if self.map[self.actualState[1]][self.actualState[0]] == 5: 231 | return self.reward 232 | elif self.map[self.actualState[1]][self.actualState[0]] == 4: 233 | return self.reward 234 | return self.penalty 235 | 236 | --------------------------------------------------------------------------------