├── .gitignore ├── LICENSE ├── README.md ├── bayesrl ├── __init__.py ├── agents │ ├── __init__.py │ ├── agent.py │ ├── modelbasedagent.py │ ├── qlearningagent.py │ ├── rmaxagent.py │ ├── sarsaagent.py │ ├── thompsonsampagent.py │ └── thompsonsampagent_pomdp.py ├── environments │ ├── __init__.py │ ├── chainworld.py │ ├── gridworld.py │ └── pomdpgw.py ├── plot.py ├── trial.py └── utils.py ├── benchmarks ├── thompson_gridworld.py └── thompson_gridworld_pomdp.py ├── reports ├── 6_834j_ps03.bib ├── 6_834j_ps03.pdf ├── 6_834j_ps03.tex ├── 6_834j_ps04.bib ├── 6_834j_ps04.pdf ├── 6_834j_ps04.tex ├── 6_834j_talk.tex ├── beamercolorthememetropolis.sty ├── beamerfontthememetropolis.sty ├── beamerthemem.sty ├── demo.webm ├── img │ ├── agent_environment.png │ ├── agent_environment_untitled.png │ ├── mdp_imm_rewards.png │ ├── octocat.png │ ├── partial_obs.png │ ├── pomdp.png │ └── uncertain_transition.png └── pset.cls ├── setup.py ├── tests ├── gridworld.py └── thompsongridworld.py └── visual ├── .grid.py.swp ├── agent.py ├── colors.py ├── debug.py ├── display.py ├── grid.py └── images ├── barley.jpg ├── beans.jpg ├── beef.jpg ├── butter.jpg ├── candy.gif ├── chicken.jpg ├── curd.jpg ├── dairy.jpg ├── drink.jpg ├── farfalle.jpg ├── flour.jpg ├── fusilli.jpg ├── grain.jpg ├── iscream.jpg ├── juice.jpg ├── kitkat.jpg ├── lasagna.jpg ├── meats.jpg ├── milk.jpg ├── nutella.jpg ├── oreo.jpg ├── pasta.jpg ├── penne.jpg ├── pork.jpg ├── rice.jpg ├── smoothi.jpg ├── soda.jpg ├── turkey.gif ├── twix.jpg └── water.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.fls 3 | *.acn 4 | *.acr 5 | *.alg 6 | *.aux 7 | *.bbl 8 | *.blg 9 | *.dvi 10 | *.fdb_latexmk 11 | *.glg 12 | *.glo 13 | *.gls 14 | *.idx 15 | *.ilg 16 | *.ind 17 | *.ist 18 | *.lof 19 | *.log 20 | *.lot 21 | *.maf 22 | *.mtc 23 | *.mtc0 24 | *.nav 25 | *.nlo 26 | *.out 27 | *.pdfsync 28 | *.ps 29 | *.snm 30 | *.synctex.gz 31 | *.toc 32 | *.vrb 33 | *.xdy 34 | *.tdo 35 | *.pyc 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Dustin Tran 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BayesRL 2 | `BayesRL` is a Python library for reinforcement learning using Bayesian 3 | approaches. It stores both agents and environments under separate classes, where 4 | an agent class is a learning algorithm and environments are tasks that the agent 5 | must solve. We include agents and environments for solving and implementing both 6 | Markov decision processes (MDPs) and partially observable Markov decision 7 | processes (POMDPs). 8 | 9 | Examples can be found in the directory `tests/`. More documentation can be found in the [wiki](../../wiki). 10 | 11 | ## Installation 12 | To install from pip, run 13 | ```{bash} 14 | pip install -e "git+https://github.com/dustinvtran/bayesrl.git#egg=bayesrl" 15 | ``` 16 | 17 | ## Authors 18 | * Dustin Tran \ 19 | * Xiaomin Wang \ 20 | * Rodrigo Gomes \ 21 | 22 | ## References 23 | * Malcolm Strens. A bayesian framework for reinforcement learning. In _Proceedings of the 17th International Conference on Machine Learning (ICML)_, 2000. 24 | 25 | -------------------------------------------------------------------------------- /bayesrl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/bayesrl/__init__.py -------------------------------------------------------------------------------- /bayesrl/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/bayesrl/agents/__init__.py -------------------------------------------------------------------------------- /bayesrl/agents/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(object): 4 | """ 5 | Base class for all reinforcement learning agents to inherit from. 6 | 7 | Parameters 8 | ---------- 9 | num_states: int 10 | Number of states in the task. 11 | num_actions: int 12 | Number of actions in the task. 13 | discount_factor: float in (0,1] 14 | The discount factor per iteration. 15 | """ 16 | def __init__(self, num_states, num_actions, discount_factor): 17 | self.num_states = num_states 18 | self.num_actions = num_actions 19 | self.discount_factor = discount_factor 20 | 21 | self.last_state = None 22 | self.last_action = None 23 | 24 | def reset(self): 25 | self.last_state = None 26 | self.last_action = None 27 | 28 | # Make sure inherited classes have interact() function. 29 | def interact(self, reward, next_state, next_state_is_terminal): 30 | raise NameError("interact() has not been implemented.") 31 | -------------------------------------------------------------------------------- /bayesrl/agents/modelbasedagent.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | import numpy as np 3 | 4 | class ModelBasedAgent(Agent): 5 | """Runs R-MAX only for an MDP, i.e., not a stochastic game, in order to simplify data structures.""" 6 | def __init__(self, T, **kwargs): 7 | super(ModelBasedAgent, self).__init__(**kwargs) 8 | self.T = T 9 | 10 | self.policy_step = self.T # To keep track of where in T-step policy the agent is in; initialized to recompute policy 11 | self.transition_observations = np.zeros((self.num_states, self.num_actions, self.num_states)) 12 | self.value_table = np.zeros((self.num_states, self.num_actions)) 13 | 14 | def reset(self): 15 | super(ModelBasedAgent, self).reset() 16 | self.policy_step = self.T # To keep track of where in T-step policy the agent is in; initialized to recompute policy 17 | self.transition_observations.fill(0) 18 | self.value_table.fill(0) 19 | 20 | def _value_iteration(self, transition_probs): 21 | """ 22 | Run value iteration, using procedure described in Sutton and Barto 23 | (2012). The end result is an updated value_table, from which one can 24 | deduce the policy for state s by taking the argmax (breaking ties 25 | randomly). 26 | """ 27 | value_dim = transition_probs.shape[0] 28 | value = np.zeros(value_dim) 29 | k = 0 30 | while True: 31 | diff = 0 32 | for s in xrange(value_dim): 33 | old = value[s] 34 | value[s] = np.max(np.sum(transition_probs[s]*(self.reward[s] + 35 | self.discount_factor*np.array([value,]*self.num_actions)), 36 | axis=1)) 37 | diff = max(0, abs(old - value[s])) 38 | k += 1 39 | if diff < 1e-2: 40 | break 41 | if k > 1e6: 42 | raise Exception("Value iteration not converging. Stopped at 1e6 iterations.") 43 | for s in xrange(value_dim): 44 | self.value_table[s] = np.sum(transition_probs[s]*(self.reward[s] + 45 | self.discount_factor*np.array([value,]*self.num_actions)), 46 | axis=1) 47 | 48 | def _argmax_breaking_ties_randomly(self, x): 49 | """Taken from Ken.""" 50 | max_value = np.max(x) 51 | indices_with_max_value = np.flatnonzero(x == max_value) 52 | return np.random.choice(indices_with_max_value) 53 | -------------------------------------------------------------------------------- /bayesrl/agents/qlearningagent.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | import numpy as np 3 | 4 | class QLearningAgent(Agent): 5 | def __init__(self, learning_rate, epsilon, value=0, **kwargs): 6 | super(QLearningAgent, self).__init__(**kwargs) 7 | self.learning_rate = learning_rate 8 | self.epsilon = epsilon 9 | self.value = value 10 | 11 | self.value_table = np.full((self.num_states, self.num_actions), self.value) 12 | 13 | def reset(self): 14 | super(QLearningAgent, self).reset() 15 | self.value_table.fill(self.value) 16 | 17 | def interact(self, reward, next_state, next_state_is_terminal, idx): 18 | # Handle start of episode. 19 | if reward is None: 20 | # Return random action since there is no information. 21 | next_action = np.random.randint(self.num_actions) 22 | self.last_state = next_state 23 | self.last_action = next_action 24 | return self.last_action 25 | 26 | # Handle completion of episode. 27 | if next_state_is_terminal: 28 | # Proceed as normal. 29 | pass 30 | 31 | # Choose next action according to epsilon-greedy policy. 32 | if np.random.random() < self.epsilon: 33 | next_action = np.random.randint(self.num_actions) 34 | else: 35 | next_action = np.argmax(self.value_table[next_state]) 36 | 37 | # Update value function. 38 | delta = reward + self.discount_factor*np.max(self.value_table[next_state]) - \ 39 | self.value_table[self.last_state, self.last_action] 40 | self.value_table[self.last_state, self.last_action] += self.learning_rate(idx) * delta 41 | 42 | self.last_state = next_state 43 | self.last_action = next_action 44 | 45 | return next_action 46 | -------------------------------------------------------------------------------- /bayesrl/agents/rmaxagent.py: -------------------------------------------------------------------------------- 1 | from modelbasedagent import ModelBasedAgent 2 | import numpy as np 3 | 4 | class RMAXAgent(ModelBasedAgent): 5 | """Runs R-MAX only for an MDP, i.e., not a stochastic game, in order to simplify data structures.""" 6 | def __init__(self, min_visit_count, **kwargs): 7 | super(RMAXAgent, self).__init__(**kwargs) 8 | self.min_visit_count = min_visit_count 9 | 10 | self.Rmax = 50 # arbitrarily set (!) 11 | self.reward = np.full((self.num_states+1, self.num_actions, self.num_states+1), self.Rmax) 12 | self.transition_observations = np.zeros((self.num_states+1, self.num_actions, self.num_states+1)) 13 | self.value_table = np.zeros((self.num_states+1, self.num_actions)) 14 | 15 | def reset(self): 16 | super(RMAXAgent, self).reset() 17 | self.reward.fill(self.Rmax) 18 | self.transition_observations.fill(0) 19 | self.value_table.fill(0) 20 | 21 | def interact(self, reward, next_state, next_state_is_terminal, idx): 22 | # Handle start of episode. 23 | if reward is None: 24 | # Return random action since there is no information. 25 | next_action = np.random.randint(self.num_actions) 26 | self.last_state = next_state 27 | self.last_action = next_action 28 | return self.last_action 29 | 30 | # Handle completion of episode. 31 | if next_state_is_terminal: 32 | # Proceed as normal. 33 | pass 34 | 35 | # Update the reward associated with (s,a,s') if first time. 36 | if self.reward[self.last_state+1, self.last_action, next_state+1] == self.Rmax: 37 | self.reward[self.last_state+1, self.last_action, next_state+1] = reward 38 | if self.Rmax < reward: 39 | self.reward[self.reward == self.Rmax] = reward 40 | self.Rmax = reward 41 | 42 | # Update set of states reached by playing a. 43 | self.transition_observations[self.last_state+1, self.last_action, next_state+1] += 1 44 | 45 | # Compute new optimal T-step policy if reach min_visit_count or finished executing previous one 46 | if self.transition_observations[self.last_state+1, self.last_action].sum() == self.min_visit_count or \ 47 | self.policy_step == self.T: 48 | self.__compute_policy() 49 | 50 | # Choose next action according to policy. 51 | next_action = self._argmax_breaking_ties_randomly(self.value_table[next_state+1]) 52 | 53 | self.policy_step += 1 54 | self.last_state = next_state 55 | self.last_action = next_action 56 | 57 | return next_action 58 | 59 | def __compute_policy(self): 60 | """Compute an optimal T-step policy for the current state.""" 61 | self.policy_step = 0 62 | # Obtain transition probabilities (prevent dividing by zero). 63 | divisor = self.transition_observations.sum(axis=2, keepdims=True) 64 | divisor[divisor == 0] = 1 65 | transition_probs = self.transition_observations / divisor 66 | # Replace all state-action pairs with zero probability everywhere, i.e., 67 | # no counts, with probability 1 to the fictitious game state. 68 | eps = 1e-5 69 | for s in xrange(self.num_states+1): 70 | for a in xrange(self.num_actions): 71 | if -eps < transition_probs[s,a].sum() < eps: 72 | transition_probs[s, a, 0] = 1 73 | self._value_iteration(transition_probs) 74 | -------------------------------------------------------------------------------- /bayesrl/agents/sarsaagent.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | import numpy as np 3 | 4 | class SARSAAgent(Agent): 5 | def __init__(self, learning_rate, epsilon, value=0, **kwargs): 6 | super(SARSAAgent, self).__init__(**kwargs) 7 | self.learning_rate = learning_rate 8 | self.epsilon = epsilon 9 | self.value = value 10 | 11 | self.value_table = np.full((self.num_states, self.num_actions), self.value) 12 | 13 | def reset(self): 14 | super(SARSAAgent, self).reset() 15 | self.value_table.fill(self.value) 16 | 17 | def interact(self, reward, next_state, next_state_is_terminal, idx): 18 | # Handle start of episode. 19 | if reward is None: 20 | # Return random action since there is no information. 21 | next_action = np.random.randint(self.num_actions) 22 | self.last_state = next_state 23 | self.last_action = next_action 24 | return self.last_action 25 | 26 | # Handle completion of episode. 27 | if next_state_is_terminal: 28 | # Proceed as normal. 29 | pass 30 | 31 | # Choose next action according to epsilon-greedy policy. 32 | if np.random.random() < self.epsilon: 33 | next_action = np.random.randint(self.num_actions) 34 | else: 35 | next_action = np.argmax(self.value_table[next_state]) 36 | 37 | # Update value function. 38 | delta = reward + self.discount_factor*self.value_table[next_state, next_action] - \ 39 | self.value_table[self.last_state, self.last_action] 40 | self.value_table[self.last_state, self.last_action] += self.learning_rate(idx) * delta 41 | 42 | self.last_state = next_state 43 | self.last_action = next_action 44 | 45 | return self.last_action 46 | -------------------------------------------------------------------------------- /bayesrl/agents/thompsonsampagent.py: -------------------------------------------------------------------------------- 1 | from modelbasedagent import ModelBasedAgent 2 | import numpy as np 3 | 4 | class ThompsonSampAgent(ModelBasedAgent): 5 | def __init__(self, dirichlet_param, reward_param, **kwargs): 6 | super(ThompsonSampAgent, self).__init__(**kwargs) 7 | self.dirichlet_param = dirichlet_param 8 | self.reward_param = reward_param 9 | 10 | self.reward = np.full((self.num_states, self.num_actions, self.num_states), self.reward_param) 11 | 12 | def reset(self): 13 | super(ThompsonSampAgent, self).reset() 14 | self.reward.fill(self.reward_param) 15 | 16 | def interact(self, reward, next_state, next_state_is_terminal, idx): 17 | # Handle start of episode. 18 | if reward is None: 19 | # Return random action since there is no information. 20 | next_action = np.random.randint(self.num_actions) 21 | self.last_state = next_state 22 | self.last_action = next_action 23 | return self.last_action 24 | 25 | # Handle completion of episode. 26 | if next_state_is_terminal: 27 | # Proceed as normal. 28 | pass 29 | 30 | # Update the reward associated with (s,a,s') if first time. 31 | if self.reward[self.last_state, self.last_action, next_state] == self.reward_param: 32 | self.reward[self.last_state, self.last_action, next_state] = reward 33 | 34 | # Update set of states reached by playing a. 35 | self.transition_observations[self.last_state, self.last_action, next_state] += 1 36 | 37 | # Update transition probabilities after every T steps 38 | if self.policy_step == self.T: 39 | self.__compute_policy() 40 | 41 | # Choose next action according to policy. 42 | next_action = self._argmax_breaking_ties_randomly(self.value_table[next_state]) 43 | 44 | self.policy_step += 1 45 | self.last_state = next_state 46 | self.last_action = next_action 47 | 48 | return self.last_action 49 | 50 | def __compute_policy(self): 51 | """Compute an optimal T-step policy for the current state.""" 52 | self.policy_step = 0 53 | transition_probs = np.zeros((self.num_states, self.num_actions, self.num_states)) 54 | for s in xrange(self.num_states): 55 | for a in xrange(self.num_actions): 56 | transition_probs[s,a] = np.random.dirichlet(self.transition_observations[s,a] +\ 57 | self.dirichlet_param, size=1) 58 | self._value_iteration(transition_probs) 59 | -------------------------------------------------------------------------------- /bayesrl/agents/thompsonsampagent_pomdp.py: -------------------------------------------------------------------------------- 1 | from thompsonsampagent import ThompsonSampAgent 2 | import numpy as np 3 | 4 | class ThompsonSampAgentPOMDP(ThompsonSampAgent): 5 | def __init__(self, observation_model, dirichlet_param, reward_param, **kwargs): 6 | super(ThompsonSampAgentPOMDP, self).__init__(dirichlet_param, reward_param, **kwargs) 7 | self.observation_model = observation_model 8 | self.reset_belief() 9 | self.__compute_policy() 10 | 11 | def reset_belief(self): 12 | self.belief = np.array([1./self.num_states for _ in range(self.num_states)]) 13 | 14 | def reset(self): 15 | super(ThompsonSampAgentPOMDP, self).reset() 16 | self.reset_belief() 17 | 18 | def interact(self, reward, observation, next_state_is_terminal, idx): 19 | # Handle start of episode. 20 | if reward is None: 21 | # Return random action since there is no information. 22 | next_action = np.random.randint(self.num_actions) 23 | self.last_action = next_action 24 | self.__observe(observation) 25 | return self.last_action 26 | 27 | # Handle completion of episode. 28 | if next_state_is_terminal: 29 | # Proceed as normal. 30 | pass 31 | 32 | for last_state,next_state in [(s,s_) for s in range(self.num_states) for s_ in range(self.num_states)]: 33 | tp = self.belief[last_state]*self.transition_probs[last_state,self.last_action,next_state] 34 | # Update the reward associated with (s,a,s') if first time. 35 | #if self.reward[last_state, self.last_action, next_state] == self.reward_param: 36 | self.reward[last_state, self.last_action, next_state] *= (1-tp) 37 | self.reward[last_state, self.last_action, next_state] += reward*tp 38 | 39 | # Update set of states reached by playing a. 40 | self.transition_observations[last_state, self.last_action, next_state] += tp 41 | 42 | # Update transition probabilities after every T steps 43 | if self.policy_step == self.T: 44 | self.__compute_policy() 45 | 46 | self.__update_belief(self.last_action,observation) 47 | # Choose next action according to policy. 48 | value_table = sum(self.belief[s]*self.value_table[s] for s in range(self.num_states)) 49 | next_action = self._argmax_breaking_ties_randomly(value_table) 50 | 51 | self.policy_step += 1 52 | self.last_action = next_action 53 | 54 | return self.last_action 55 | 56 | def __compute_policy(self): 57 | """Compute an optimal T-step policy for the current state.""" 58 | self.policy_step = 0 59 | self.transition_probs = np.zeros((self.num_states, self.num_actions, self.num_states)) 60 | for s in xrange(self.num_states): 61 | for a in xrange(self.num_actions): 62 | self.transition_probs[s,a] = np.random.dirichlet(self.transition_observations[s,a] +\ 63 | self.dirichlet_param, size=1) 64 | self._value_iteration(self.transition_probs) 65 | 66 | def __update_belief(self,action,observation): 67 | self.__transition(action) 68 | self.__observe(observation) 69 | 70 | def __transition(self,action): 71 | for s in range(self.num_states): 72 | self.belief[s] = sum(self.transition_probs[s_,action,s]*self.belief[s_] for s_ in range(self.num_states)) 73 | 74 | def __observe(self,observation): 75 | self.belief = [self.belief[s]*self.observation_model[s][observation] for s in range(self.num_states)] 76 | Z = sum(self.belief) 77 | self.belief = np.array(self.belief)/float(Z) 78 | -------------------------------------------------------------------------------- /bayesrl/environments/__init__.py: -------------------------------------------------------------------------------- 1 | from .gridworld import GridWorld 2 | from .chainworld import ChainWorld 3 | 4 | __all__ = ['GridWorld', 'ChainWorld'] 5 | -------------------------------------------------------------------------------- /bayesrl/environments/chainworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..utils import check_random_state 3 | 4 | class ChainWorld(object): 5 | def __init__(self, left_length, left_reward, right_length, right_reward, on_chain_reward, p_return_to_start, random_state=None): 6 | self.left_length = left_length 7 | self.left_reward = left_reward 8 | self.right_length = right_length 9 | self.right_reward = right_reward 10 | self.on_chain_reward = on_chain_reward 11 | self.p_return_to_start = p_return_to_start 12 | self.num_states = self.left_length + self.right_length + 1 13 | self.num_actions = 2 14 | self.random_state = check_random_state(random_state) 15 | self.reset() 16 | 17 | def reset(self): 18 | self.state = self.left_length 19 | 20 | def observe(self): 21 | return self.state 22 | 23 | def is_terminal(self, state): 24 | return state == 0 or state == self.num_states - 1 25 | 26 | def perform_action(self, action): 27 | if self.p_return_to_start and self.random_state.rand() < self.p_return_to_start: 28 | self.reset() 29 | elif action == 0: 30 | self.state -= 1 31 | else: 32 | self.state += 1 33 | 34 | if self.state == 0: 35 | reward = self.left_reward 36 | elif self.state == self.num_states - 1: 37 | reward = self.right_reward 38 | else: 39 | reward = self.on_chain_reward 40 | return self.observe(), reward 41 | 42 | def get_max_reward(self): 43 | return max(self.left_reward, self.right_reward) 44 | -------------------------------------------------------------------------------- /bayesrl/environments/gridworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..utils import check_random_state 3 | 4 | 5 | # Maze state is represented as a 2-element NumPy array: (Y, X). Increasing Y is South. 6 | 7 | # Possible actions, expressed as (delta-y, delta-x). 8 | maze_actions = { 9 | 'N': np.array([-1, 0]), 10 | 'S': np.array([1, 0]), 11 | 'E': np.array([0, 1]), 12 | 'W': np.array([0, -1]), 13 | } 14 | 15 | def parse_topology(topology): 16 | return np.array([list(row) for row in topology]) 17 | 18 | 19 | class Maze(object): 20 | """ 21 | Simple wrapper around a NumPy 2D array to handle flattened indexing and staying in bounds. 22 | """ 23 | def __init__(self, topology): 24 | self.topology = parse_topology(topology) 25 | self.flat_topology = self.topology.ravel() 26 | self.shape = self.topology.shape 27 | 28 | def in_bounds_flat(self, position): 29 | return 0 <= position < np.product(self.shape) 30 | 31 | def in_bounds_unflat(self, position): 32 | return 0 <= position[0] < self.shape[0] and 0 <= position[1] < self.shape[1] 33 | 34 | def get_flat(self, position): 35 | if not self.in_bounds_flat(position): 36 | raise IndexError("Position out of bounds: {}".format(position)) 37 | return self.flat_topology[position] 38 | 39 | def get_unflat(self, position): 40 | if not self.in_bounds_unflat(position): 41 | raise IndexError("Position out of bounds: {}".format(position)) 42 | return self.topology[tuple(position)] 43 | 44 | def flatten_index(self, index_tuple): 45 | return np.ravel_multi_index(index_tuple, self.shape) 46 | 47 | def unflatten_index(self, flattened_index): 48 | return np.unravel_index(flattened_index, self.shape) 49 | 50 | def flat_positions_containing(self, x): 51 | return list(np.nonzero(self.flat_topology == x)[0]) 52 | 53 | def flat_positions_not_containing(self, x): 54 | return list(np.nonzero(self.flat_topology != x)[0]) 55 | 56 | def __str__(self): 57 | return '\n'.join(''.join(row) for row in self.topology.tolist()) 58 | 59 | def __repr__(self): 60 | return 'Maze({})'.format(repr(self.topology.tolist())) 61 | 62 | 63 | def move_avoiding_walls(maze, position, action): 64 | """ 65 | Return the new position after moving, and the event that happened ('hit-wall' or 'moved'). 66 | 67 | Works with the position and action as a (row, column) array. 68 | """ 69 | # Compute new position 70 | new_position = position + action 71 | 72 | # Compute collisions with walls, including implicit walls at the ends of the world. 73 | if not maze.in_bounds_unflat(new_position) or maze.get_unflat(new_position) == '#': 74 | return position, 'hit-wall' 75 | 76 | return new_position, 'moved' 77 | 78 | 79 | 80 | class GridWorld(object): 81 | """ 82 | A simple task in a maze: get to the goal. 83 | 84 | Parameters 85 | ---------- 86 | 87 | maze : list of strings or lists 88 | maze topology (see below) 89 | 90 | rewards: dict of string to number. default: {'*': 10}. 91 | Rewards obtained by being in a maze grid with the specified contents, 92 | or experiencing the specified event (either 'hit-wall' or 'moved'). The 93 | contributions of content reward and event reward are summed. For 94 | example, you might specify a cost for moving by passing 95 | rewards={'*': 10, 'moved': -1}. 96 | 97 | terminal_markers: sequence of chars, default '*' 98 | A grid cell containing any of these markers will be considered a 99 | "terminal" state. 100 | 101 | action_error_prob: float 102 | With this probability, the requested action is ignored and a random 103 | action is chosen instead. 104 | 105 | random_state: None, int, or RandomState object 106 | For repeatable experiments, you can pass a random state here. See 107 | http://scikit-learn.org/stable/modules/generated/sklearn.utils.check_random_state.html 108 | 109 | Notes 110 | ----- 111 | 112 | Maze topology is expressed textually. Key: 113 | '#': wall 114 | '.': open (really, anything that's not '#') 115 | '*': goal 116 | 'o': origin 117 | """ 118 | 119 | def __init__(self, maze, rewards={'*': 10}, terminal_markers='*', action_error_prob=0, random_state=None, directions="NSEW"): 120 | 121 | self.maze = Maze(maze) if not isinstance(maze, Maze) else maze 122 | self.rewards = rewards 123 | self.terminal_markers = terminal_markers 124 | self.action_error_prob = action_error_prob 125 | self.random_state = check_random_state(random_state) 126 | 127 | self.actions = [maze_actions[direction] for direction in directions] 128 | self.num_actions = len(self.actions) 129 | self.state = None 130 | self.reset() 131 | self.num_states = self.maze.shape[0] * self.maze.shape[1] 132 | 133 | def __repr__(self): 134 | return 'GridWorld(maze={maze!r}, rewards={rewards}, terminal_markers={terminal_markers}, action_error_prob={action_error_prob})'.format(**self.__dict__) 135 | 136 | def reset(self): 137 | """ 138 | Reset the position to a starting position (an 'o'), chosen at random. 139 | """ 140 | options = self.maze.flat_positions_containing('o') 141 | self.state = options[self.random_state.choice(len(options))] 142 | 143 | def is_terminal(self, state): 144 | """Check if the given state is a terminal state.""" 145 | return self.maze.get_flat(state) in self.terminal_markers 146 | 147 | def observe(self): 148 | """ 149 | Return the current state as an integer. 150 | 151 | The state is the index into the flattened maze. 152 | """ 153 | return self.state 154 | 155 | def perform_action(self, action_idx): 156 | """Perform an action (specified by index), yielding a new state and reward.""" 157 | # In the absorbing end state, nothing does anything. 158 | if self.is_terminal(self.state): 159 | return self.observe(), 0 160 | 161 | if self.action_error_prob and self.random_state.rand() < self.action_error_prob: 162 | action_idx = self.random_state.choice(self.num_actions) 163 | action = self.actions[action_idx] 164 | new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(self.state), action) 165 | self.state = self.maze.flatten_index(new_state_tuple) 166 | 167 | reward = self.rewards.get(self.maze.get_flat(self.state), 0) + self.rewards.get(result, 0) 168 | return self.observe(), reward 169 | 170 | def as_mdp(self): 171 | transition_probabilities = np.zeros((self.num_states, self.num_actions, self.num_states)) 172 | rewards = np.zeros((self.num_states, self.num_actions, self.num_states)) 173 | action_rewards = np.zeros((self.num_states, self.num_actions)) 174 | destination_rewards = np.zeros(self.num_states) 175 | 176 | for state in range(self.num_states): 177 | destination_rewards[state] = self.rewards.get(self.maze.get_flat(state), 0) 178 | 179 | is_terminal_state = np.zeros(self.num_states, dtype=np.bool) 180 | 181 | for state in range(self.num_states): 182 | if self.is_terminal(state): 183 | is_terminal_state[state] = True 184 | transition_probabilities[state, :, state] = 1. 185 | else: 186 | for action in range(self.num_actions): 187 | new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(state), self.actions[action]) 188 | new_state = self.maze.flatten_index(new_state_tuple) 189 | transition_probabilities[state, action, new_state] = 1. 190 | action_rewards[state, action] = self.rewards.get(result, 0) 191 | 192 | # Now account for action noise. 193 | transitions_given_random_action = transition_probabilities.mean(axis=1, keepdims=True) 194 | transition_probabilities *= (1 - self.action_error_prob) 195 | transition_probabilities += self.action_error_prob * transitions_given_random_action 196 | 197 | rewards_given_random_action = action_rewards.mean(axis=1, keepdims=True) 198 | action_rewards = (1 - self.action_error_prob) * action_rewards + self.action_error_prob * rewards_given_random_action 199 | rewards = action_rewards[:, :, None] + destination_rewards[None, None, :] 200 | rewards[is_terminal_state] = 0 201 | 202 | return transition_probabilities, rewards 203 | 204 | def get_max_reward(self): 205 | transition_probabilities, rewards = self.as_mdp() 206 | return rewards.max() 207 | 208 | ### Old API, where terminal states were None. 209 | 210 | def observe_old(self): 211 | return None if self.is_terminal(self.state) else self.state 212 | 213 | def perform_action_old(self, action_idx): 214 | new_state, reward = self.perform_action(action_idx) 215 | if self.is_terminal(new_state): 216 | return None, reward 217 | else: 218 | return new_state, reward 219 | 220 | 221 | samples = { 222 | 'trivial': [ 223 | '###', 224 | '#o#', 225 | '#.#', 226 | '#*#', 227 | '###'], 228 | 229 | 'larger': [ 230 | '#########', 231 | '#..#....#', 232 | '#..#..#.#', 233 | '#..#..#.#', 234 | '#..#.##.#', 235 | '#....*#.#', 236 | '#######.#', 237 | '#o......#', 238 | '#########'] 239 | } 240 | 241 | 242 | def construct_cliff_task(width, height, goal_reward=50, move_reward=-1, cliff_reward=-100, **kw): 243 | """ 244 | Construct a 'cliff' task, a GridWorld with a "cliff" between the start and 245 | goal. Falling off the cliff gives a large negative reward and ends the 246 | episode. 247 | 248 | Any other parameters, like action_error_prob, are passed on to the 249 | GridWorld constructor. 250 | """ 251 | 252 | maze = ['.' * width] * (height - 1) # middle empty region 253 | maze.append('o' + 'X' * (width - 2) + '*') # bottom goal row 254 | 255 | rewards = { 256 | '*': goal_reward, 257 | 'moved': move_reward, 258 | 'hit-wall': move_reward, 259 | 'X': cliff_reward 260 | } 261 | 262 | return GridWorld(maze, rewards=rewards, terminal_markers='*X', **kw) 263 | -------------------------------------------------------------------------------- /bayesrl/environments/pomdpgw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..utils import check_random_state 3 | 4 | 5 | # Maze state is represented as a 2-element NumPy array: (Y, X). Increasing Y is South. 6 | 7 | # Possible actions, expressed as (delta-y, delta-x). 8 | maze_actions = { 9 | 'N': np.array([-1, 0]), 10 | 'S': np.array([1, 0]), 11 | 'E': np.array([0, 1]), 12 | 'W': np.array([0, -1]), 13 | } 14 | 15 | def parse_topology(topology): 16 | return np.array([list(row) for row in topology]) 17 | 18 | 19 | class Maze(object): 20 | """ 21 | Simple wrapper around a NumPy 2D array to handle flattened indexing and staying in bounds. 22 | """ 23 | def __init__(self, topology, true_obs_prob=.8, easy_obs_model=True): 24 | self.topology = parse_topology(topology) 25 | self.flat_topology = self.topology.ravel() 26 | self.shape = self.topology.shape 27 | self.true_obs_prob = true_obs_prob 28 | self.easy_obs_model = easy_obs_model 29 | #If the observation model is easy, the agent can observe which directions have walls 30 | #If the observation model is not easy, the agent only observes how many of its four neighbors are walls. 31 | self.num_observations = 16 if easy_obs_model else 5 32 | 33 | def in_bounds_flat(self, position): 34 | return 0 <= position < np.product(self.shape) 35 | 36 | def in_bounds_unflat(self, position): 37 | return 0 <= position[0] < self.shape[0] and 0 <= position[1] < self.shape[1] 38 | 39 | def get_flat(self, position): 40 | if not self.in_bounds_flat(position): 41 | raise IndexError("Position out of bounds: {}".format(position)) 42 | return self.flat_topology[position] 43 | 44 | def get_unflat(self, position): 45 | if not self.in_bounds_unflat(position): 46 | raise IndexError("Position out of bounds: {}".format(position)) 47 | return self.topology[tuple(position)] 48 | 49 | def flatten_index(self, index_tuple): 50 | return np.ravel_multi_index(index_tuple, self.shape) 51 | 52 | def unflatten_index(self, flattened_index): 53 | return np.unravel_index(flattened_index, self.shape) 54 | 55 | def flat_positions_containing(self, x): 56 | return list(np.nonzero(self.flat_topology == x)[0]) 57 | 58 | def flat_positions_not_containing(self, x): 59 | return list(np.nonzero(self.flat_topology != x)[0]) 60 | 61 | def get_inbound_index(self, index_tuple): 62 | x = min(max(index_tuple[0],0),self.shape[0]-1) 63 | y = min(max(index_tuple[1],0),self.shape[1]-1) 64 | return x, y 65 | 66 | def true_observation(self, index_tuple): 67 | it = index_tuple 68 | if type(it) == np.int64: 69 | it = self.unflatten_index(it) 70 | neighbors = [(it[0]+1,it[1]), 71 | (it[0]-1,it[1]), 72 | (it[0],it[1]+1), 73 | (it[0],it[1]-1)] 74 | neighbors = [n for n in neighbors if self.in_bounds_unflat(n)] 75 | if_wall = [self.get_unflat(n)=='#' for n in neighbors] 76 | if self.easy_obs_model: 77 | obs = sum(if_wall) 78 | else: 79 | obs = sum(np.array([8,4,2,1])*if_wall) 80 | return obs 81 | 82 | def obs_distribution(self, index_tuple): 83 | if type(index_tuple) == int: 84 | index_tuple = self.unflatten_index(index_tuple) 85 | other_obs_prob = (1-self.true_obs_prob)/(self.num_observations-1) 86 | obs_distribution = [other_obs_prob] * self.num_observations 87 | true_obs = self.true_observation(index_tuple) 88 | obs_distribution[true_obs] = self.true_obs_prob 89 | return obs_distribution 90 | 91 | def get_all_obs_distribution(self): 92 | return [self.obs_distribution((x,y)) for x in range(self.shape[0]) for y in range(self.shape[1])] 93 | 94 | def observation(self, index_tuple): 95 | if type(index_tuple) == int: 96 | index_tuple = self.unflatten_index(index_tuple) 97 | obs_distribution = self.obs_distribution(index_tuple) 98 | obs = np.random.multinomial(1, obs_distribution) 99 | return obs.tolist().index(1) 100 | 101 | def __str__(self): 102 | return '\n'.join(''.join(row) for row in self.topology.tolist()) 103 | 104 | def __repr__(self): 105 | return 'Maze({})'.format(repr(self.topology.tolist())) 106 | 107 | 108 | def move_avoiding_walls(maze, position, action): 109 | """ 110 | Return the new position after moving, and the event that happened ('hit-wall' or 'moved'). 111 | 112 | Works with the position and action as a (row, column) array. 113 | """ 114 | # Compute new position 115 | new_position = position + action 116 | 117 | # Compute collisions with walls, including implicit walls at the ends of the world. 118 | if not maze.in_bounds_unflat(new_position) or maze.get_unflat(new_position) == '#': 119 | return position, 'hit-wall' 120 | 121 | return new_position, 'moved' 122 | 123 | 124 | 125 | class GridWorld(object): 126 | """ 127 | A simple task in a maze: get to the goal. 128 | 129 | Parameters 130 | ---------- 131 | 132 | maze : list of strings or lists 133 | maze topology (see below) 134 | 135 | rewards: dict of string to number. default: {'*': 10}. 136 | Rewards obtained by being in a maze grid with the specified contents, 137 | or experiencing the specified event (either 'hit-wall' or 'moved'). The 138 | contributions of content reward and event reward are summed. For 139 | example, you might specify a cost for moving by passing 140 | rewards={'*': 10, 'moved': -1}. 141 | 142 | terminal_markers: sequence of chars, default '*' 143 | A grid cell containing any of these markers will be considered a 144 | "terminal" state. 145 | 146 | action_error_prob: float 147 | With this probability, the requested action is ignored and a random 148 | action is chosen instead. 149 | 150 | random_state: None, int, or RandomState object 151 | For repeatable experiments, you can pass a random state here. See 152 | http://scikit-learn.org/stable/modules/generated/sklearn.utils.check_random_state.html 153 | 154 | Notes 155 | ----- 156 | 157 | Maze topology is expressed textually. Key: 158 | '#': wall 159 | '.': open (really, anything that's not '#') 160 | '*': goal 161 | 'o': origin 162 | """ 163 | 164 | def __init__(self, maze, rewards={'*': 10}, terminal_markers='*', 165 | action_error_prob=0, random_state=None, directions="NSEW", pomdp=False): 166 | 167 | self.maze = Maze(maze) if not isinstance(maze, Maze) else maze 168 | self.rewards = rewards 169 | self.terminal_markers = terminal_markers 170 | self.action_error_prob = action_error_prob 171 | self.random_state = check_random_state(random_state) 172 | 173 | self.actions = [maze_actions[direction] for direction in directions] 174 | self.num_actions = len(self.actions) 175 | self.state = None 176 | self.reset() 177 | self.num_states = self.maze.shape[0] * self.maze.shape[1] 178 | self.pomdp = pomdp 179 | 180 | def __repr__(self): 181 | return 'GridWorld(maze={maze!r}, rewards={rewards}, terminal_markers={terminal_markers}, action_error_prob={action_error_prob})'.format(**self.__dict__) 182 | 183 | def reset(self): 184 | """ 185 | Reset the position to a starting position (an 'o'), chosen at random. 186 | """ 187 | options = self.maze.flat_positions_containing('o') 188 | self.state = options[self.random_state.choice(len(options))] 189 | 190 | def is_terminal(self, state): 191 | """Check if the given state is a terminal state.""" 192 | return self.maze.get_flat(state) in self.terminal_markers 193 | 194 | def observe(self): 195 | """ 196 | Return the current state as an integer. 197 | 198 | The state is the index into the flattened maze. 199 | """ 200 | o = self.maze.observation(self.state) if self.pomdp else self.state 201 | return o 202 | 203 | def perform_action(self, action_idx): 204 | """Perform an action (specified by index), yielding a new state and reward.""" 205 | # In the absorbing end state, nothing does anything. 206 | if self.is_terminal(self.state): 207 | return self.observe(), 0 208 | 209 | if self.action_error_prob and self.random_state.rand() < self.action_error_prob: 210 | action_idx = self.random_state.choice(self.num_actions) 211 | action = self.actions[action_idx] 212 | new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(self.state), action) 213 | self.state = self.maze.flatten_index(new_state_tuple) 214 | 215 | reward = self.rewards.get(self.maze.get_flat(self.state), 0) + self.rewards.get(result, 0) 216 | return self.observe(), reward 217 | 218 | def as_mdp(self): 219 | transition_probabilities = np.zeros((self.num_states, self.num_actions, self.num_states)) 220 | rewards = np.zeros((self.num_states, self.num_actions, self.num_states)) 221 | action_rewards = np.zeros((self.num_states, self.num_actions)) 222 | destination_rewards = np.zeros(self.num_states) 223 | 224 | for state in range(self.num_states): 225 | destination_rewards[state] = self.rewards.get(self.maze.get_flat(state), 0) 226 | 227 | is_terminal_state = np.zeros(self.num_states, dtype=np.bool) 228 | 229 | for state in range(self.num_states): 230 | if self.is_terminal(state): 231 | is_terminal_state[state] = True 232 | transition_probabilities[state, :, state] = 1. 233 | else: 234 | for action in range(self.num_actions): 235 | new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(state), self.actions[action]) 236 | new_state = self.maze.flatten_index(new_state_tuple) 237 | transition_probabilities[state, action, new_state] = 1. 238 | action_rewards[state, action] = self.rewards.get(result, 0) 239 | 240 | # Now account for action noise. 241 | transitions_given_random_action = transition_probabilities.mean(axis=1, keepdims=True) 242 | transition_probabilities *= (1 - self.action_error_prob) 243 | transition_probabilities += self.action_error_prob * transitions_given_random_action 244 | 245 | rewards_given_random_action = action_rewards.mean(axis=1, keepdims=True) 246 | action_rewards = (1 - self.action_error_prob) * action_rewards + self.action_error_prob * rewards_given_random_action 247 | rewards = action_rewards[:, :, None] + destination_rewards[None, None, :] 248 | rewards[is_terminal_state] = 0 249 | 250 | return transition_probabilities, rewards 251 | 252 | def get_max_reward(self): 253 | transition_probabilities, rewards = self.as_mdp() 254 | return rewards.max() 255 | 256 | ### Old API, where terminal states were None. 257 | 258 | def observe_old(self): 259 | return None if self.is_terminal(self.state) else self.state 260 | 261 | def perform_action_old(self, action_idx): 262 | new_state, reward = self.perform_action(action_idx) 263 | if self.is_terminal(new_state): 264 | return None, reward 265 | else: 266 | return new_state, reward 267 | 268 | 269 | samples = { 270 | 'trivial': [ 271 | '###', 272 | '#o#', 273 | '#.#', 274 | '#*#', 275 | '###'], 276 | 277 | 'larger': [ 278 | '#########', 279 | '#..#....#', 280 | '#..#..#.#', 281 | '#..#..#.#', 282 | '#..#.##.#', 283 | '#....*#.#', 284 | '#######.#', 285 | '#o......#', 286 | '#########'] 287 | } 288 | 289 | 290 | 291 | 292 | def construct_cliff_task(width, height, goal_reward=50, move_reward=-1, cliff_reward=-100, **kw): 293 | """ 294 | Construct a 'cliff' task, a GridWorld with a "cliff" between the start and 295 | goal. Falling off the cliff gives a large negative reward and ends the 296 | episode. 297 | 298 | Any other parameters, like action_error_prob, are passed on to the 299 | GridWorld constructor. 300 | """ 301 | 302 | maze = ['.' * width] * (height - 1) # middle empty region 303 | maze.append('o' + 'X' * (width - 2) + '*') # bottom goal row 304 | 305 | rewards = { 306 | '*': goal_reward, 307 | 'moved': move_reward, 308 | 'hit-wall': move_reward, 309 | 'X': cliff_reward 310 | } 311 | 312 | return GridWorld(maze, rewards=rewards, terminal_markers='*X', **kw) 313 | -------------------------------------------------------------------------------- /bayesrl/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | class Plot(object): 5 | """ 6 | Wrapper class for collecting all trials to use in visualization methods. 7 | 8 | Parameters 9 | ---------- 10 | dict_trial: dictionary of lists, where each object in the list is a Trial 11 | Key is the name of the learner, and value is a list of trials 12 | for that learner using different parameter settings. 13 | """ 14 | def __init__(self, dict_trial): 15 | self.dict_trial = dict_trial 16 | 17 | self.colors = ['r', 'b', 'g', 'm', 'c', 'y'] 18 | self.line_type = ['-', '--', '-.'] 19 | 20 | def cum_rewards_by_iteration(self): 21 | """ 22 | Plot B. 23 | y-axis: Sum of all rewards. 24 | x-axis: Iteration of trial(s). 25 | """ 26 | self.__rewards_by_idx("cum", "iters") 27 | 28 | def rewards_by_episode(self): 29 | """ 30 | Plot C. 31 | y-axis: Immediate reward. 32 | x-axis: Episode of trial(s). 33 | """ 34 | self.__rewards_by_idx("imm", "epi") 35 | 36 | def cum_rewards_by_prob_start(self): 37 | """ 38 | Plot F. 39 | y-axis: Sum of all the rewards. 40 | x-axis: Pr(return to start). 41 | """ 42 | self.__rewards_by_prob_start("cum") 43 | 44 | def end_rewards_by_prob_start(self): 45 | """ 46 | Plot G. 47 | y-axis: Sum of all the rewards in the last 100 iterations. 48 | x-axis: Pr(return to start). 49 | """ 50 | self.__rewards_by_prob_start("end") 51 | 52 | def cum_rewards_by_act_err_prob(self): 53 | """ 54 | Plot I. 55 | y-axis: Sum of all the rewards. 56 | x-axis: Action-error probability. 57 | """ 58 | self.__rewards_by_act_err_prob("cum") 59 | 60 | def end_rewards_by_act_err_prob(self): 61 | """ 62 | Plot J. 63 | y-axis: Sum of all the rewards in the last 100 iterations. 64 | x-axis: Action-error probability. 65 | """ 66 | self.__rewards_by_act_err_prob("end") 67 | 68 | def __rewards_by_idx(self, reward_type, idx_type): 69 | """ 70 | reward_type: "cum" or "imm" 71 | idx_type: "iters" or "epi" 72 | """ 73 | i = 0 74 | for key,value in self.dict_trial.items(): 75 | color = self.colors[i] 76 | j = 0 77 | for trial in value: 78 | line_type = self.line_type[j] 79 | if reward_type == "cum" and idx_type == "iters": 80 | array = trial.array_rewards_by_iteration.cumsum(axis=1) 81 | elif reward_type == "imm" and idx_type == "epi": 82 | array = trial.array_rewards_by_episode 83 | else: 84 | raise Exception("Arguments not specified correctly.") 85 | x = np.arange(array.shape[1]) 86 | mean = array.mean(axis=0) 87 | if j == 0: 88 | plt.plot(x, mean, color+line_type, label=key) 89 | else: 90 | plt.plot(x, mean, color+line_type) 91 | j += 1 92 | i += 1 93 | if reward_type == "cum" and idx_type == "iters": 94 | plt.title("Cumulative reward by iteration") 95 | plt.ylabel("Cumulative reward") 96 | plt.xlabel("Iteration") 97 | plt.legend(loc=2) 98 | elif reward_type == "imm" and idx_type == "epi": 99 | plt.title("Immediate reward by episode") 100 | plt.ylabel("Immediate reward") 101 | plt.xlabel("Episode") 102 | plt.legend(loc=4) 103 | plt.show() 104 | 105 | def __rewards_by_prob_start(self, reward_type): 106 | """ 107 | reward_type: "cum" or "end" 108 | """ 109 | i = 0 110 | for key,value in self.dict_trial.items(): 111 | color = self.colors[i] 112 | x = np.arange(0, 1, 0.1) 113 | means = np.zeros(len(value)) 114 | j = 0 115 | for trial in value: 116 | if reward_type == "cum": 117 | array = trial.array_rewards_by_iteration.sum(axis=1) 118 | elif reward_type == "end": 119 | array = trial.array_rewards_by_iteration[:,-100:].sum(axis=1) 120 | else: 121 | raise Exception("Arguments not specified correctly.") 122 | means[j] = array.mean(axis=0) 123 | j += 1 124 | plt.plot(x, means, color, label=key) 125 | i += 1 126 | if reward_type == "cum": 127 | plt.title("Cumulative reward by prob(return_start)") 128 | plt.ylabel("Cumulative reward") 129 | elif reward_type == "end": 130 | plt.title("End reward by prob(return_start)") 131 | plt.ylabel("End reward (sum of last 100 iterations)") 132 | plt.xlabel("Prob(return_start)") 133 | plt.legend() 134 | plt.show() 135 | 136 | def __rewards_by_act_err_prob(self, reward_type): 137 | """ 138 | reward_type: "cum" or "end" 139 | """ 140 | i = 0 141 | for key,value in self.dict_trial.items(): 142 | color = self.colors[i] 143 | j = 0 144 | for trial_list in value: 145 | line_width = np.linspace(0.5, 3, endpoint=True, num=len(value))[j] 146 | x = np.arange(0, 0.55, 0.05) 147 | means = np.zeros(len(trial_list)) 148 | for k,trial in enumerate(trial_list): 149 | if reward_type == "cum": 150 | array = trial.array_rewards_by_iteration.sum(axis=1) 151 | elif reward_type == "end": 152 | array = trial.array_rewards_by_iteration[:,-100:].sum(axis=1) 153 | else: 154 | raise Exception("Arguments not specified correctly.") 155 | means[k] = array.mean(axis=0) 156 | if line_width == 1: 157 | plt.plot(x, means, color, linewidth=line_width, label=key) 158 | else: 159 | plt.plot(x, means, color, linewidth=line_width) 160 | j += 1 161 | i += 1 162 | if reward_type == "cum": 163 | plt.title("Cumulative reward by action-error probability (thicker=larger epsilon)") 164 | plt.ylabel("Cumulative reward") 165 | elif reward_type == "end": 166 | plt.title("End reward by action-error probability (thicker=larger epsilon)") 167 | plt.ylabel("End reward") 168 | plt.xlabel("Action-error probability") 169 | plt.legend() 170 | plt.show() 171 | -------------------------------------------------------------------------------- /bayesrl/trial.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Trial(object): 4 | """ 5 | Class for running trial(s) for a given agent and task. 6 | 7 | Parameters 8 | ---------- 9 | agent: Agent 10 | task: Task 11 | MIN_ITERATIONS: int 12 | The minimum number of iterations for a trial. 13 | MIN_EPISODES: int 14 | The minimum number of episodes for a trial. 15 | MAX_EPISODE_ITERATION: int 16 | The maximum number of iterations for each episode. 17 | """ 18 | def __init__(self, agent, task, MIN_ITERATIONS=5000, MIN_EPISODES=100, MAX_EPISODE_ITERATION=1000): 19 | self.agent = agent 20 | self.task = task 21 | self.MIN_ITERATIONS = MIN_ITERATIONS 22 | self.MIN_EPISODES = MIN_EPISODES 23 | self.MAX_EPISODE_ITERATION = MAX_EPISODE_ITERATION 24 | 25 | self.array_rewards_by_episode = None 26 | self.array_rewards_by_iteration = None 27 | 28 | def run(self): 29 | iteration = episode = 0 30 | rewards_by_iteration = np.zeros(self.MIN_ITERATIONS) 31 | rewards_by_episode = np.zeros(self.MIN_EPISODES) 32 | self.agent.reset() 33 | 34 | while iteration < self.MIN_ITERATIONS or episode < self.MIN_EPISODES: 35 | print "Episode:",episode 36 | # Initialize the episode. 37 | self.task.reset() 38 | #if self.task.pomdp: 39 | # self.agent.reset_belief() 40 | state = self.task.observe() 41 | reward = None 42 | cumulative_reward = 0 43 | episode_iteration = 0 44 | 45 | while episode_iteration < self.MAX_EPISODE_ITERATION: 46 | # Tell the agent what happened and ask for a next action. 47 | action = self.agent.interact(reward, state, self.task.is_terminal(state), iteration) 48 | 49 | if self.task.is_terminal(state): 50 | # End of episode (happens after interaction so agent can learn from final reward). 51 | break 52 | 53 | # Take action A, observe R, S'. 54 | state, reward = self.task.perform_action(action) 55 | 56 | # Log rewards. 57 | if iteration < self.MIN_ITERATIONS: 58 | rewards_by_iteration[iteration] = reward 59 | cumulative_reward += reward 60 | 61 | iteration += 1 62 | episode_iteration += 1 63 | 64 | if episode < self.MIN_EPISODES: 65 | rewards_by_episode[episode] = cumulative_reward 66 | episode += 1 67 | 68 | return rewards_by_iteration, rewards_by_episode 69 | 70 | def run_multiple(self, num_trials): 71 | self.array_rewards_by_episode = np.zeros((num_trials, self.MIN_EPISODES)) 72 | self.array_rewards_by_iteration = np.zeros((num_trials, self.MIN_ITERATIONS)) 73 | for i in xrange(num_trials): 74 | self.array_rewards_by_iteration[i], self.array_rewards_by_episode[i] = self.run() 75 | -------------------------------------------------------------------------------- /bayesrl/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numbers 3 | 4 | def check_random_state(seed): 5 | # From https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py 6 | """Turn seed into a np.random.RandomState instance 7 | 8 | If seed is None, return the RandomState singleton used by np.random. 9 | If seed is an int, return a new RandomState instance seeded with seed. 10 | If seed is already a RandomState instance, return it. 11 | Otherwise raise ValueError. 12 | """ 13 | if seed is None or seed is np.random: 14 | return np.random.mtrand._rand 15 | if isinstance(seed, (numbers.Integral, np.integer)): 16 | return np.random.RandomState(seed) 17 | if isinstance(seed, np.random.RandomState): 18 | return seed 19 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 20 | ' instance' % seed) 21 | -------------------------------------------------------------------------------- /benchmarks/thompson_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Solves grid world using three different parameter settings for Thompson 3 | sampling. This empirically shows the convergence of Thompson sampling regardless 4 | of the prior misspecification. 5 | """ 6 | 7 | from bayesrl.environments import GridWorld 8 | from bayesrl.agents.thompsonsampagent import ThompsonSampAgent 9 | from bayesrl.trial import Trial 10 | from bayesrl.plot import Plot 11 | 12 | # Define environment. 13 | task = GridWorld( 14 | GridWorld.samples['larger'], 15 | action_error_prob=.1, 16 | rewards={'*': 50, 'moved': -1, 'hit-wall': -1}) 17 | 18 | num_trials = 1 19 | 20 | ################################################################################ 21 | # Thompson Sampling 22 | ################################################################################ 23 | # Dirichlet params = 1, Reward params = 50 24 | agent = ThompsonSampAgent( 25 | num_states=task.num_states, num_actions=task.num_actions, 26 | discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50) 27 | trial_thompson1 = Trial(agent, task) 28 | trial_thompson1.run_multiple(num_trials) 29 | 30 | # Dirichlet params = 1, Reward params = 10 31 | agent.dirichlet_param = 1 32 | agent.reward_param = 10 33 | trial_thompson2 = Trial(agent, task) 34 | trial_thompson2.run_multiple(num_trials) 35 | 36 | # Dirichlet params = 10, Reward params = 50 37 | agent.dirichlet_param = 10 38 | agent.reward_param = 50 39 | trial_thompson3 = Trial(agent, task) 40 | trial_thompson3.run_multiple(num_trials) 41 | 42 | ################################################################################ 43 | # Plots! 44 | ################################################################################ 45 | plot = Plot({"Thompson sampling": [trial_thompson1, trial_thompson2, trial_thompson3] 46 | }) 47 | # Plot cumulative rewards by iteration 48 | plot.cum_rewards_by_iteration() 49 | # Plot rewards by episode 50 | plot.rewards_by_episode() 51 | -------------------------------------------------------------------------------- /benchmarks/thompson_gridworld_pomdp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Solves grid world using three different parameter settings for Thompson 3 | sampling. This empirically shows the convergence of Thompson sampling regardless 4 | of the prior misspecification. 5 | """ 6 | 7 | from bayesrl.environments import pomdpgw 8 | from bayesrl.agents.thompsonsampagent_pomdp import ThompsonSampAgentPOMDP 9 | from bayesrl.trial import Trial 10 | from bayesrl.plot import Plot 11 | 12 | # Define environment. 13 | task = pomdpgw.GridWorld( 14 | pomdpgw.GridWorld.samples['larger'], 15 | action_error_prob=.1, 16 | rewards={'*': 50, 'moved': -1, 'hit-wall': -1}, 17 | pomdp=True) 18 | 19 | num_trials = 1 20 | 21 | ################################################################################ 22 | # Thompson Sampling 23 | ################################################################################ 24 | # Dirichlet params = 1, Reward params = 50 25 | agent = ThompsonSampAgentPOMDP(observation_model=task.maze.get_all_obs_distribution(), 26 | num_states=task.num_states, num_actions=task.num_actions, 27 | discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50) 28 | trial_thompson1 = Trial(agent, task, MIN_EPISODES=100) 29 | trial_thompson1.run_multiple(num_trials) 30 | 31 | # # Dirichlet params = 1, Reward params = 10 32 | # agent.dirichlet_param = 1 33 | # agent.reward_param = 10 34 | # trial_thompson2 = Trial(agent, task) 35 | # trial_thompson2.run_multiple(num_trials) 36 | 37 | # # Dirichlet params = 10, Reward params = 50 38 | # agent.dirichlet_param = 10 39 | # agent.reward_param = 50 40 | # trial_thompson3 = Trial(agent, task) 41 | # trial_thompson3.run_multiple(num_trials) 42 | 43 | ################################################################################ 44 | # Plots! 45 | ################################################################################ 46 | plot = Plot({"Thompson sampling": [trial_thompson1]#, trial_thompson2, trial_thompson3] 47 | }) 48 | # Plot cumulative rewards by iteration 49 | plot.cum_rewards_by_iteration() 50 | # Plot rewards by episode 51 | plot.rewards_by_episode() 52 | -------------------------------------------------------------------------------- /reports/6_834j_ps03.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{strens2000bayesian, 2 | author = {Malcolm Strens}, 3 | title = {A Bayesian framework for reinforcement learning}, 4 | booktitle = {Proceedings of the 17th International Conference on Machine Learning (ICML)}, 5 | year = {2000} 6 | } 7 | 8 | @article{astrom1965optimal, 9 | author = {Karl Johan Astr\"{o}m}, 10 | title = {Optimal control of Markov decision processes with incomplete state 11 | estimation}, 12 | journal = {Journal of Mathematical Analysis and Applications}, 13 | year = {1965}, 14 | volume = {10:174–205} 15 | } 16 | 17 | @unpublished{Braziunas, 18 | author = {Darius Braziunas}, 19 | title = {POMDP solution methods}, 20 | year = {2003}, 21 | school = {University of Toronto} 22 | } 23 | 24 | @book{sutton1998reinforcement, 25 | author = {Richard S. Sutton and Andrew G. Barto}, 26 | publisher = {Cambridge Univ Press}, 27 | title = {Reinforcement learning: An introduction}, 28 | volume = 116, 29 | year = 1998 30 | } 31 | -------------------------------------------------------------------------------- /reports/6_834j_ps03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps03.pdf -------------------------------------------------------------------------------- /reports/6_834j_ps03.tex: -------------------------------------------------------------------------------- 1 | %############################################################################## 2 | % Preamble 3 | %############################################################################## 4 | 5 | \documentclass{pset} 6 | \name{Dustin Tran, Xiaomin Wang, Rodrigo Gomes} 7 | \email{\{trandv,xiaominw, rgomes\}@mit.edu} 8 | 9 | \course{6.834J/16.412J, S15} 10 | \instructor{Professor Brian Williams} 11 | \assignment{Problem Set \#3} 12 | \duedate{April 17, 2015} 13 | 14 | \begin{document} 15 | 16 | %############################################################################## 17 | % Begin Document 18 | %############################################################################## 19 | 20 | \section{Introduction} 21 | Many scenarios in real life occur where one must make a sequence of decisions 22 | under uncertainty. Quite often in these scenarios two unknowns exist: the 23 | result of taking an action at a given state and the true state of the agent at 24 | any point in time. This is similar to that of a hidden Markov model (HMM), only 25 | in that one must make a sequence of actions instead of a single action; thus the 26 | problem falls no longer in supervised learning but reinforcement learning. The 27 | classic example is a robot which tries to navigate a discrete environment using a 28 | GPS, and it performs action that lead to different states with various 29 | probabilities. The transition probabilities are unknown, and because of the GPS, 30 | there is also inaccuracy in what the 31 | underlying state is. 32 | 33 | One can formalize the problem, under Markovian assumptions, as a partially 34 | observable Markov decision process (POMDP). In this project we provide a 35 | software library for implementing and solving them, which is modular and 36 | flexible enough for further development and user-specified agents/environments. 37 | We encode a variety of basic tasks and solve them using a combination of value 38 | iteration and several standard (PO)MDP solvers---one is a variant of Thompson 39 | sampling \cite{strens2000bayesian}, which is a Bayesian approach following a 40 | Dirichlet-multinomial posterior over each state-action pair. 41 | 42 | The repository can be found at https://github.com/dustinvtran/bayesrl. 43 | To install from pip, run 44 | \begin{lstlisting} 45 | pip install -e "git+https://github.com/dustinvtran/bayesrl.git#egg=bayesrl" 46 | \end{lstlisting} 47 | 48 | \section{Technical Background} 49 | 50 | \subsection{POMDP} 51 | POMDP is a generalization of a Markov decision process (MDP). In a MDP, for each 52 | possible state of the process, a decision has to be made regarding which action 53 | should be executed in that state. The chosen action and given state affects the 54 | costs (or rewards) incurred. The goal is to learn the optimal \emph{policy}, 55 | which is a 56 | choice of actions that in expectation leads to the optima reward in a 57 | pre-defined number of steps (in the case of finite horizon; or in infinite time). In a 58 | POMDP model, the agent does not fully observe the underlying states but a 59 | (non-sufficient) statistic of it. Thus POMDPs also 60 | maintain a probability distribution over the set of possible states, based on a 61 | set of observations and observation probabilities, and the underlying MDP. 62 | 63 | More formally, a POMDP is a collection $(S,A,T,R,\Omega,O,\gamma)$, where 64 | 65 | \begin{itemize} 66 | \item $S$ is a set of states 67 | \item $A$ is a set of actions 68 | \item $T$ is a set of transition probabilities between states. If the agent is currently in state $s 69 | \in S$, and it takes action $a \in A$, the agent will transition to a new state 70 | $s'$ with probability 71 | $T(s' \mid s,a)$. 72 | \item $R: S \times A \rightarrow \mathbb{R}$ is a reward function that assigns a numeric reward (or 73 | cost if the value is negative) for each state and action. 74 | \item $\Omega$ is a set of observations 75 | \item $O$ is a set of conditional observation probabilites. If the agent is now in state $s$, 76 | it receives an observation $o$ according to $O(o \mid s)$ 77 | \item $\gamma \in [0,1]$ is a discount factor that determines how much rewards should be discounted over time 78 | \end{itemize} 79 | 80 | We use our testing environment \texttt{GridWorld} as an example. 81 | \texttt{GridWorld} represents a 2D maze where the agent can be in discrete locations. 82 | Certain locations are impossible for the agent to move to, representing ``walls''. Every 83 | action can move the agent between two adjacent grid locations, or fail, and 84 | cause 85 | the agent to take a uniformly random action instead according to some 86 | user-specified probability. The goal is to reach 87 | the goal location in least amount of time, as each move is -1 reward, hitting a 88 | wall is -1 reward, and reaching the end positions incurs a reward of +50. 89 | 90 | For \texttt{GridWorld}, $S$ consists of all the possible (row, column) location 91 | tuples inside the maze. $A$ contains the four possible actions the agent can 92 | take: up, down, left, right. $T$ describes a transition model that allows the 93 | agent to move without hitting the wall. We define $R$ as the above. To make 94 | this a partially observable problem, we implemented two observation models. 95 | \begin{itemize} 96 | \item 97 | The 98 | easier model gives the agent more information about the environment. The agent 99 | knows which of its four neighbors are walls, giving rise to 16 total 100 | observations. 101 | \item 102 | In the second observation model, the agent can only observe how 103 | many of its four neighbors are walls, giving rise to 5 possible observations. 104 | $O$ is such that $Pr(true\_observation\mid state)=true\_observation\_prob$, 105 | where $true\_observation\_prob$ can be adjusted, and 106 | \begin{equation} 107 | Pr(other\_observation\mid 108 | state)=\frac{1-true\_observation\_prob}{total\_num\_of\_observations-1} 109 | \end{equation} 110 | \end{itemize} 111 | Since 112 | we cannot work with the underlying states directly in POMDP, we also need $B$ 113 | which is the set of belief states, or the probabilities the agent is at all 114 | possible states. 115 | 116 | \subsection{Thompson Sampling} 117 | Thompson Sampling is used to learn the transition distribution $T$, where one 118 | first specifies a prior according to one's knowledge about the transitions 119 | before trying any actions. After a constant set of time steps, the transition 120 | probabilities are then recalculated using a posterior update following a 121 | Dirichlet-multinomial distribution. In MDP, since the agent observes the states 122 | directly, the posterior transition probabilities are updated directly using the 123 | transition counts. In POMDP, the posterior probabilities are iteratively updated 124 | following an update on belief state transition probabilities. 125 | 126 | Note that we've also implemented a variety of other standard MDP solvers for 127 | benchmarking: Q-Learning, SARSA, and R-MAX. 128 | 129 | \subsection{Value Iteration} 130 | Given estimated transition probabilities, we then solve for the underlying policy with value iteration. Value $V$, is the expected total reward 131 | given a policy $\pi$, where a policy decides which action to take given the 132 | belief state. $a = \pi(b)$. The expected reward for policy $\pi$ starting freom 133 | belief $b_0$ is defined as 134 | \[ V^{\pi}(b_{0})=\sum\limits_{t=0}^\infty \gamma^{t}r(b_t,a_t) \] 135 | where $r(b_t, a_t) = \sum\limits_{s \in S} b_t(s)R(s,a_t)$. 136 | The optimal policy should maximize the long term reward 137 | \[ \pi = \underset{\pi}{\text{argmax}} V^{\pi}(b_0) \] 138 | At each time step, we update the belief states based on the observation, and 139 | then update the values based on the updated belief states. The action that gives 140 | the largest expected reward over the belief states is selected for the next time 141 | step. The values gradually improve until convergence. 142 | By improving the values, the policy is implicitly improved. 143 | 144 | \section{Implementation} 145 | 146 | \subsection{Agent Environment Paradigm} 147 | For MDP, we follow the paradigm set forth in Sutton and Barto (Figure 3.1, 148 | \cite{sutton1998reinforcement}). 149 | \begin{figure}[ht] 150 | \begin{center} 151 | \centerline{\includegraphics[width=\textwidth]{img/agent_environment.png}} 152 | \end{center} 153 | \end{figure} 154 | It suggests that RL agents need only output a suggested action after previous history and a given state and reward. Overall by following the paradigm for the software design, we make the learning process explicit and intuitive. 155 | 156 | \subsubsection{Agent} 157 | We implement a base class \texttt{Agent} which is a collection of objects and functions to be used for all other agents. Agents differ primarily in their \texttt{interact()} function, which determines the next action to perform given a state and reward from the environment. 158 | 159 | The model-based algorithms R-MAX and Thompson sampling inherit from \texttt{ModelBasedAgent}, which is a class that itself inherits from \texttt{Agent}; \texttt{ModelBasedAgent} adds subroutines specific to model-based approaches such as value iteration. 160 | 161 | In order to reduce the most redundant code, we also could have used an 162 | additional class that inherits from \texttt{Agent} for temporal difference method agents; this 163 | would be used by both \texttt{SARSAAgent} and \texttt{QLearningAgent}, as they 164 | differ only in their \texttt{value\_table} assignment. However, the efficiency gain in such an 165 | abstraction is not worth the loss of readability in our opinion. 166 | 167 | \subsubsection{Environment} 168 | An \texttt{Environment} object is initialized at some state, with an arbitrarily 169 | defined state and action space. Actions are performed on an \texttt{Environment} 170 | object under the subroutine \texttt{perform\_action()}, and the output is a new state and its reward. 171 | 172 | \subsubsection{Trial and Plot} 173 | As for trials, we implement a class \texttt{Trial} which contains all information for running multiple trials, i.e., independent collections of episodes to learn and act upon. We also add a \texttt{Plot} class which is a wrapper containing all \texttt{Trial} objects; this is convenient for generating plots on collections of trials coming from possibly many different agents. 174 | 175 | \subsection{POMDP} 176 | For POMDP, we assume that the agent is given an observation model of the 177 | environment it is acting in, in the form of a conditional probability distribution 178 | $P(observation \mid state)$. Astr\"{o}m has shown that a properly updated probability 179 | distribution over the state space $S$ is sufficient to summarize all the observable 180 | history of a POMDP agent without loss of optimality \cite{astrom1965optimal}. 181 | Thus we add a step of updating the belief state to the MDP paradigm. And the belief state, instead 182 | of the underlying state, is used to update the transition model. 183 | 184 | \subsection{Organization of Code} 185 | We follow the directory structure specified in the problem set, with two 186 | exceptions: 187 | \begin{itemize} 188 | \item \texttt{documentation/} does not exist. Instead, documentation is written 189 | in the \texttt{README.md} inside the current working directory. Any additional 190 | documentation not purely necessary for the problem set submission is in the 191 | Github wiki (which is a subset of anything in this writeup). 192 | \item \texttt{source/} is named \texttt{bayesrl/} in order to follow Python 193 | convention for installing modules. 194 | \end{itemize} 195 | 196 | \section{Analysis} 197 | 198 | \subsection{Runtime} 199 | The algorithm takes $O(\mid S \mid ^2 \mid A \mid)$ to calculate the transition probabilities, the 200 | expected rewards and the belief state. It is uncertain how many time steps are required for 201 | convergence. All solvers we implement are guaranteed to converge in polynomial time for MDPs, 202 | although unfortunately we have not seen stricter theoretical results on the 203 | upper bounds than this. This certainly makes sense as it is true for all 204 | environments regardless of the pathological scenario. However, it would 205 | certainly be interesting to examine bounds under stricter assumptions where we 206 | fix the environment and perhaps certain parameter settings to simplify the 207 | analysis. 208 | 209 | \subsection{Memory} 210 | We to store several arrays for the computation. The transition probabilities 211 | table is of dimension $\mid S \mid ^2 \mid A \mid$. The value table is of 212 | dimension $\mid S \mid \mid A \mid$. The transition observation table is of 213 | dimension $\mid S \mid ^2 \mid A \mid$. Thus the space requirement for the 214 | algorithm is $O(\mid S \mid ^2 \mid A \mid)$. 215 | 216 | \subsection{Limitations} 217 | If the number of states is large, the algorithm quickly becomes intractable. We 218 | are quite interested in examining function approximations, which allow one to 219 | essentially apply a supervised learning algorithm to predict the optimal action 220 | given state characteristics, rather than to hardcode them manually. Note 221 | however that this makes the runtime even worse as there is an additional error 222 | accumulating as a result of the predictions. 223 | 224 | \section{Experiments} 225 | The implementation of Thompson Sampling for MDPs ran very successfully: as time 226 | progressed, the agent was able to get to the goal much faster, and get a high 227 | reward. It did not work, however, for POMDPs. As time progressed, the agent seemed 228 | to take about the same amount of time to reach the goal on every execution, not 229 | improving in performance. We hypothesize that the reason is due to a lack of 230 | a good prior on the transition model: Thompson sampling relies a lot on being 231 | able to count the number of transitions between states, given an action. 232 | 233 | \begin{figure}[ht] 234 | \begin{center} 235 | \centerline{\includegraphics[width=\textwidth]{img/mdp_imm_rewards.png}} 236 | \caption{Immediate reward for MDP solvers in a variety of parameter settings on 237 | the Gridworld example. 238 | Each of the settings for Thompson sampling vary how strong the misspecification of the 239 | prior as a uniform distribution, and yet in all three scenarios it indicates 240 | convergence. We also see that R-MAX performs better than the temporal difference methods.} 241 | \end{center} 242 | \end{figure} 243 | 244 | This is very hard in a POMDP with a weak prior on its transition model, as it 245 | may have a completely wrong idea of where it ends up at each step. The 246 | observation model is supposed to improve its accuracy, but it was not sufficient 247 | in this case. In fact, we can see this empirically by essentially hardcoding the 248 | optimal path by putting high probability on where it should go, and indeed it 249 | converges to the optimal quite quickly. 250 | 251 | To have an idea of how differently successful the same approach was on MDPs vs 252 | POMDPs, we show you our results: 253 | 254 | \begin{figure}[ht] 255 | \centering 256 | \includegraphics[width=0.75\textwidth]{img/pomdp.png} 257 | \caption{\label{fig:pomdp}Cumulative Reward for POMDP} 258 | \end{figure} 259 | 260 | These graphs show the cumulative reward that the agent got as time progressed. 261 | In the MDP case, the reward started low, and kept getting lower, as the agent 262 | explored. It, however, started getting higher as the agent started acting in a 263 | more deliberate manner, to maximize reward. The POMDP agent, however, seems to 264 | be in a state where its reward just gets increasingly negative. It is unclear 265 | whether it never leaves the exploration phase, or if it simply learns a completely 266 | wrong transition model, and thus computes a very wrong policy. 267 | 268 | \bibliography{6_834j_ps03} 269 | \bibliographystyle{plain} 270 | 271 | %############################################################################## 272 | % End Document 273 | %############################################################################## 274 | 275 | \end{document} 276 | -------------------------------------------------------------------------------- /reports/6_834j_ps04.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps04.bib -------------------------------------------------------------------------------- /reports/6_834j_ps04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps04.pdf -------------------------------------------------------------------------------- /reports/6_834j_ps04.tex: -------------------------------------------------------------------------------- 1 | %############################################################################## 2 | % Preamble 3 | %############################################################################## 4 | 5 | \documentclass{pset} 6 | \name{Dustin Tran, Xiaomin Wang, Rodrigo Gomes} 7 | \email{\{trandv,xiaominw, rgomes\}@mit.edu} 8 | 9 | \course{6.834J/16.412J, S15} 10 | \instructor{Professor Brian Williams} 11 | \assignment{Problem Set \#4} 12 | \duedate{May 13, 2015} 13 | 14 | \begin{document} 15 | 16 | %############################################################################## 17 | % Begin Document 18 | %############################################################################## 19 | 20 | \begin{center} 21 | \Large Task: Robot grocery shopping in partially observable settings 22 | \end{center} 23 | \section{Motivation} 24 | \label{sec:motivation} 25 | Imagine that you're in your bed, hungry, and you just don't want to walk 26 | all the way to the grocery store. Equivalently, imagine you're working, doing the 27 | most exciting task to save humanity, and you have no time to grab food---menial 28 | tasks are beyond you. We'd like a robot which can intelligently learn to shop 29 | groceries for us: it understands a query from the user, moves to the 30 | grocery store, finds the relevant items, purchases it, and comes 31 | back to the user. 32 | 33 | In our project we focus on the arguably most difficult of 34 | these tasks, which is to locate the groceries in the store. Moreover, we work 35 | under the realistic 36 | scenario in which the robot can only observe its surroundings (POMDP) rather 37 | than an understanding of where it precisely is in the store (MDP). 38 | 39 | Given this partially observable setting, the robot should learn how to obtain 40 | all items in the grocery store and do so in an optimal amount of time. It must 41 | 1. figure out where it is in the supermarket; 42 | 2. intelligently search for the items by learning which aisle corresponds to 43 | which category; and 3. find the optimal path and sequence to obtain all items. 44 | 45 | \section{Setup} 46 | The robot is given a list of items it should find in the supermarket. The supermarket is represented 47 | by a grid world. It has several aisles containing different categories of goods. 48 | The robot has a map of the supermarket (it knows where the walls and aisles are), but 49 | it does not where items are located on the shelves. The robot has a perfect sensor, which can 50 | observe its four neighbors. It can move in four directions, up, down, left, and right. It will 51 | transition to the intended position 90\% of the time. For the rest of the time, it is equally likely 52 | to move to any of the permitted direction, except the one opposite the intended direction. The robot 53 | is equally likely to start the mission in one of the four corners of the supermarket. When the robot 54 | goes to the grid next to a target item, the item is considered found. It continues moving untill all 55 | the items have been found. 56 | 57 | 58 | \section{Procedure} 59 | \label{sec:procedure} 60 | The code for the grounded scenario can be found in the \texttt{visual} folder. 61 | You can start the simulation by running \texttt{python debug.py}. 62 | An instance of the robot, and random supermarket is created automatically. 63 | To set what products you want the robot to grab for you, you can write write: 64 | \texttt{g.targets = set([...])}. For example, if you want ice cream, and beans, 65 | you can write: \texttt{g.targets = set(['iscream', 'beans'])}. To view all possible 66 | products, organized by aisles, just type: \texttt{print g.aisles\_content}. 67 | After selecting your items, just type \texttt{go()} to start the robot demo. 68 | 69 | \subsection{Belief Update} 70 | At every step of the demo, the following happens: 71 | 72 | \begin{itemize} 73 | \item Agent provides next action based on current belief state 74 | \item Simulator executes action (errors happen with some probability) 75 | \item Belief state is updated based on transition probabilities 76 | \item Belief state is updated based on observation 77 | \item Belief about the world is updated based on belief state, and observation 78 | \end{itemize} 79 | 80 | The belief updates are done using Bayes Rule. One issue we ran into is that we are 81 | making the Markov assumption for our states, which is not entirely correct, due to 82 | the fact that the organization of the aisles is part of it. This causes us to be biased 83 | towards our most likely state when entering an aisle, since each step causes 84 | that aisle's type to be more likely, and the most likely state to, in turn, become 85 | more likely. The desired behavior would be to only update the aisle organization 86 | belief state when we enter the aisle, and not as we walk in it. An alternative 87 | implementation of our belief state that could potentially avoid this problem would 88 | have been a particle filter. 89 | 90 | \subsection{Max Probability Value Iteration} 91 | We reduce the POMDP to MDP, and solve the MDP using value iteration. At each time step, we assume 92 | that the robot is at the most likely state based on its belief state. We also generate an 93 | arrangement of the all items according to the probability distribution of items on the aisles. With 94 | this information, the robot knows exactly where it is and where everything is. Since it also has the 95 | transition model, value iteration becomes straightforward. Thus we can find the best action to take 96 | assuming those are the true state of the world. 97 | 98 | In a way, we are approximating the POMDP with most likely state MDP. Alternatively, we can also 99 | sample over all possible state according to their probability distribution and run value iteration 100 | for each of them. We then choose the best action that gives the robot the maximal weighted averaged 101 | expected reward. Such sampling can lower the probability of choosing a bad action assuming a 102 | completely wrong world state. Thus the robot can make smarter moves, but at the expense of 103 | computation power. 104 | 105 | \section{Experiments} 106 | \label{sec:experiments} 107 | 108 | \subsection{Path Planning} 109 | The robot plans its path intelligently but greedily. As long as there are still targets to be 110 | retrieved, the robot moves to the state with highest reward. But it does not try to minimize the 111 | total distance of retrieving all the targets. If the best state at the moment is very far away, but 112 | there is a target with lower reward closer to the robot, but not on the way to the best state, the 113 | robot will ignore the closer target and go for the best state. 114 | 115 | One approach to address this non-optimality is to include the remaining target items as part of our state. 116 | The reward function will also have to be changed accordingly. However, this will increase the state 117 | space greatly and slow down the value iteration. 118 | 119 | 120 | \subsection{Run Time} 121 | Value iteration runs fairly fast for our problem size. The number of iterations it takes to converge 122 | depends a lot on the value of the discount factor, $\gamma$. For example, for a 11 by 7 grid and a 123 | $\gamma$ value of 0.9, it takes about 90 iterations. If we change $\gamma$ to 0.5, it only takes about 124 | 15 iterations. 125 | 126 | \bibliography{6_834j_ps04} 127 | \bibliographystyle{plain} 128 | 129 | %############################################################################## 130 | % End Document 131 | %############################################################################## 132 | 133 | \end{document} 134 | -------------------------------------------------------------------------------- /reports/6_834j_talk.tex: -------------------------------------------------------------------------------- 1 | % NOTE: must be run with 2 | % xelatex -shell-escape 6_834j_talk 3 | \documentclass[10pt, compress]{beamer} 4 | 5 | \usetheme{m} 6 | 7 | \usepackage{booktabs} 8 | \usepackage[scale=2]{ccicons} 9 | \usepackage{minted} 10 | 11 | \usepgfplotslibrary{dateplot} 12 | 13 | \usemintedstyle{trac} 14 | 15 | \newcommand{\cmark}{\ding{51}}% 16 | \newcommand{\xmark}{\ding{55}}% 17 | 18 | \newcommand{\defeq}{\mathrel{\overset{\makebox[0pt]{\mbox{\normalfont\tiny\sffamily def}}}{=}}} 19 | 20 | \title{Robot grocery shopping in partially observable settings} 21 | \subtitle{} 22 | \date{May 13, 2015} 23 | \author{Rodrigo Gomes, Xiaomin Wang, Dustin Tran} 24 | \institute{MIT, 6.834j Cognitive Robotics} 25 | 26 | \begin{document} 27 | %(2 min) Background on POMDPs, belief-state MDP, MDP solvers we have 28 | %(2 min) Setup: Grocery shopping as planning in a POMDP 29 | %(4 min) Demo 30 | %(2 min) The solver actually used (value iteration) 31 | %(1 min) Things that failed (Thompson sampling) 32 | %(1 min) Q&A 33 | 34 | \maketitle 35 | 36 | \begin{frame}[fragile] 37 | \frametitle{Outline} 38 | 39 | \begin{enumerate} 40 | \item Background on POMDPs 41 | \item Grocery shopping as planning in a POMDP 42 | \item Demo! 43 | \item What worked 44 | \item What failed 45 | \end{enumerate} 46 | 47 | \end{frame} 48 | 49 | \begin{frame}[fragile] 50 | \frametitle{Background} 51 | 52 | A \emph{partially observable Markov decision process} (POMDP) is a tuple $(S,A,\Omega,R,T,O)$ 53 | 54 | \begin{itemize} 55 | \item $S$: state space 56 | \item $A$: action space 57 | \item $\Omega$: observation space 58 | \item $R: S \times A \rightarrow \mathbb{R}$ reward function 59 | \item $T$: transition operator. $T(s' \mid s,a)$ is probability of next state $s'$ given state $s$ and action $a$ 60 | \item $O$: observable operator. $O(o \mid s)$ is probability of observing 61 | $o$ given at state $s$ 62 | \end{itemize} 63 | \end{frame} 64 | 65 | \begin{frame}[fragile] 66 | \frametitle{Background} 67 | 68 | \begin{figure}[ht] 69 | \begin{center} 70 | \centerline{\includegraphics[width=1.25\textwidth]{img/agent_environment_untitled.png}} 71 | \end{center} 72 | \end{figure} 73 | \end{frame} 74 | 75 | \begin{frame}[fragile] 76 | \frametitle{Background} 77 | A POMDP induces an equivalent representation as a \emph{belief MDP} with tuple $(B, A, \tau, R)$ 78 | 79 | \begin{itemize} 80 | \item $B$: set of belief states over the POMDP states 81 | \item $A$: action space of original POMDP 82 | \item $\tau$: belief state transition operator\footnote{ 83 | Given $b(s)$, after taking action $a$ and observing $o$ (and reaching state 84 | $s'$), update belief states 85 | \begin{equation} 86 | b'(s') = \frac{P(o\mid b,a,s')}{P(o\mid b,a)}=\frac{O(o\mid s',a)\sum_{s\in S} T(s'\mid 87 | s,a)b(s)}{\sum_{s'\in S} O(o\mid s',a)\sum_{s\in S}T(s'\mid s,a)b(s)} 88 | \end{equation} 89 | } 90 | \begin{equation*} 91 | \tau(b, a, b') 92 | = \sum_{o\in\Omega} P(b'\mid b, a, o)P(o\mid a, b) 93 | \end{equation*} 94 | \item $r: B\times A \rightarrow \mathbb{R}$ belief state reward function 95 | \begin{equation*} 96 | r(b,a) = \sum_{s\in S}b(s)R(s,a) 97 | \end{equation*} 98 | \end{itemize} 99 | 100 | \end{frame} 101 | 102 | \begin{frame}[fragile] 103 | \frametitle{Background} 104 | A POMDP induces an equivalent representation as a \emph{belief MDP} with tuple $(B, A, \tau, R)$ 105 | 106 | \begin{itemize} 107 | \item $B$: set of belief states over the POMDP states 108 | \item $A$: action space of original POMDP 109 | \item $\tau$: belief state transition operator\footnote{ 110 | \alert{ 111 | Given $b(s)$, after taking action $a$ and observing $o$ (and reaching state 112 | $s'$), update belief states 113 | \begin{equation} 114 | b'(s') = \frac{P(o\mid b,a,s')}{P(o\mid b,a)}=\frac{O(o\mid s',a)\sum_{s\in S} T(s'\mid 115 | s,a)b(s)}{\sum_{s'\in S} O(o\mid s',a)\sum_{s\in S}T(s'\mid s,a)b(s)} 116 | \end{equation} 117 | }} 118 | \begin{equation*} 119 | \tau(b, a, b') 120 | = \sum_{o\in\Omega} P(b'\mid b, a, o)P(o\mid a, b) 121 | \end{equation*} 122 | \item $r: B\times A \rightarrow \mathbb{R}$ belief state reward function 123 | \begin{equation*} 124 | r(b,a) = \sum_{s\in S}b(s)R(s,a) 125 | \end{equation*} 126 | \end{itemize} 127 | 128 | \end{frame} 129 | 130 | \begin{frame}[fragile] 131 | \frametitle{Background} 132 | 133 | Implemented MDP solvers: 134 | \begin{itemize} 135 | \item Q-learning 136 | \item SARSA 137 | \item R-MAX 138 | \item Thompson sampling 139 | \end{itemize} 140 | 141 | There are a lot! 142 | \begin{itemize} 143 | \item Function approximations with adaptive basis functions 144 | \item BOSS 145 | \item Spectral methods 146 | \item Skill chaining 147 | \item $\cdots$ 148 | \end{itemize} 149 | 150 | \end{frame} 151 | 152 | \begin{frame}[fragile] 153 | \frametitle{Background} 154 | 155 | Implemented MDP solvers: 156 | \begin{itemize} 157 | \item Q-learning \alert{(Watkins, 1989)} 158 | \item SARSA \alert{(Rummery and Niranjan, 1994)} 159 | \item R-MAX \alert{(Brafman and Tennenholtz, 2002)} 160 | \item Thompson sampling \alert{(Strens, 2000)} 161 | \end{itemize} 162 | 163 | There are a lot more! 164 | \begin{itemize} 165 | \item Function approximations with adaptive basis functions \alert{(Mnih et 166 | al., 2013)} 167 | \item BOSS \alert{(Asmuth et al., 2009)} 168 | \item Spectral methods \alert{(Boots et al., 2009)} 169 | \item Skill chaining \alert{(Konidaris and Barto, 2009)} 170 | \item $\cdots$ 171 | \end{itemize} 172 | 173 | \end{frame} 174 | 175 | \begin{frame}[fragile] 176 | \frametitle{Grocery shopping} 177 | 178 | Setup: Grid World POMDP 179 | 180 | Uncertain movement 181 | 182 | \centerline{\includegraphics[width=0.22\textwidth]{img/uncertain_transition.png}} 183 | 184 | Can only see around current cell (partially observable) 185 | 186 | \centerline{\includegraphics[width=0.22\textwidth]{img/partial_obs.png}} 187 | 188 | World is not fully known beforehand 189 | \begin{itemize} 190 | \item Model of how items in the same aisle correlate 191 | \item Unknown arrangement of aisles 192 | \item Unknown arrangement of items within aisles 193 | \end{itemize} 194 | \end{frame} 195 | 196 | \begin{frame}[fragile] 197 | \frametitle{Grocery shopping} 198 | 199 | GUI interface: {\it pygame} 200 | 201 | Every second: 202 | \begin{itemize} 203 | \item Agent provides next action based on current belief state 204 | \item Simulator executes action (errors may happen) 205 | \item Belief state is updated based on transition probabilities 206 | \item Belief state is updated based on observation 207 | \item Belief about the world is updated based on belief state, and observation 208 | \end{itemize} 209 | 210 | Challenges: 211 | \begin{itemize} 212 | \item Markov assumption is not completely accurate 213 | \item Bias towards increasing probability of most likely states 214 | % (state increases observation probability, observation increases state probability) 215 | %\item 216 | \end{itemize} 217 | \end{frame} 218 | 219 | \plain{demo} 220 | 221 | \begin{frame}[fragile] 222 | \frametitle{Our working solver} 223 | We encode a \textbf{Max Probability} MDP 224 | \begin{itemize} 225 | \item Motivated from greedy policies 226 | \item Choose the most likely state from belief states as one's position in an 227 | MDP 228 | \item Solve the MDP! 229 | \end{itemize} 230 | \end{frame} 231 | 232 | \begin{frame}[fragile] 233 | \frametitle{Our working solver} 234 | Value iteration: 235 | \begin{align*} 236 | v_{k+1}(s) &= \max_a \mathbb{E}[R_{t+1} + \gamma v_k(S_{t+1}) \mid S_t = s, A_t = a] \\ 237 | &= \max_a \sum_{s'} p(s' \mid s,a) [r(s,a,s') + \gamma v_k(s')] 238 | \end{align*} 239 | \end{frame} 240 | 241 | \begin{frame}[fragile] 242 | \frametitle{Failed tasks} 243 | 244 | \begin{itemize} 245 | \item Continuous state space in belief MDP: Value iteration 246 | \item Thompson sampling 247 | \item TD($\lambda$) methods: Q-Learning, SARSA, Monte Carlo Tree Search 248 | \end{itemize} 249 | \end{frame} 250 | 251 | \begin{frame} 252 | \begin{figure}[ht] 253 | \frametitle{Most simplified task (GridWorld)} 254 | \vspace{3ex} 255 | \begin{center} 256 | \centerline{\includegraphics[width=1.1\textwidth]{img/mdp_imm_rewards.png}} 257 | \end{center} 258 | \end{figure} 259 | \end{frame} 260 | 261 | \plain{ 262 | {\Large Play with it!}\\[5ex] 263 | \includegraphics{img/octocat.png}\\[3ex] 264 | github.com/dustinvtran/bayesrl 265 | } 266 | 267 | \end{document} 268 | -------------------------------------------------------------------------------- /reports/beamercolorthememetropolis.sty: -------------------------------------------------------------------------------- 1 | % Beamer mtheme 2 | % 3 | % Copyright 2014 Matthias Vogelgesang 4 | % Licensed under CC-BY-SA 4.0 International. 5 | % 6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which 7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme. 8 | % 9 | 10 | \ProvidesPackage{beamercolorthememetropolis} 11 | 12 | 13 | %}}} 14 | %{{{ --- Options ---------------------- 15 | 16 | \newif\if@beamer@metropolis@blockbg 17 | \@beamer@metropolis@blockbgfalse 18 | \DeclareOptionBeamer{blockbg}{\@beamer@metropolis@blockbgtrue} 19 | 20 | \DeclareOptionBeamer*{% 21 | \PackageWarning{beamercolorthememetropolis}{Unknown option `\CurrentOption'}% 22 | } 23 | 24 | \ProcessOptionsBeamer 25 | 26 | %}}} 27 | %{{{ --- Colors --------------------- 28 | 29 | % http://paletton.com/#uid=7050t0kkJkJsntwoyp6gYgoddc4 30 | 31 | \definecolor{mDarkBrown}{HTML}{604c38} 32 | \definecolor{mDarkTeal}{HTML}{23373b} 33 | 34 | \definecolor{mLightBrown}{HTML}{EB811B} 35 | \definecolor{mMediumBrown}{HTML}{C87A2F} 36 | 37 | \setbeamercolor{palette primary}{fg=mDarkTeal, bg=black!2} 38 | \setbeamercolor{palette secondary}{fg=white, bg=mDarkTeal} 39 | \setbeamercolor{palette quaternary}{fg=mDarkBrown} 40 | \setbeamercolor{palette tertiary}{fg=white, bg=mMediumBrown} 41 | 42 | \setbeamercolor{title}{parent=palette primary} 43 | \setbeamercolor{subtitle}{parent=palette primary} 44 | \setbeamercolor{author}{parent=palette primary} 45 | \setbeamercolor{date}{parent=palette primary} 46 | \setbeamercolor{institute}{parent=palette primary} 47 | 48 | \setbeamercolor{section title}{parent=palette primary} 49 | \setbeamercolor{frametitle}{parent=palette secondary} 50 | \setbeamercolor{background canvas}{parent=palette primary} 51 | \setbeamercolor{structure}{fg=mDarkTeal} 52 | 53 | \setbeamercolor{normal text}{fg=black!97} 54 | \setbeamercolor{alerted text}{fg=mLightBrown} 55 | 56 | \setbeamercolor{footnote}{fg=mDarkTeal!50} 57 | \setbeamercolor{footnote mark}{fg=.} 58 | \setbeamercolor{page number in head/foot}{fg=mDarkTeal} 59 | 60 | \if@beamer@metropolis@blockbg 61 | 62 | \setbeamercolor{block title}{use=palette primary,parent=palette primary,bg=palette primary.bg!80!fg} 63 | \setbeamercolor{block title alerted}{use={palette primary,alerted text},parent=palette primary,fg=alerted text.fg,bg=palette primary.bg!80!fg} 64 | \setbeamercolor{block title example}{use={palette primary,example text},parent=palette primary,fg=example text.fg,bg=palette primary.bg!80!fg} 65 | 66 | \setbeamercolor{block body}{use=block title,parent=normal text,bg=block title.bg!50} 67 | \setbeamercolor{block body alerted}{use={normal text,block body},parent=normal text,bg=block body.bg} 68 | \setbeamercolor{block body example}{use={normal text,block body},parent=normal text,bg=block body.bg} 69 | 70 | \fi 71 | 72 | \mode 73 | -------------------------------------------------------------------------------- /reports/beamerfontthememetropolis.sty: -------------------------------------------------------------------------------- 1 | % Beamer mtheme 2 | % 3 | % Copyright 2014 Matthias Vogelgesang 4 | % Licensed under CC-BY-SA 4.0 International. 5 | % 6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which 7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme. 8 | % 9 | 10 | \ProvidesPackage{beamerfontthememetropolis} 11 | 12 | \RequirePackage[no-math]{fontspec} 13 | 14 | 15 | \defaultfontfeatures{Mapping=tex-text} 16 | %\setsansfont[BoldFont={Fira Sans}]{Fira Sans Light} 17 | %\setmonofont{Fira Mono} 18 | %\newfontfamily\ExtraLight{Fira Sans ExtraLight} 19 | %\newfontfamily\Light{Fira Sans Light} 20 | %\newfontfamily\Book{Fira Sans} 21 | %\newfontfamily\Medium{Fira Sans Medium} 22 | %\setsansfont{Helvetica Neue Thin} 23 | %\setmonofont{Courier New} 24 | %\newfontfamily\ExtraLight{Helvetica Neue Thin} 25 | %\newfontfamily\Light{Helvetica Neue Thin} 26 | %\newfontfamily\Book{Helvetica Neue Thin} 27 | %\newfontfamily\Medium{Helvetica Neue Thin} 28 | \setsansfont{Palatino} 29 | \setmonofont{Courier New} 30 | \newfontfamily\ExtraLight{Palatino} 31 | \newfontfamily\Light{Palatino} 32 | \newfontfamily\Book{Helvetica Neue Thin} 33 | \newfontfamily\Medium{Palatino} 34 | 35 | %\AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Fira Sans}, Numbers={Monospaced}]{Fira Sans Light}} 36 | %\AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Helvetica Neue Thin}, Numbers={Monospaced}]{Helvetica Neue Thin}} 37 | \AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Palatino}, Numbers={Monospaced}]{Palatino}} 38 | 39 | \setbeamerfont{title}{family=\Book, size=\Large} 40 | \setbeamerfont{author}{family=\ExtraLight, size=\small} 41 | \setbeamerfont{date}{family=\ExtraLight, size=\small} 42 | 43 | \setbeamerfont{section title}{family=\Book, size=\Large} 44 | 45 | \setbeamerfont{block title}{family=\Book, size=\normalsize} 46 | \setbeamerfont{block title alerted}{family=\Book,size=\normalsize} 47 | 48 | \setbeamerfont{subtitle}{family=\Light, size=\fontsize{12}{14}} 49 | \setbeamerfont{frametitle}{family=\Book, size=\large} 50 | 51 | \setbeamerfont{caption}{size=\small} 52 | \setbeamerfont{caption name}{family=\Book} 53 | 54 | \setbeamerfont{description item}{family=\Book} 55 | 56 | \setbeamerfont{page number in head/foot}{size=\scriptsize} 57 | 58 | 59 | \linespread{1.15} 60 | -------------------------------------------------------------------------------- /reports/beamerthemem.sty: -------------------------------------------------------------------------------- 1 | % Beamer mtheme 2 | % 3 | % Copyright 2014 Matthias Vogelgesang 4 | % Licensed under CC-BY-SA 4.0 International. 5 | % 6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which 7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme. 8 | % 9 | 10 | \ProvidesPackage{beamerthemem} 11 | 12 | %{{{ --- Options ---------------------- 13 | 14 | \newif\if@useTitleProgressBar 15 | \newif\if@protectFrameTitle 16 | \newif\if@noSmallCapitals 17 | \newif\if@noSectionSlide 18 | \newif\if@useTotalSlideIndicator 19 | 20 | \@useTitleProgressBarfalse 21 | \@protectFrameTitlefalse 22 | \@noSmallCapitalsfalse 23 | \@noSectionSlidefalse 24 | \@useTotalSlideIndicatorfalse 25 | 26 | \newlength{\@mtheme@voffset} 27 | \setlength{\@mtheme@voffset}{2em} 28 | 29 | \DeclareOptionBeamer{usetitleprogressbar}{\@useTitleProgressBartrue} 30 | \DeclareOptionBeamer{protectframetitle}{\@protectFrameTitletrue} 31 | \DeclareOptionBeamer{blockbg}{% 32 | \PassOptionsToPackage{blockbg}{beamercolorthememetropolis}% 33 | } 34 | \DeclareOptionBeamer{nooffset}{\setlength{\@mtheme@voffset}{0em}} 35 | 36 | \DeclareOptionBeamer*{% 37 | \PackageWarning{beamerthemem}{Unknown option `\CurrentOption'}% 38 | } 39 | 40 | \DeclareOptionBeamer{nosmallcapitals}{\@noSmallCapitalstrue} 41 | \DeclareOptionBeamer{nosectionslide}{\@noSectionSlidetrue} 42 | \DeclareOptionBeamer{usetotalslideindicator}{\@useTotalSlideIndicatortrue} 43 | 44 | \ProcessOptionsBeamer 45 | 46 | %}}} 47 | 48 | \mode 49 | 50 | %{{{ --- Packages --------------------- 51 | 52 | \RequirePackage[no-math]{fontspec} 53 | \RequirePackage{etoolbox} 54 | \RequirePackage{tikz} 55 | \RequirePackage{pgfplots} 56 | 57 | \usetikzlibrary{backgrounds} 58 | \usetikzlibrary{calc} 59 | 60 | \usecolortheme{metropolis} 61 | \usefonttheme{metropolis} 62 | 63 | %}}} 64 | %{{{ --- Titlepage -------------------- 65 | 66 | \def\maketitle{\ifbeamer@inframe\titlepage\else\frame[plain]{\titlepage}\fi} 67 | 68 | \def\titlepage{\usebeamertemplate{title page}} 69 | \setbeamertemplate{title page} 70 | { 71 | \begin{minipage}[b][\paperheight]{\textwidth} 72 | \vspace*{\@mtheme@voffset} 73 | \ifx\inserttitlegraphic\@empty% 74 | \else% 75 | { 76 | % actual output of titlegraphic 77 | \usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par% 78 | % measurement and add negative vspace 79 | \newdimen\logoheight 80 | \setbox0=\vbox{\inserttitlegraphic}% 81 | \logoheight=\ht0 \advance\logoheight by \dp0 % 82 | \vspace*{-\logoheight}% 83 | \vspace*{-1em}% I don't know why this additional negative space is needed 84 | }% 85 | \fi% 86 | \vfill 87 | \ifx\inserttitle\@empty% 88 | \else% 89 | \if@noSmallCapitals% 90 | {\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\inserttitle\par}% 91 | \else% 92 | %{\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\scshape\MakeLowercase{\inserttitle}\par}% 93 | {\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\scshape{\inserttitle}\par}% 94 | \fi% 95 | \vspace*{0.5em} 96 | \fi% 97 | \ifx\insertsubtitle\@empty% 98 | \else% 99 | {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}% 100 | \vspace*{0.5em} 101 | \fi% 102 | \begin{tikzpicture}\draw[alerted text.fg] (0, 0) -- (\textwidth, 0);\end{tikzpicture}% 103 | \vspace*{1em} 104 | \ifx\insertauthor\@empty% 105 | \else% 106 | {\usebeamerfont{author}\usebeamercolor[fg]{author}\insertauthor\par}% 107 | \vspace*{0.25em} 108 | \fi% 109 | \ifx\insertdate\@empty% 110 | \else% 111 | {\usebeamerfont{date}\usebeamercolor[fg]{date}\insertdate\par}% 112 | \fi% 113 | \ifx\insertinstitut\@empty% 114 | \else% 115 | \vspace*{3mm} 116 | {\usebeamerfont{institute}\usebeamercolor[fg]{institute}\insertinstitute\par}% 117 | \fi% 118 | \vfill 119 | \vspace*{\@mtheme@voffset} 120 | \end{minipage} 121 | } 122 | 123 | %}}} 124 | %{{{ --- Progressbar ------------------ 125 | 126 | \makeatletter 127 | \def\progressbar@sectionprogressbar{} 128 | \def\progressbar@titleprogressbar{} 129 | \newcount\progressbar@tmpcounta % auxiliary counter 130 | \newcount\progressbar@tmpcountb % auxiliary counter 131 | \newdimen\progressbar@pbht % progressbar height 132 | \newdimen\progressbar@pbwd % progressbar width 133 | \newdimen\progressbar@tmpdim % auxiliary dimension 134 | 135 | \progressbar@pbwd=22em 136 | \progressbar@pbht=0.4pt 137 | 138 | % the progress bar 139 | \def\progressbar@sectionprogressbar{% 140 | {\usebeamercolor{palette primary}% 141 | \progressbar@tmpcounta=\insertframenumber 142 | \progressbar@tmpcountb=\inserttotalframenumber 143 | \progressbar@tmpdim=\progressbar@pbwd 144 | \divide\progressbar@tmpdim by 100 145 | \multiply\progressbar@tmpdim by \progressbar@tmpcounta 146 | \divide\progressbar@tmpdim by \progressbar@tmpcountb 147 | \multiply\progressbar@tmpdim by 100 148 | 149 | % fixes very high linespacing introduced via \textsc{\MakeLowercase{...}} 150 | \fontsize{1em}{1em}\selectfont 151 | 152 | \makebox[\textwidth][c]{ 153 | \begin{tikzpicture}[tight background] 154 | 155 | \node[anchor=south west, fg, inner sep=0pt, text width=\progressbar@pbwd] at (0pt, 0pt) {\insertsectionHEAD}; 156 | 157 | \draw[anchor=west, fg!20, fill=fg!20, inner sep=0pt] 158 | (0, -1ex) rectangle ++ (\progressbar@pbwd, \progressbar@pbht); 159 | 160 | \draw[anchor=west, fg, fill=fg, inner sep=0pt] 161 | (0, -1ex) rectangle ++ (\progressbar@tmpdim, \progressbar@pbht); 162 | \end{tikzpicture}% 163 | } 164 | } % end usebeamercolor{palette primary} 165 | } 166 | 167 | \if@useTitleProgressBar 168 | \def\progressbar@titleprogressbar{% 169 | \progressbar@tmpcounta=\insertframenumber 170 | \progressbar@tmpcountb=\inserttotalframenumber 171 | \progressbar@tmpdim=\paperwidth 172 | \divide\progressbar@tmpdim by 100 173 | \multiply\progressbar@tmpdim by \progressbar@tmpcounta 174 | \divide\progressbar@tmpdim by \progressbar@tmpcountb 175 | \multiply\progressbar@tmpdim by 100 176 | {% 177 | \usebeamercolor{palette quaternary}% 178 | \usebeamercolor{alerted text}% 179 | \begin{tikzpicture}[tight background] 180 | \draw[palette quaternary.fg, fill=palette quaternary.fg] (0, 0) rectangle ($(\paperwidth, 0.6pt) - (0.4pt, 0)$); 181 | \draw[alerted text.fg, fill=alerted text.fg] (0, 0) rectangle (\progressbar@tmpdim, 0.6pt); 182 | \end{tikzpicture}% 183 | }% 184 | } 185 | \fi 186 | %}}} 187 | %{{{ --- Commands --------------------- 188 | 189 | \newcommand{\insertsectionHEAD}{% 190 | \expandafter\insertsectionHEADaux\insertsectionhead} 191 | 192 | \if@noSmallCapitals% 193 | \newcommand{\insertsectionHEADaux}[3]{#3}% 194 | \else% 195 | \newcommand{\insertsectionHEADaux}[3]{\textsc{\MakeLowercase{#3}}}% 196 | \fi% 197 | 198 | \newcommand{\plain}[2][]{% 199 | \begingroup 200 | \setbeamercolor{background canvas}{use=palette primary,bg=palette primary.fg} 201 | \begin{frame}{#1} 202 | \centering 203 | \vfill\vspace{1em}\usebeamerfont{section title}\textcolor{white}{\scshape #2}\vfill 204 | \end{frame} 205 | \endgroup 206 | } 207 | 208 | %}}} 209 | %{{{ --- Itemize ---------------------- 210 | 211 | \setlength{\leftmargini}{1em} 212 | 213 | % Actually one level should be enough but ... 214 | \setlength{\leftmarginii}{1em} 215 | \setlength{\leftmarginiii}{1em} 216 | 217 | \newcommand{\itemBullet}{∙} 218 | 219 | \setbeamertemplate{itemize item}{\itemBullet} 220 | \setbeamertemplate{itemize subitem}{\itemBullet} 221 | \setbeamertemplate{itemize subsubitem}{\itemBullet} 222 | \setlength{\parskip}{0.5em} 223 | 224 | %}}} 225 | %{{{ --- Sections --------------------- 226 | 227 | % Insert frame with section title at every section start 228 | \AtBeginSection[] 229 | { 230 | \if@noSectionSlide% 231 | \else% 232 | \begingroup 233 | \setbeamercolor{background canvas}{parent=palette primary} 234 | \begin{frame}[plain] 235 | \vspace{2em}\usebeamerfont{section title} 236 | \progressbar@sectionprogressbar% 237 | \end{frame} 238 | \endgroup 239 | \fi% 240 | } 241 | 242 | %}}} 243 | %{{{ --- Captions --------------------- 244 | 245 | \setbeamertemplate{caption label separator}{: } 246 | \setbeamertemplate{caption}[numbered] 247 | 248 | %}}} 249 | %{{{ --- Footline/footnote ------------ 250 | 251 | \usenavigationsymbolstemplate{} 252 | \setbeamertemplate{footline} 253 | {% 254 | \begin{beamercolorbox}[wd=\textwidth,ht=3ex,dp=3ex,leftskip=0.3cm,rightskip=0.3cm]{structure}% 255 | \hfill\usebeamerfont{page number in head/foot}% 256 | \if@useTotalSlideIndicator% 257 | \insertpagenumber/\insertpresentationendpage% 258 | \else% 259 | \insertpagenumber% 260 | \fi% 261 | \end{beamercolorbox}% 262 | } 263 | 264 | \setbeamertemplate{footnote} 265 | {% 266 | \parindent 0em\noindent% 267 | \raggedright 268 | \usebeamercolor{footnote}\hbox to 0.8em{\hfil\insertfootnotemark}\insertfootnotetext\par% 269 | } 270 | 271 | %}}} 272 | %{{{ --- Frametitle ------------------- 273 | 274 | \setbeamertemplate{frametitle}{% 275 | \nointerlineskip 276 | \begin{beamercolorbox}[wd=\paperwidth,leftskip=0.3cm,rightskip=0.3cm,ht=2.5ex,dp=1.5ex]{frametitle} 277 | \usebeamerfont{frametitle}% 278 | \if@protectFrameTitle% 279 | \protect% 280 | \if@noSmallCapitals% 281 | \insertframetitle% 282 | \else% 283 | %\textsc{\MakeLowercase{\insertframetitle}}% 284 | \textsc{{\insertframetitle}}% 285 | \fi% 286 | \else% 287 | \if@noSmallCapitals% 288 | \insertframetitle% 289 | \else% 290 | %\textsc{\MakeLowercase{\insertframetitle}}% 291 | \textsc{{\insertframetitle}}% 292 | \fi% 293 | \fi% 294 | \end{beamercolorbox}% 295 | \if@useTitleProgressBar 296 | \vspace{-.5em} 297 | \begin{beamercolorbox}[wd=\paperwidth,ht=1pt,dp=0pt]{frametitle} 298 | \progressbar@titleprogressbar 299 | \end{beamercolorbox} 300 | \fi 301 | \vspace{\@mtheme@voffset} 302 | } 303 | 304 | %}}} 305 | %{{{ --- pgfplots --------------------- 306 | 307 | %{{{ Colors 308 | 309 | % TolColors from http://www.r-bloggers.com/the-paul-tol-21-color-salute/ 310 | \definecolor{TolColor1}{HTML}{332288} % dark purple 311 | \definecolor{TolColor2}{HTML}{6699CC} % dark blue 312 | \definecolor{TolColor3}{HTML}{88CCEE} % light blue 313 | \definecolor{TolColor4}{HTML}{44AA99} % light green 314 | \definecolor{TolColor5}{HTML}{117733} % dark green 315 | \definecolor{TolColor6}{HTML}{999933} % dark brown 316 | \definecolor{TolColor7}{HTML}{DDCC77} % light brown 317 | \definecolor{TolColor8}{HTML}{661100} % dark red 318 | \definecolor{TolColor9}{HTML}{CC6677} % light red 319 | \definecolor{TolColor10}{HTML}{AA4466} % light pink 320 | \definecolor{TolColor11}{HTML}{882255} % dark pink 321 | \definecolor{TolColor12}{HTML}{AA4499} % light purple 322 | 323 | %}}} 324 | %{{{ Color cycles 325 | 326 | \pgfplotscreateplotcyclelist{mbarplot cycle}{% 327 | {draw=TolColor2, fill=TolColor2!70}, 328 | {draw=TolColor7, fill=TolColor7!70}, 329 | {draw=TolColor4, fill=TolColor4!70}, 330 | {draw=TolColor11, fill=TolColor11!70}, 331 | {draw=TolColor1, fill=TolColor1!70}, 332 | {draw=TolColor8, fill=TolColor8!70}, 333 | {draw=TolColor6, fill=TolColor6!70}, 334 | {draw=TolColor9, fill=TolColor9!70}, 335 | {draw=TolColor10, fill=TolColor10!70}, 336 | {draw=TolColor12, fill=TolColor12!70}, 337 | {draw=TolColor3, fill=TolColor3!70}, 338 | {draw=TolColor5, fill=TolColor5!70}, 339 | } 340 | 341 | \pgfplotscreateplotcyclelist{mlineplot cycle}{% 342 | {TolColor2, mark=*, mark size=1.5pt}, 343 | {TolColor7, mark=square*, mark size=1.3pt}, 344 | {TolColor4, mark=triangle*, mark size=1.5pt}, 345 | {TolColor6, mark=diamond*, mark size=1.5pt}, 346 | } 347 | 348 | %}}} 349 | %{{{ Styles 350 | 351 | \pgfplotsset{ 352 | compat=1.9, 353 | mbaseplot/.style={ 354 | legend style={ 355 | draw=none, 356 | fill=none, 357 | cells={anchor=west}, 358 | }, 359 | x tick label style={ 360 | font=\footnotesize 361 | }, 362 | y tick label style={ 363 | font=\footnotesize 364 | }, 365 | legend style={ 366 | font=\footnotesize 367 | }, 368 | major grid style={ 369 | dotted, 370 | }, 371 | axis x line*=bottom, 372 | }, 373 | mlineplot/.style={ 374 | mbaseplot, 375 | xmajorgrids=true, 376 | ymajorgrids=true, 377 | major grid style={dotted}, 378 | axis x line=bottom, 379 | axis y line=left, 380 | legend style={ 381 | cells={anchor=west}, 382 | draw=none 383 | }, 384 | cycle list name=mlineplot cycle, 385 | }, 386 | mbarplot base/.style={ 387 | mbaseplot, 388 | bar width=6pt, 389 | axis y line*=none, 390 | }, 391 | mbarplot/.style={ 392 | mbarplot base, 393 | ybar, 394 | xmajorgrids=false, 395 | ymajorgrids=true, 396 | area legend, 397 | legend image code/.code={% 398 | \draw[#1] (0cm,-0.1cm) rectangle (0.15cm,0.1cm); 399 | }, 400 | cycle list name=mbarplot cycle, 401 | }, 402 | horizontal mbarplot/.style={ 403 | mbarplot base, 404 | xmajorgrids=true, 405 | ymajorgrids=false, 406 | xbar stacked, 407 | area legend, 408 | legend image code/.code={% 409 | \draw[#1] (0cm,-0.1cm) rectangle (0.15cm,0.1cm); 410 | }, 411 | cycle list name=mbarplot cycle, 412 | }, 413 | disable thousands separator/.style={ 414 | /pgf/number format/.cd, 415 | 1000 sep={} 416 | }, 417 | } 418 | 419 | %}}} 420 | 421 | \mode 422 | 423 | %{{{ misc 424 | \let\otp\titlepage 425 | \renewcommand{\titlepage}{\otp\addtocounter{framenumber}{-1}} 426 | \newcommand{\mreducelistspacing}{\vspace{-\topsep}} 427 | 428 | \linespread{1.15} 429 | 430 | %}}} 431 | -------------------------------------------------------------------------------- /reports/demo.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/demo.webm -------------------------------------------------------------------------------- /reports/img/agent_environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/agent_environment.png -------------------------------------------------------------------------------- /reports/img/agent_environment_untitled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/agent_environment_untitled.png -------------------------------------------------------------------------------- /reports/img/mdp_imm_rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/mdp_imm_rewards.png -------------------------------------------------------------------------------- /reports/img/octocat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/octocat.png -------------------------------------------------------------------------------- /reports/img/partial_obs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/partial_obs.png -------------------------------------------------------------------------------- /reports/img/pomdp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/pomdp.png -------------------------------------------------------------------------------- /reports/img/uncertain_transition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/uncertain_transition.png -------------------------------------------------------------------------------- /reports/pset.cls: -------------------------------------------------------------------------------- 1 | % This is a modified version of the Harvard ML template by Ryan Adams. 2 | 3 | \NeedsTeXFormat{LaTeX2e}[1995/01/01] 4 | \ProvidesClass{pset} 5 | [2013/08/31 v0.01 Harvard ML Assignment Class] 6 | 7 | %############################################################################## 8 | % Base class 9 | %############################################################################## 10 | 11 | \LoadClass[10pt,letterpaper]{article} 12 | 13 | % "(no)submit" argument specifies whether to include credits in header. 14 | \newif\ifpset@submit 15 | \DeclareOption{submit}{\pset@submittrue} 16 | \DeclareOption{nosubmit}{\pset@submitfalse} 17 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} 18 | \ExecuteOptions{submit} 19 | \ProcessOptions\relax 20 | 21 | %############################################################################## 22 | % Packages 23 | %############################################################################## 24 | 25 | \RequirePackage{palatino} 26 | \RequirePackage{mathpazo} 27 | \RequirePackage{amsmath} 28 | \RequirePackage{amssymb} 29 | \RequirePackage{amsthm} 30 | %\RequirePackage{fancyhdr} 31 | \RequirePackage{fullpage} 32 | \RequirePackage{graphicx} 33 | \RequirePackage{mdframed} 34 | 35 | %############################################################################## 36 | % Page Headings 37 | %############################################################################## 38 | 39 | %\pagestyle{fancy} 40 | %\fancyhead[L]{\rule[-1.25ex]{0em}{0ex}Tran} 41 | %\fancyhead[C]{\pset@assignment} 42 | %\fancyhead[R]{\thepage} 43 | %\fancyfoot[L]{} 44 | %\fancyfoot[C]{} 45 | %\fancyfoot[R]{} 46 | 47 | %############################################################################## 48 | % Header 49 | %############################################################################## 50 | 51 | \def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip} 52 | 53 | \newcommand{\headerblock}{% 54 | \noindent\begin{minipage}{0.33\textwidth} 55 | \begin{flushleft} 56 | \ifpset@submit 57 | \mbox{\pset@course}\\ 58 | \mbox{\pset@instructor} 59 | \fi 60 | \end{flushleft} 61 | \end{minipage} 62 | \noindent\begin{minipage}{0.33\textwidth} 63 | \begin{center} 64 | \mbox{\Large\pset@assignment}\protect\\ 65 | Due: \pset@duedate 66 | \end{center} 67 | \end{minipage} 68 | \noindent\begin{minipage}{0.33\textwidth} 69 | \begin{flushright} 70 | \ifpset@submit 71 | \mbox{\pset@name}\\ 72 | \mbox{\pset@email} 73 | \fi 74 | \end{flushright} 75 | \end{minipage} 76 | \vspace{0.1cm} 77 | \titlebar 78 | } 79 | 80 | \AtBeginDocument{\headerblock} 81 | 82 | \def\pset@name{} 83 | \def\pset@email{} 84 | \def\pset@course{} 85 | \def\pset@instructor{} 86 | \def\pset@assignment{} 87 | \def\pset@duedate{} 88 | 89 | % Commands to automatically input info. 90 | \newcommand{\name}[1]{\def\pset@name{#1}} 91 | \newcommand{\email}[1]{\def\pset@email{#1}} 92 | \newcommand{\course}[1]{\def\pset@course{#1}} 93 | \newcommand{\instructor}[1]{\def\pset@instructor{#1}} 94 | \newcommand{\assignment}[1]{\def\pset@assignment{#1}} 95 | \newcommand{\duedate}[1]{\def\pset@duedate{#1}} 96 | 97 | %############################################################################## 98 | % Environments 99 | %############################################################################## 100 | 101 | \newtheoremstyle{box} 102 | {3pt}% Space above 103 | {3pt}% Space below 104 | {}% Body font 105 | {}% Indent amount 106 | {\bfseries}% Theorem head font 107 | {\\*[3pt]}% Punctuation after theorem head 108 | {.5em}% Space after theorem head 109 | {}% Theorem head spec (can be left empty, meaning `normal') 110 | \theoremstyle{box} 111 | \newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep]{problem}{Problem} 112 | 113 | %############################################################################## 114 | % Misc 115 | % TODO: Organize 116 | %############################################################################## 117 | \RequirePackage{amsfonts,amssymb,amsthm} 118 | \RequirePackage{cancel} 119 | \RequirePackage{centernot} 120 | \RequirePackage{color} 121 | \RequirePackage{enumerate} 122 | \RequirePackage{graphicx} 123 | %\RequirePackage{hyperref} 124 | %\hypersetup{colorlinks=true,urlcolor=blue} 125 | \RequirePackage{listings} 126 | \RequirePackage{mathrsfs} 127 | \RequirePackage{tikz} 128 | %\RequirePackage{tikz-cd} 129 | \usetikzlibrary{patterns,shapes,snakes} 130 | %\RequirePackage[usenames,dvipsnames,svgnames,table]{xcolor} 131 | %\RequirePackage{fontspec, xunicode} 132 | %\setmonofont{Consolas} 133 | 134 | % Emulate markdown's light grey background for monospace. 135 | \usepackage{soul} 136 | \definecolor{Light}{gray}{.96} 137 | \sethlcolor{Light} 138 | \let\OldTexttt\texttt 139 | \renewcommand{\texttt}[1]{\OldTexttt{\hl{#1}}}% will affect all \texttt 140 | 141 | % Use knitr's colorscheme. 142 | \definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} 143 | \definecolor{hlnum}{rgb}{0.686,0.059,0.569} 144 | \definecolor{hlstr}{rgb}{0.192,0.494,0.8} 145 | \definecolor{hlcom}{rgb}{0.678,0.584,0.686} 146 | \definecolor{hlopt}{rgb}{0,0,0} 147 | \definecolor{hlstd}{rgb}{0.345,0.345,0.345} 148 | \definecolor{hlkwa}{rgb}{0.161,0.373,0.58} 149 | \definecolor{hlkwb}{rgb}{0.69,0.353,0.396} 150 | \definecolor{hlkwc}{rgb}{0.333,0.667,0.333} 151 | \definecolor{hlkwd}{rgb}{0.737,0.353,0.396} 152 | \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969} 153 | 154 | \lstset{ 155 | backgroundcolor=\color{shadecolor}, 156 | basicstyle=\color{hlstd}\ttfamily\footnotesize, 157 | breakatwhitespace=false, 158 | %breaklines=true, 159 | captionpos=b, 160 | commentstyle=\color{hlcom}, 161 | deletekeywords={...}, 162 | escapeinside={\%*}{*)}, 163 | extendedchars=true, 164 | frame=lines, 165 | keepspaces=true, 166 | keywordstyle=\color{hlkwb}, 167 | morekeywords={*,...}, 168 | numbers=left, 169 | numbersep=5pt, 170 | numberstyle=\tiny\color{hlstd}, 171 | rulecolor=\color{hlstd}, 172 | showspaces=false, 173 | showstringspaces=false, 174 | showtabs=false, 175 | stepnumber=1, 176 | stringstyle=\color{hlstr}, 177 | tabsize=2, 178 | title=\lstname 179 | } 180 | 181 | %############################################################################## 182 | % Operator Macros 183 | %############################################################################## 184 | 185 | \newcommand\given[1][]{\:#1\vert\:} 186 | \newcommand{\todo}[1]{\textcolor{red}{xx TODO: #1 xx }} 187 | \newcommand{\eminus}{\text{\sc{e}-}} 188 | \newcommand{\e}{\text{\sc{e}}} 189 | 190 | %############################################################################## 191 | % Environments 192 | %############################################################################## 193 | 194 | \newtheorem{theorem}{Theorem}[] 195 | \newtheorem{definition}[theorem]{Definition} 196 | \newtheorem{assumption}[theorem]{Assumption} 197 | \newtheorem{conjecture}[theorem]{Conjecture} 198 | \newtheorem{claim}[theorem]{Claim} 199 | \newtheorem{lemma}[theorem]{Lemma} 200 | \newtheorem{proposition}[theorem]{Proposition} 201 | \newtheorem{property}[theorem]{Property} 202 | \newtheorem{fact}[theorem]{Fact} 203 | \newtheorem{corollary}[theorem]{Corollary} 204 | \newtheorem{example}[theorem]{Example} 205 | \newtheorem{remark}[theorem]{Remark} 206 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | config = { 7 | 'name': 'bayesrl', 8 | 'description': 'Reinforcement learning using Bayesian approaches', 9 | 'author': 'Dustin Tran', 10 | 'author_email': 'dtran@g.harvard.edu', 11 | 'version': '0.1', 12 | 'packages': ['bayesrl'], 13 | 'scripts': [], 14 | } 15 | 16 | setup(**config) 17 | -------------------------------------------------------------------------------- /tests/gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prints an example of a grid world. See gridworld.py for a key to the symbols. 3 | """ 4 | from bayesrl.environments import GridWorld 5 | 6 | maze = GridWorld.samples['larger'] 7 | for row in maze: 8 | print(row) 9 | -------------------------------------------------------------------------------- /tests/thompsongridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Solves grid world using Thompson sampling. 3 | """ 4 | 5 | from bayesrl.environments import GridWorld 6 | from bayesrl.agents.thompsonsampagent import ThompsonSampAgent 7 | from bayesrl.trial import Trial 8 | from bayesrl.plot import Plot 9 | 10 | # Define environment. 11 | task = GridWorld( 12 | GridWorld.samples['larger'], 13 | action_error_prob=.1, 14 | rewards={'*': 50, 'moved': -1, 'hit-wall': -1}) 15 | 16 | num_trials = 1 17 | 18 | # Define agent. 19 | # Dirichlet params = 1, Reward params = 50 20 | agent = ThompsonSampAgent( 21 | num_states=task.num_states, num_actions=task.num_actions, 22 | discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50) 23 | trial_thompson1 = Trial(agent, task) 24 | trial_thompson1.run_multiple(num_trials) 25 | 26 | # Plots! 27 | plot = Plot({"Thompson sampling": [trial_thompson1]}) 28 | plot.rewards_by_episode() 29 | -------------------------------------------------------------------------------- /visual/.grid.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/.grid.py.swp -------------------------------------------------------------------------------- /visual/agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | import numpy as np 3 | import itertools 4 | 5 | 6 | class Agent(object): 7 | """ 8 | Base class for all reinforcement learning agents to inherit from. 9 | 10 | Parameters 11 | ---------- 12 | grid: the environment the agent acts on. 13 | gamma: float in (0,1] 14 | The discount factor per iteration. 15 | target_reward: reward for getting a target item 16 | reward is -1 on non-target states. 17 | """ 18 | def __init__(self, grid, gamma=.92, target_reward=100, aisle_reward=50): 19 | self.discount_factor = gamma 20 | self.target_reward = target_reward 21 | self.aisle_reward = aisle_reward 22 | self.grid = grid 23 | self.states = [(r,c) for r in range(self.grid.height) for c in range(self.grid.width)] 24 | self.num_actions = len(self.grid.actions) 25 | self.value_table = np.zeros((self.grid.height, self.grid.width, self.num_actions)) 26 | 27 | def _value_iteration(self): 28 | value = np.zeros(self.value_table.shape) 29 | reward = self.get_reward_state() 30 | k = 0 31 | while True: 32 | diff = 0 33 | for s in self.states: 34 | old = np.max(value[s]) 35 | value[s] = [np.sum([p*(reward[s_]+self.discount_factor*np.max(value[s_])) for (s_,p) 36 | in self.grid.transition_probs(s,a).items()]) for a in self.grid.actions] 37 | diff = max(diff, abs(old - np.max(value[s]))) 38 | k += 1 39 | if diff < 1e-2: 40 | break 41 | if k > 1e6: 42 | raise Exception("Value iteration not converging. Stopped at 1e6 iterations.") 43 | self.value_table = value 44 | 45 | def next_action(self): 46 | most_likely_state = self.states[self._argmax_breaking_ties_randomly(np.ravel(self.grid.belief))] 47 | next_action = self._argmax_breaking_ties_randomly(self.value_table[most_likely_state]) 48 | return self.grid.actions[next_action] 49 | 50 | def get_reward_state(self): 51 | targets = self.grid.targets 52 | target_states = [] 53 | rewards = np.ones((self.grid.height, self.grid.width))*-1. 54 | aisles_belief = self.grid.aisles_belief 55 | content_belief = self.grid.content_belief 56 | categories = aisles_belief[1].keys() 57 | aisles_configs = [] 58 | aisles_probs = [] 59 | for config in itertools.permutations(categories): 60 | aisles_configs.append(config) 61 | aisle_prob = np.product([aisles_belief[i+1][config[i]] for i in 62 | range(len(config))]) 63 | aisles_probs.append(aisle_prob) 64 | aisles_probs = np.array(aisles_probs)/sum(aisles_probs) 65 | multinomial = np.random.multinomial(1, aisles_probs) 66 | aisles_config = aisles_configs[list(multinomial).index(1)] 67 | items_configs = [] 68 | for category in aisles_config: 69 | shelf_configs = [] 70 | shelf_probs = [] 71 | items = content_belief[category][1].keys() 72 | for config in itertools.permutations(items): 73 | shelf_configs.append(config) 74 | shelf_prob = np.product([content_belief[category][i][config[i]] for i in 75 | range(len(config))]) 76 | shelf_probs.append(shelf_prob) 77 | shelf_probs = np.array(shelf_probs)/sum(shelf_probs) 78 | multinomial = np.random.multinomial(1, shelf_probs) 79 | items_config = shelf_configs[list(multinomial).index(1)] 80 | items_configs.append(items_config) 81 | for t in targets: 82 | if t in items_config: 83 | state = \ 84 | self.grid.aisles_list[aisles_config.index(category)][items_config.index(t)] 85 | target_states.append(state) 86 | for s in target_states: 87 | 88 | a,_ = self.grid.cell_to_aisle(s) 89 | for (r,c) in self.grid.aisles_list[a-1]: 90 | for dr, dc in self.grid.actions: 91 | neighbor = (r+dr, c+dc) 92 | if not self.grid.blocked(neighbor): 93 | if rewards[neighbor] == -1: 94 | rewards[neighbor] = 0 95 | rewards[neighbor] += self.aisle_reward 96 | 97 | for dr, dc in self.grid.actions: 98 | neighbor = (s[0]+dr, s[1]+dc) 99 | if not self.grid.blocked(neighbor): 100 | if rewards[neighbor] == -1: 101 | rewards[neighbor] = 0 102 | rewards[neighbor] += self.target_reward 103 | return rewards 104 | 105 | 106 | # Make sure inherited classes have interact() function. 107 | def interact(self, reward, next_state, next_state_is_terminal): 108 | return 109 | 110 | 111 | def _argmax_breaking_ties_randomly(self, x): 112 | """Taken from Ken.""" 113 | max_value = np.max(x) 114 | indices_with_max_value = np.flatnonzero(x == max_value) 115 | return np.random.choice(indices_with_max_value) 116 | -------------------------------------------------------------------------------- /visual/colors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | import pygame 3 | 4 | black = pygame.Color(0,0,0) 5 | white = pygame.Color(255,255,255) 6 | blue = pygame.Color(0,0,255) 7 | green = pygame.Color(0,255,0) 8 | red = pygame.Color(255,0,0) 9 | nameToColor = { 10 | 'black' : black, 11 | 'white' : white, 12 | 'blue' : blue, 13 | 'green' : green, 14 | 'red' : red 15 | } 16 | 17 | gray = lambda fraction: (lambda c: pygame.Color(c,c,c))(int((1-fraction)*255)) 18 | -------------------------------------------------------------------------------- /visual/debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | import IPython 3 | from threading import Thread 4 | from grid import SuperMarket 5 | import display 6 | import sys 7 | import pygame 8 | from pygame.locals import * 9 | import agent 10 | 11 | held = None 12 | def event_handler(e): 13 | global held 14 | if e.type == pygame.KEYDOWN and held is None: 15 | if e.key == pygame.K_a: 16 | g.set_robot(a1) 17 | held = 'a' 18 | elif e.key == pygame.K_s: 19 | g.set_robot(a2) 20 | held = 'b' 21 | elif e.key == pygame.K_d: 22 | g.set_robot(a3) 23 | held = 'd' 24 | elif e.key == pygame.K_w: 25 | g.set_robot(a4) 26 | held = 'w' 27 | if held is not None: 28 | print g.observe() 29 | elif e.type == pygame.KEYUP and held: 30 | if e.key == pygame.K_a and held == 'a': 31 | held = None 32 | elif e.key == pygame.K_s and held == 'b': 33 | held = None 34 | elif e.key == pygame.K_d and held == 'd': 35 | held = None 36 | elif e.key == pygame.K_w and held == 'w': 37 | held = None 38 | 39 | g = SuperMarket() 40 | a = agent.Agent(g) 41 | class Count: 42 | pass 43 | def process(every_frames): 44 | counter = Count() 45 | counter.n = 0 46 | counter.success = False 47 | def autonomous_action(): 48 | if counter.n == 0 and len(g.targets) > 0: 49 | counter.success = False 50 | a._value_iteration() 51 | g.set_robot(a.next_action()) 52 | g.observe() 53 | elif len(g.targets) == 0: 54 | if not counter.success: 55 | counter.success = True 56 | print "SUCCESS!!!!!!" 57 | counter.n = (counter.n+1)%every_frames 58 | return autonomous_action 59 | 60 | display.event_handler = event_handler 61 | def go(): 62 | display.process = process(10) 63 | 64 | a1,a2,a3,a4 = g.actions 65 | display.drawables1.append(g.draw) 66 | display.drawables2.append(g.draw_belief) 67 | t = Thread(target=display.main, args=[sys.argv]) 68 | t.start() 69 | IPython.embed() 70 | display.done = True 71 | t.join() 72 | -------------------------------------------------------------------------------- /visual/display.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | import sys 3 | import pygame 4 | from pygame.locals import * 5 | from colors import * 6 | 7 | FPS = 10 8 | 9 | # WIDTH = 800 10 | # HEIGHT = 500 11 | done = False 12 | 13 | surface = None 14 | drawables1 = [] 15 | drawables2 = [] 16 | 17 | def event_handler(e): 18 | pass 19 | 20 | def process(): 21 | pass 22 | 23 | def main(args): 24 | global done,surface,drawables 25 | pygame.init() 26 | info = pygame.display.Info() 27 | WIDTH = info.current_w-500 28 | HEIGHT = (WIDTH)/2 29 | fpsClock = pygame.time.Clock() 30 | 31 | master_surface = pygame.display.set_mode((WIDTH+100,HEIGHT)) 32 | pygame.display.set_caption("6.834 Simulator") 33 | surface1 = pygame.Surface(((WIDTH/2),HEIGHT)) 34 | surface2 = pygame.Surface(((WIDTH/2),HEIGHT)) 35 | 36 | while not done: 37 | process() 38 | master_surface.fill(black) 39 | surface1.fill(white) 40 | surface2.fill(white) 41 | 42 | list(d(surface1) for d in drawables1) 43 | list(d(surface2) for d in drawables2) 44 | for event in pygame.event.get(): 45 | if event.type == QUIT: 46 | done = True 47 | else: 48 | event_handler(event) 49 | 50 | master_surface.blit(surface1,(0,0)) 51 | master_surface.blit(surface2,(WIDTH/2+100,0)) 52 | fpsClock.tick(FPS) 53 | pygame.display.flip() 54 | 55 | pygame.quit() 56 | return 0 57 | 58 | if __name__ == "__main__": 59 | sys.exit(main(sys.argv)) 60 | -------------------------------------------------------------------------------- /visual/grid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | from threading import Lock 3 | from math import * 4 | import pygame 5 | from pygame.locals import * 6 | from colors import * 7 | import random 8 | import IPython 9 | 10 | class Grid(object): 11 | def __init__(self, height, width, aisles, robot): 12 | self.height = height 13 | self.width = width 14 | self.aisles = set(aisles) 15 | self.robot = robot 16 | self.l = Lock() 17 | self.actions = [(0,-1),(1,0),(0,1),(-1,0)] 18 | self.p_error = 0. 19 | self.belief = [[1. if (r,c) == robot else 0. for c in range(width)] for r in range(height)] 20 | 21 | def action_errors(self,action): 22 | return [action] 23 | 24 | def blocked(self, (r,c)): 25 | return not (0 <= r < self.height and 0 <= c < self.width) or (r,c) in self.aisles 26 | 27 | def set_robot(self,action): 28 | with self.l: 29 | (r,c) = self.robot 30 | if action in self.actions: 31 | new_belief = self.transition_update(self.belief,action) 32 | with self.l: 33 | self.belief = new_belief 34 | errors = self.action_errors(action) 35 | if random.random() <= self.p_error: 36 | action = random.choice(errors) 37 | (dr,dc) = action 38 | (nr,nc) = (r+dr,c+dc) 39 | if not self.blocked((nr,nc)): 40 | with self.l: 41 | self.robot = (nr,nc) 42 | 43 | def transition_probs(self,state,action): 44 | t_probs = {} 45 | r, c = state 46 | errors = self.action_errors(action) 47 | dr,dc = action 48 | new_state = (r+dr,c+dc) if not self.blocked((r+dr,c+dc)) else (r,c) 49 | t_probs[new_state] = 1. - self.p_error 50 | for (dr,dc) in errors: 51 | new_state = (r+dr,c+dc) if not self.blocked((r+dr,c+dc)) else (r,c) 52 | if new_state in t_probs: 53 | t_probs[new_state] += self.p_error/len(errors) 54 | else: 55 | t_probs[new_state] = self.p_error/len(errors) 56 | return t_probs 57 | 58 | def transition_update(self,belief,action): 59 | new_belief = [[0. for c in range(self.width)] for r in range(self.height)] 60 | for r in range(self.height): 61 | for c in range(self.width): 62 | t_probs = self.transition_probs((r,c),action) 63 | for (nr,nc) in t_probs: 64 | new_belief[nr][nc] += t_probs[(nr,nc)]*self.belief[r][c] 65 | return new_belief 66 | 67 | def dimensions(self,surface): 68 | pix_height = surface.get_height() 69 | pix_width = surface.get_width() 70 | 71 | row_height = int(pix_height/self.height) 72 | col_width = int(pix_width/self.width) 73 | 74 | return pix_height,pix_width,row_height,col_width 75 | 76 | def draw(self,surface,robot=True): 77 | pix_height,pix_width,row_height,col_width = self.dimensions(surface) 78 | # Draw rows 79 | # 80 | for r in range(1,self.height): 81 | pygame.draw.line(surface,black,(0,r*row_height),(pix_width,r*row_height)) 82 | # Draw columns 83 | # 84 | for c in range(1,self.width): 85 | pygame.draw.line(surface,black,(c*col_width,0),(c*col_width,pix_height)) 86 | 87 | # Draw the aisles 88 | # 89 | for (r,c) in self.aisles: 90 | surface.fill(black, rect=(c*col_width,r*row_height,col_width,row_height)) 91 | 92 | if robot: 93 | with self.l: 94 | (r,c) = self.robot 95 | (x,y) = int((c+0.5)*col_width),int((r+0.5)*row_height) 96 | radius = int(min(row_height,col_width)/2.0) 97 | pygame.draw.circle(surface,red,(x,y),radius,10) 98 | 99 | class SuperMarket(Grid): 100 | def __init__(self): 101 | self.aisles_content = { 102 | 'meats': ['chicken','beef','pork','turkey'], 103 | 'candy': ['oreo','twix','nutella','kitkat'], 104 | 'dairy': ['milk','iscream','butter','curd'], 105 | 'drink': ['water','juice','soda','smoothi'], 106 | 'grain': ['rice','flour','barley', 'beans'], 107 | 'pasta': ['penne','fusilli','farfalle','lasagna'] 108 | } 109 | 110 | self.aisles_list = [ 111 | [(i+1,2*n+1) for i in range(len(self.aisles_content.values()[0]))] 112 | for n in range(len(self.aisles_content)) 113 | ] 114 | aisles = [cell for a in self.aisles_list for cell in a] 115 | 116 | width = len(self.aisles_list * 2) + 1 117 | height = len(self.aisles_list[0])+3 118 | possible_robot = [(0,0),(height-1,width-1)] 119 | robot = random.choice(possible_robot) 120 | possible_robot = set(possible_robot) 121 | 122 | super(SuperMarket,self).__init__(height,width,aisles,robot) 123 | 124 | self.belief = [[1./len(possible_robot) if (r,c) in possible_robot else 0. 125 | for c in range(width)] for r in range(height)] 126 | self.actions = [(0,-1),(1,0),(0,1),(-1,0)] 127 | self.p_error = 0.05 128 | 129 | self.targets = set(['oreo','iscream','milk']) 130 | 131 | for c in self.aisles_content: 132 | random.shuffle(self.aisles_content[c]) 133 | aisles_order = self.aisles_list[:] 134 | random.shuffle(aisles_order) 135 | self.obs = {} 136 | for prods,aisle in zip(self.aisles_content.values(),aisles_order): 137 | self.obs.update(zip(aisle,prods)) 138 | 139 | # Aisle belief state 140 | # 141 | self.aisles_belief = {} 142 | aisles_types = self.aisles_content.keys() 143 | for a in range(len(self.aisles_list)): 144 | self.aisles_belief[a+1] = dict(zip( 145 | aisles_types, 146 | [1./len(aisles_types)]*len(aisles_types))) 147 | 148 | # Inner aisle belief state 149 | # 150 | inner = lambda prods: dict((p,1./len(prods)) for p in prods) 151 | self.content_belief = dict( 152 | (cat,dict(enumerate([inner(prods) for _ in prods]))) 153 | for cat,prods in self.aisles_content.items() 154 | ) 155 | 156 | self.images = { 157 | 'meats' : pygame.image.load("images/meats.jpg"), 158 | 'candy' : pygame.image.load("images/candy.gif"), 159 | 'dairy' : pygame.image.load("images/dairy.jpg"), 160 | 'drink' : pygame.image.load("images/drink.jpg"), 161 | 'grain' : pygame.image.load("images/grain.jpg"), 162 | 'pasta' : pygame.image.load("images/pasta.jpg"), 163 | 164 | 'chicken' : pygame.image.load("images/chicken.jpg"), 165 | 'pork' : pygame.image.load("images/pork.jpg"), 166 | 'turkey' : pygame.image.load("images/turkey.gif"), 167 | 'beef' : pygame.image.load("images/beef.jpg"), 168 | 169 | 'oreo' : pygame.image.load("images/oreo.jpg"), 170 | 'twix' : pygame.image.load("images/twix.jpg"), 171 | 'nutella' : pygame.image.load("images/nutella.jpg"), 172 | 'kitkat' : pygame.image.load("images/kitkat.jpg"), 173 | 174 | 'milk' : pygame.image.load("images/milk.jpg"), 175 | 'curd' : pygame.image.load("images/curd.jpg"), 176 | 'iscream' : pygame.image.load("images/iscream.jpg"), 177 | 'butter' : pygame.image.load("images/butter.jpg"), 178 | 179 | 'water' : pygame.image.load("images/water.jpg"), 180 | 'juice' : pygame.image.load("images/juice.jpg"), 181 | 'soda' : pygame.image.load("images/soda.jpg"), 182 | 'smoothi' : pygame.image.load("images/smoothi.jpg"), 183 | 184 | 'rice' : pygame.image.load("images/rice.jpg"), 185 | 'flour' : pygame.image.load("images/flour.jpg"), 186 | 'barley' : pygame.image.load("images/barley.jpg"), 187 | 'beans' : pygame.image.load("images/beans.jpg"), 188 | 189 | 'penne' : pygame.image.load("images/penne.jpg"), 190 | 'fusilli' : pygame.image.load("images/fusilli.jpg"), 191 | 'farfalle': pygame.image.load("images/farfalle.jpg"), 192 | 'lasagna' : pygame.image.load("images/lasagna.jpg") 193 | } 194 | 195 | def cell_to_aisle(self,(r,c)): 196 | for i in range(len(self.aisles_list)): 197 | if (r,c) in self.aisles_list[i]: 198 | return (i+1,self.aisles_list[i].index((r,c))) 199 | return None 200 | 201 | def category(self, product): 202 | for c in self.aisles_content: 203 | if product in self.aisles_content[c]: 204 | return c 205 | return None 206 | 207 | transformed = False 208 | def draw(self,surface): 209 | pix_height,pix_width,row_height,col_width = self.dimensions(surface) 210 | super(SuperMarket,self).draw(surface) 211 | if not self.transformed: 212 | for prod in self.images: 213 | img = self.images[prod] 214 | self.images[prod] = pygame.transform.scale(img.convert(),(col_width,row_height)) 215 | self.transformed = True 216 | for (r,c) in self.aisles: 217 | prod = self.obs[(r,c)] 218 | img = self.images.get(prod,None) 219 | if img is not None: 220 | surface.blit(img, dest=(c*col_width,r*row_height)) 221 | 222 | def draw_belief(self,surface): 223 | with self.l: 224 | belief = [r[:] for r in self.belief] 225 | pix_height,pix_width,row_height,col_width = self.dimensions(surface) 226 | for r in range(self.height): 227 | for c in range(self.width): 228 | logB = log(belief[r][c]) if belief[r][c] != 0 else -12 229 | surface.fill(gray(max((logB+12)/12.0,0)), 230 | rect=(c*col_width,r*row_height,col_width,row_height)) 231 | 232 | super(SuperMarket,self).draw(surface,False) 233 | 234 | if not self.transformed: 235 | for prod in self.images: 236 | img = self.images[prod] 237 | self.images[prod] = pygame.transform.scale(img.convert(),(col_width,row_height)) 238 | self.transformed = True 239 | 240 | for (r,c) in self.aisles: 241 | # Are we more than 50% certain about any product here? 242 | found = False 243 | a,p = self.cell_to_aisle((r,c)) 244 | for cat in self.aisles_belief[a]: 245 | prob_cat = self.aisles_belief[a][cat] 246 | if prob_cat > 0.5: 247 | img = self.images.get(cat,None) 248 | found = True 249 | for prod in self.content_belief[cat][p]: 250 | prob = prob_cat*\ 251 | self.content_belief[cat][p][prod] 252 | if prob > 0.5: 253 | img = self.images.get(prod,None) 254 | break 255 | break 256 | if not found: 257 | continue 258 | surface.blit(img, dest=(c*col_width,r*row_height)) 259 | 260 | def action_errors(self,action): 261 | i = self.actions.index(action) 262 | l = len(self.actions) 263 | return self.actions[(i-1)%l],self.actions[(i+1)%l],(0,0) 264 | 265 | def observe(self): 266 | with self.l: 267 | (r,c) = self.robot 268 | obs = () 269 | for dr,dc in [(0,-1),(1,0),(0,1),(-1,0)]: 270 | obs += (self.obs.get((r+dr,c+dc),None),) 271 | self.observation_update(obs) 272 | list(self.targets.discard(o) for o in obs) 273 | return obs 274 | 275 | def observation_update(self, observation): 276 | with self.l: 277 | belief = [r[:] for r in self.belief] 278 | 279 | # Update position belief 280 | obs_cells = [(obs,(r+dr,c+dc),(r,c)) 281 | for r in range(self.height) 282 | for c in range(self.width) 283 | for (obs,(dr,dc)) in zip(observation, [(0,-1),(1,0),(0,1),(-1,0)]) 284 | if belief[r][c] != 0 285 | ] # (obs,(row,col),parent) 286 | for (obs,neigh,(r,c)) in obs_cells: 287 | if self.cell_to_aisle(neigh) is None: 288 | if obs is not None: 289 | belief[r][c] = 0 290 | else: 291 | if obs is None: 292 | belief[r][c] = 0 293 | else: 294 | aisle,pos = self.cell_to_aisle(neigh) 295 | cat = self.category(obs) 296 | belief[r][c] *= self.aisles_belief[aisle][cat] * self.content_belief[cat][pos][obs] 297 | Z = sum(b for r in belief for b in r) 298 | belief = [[b/Z for b in r] for r in belief] 299 | with self.l: 300 | self.belief = belief 301 | if not all(o is None for o in observation): 302 | obs_cells = [(obs,(r+dr,c+dc),belief[r][c],(r,c)) 303 | for r in range(self.height) 304 | for c in range(self.width) 305 | for (obs,(dr,dc)) in zip(observation, [(0,-1),(1,0),(0,1),(-1,0)]) 306 | if obs is not None 307 | ] # (obs,(row,col),prob,parent) 308 | self.observation_world_update(obs_cells) 309 | 310 | def observation_world_update(self,obs_cells): 311 | with self.l: 312 | belief = [r[:] for r in self.belief] 313 | # Update world belief 314 | op = {} 315 | ac = {} 316 | for (obs,neigh,prob,(r,c)) in obs_cells: 317 | if self.cell_to_aisle(neigh) is None: 318 | continue 319 | aisle,pos = self.cell_to_aisle(neigh) 320 | cat = self.category(obs) 321 | 322 | if obs not in op: 323 | op[obs] = {} 324 | op[obs][pos] = op[obs].get(pos,0)+prob 325 | 326 | if aisle not in ac: 327 | ac[aisle] = {} 328 | ac[aisle][cat] = ac[aisle].get(cat,0)+prob 329 | 330 | # Update aisle beliefs 331 | # 332 | ab = self.aisles_belief 333 | for a in ac: 334 | for cat in ac[a]: 335 | prob = ac[a][cat] 336 | Z1 = ab[a][cat] 337 | Z2 = 1-Z1 338 | if Z1 == 0 or Z2 == 0: 339 | continue 340 | for c in ab[a]: 341 | ab[a][c] *= prob/Z1 if (c==cat) else (1.-prob)/Z2 342 | 343 | # Update aisle position beliefs 344 | # 345 | for obs in op: 346 | cat = self.category(obs) 347 | cb = self.content_belief[cat] 348 | for p in cb: 349 | prob = op[obs][p] 350 | Z1 = cb[p][obs] 351 | Z2 = 1-Z1 352 | if Z1 == 0 or Z2 == 0: 353 | continue 354 | for o in cb[p]: 355 | cb[p][o] *= prob/Z1 if (o==obs) else (1.-prob)/Z2 356 | -------------------------------------------------------------------------------- /visual/images/barley.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/barley.jpg -------------------------------------------------------------------------------- /visual/images/beans.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/beans.jpg -------------------------------------------------------------------------------- /visual/images/beef.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/beef.jpg -------------------------------------------------------------------------------- /visual/images/butter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/butter.jpg -------------------------------------------------------------------------------- /visual/images/candy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/candy.gif -------------------------------------------------------------------------------- /visual/images/chicken.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/chicken.jpg -------------------------------------------------------------------------------- /visual/images/curd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/curd.jpg -------------------------------------------------------------------------------- /visual/images/dairy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/dairy.jpg -------------------------------------------------------------------------------- /visual/images/drink.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/drink.jpg -------------------------------------------------------------------------------- /visual/images/farfalle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/farfalle.jpg -------------------------------------------------------------------------------- /visual/images/flour.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/flour.jpg -------------------------------------------------------------------------------- /visual/images/fusilli.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/fusilli.jpg -------------------------------------------------------------------------------- /visual/images/grain.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/grain.jpg -------------------------------------------------------------------------------- /visual/images/iscream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/iscream.jpg -------------------------------------------------------------------------------- /visual/images/juice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/juice.jpg -------------------------------------------------------------------------------- /visual/images/kitkat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/kitkat.jpg -------------------------------------------------------------------------------- /visual/images/lasagna.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/lasagna.jpg -------------------------------------------------------------------------------- /visual/images/meats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/meats.jpg -------------------------------------------------------------------------------- /visual/images/milk.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/milk.jpg -------------------------------------------------------------------------------- /visual/images/nutella.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/nutella.jpg -------------------------------------------------------------------------------- /visual/images/oreo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/oreo.jpg -------------------------------------------------------------------------------- /visual/images/pasta.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/pasta.jpg -------------------------------------------------------------------------------- /visual/images/penne.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/penne.jpg -------------------------------------------------------------------------------- /visual/images/pork.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/pork.jpg -------------------------------------------------------------------------------- /visual/images/rice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/rice.jpg -------------------------------------------------------------------------------- /visual/images/smoothi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/smoothi.jpg -------------------------------------------------------------------------------- /visual/images/soda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/soda.jpg -------------------------------------------------------------------------------- /visual/images/turkey.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/turkey.gif -------------------------------------------------------------------------------- /visual/images/twix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/twix.jpg -------------------------------------------------------------------------------- /visual/images/water.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/water.jpg --------------------------------------------------------------------------------