├── .gitignore
├── LICENSE
├── README.md
├── bayesrl
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── agent.py
    │   ├── modelbasedagent.py
    │   ├── qlearningagent.py
    │   ├── rmaxagent.py
    │   ├── sarsaagent.py
    │   ├── thompsonsampagent.py
    │   └── thompsonsampagent_pomdp.py
    ├── environments
    │   ├── __init__.py
    │   ├── chainworld.py
    │   ├── gridworld.py
    │   └── pomdpgw.py
    ├── plot.py
    ├── trial.py
    └── utils.py
├── benchmarks
    ├── thompson_gridworld.py
    └── thompson_gridworld_pomdp.py
├── reports
    ├── 6_834j_ps03.bib
    ├── 6_834j_ps03.pdf
    ├── 6_834j_ps03.tex
    ├── 6_834j_ps04.bib
    ├── 6_834j_ps04.pdf
    ├── 6_834j_ps04.tex
    ├── 6_834j_talk.tex
    ├── beamercolorthememetropolis.sty
    ├── beamerfontthememetropolis.sty
    ├── beamerthemem.sty
    ├── demo.webm
    ├── img
    │   ├── agent_environment.png
    │   ├── agent_environment_untitled.png
    │   ├── mdp_imm_rewards.png
    │   ├── octocat.png
    │   ├── partial_obs.png
    │   ├── pomdp.png
    │   └── uncertain_transition.png
    └── pset.cls
├── setup.py
├── tests
    ├── gridworld.py
    └── thompsongridworld.py
└── visual
    ├── .grid.py.swp
    ├── agent.py
    ├── colors.py
    ├── debug.py
    ├── display.py
    ├── grid.py
    └── images
        ├── barley.jpg
        ├── beans.jpg
        ├── beef.jpg
        ├── butter.jpg
        ├── candy.gif
        ├── chicken.jpg
        ├── curd.jpg
        ├── dairy.jpg
        ├── drink.jpg
        ├── farfalle.jpg
        ├── flour.jpg
        ├── fusilli.jpg
        ├── grain.jpg
        ├── iscream.jpg
        ├── juice.jpg
        ├── kitkat.jpg
        ├── lasagna.jpg
        ├── meats.jpg
        ├── milk.jpg
        ├── nutella.jpg
        ├── oreo.jpg
        ├── pasta.jpg
        ├── penne.jpg
        ├── pork.jpg
        ├── rice.jpg
        ├── smoothi.jpg
        ├── soda.jpg
        ├── turkey.gif
        ├── twix.jpg
        └── water.jpg


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.fls
 3 | *.acn
 4 | *.acr
 5 | *.alg
 6 | *.aux
 7 | *.bbl
 8 | *.blg
 9 | *.dvi
10 | *.fdb_latexmk
11 | *.glg
12 | *.glo
13 | *.gls
14 | *.idx
15 | *.ilg
16 | *.ind
17 | *.ist
18 | *.lof
19 | *.log
20 | *.lot
21 | *.maf
22 | *.mtc
23 | *.mtc0
24 | *.nav
25 | *.nlo
26 | *.out
27 | *.pdfsync
28 | *.ps
29 | *.snm
30 | *.synctex.gz
31 | *.toc
32 | *.vrb
33 | *.xdy
34 | *.tdo
35 | *.pyc
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Dustin Tran
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BayesRL
 2 | `BayesRL` is a Python library for reinforcement learning using Bayesian
 3 | approaches. It stores both agents and environments under separate classes, where
 4 | an agent class is a learning algorithm and environments are tasks that the agent
 5 | must solve. We include agents and environments for solving and implementing both
 6 | Markov decision processes (MDPs) and partially observable Markov decision
 7 | processes (POMDPs).
 8 | 
 9 | Examples can be found in the directory `tests/`. More documentation can be found in the [wiki](../../wiki).
10 | 
11 | ## Installation
12 | To install from pip, run
13 | ```{bash}
14 | pip install -e "git+https://github.com/dustinvtran/bayesrl.git#egg=bayesrl"
15 | ```
16 | 
17 | ## Authors
18 | * Dustin Tran \<dtran@g.harvard.edu\>
19 | * Xiaomin Wang \<xiaominw@mit.edu\>
20 | * Rodrigo Gomes \<rgomes@mit.edu\>
21 | 
22 | ## References
23 | * Malcolm Strens. A bayesian framework for reinforcement learning. In _Proceedings of the 17th International Conference on Machine Learning (ICML)_, 2000.
24 | 
25 | 


--------------------------------------------------------------------------------
/bayesrl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/bayesrl/__init__.py


--------------------------------------------------------------------------------
/bayesrl/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/bayesrl/agents/__init__.py


--------------------------------------------------------------------------------
/bayesrl/agents/agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent(object):
 4 |     """
 5 |     Base class for all reinforcement learning agents to inherit from.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     num_states: int
10 |         Number of states in the task.
11 |     num_actions: int
12 |         Number of actions in the task.
13 |     discount_factor: float in (0,1]
14 |         The discount factor per iteration.
15 |     """
16 |     def __init__(self, num_states, num_actions, discount_factor):
17 |         self.num_states = num_states
18 |         self.num_actions = num_actions
19 |         self.discount_factor = discount_factor
20 | 
21 |         self.last_state = None
22 |         self.last_action = None
23 | 
24 |     def reset(self):
25 |         self.last_state = None
26 |         self.last_action = None
27 | 
28 |     # Make sure inherited classes have interact() function.
29 |     def interact(self, reward, next_state, next_state_is_terminal):
30 |         raise NameError("interact() has not been implemented.")
31 | 


--------------------------------------------------------------------------------
/bayesrl/agents/modelbasedagent.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | import numpy as np
 3 | 
 4 | class ModelBasedAgent(Agent):
 5 |     """Runs R-MAX only for an MDP, i.e., not a stochastic game, in order to simplify data structures."""
 6 |     def __init__(self, T, **kwargs):
 7 |         super(ModelBasedAgent, self).__init__(**kwargs)
 8 |         self.T = T
 9 | 
10 |         self.policy_step = self.T # To keep track of where in T-step policy the agent is in; initialized to recompute policy
11 |         self.transition_observations = np.zeros((self.num_states, self.num_actions, self.num_states))
12 |         self.value_table = np.zeros((self.num_states, self.num_actions))
13 | 
14 |     def reset(self):
15 |         super(ModelBasedAgent, self).reset()
16 |         self.policy_step = self.T # To keep track of where in T-step policy the agent is in; initialized to recompute policy
17 |         self.transition_observations.fill(0)
18 |         self.value_table.fill(0)
19 | 
20 |     def _value_iteration(self, transition_probs):
21 |         """
22 |         Run value iteration, using procedure described in Sutton and Barto
23 |         (2012). The end result is an updated value_table, from which one can
24 |         deduce the policy for state s by taking the argmax (breaking ties
25 |         randomly).
26 |         """
27 |         value_dim = transition_probs.shape[0]
28 |         value = np.zeros(value_dim)
29 |         k = 0
30 |         while True:
31 |             diff = 0
32 |             for s in xrange(value_dim):
33 |                 old = value[s]
34 |                 value[s] = np.max(np.sum(transition_probs[s]*(self.reward[s] +
35 |                            self.discount_factor*np.array([value,]*self.num_actions)),
36 |                            axis=1))
37 |                 diff = max(0, abs(old - value[s]))
38 |             k += 1
39 |             if diff < 1e-2:
40 |                 break
41 |             if k > 1e6:
42 |                 raise Exception("Value iteration not converging. Stopped at 1e6 iterations.")
43 |         for s in xrange(value_dim):
44 |             self.value_table[s] = np.sum(transition_probs[s]*(self.reward[s] +
45 |                    self.discount_factor*np.array([value,]*self.num_actions)),
46 |                    axis=1)
47 | 
48 |     def _argmax_breaking_ties_randomly(self, x):
49 |         """Taken from Ken."""
50 |         max_value = np.max(x)
51 |         indices_with_max_value = np.flatnonzero(x == max_value)
52 |         return np.random.choice(indices_with_max_value)
53 | 


--------------------------------------------------------------------------------
/bayesrl/agents/qlearningagent.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | import numpy as np
 3 | 
 4 | class QLearningAgent(Agent):
 5 |     def __init__(self, learning_rate, epsilon, value=0, **kwargs):
 6 |         super(QLearningAgent, self).__init__(**kwargs)
 7 |         self.learning_rate = learning_rate
 8 |         self.epsilon = epsilon
 9 |         self.value = value
10 | 
11 |         self.value_table = np.full((self.num_states, self.num_actions), self.value)
12 | 
13 |     def reset(self):
14 |         super(QLearningAgent, self).reset()
15 |         self.value_table.fill(self.value)
16 | 
17 |     def interact(self, reward, next_state, next_state_is_terminal, idx):
18 |         # Handle start of episode.
19 |         if reward is None:
20 |             # Return random action since there is no information.
21 |             next_action = np.random.randint(self.num_actions)
22 |             self.last_state = next_state
23 |             self.last_action = next_action
24 |             return self.last_action
25 | 
26 |         # Handle completion of episode.
27 |         if next_state_is_terminal:
28 |             # Proceed as normal.
29 |             pass
30 | 
31 |         # Choose next action according to epsilon-greedy policy.
32 |         if np.random.random() < self.epsilon:
33 |             next_action = np.random.randint(self.num_actions)
34 |         else:
35 |             next_action = np.argmax(self.value_table[next_state])
36 | 
37 |         # Update value function.
38 |         delta = reward + self.discount_factor*np.max(self.value_table[next_state]) - \
39 |                 self.value_table[self.last_state, self.last_action]
40 |         self.value_table[self.last_state, self.last_action] += self.learning_rate(idx) * delta
41 | 
42 |         self.last_state = next_state
43 |         self.last_action = next_action
44 | 
45 |         return next_action
46 | 


--------------------------------------------------------------------------------
/bayesrl/agents/rmaxagent.py:
--------------------------------------------------------------------------------
 1 | from modelbasedagent import ModelBasedAgent
 2 | import numpy as np
 3 | 
 4 | class RMAXAgent(ModelBasedAgent):
 5 |     """Runs R-MAX only for an MDP, i.e., not a stochastic game, in order to simplify data structures."""
 6 |     def __init__(self, min_visit_count, **kwargs):
 7 |         super(RMAXAgent, self).__init__(**kwargs)
 8 |         self.min_visit_count = min_visit_count
 9 | 
10 |         self.Rmax = 50 # arbitrarily set (!)
11 |         self.reward = np.full((self.num_states+1, self.num_actions, self.num_states+1), self.Rmax)
12 |         self.transition_observations = np.zeros((self.num_states+1, self.num_actions, self.num_states+1))
13 |         self.value_table = np.zeros((self.num_states+1, self.num_actions))
14 | 
15 |     def reset(self):
16 |         super(RMAXAgent, self).reset()
17 |         self.reward.fill(self.Rmax)
18 |         self.transition_observations.fill(0)
19 |         self.value_table.fill(0)
20 | 
21 |     def interact(self, reward, next_state, next_state_is_terminal, idx):
22 |         # Handle start of episode.
23 |         if reward is None:
24 |             # Return random action since there is no information.
25 |             next_action = np.random.randint(self.num_actions)
26 |             self.last_state = next_state
27 |             self.last_action = next_action
28 |             return self.last_action
29 | 
30 |         # Handle completion of episode.
31 |         if next_state_is_terminal:
32 |             # Proceed as normal.
33 |             pass
34 | 
35 |         # Update the reward associated with (s,a,s') if first time.
36 |         if self.reward[self.last_state+1, self.last_action, next_state+1] == self.Rmax:
37 |             self.reward[self.last_state+1, self.last_action, next_state+1] = reward
38 |             if self.Rmax < reward:
39 |                 self.reward[self.reward == self.Rmax] = reward
40 |                 self.Rmax = reward
41 | 
42 |         # Update set of states reached by playing a.
43 |         self.transition_observations[self.last_state+1, self.last_action, next_state+1] += 1
44 | 
45 |         # Compute new optimal T-step policy if reach min_visit_count or finished executing previous one
46 |         if self.transition_observations[self.last_state+1, self.last_action].sum() == self.min_visit_count or \
47 |            self.policy_step == self.T:
48 |                 self.__compute_policy()
49 | 
50 |         # Choose next action according to policy.
51 |         next_action = self._argmax_breaking_ties_randomly(self.value_table[next_state+1])
52 | 
53 |         self.policy_step += 1
54 |         self.last_state = next_state
55 |         self.last_action = next_action
56 | 
57 |         return next_action
58 | 
59 |     def __compute_policy(self):
60 |         """Compute an optimal T-step policy for the current state."""
61 |         self.policy_step = 0
62 |         # Obtain transition probabilities (prevent dividing by zero).
63 |         divisor = self.transition_observations.sum(axis=2, keepdims=True)
64 |         divisor[divisor == 0] = 1
65 |         transition_probs = self.transition_observations / divisor
66 |         # Replace all state-action pairs with zero probability everywhere, i.e.,
67 |         # no counts, with probability 1 to the fictitious game state.
68 |         eps = 1e-5
69 |         for s in xrange(self.num_states+1):
70 |             for a in xrange(self.num_actions):
71 |                 if -eps < transition_probs[s,a].sum() < eps:
72 |                     transition_probs[s, a, 0] = 1
73 |         self._value_iteration(transition_probs)
74 | 


--------------------------------------------------------------------------------
/bayesrl/agents/sarsaagent.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | import numpy as np
 3 | 
 4 | class SARSAAgent(Agent):
 5 |     def __init__(self, learning_rate, epsilon, value=0, **kwargs):
 6 |         super(SARSAAgent, self).__init__(**kwargs)
 7 |         self.learning_rate = learning_rate
 8 |         self.epsilon = epsilon
 9 |         self.value = value
10 | 
11 |         self.value_table = np.full((self.num_states, self.num_actions), self.value)
12 | 
13 |     def reset(self):
14 |         super(SARSAAgent, self).reset()
15 |         self.value_table.fill(self.value)
16 | 
17 |     def interact(self, reward, next_state, next_state_is_terminal, idx):
18 |         # Handle start of episode.
19 |         if reward is None:
20 |             # Return random action since there is no information.
21 |             next_action = np.random.randint(self.num_actions)
22 |             self.last_state = next_state
23 |             self.last_action = next_action
24 |             return self.last_action
25 | 
26 |         # Handle completion of episode.
27 |         if next_state_is_terminal:
28 |             # Proceed as normal.
29 |             pass
30 | 
31 |         # Choose next action according to epsilon-greedy policy.
32 |         if np.random.random() < self.epsilon:
33 |             next_action = np.random.randint(self.num_actions)
34 |         else:
35 |             next_action = np.argmax(self.value_table[next_state])
36 | 
37 |         # Update value function.
38 |         delta = reward + self.discount_factor*self.value_table[next_state, next_action] - \
39 |                 self.value_table[self.last_state, self.last_action]
40 |         self.value_table[self.last_state, self.last_action] += self.learning_rate(idx) * delta
41 | 
42 |         self.last_state = next_state
43 |         self.last_action = next_action
44 | 
45 |         return self.last_action
46 | 


--------------------------------------------------------------------------------
/bayesrl/agents/thompsonsampagent.py:
--------------------------------------------------------------------------------
 1 | from modelbasedagent import ModelBasedAgent
 2 | import numpy as np
 3 | 
 4 | class ThompsonSampAgent(ModelBasedAgent):
 5 |     def __init__(self, dirichlet_param, reward_param, **kwargs):
 6 |         super(ThompsonSampAgent, self).__init__(**kwargs)
 7 |         self.dirichlet_param = dirichlet_param
 8 |         self.reward_param = reward_param
 9 | 
10 |         self.reward = np.full((self.num_states, self.num_actions, self.num_states), self.reward_param)
11 | 
12 |     def reset(self):
13 |         super(ThompsonSampAgent, self).reset()
14 |         self.reward.fill(self.reward_param)
15 | 
16 |     def interact(self, reward, next_state, next_state_is_terminal, idx):
17 |         # Handle start of episode.
18 |         if reward is None:
19 |             # Return random action since there is no information.
20 |             next_action = np.random.randint(self.num_actions)
21 |             self.last_state = next_state
22 |             self.last_action = next_action
23 |             return self.last_action
24 | 
25 |         # Handle completion of episode.
26 |         if next_state_is_terminal:
27 |             # Proceed as normal.
28 |             pass
29 | 
30 |         # Update the reward associated with (s,a,s') if first time.
31 |         if self.reward[self.last_state, self.last_action, next_state] == self.reward_param:
32 |             self.reward[self.last_state, self.last_action, next_state] = reward
33 | 
34 |         # Update set of states reached by playing a.
35 |         self.transition_observations[self.last_state, self.last_action, next_state] += 1
36 | 
37 |         # Update transition probabilities after every T steps
38 |         if self.policy_step == self.T:
39 |             self.__compute_policy()
40 | 
41 |         # Choose next action according to policy.
42 |         next_action = self._argmax_breaking_ties_randomly(self.value_table[next_state])
43 | 
44 |         self.policy_step += 1
45 |         self.last_state = next_state
46 |         self.last_action = next_action
47 | 
48 |         return self.last_action
49 | 
50 |     def __compute_policy(self):
51 |         """Compute an optimal T-step policy for the current state."""
52 |         self.policy_step = 0
53 |         transition_probs = np.zeros((self.num_states, self.num_actions, self.num_states))
54 |         for s in xrange(self.num_states):
55 |             for a in xrange(self.num_actions):
56 |                 transition_probs[s,a] = np.random.dirichlet(self.transition_observations[s,a] +\
57 |                                                             self.dirichlet_param, size=1)
58 |         self._value_iteration(transition_probs)
59 | 


--------------------------------------------------------------------------------
/bayesrl/agents/thompsonsampagent_pomdp.py:
--------------------------------------------------------------------------------
 1 | from thompsonsampagent import ThompsonSampAgent
 2 | import numpy as np
 3 | 
 4 | class ThompsonSampAgentPOMDP(ThompsonSampAgent):
 5 |     def __init__(self, observation_model, dirichlet_param, reward_param, **kwargs):
 6 |         super(ThompsonSampAgentPOMDP, self).__init__(dirichlet_param, reward_param, **kwargs)
 7 |         self.observation_model = observation_model
 8 |         self.reset_belief()
 9 |         self.__compute_policy()
10 | 
11 |     def reset_belief(self):
12 |         self.belief = np.array([1./self.num_states for _ in range(self.num_states)])
13 | 
14 |     def reset(self):
15 |         super(ThompsonSampAgentPOMDP, self).reset()
16 |         self.reset_belief()
17 | 
18 |     def interact(self, reward, observation, next_state_is_terminal, idx):
19 |         # Handle start of episode.
20 |         if reward is None:
21 |             # Return random action since there is no information.
22 |             next_action = np.random.randint(self.num_actions)
23 |             self.last_action = next_action
24 |             self.__observe(observation)
25 |             return self.last_action
26 | 
27 |         # Handle completion of episode.
28 |         if next_state_is_terminal:
29 |             # Proceed as normal.
30 |             pass
31 | 
32 |         for last_state,next_state in [(s,s_) for s in range(self.num_states) for s_ in range(self.num_states)]:
33 |             tp = self.belief[last_state]*self.transition_probs[last_state,self.last_action,next_state]
34 |             # Update the reward associated with (s,a,s') if first time.
35 |             #if self.reward[last_state, self.last_action, next_state] == self.reward_param:
36 |             self.reward[last_state, self.last_action, next_state] *= (1-tp)
37 |             self.reward[last_state, self.last_action, next_state] += reward*tp
38 | 
39 |             # Update set of states reached by playing a.
40 |             self.transition_observations[last_state, self.last_action, next_state] += tp
41 | 
42 |         # Update transition probabilities after every T steps
43 |         if self.policy_step == self.T:
44 |             self.__compute_policy()
45 | 
46 |         self.__update_belief(self.last_action,observation)
47 |         # Choose next action according to policy.
48 |         value_table = sum(self.belief[s]*self.value_table[s] for s in range(self.num_states))
49 |         next_action = self._argmax_breaking_ties_randomly(value_table)
50 | 
51 |         self.policy_step += 1
52 |         self.last_action = next_action
53 | 
54 |         return self.last_action
55 | 
56 |     def __compute_policy(self):
57 |         """Compute an optimal T-step policy for the current state."""
58 |         self.policy_step = 0
59 |         self.transition_probs = np.zeros((self.num_states, self.num_actions, self.num_states))
60 |         for s in xrange(self.num_states):
61 |             for a in xrange(self.num_actions):
62 |                 self.transition_probs[s,a] = np.random.dirichlet(self.transition_observations[s,a] +\
63 |                                                             self.dirichlet_param, size=1)
64 |         self._value_iteration(self.transition_probs)
65 | 
66 |     def __update_belief(self,action,observation):
67 |         self.__transition(action)
68 |         self.__observe(observation)
69 | 
70 |     def __transition(self,action):
71 |         for s in range(self.num_states):
72 |             self.belief[s] = sum(self.transition_probs[s_,action,s]*self.belief[s_] for s_ in range(self.num_states))
73 | 
74 |     def __observe(self,observation):
75 |         self.belief = [self.belief[s]*self.observation_model[s][observation] for s in range(self.num_states)]
76 |         Z = sum(self.belief)
77 |         self.belief = np.array(self.belief)/float(Z)
78 | 


--------------------------------------------------------------------------------
/bayesrl/environments/__init__.py:
--------------------------------------------------------------------------------
1 | from .gridworld import GridWorld
2 | from .chainworld import ChainWorld
3 | 
4 | __all__ = ['GridWorld', 'ChainWorld']
5 | 


--------------------------------------------------------------------------------
/bayesrl/environments/chainworld.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ..utils import check_random_state
 3 | 
 4 | class ChainWorld(object):
 5 |     def __init__(self, left_length, left_reward, right_length, right_reward, on_chain_reward, p_return_to_start, random_state=None):
 6 |         self.left_length = left_length
 7 |         self.left_reward = left_reward
 8 |         self.right_length = right_length
 9 |         self.right_reward = right_reward
10 |         self.on_chain_reward = on_chain_reward
11 |         self.p_return_to_start = p_return_to_start
12 |         self.num_states = self.left_length + self.right_length + 1
13 |         self.num_actions = 2
14 |         self.random_state = check_random_state(random_state)
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self.state = self.left_length
19 | 
20 |     def observe(self):
21 |         return self.state
22 | 
23 |     def is_terminal(self, state):
24 |         return state == 0 or state == self.num_states - 1
25 | 
26 |     def perform_action(self, action):
27 |         if self.p_return_to_start and self.random_state.rand() < self.p_return_to_start:
28 |             self.reset()
29 |         elif action == 0:
30 |             self.state -= 1
31 |         else:
32 |             self.state += 1
33 | 
34 |         if self.state == 0:
35 |             reward = self.left_reward
36 |         elif self.state == self.num_states - 1:
37 |             reward = self.right_reward
38 |         else:
39 |             reward = self.on_chain_reward
40 |         return self.observe(), reward
41 | 
42 |     def get_max_reward(self):
43 |         return max(self.left_reward, self.right_reward)
44 | 


--------------------------------------------------------------------------------
/bayesrl/environments/gridworld.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from ..utils import check_random_state
  3 | 
  4 | 
  5 | # Maze state is represented as a 2-element NumPy array: (Y, X). Increasing Y is South.
  6 | 
  7 | # Possible actions, expressed as (delta-y, delta-x).
  8 | maze_actions = {
  9 |     'N': np.array([-1, 0]),
 10 |     'S': np.array([1, 0]),
 11 |     'E': np.array([0, 1]),
 12 |     'W': np.array([0, -1]),
 13 | }
 14 | 
 15 | def parse_topology(topology):
 16 |     return np.array([list(row) for row in topology])
 17 | 
 18 | 
 19 | class Maze(object):
 20 |     """
 21 |     Simple wrapper around a NumPy 2D array to handle flattened indexing and staying in bounds.
 22 |     """
 23 |     def __init__(self, topology):
 24 |         self.topology = parse_topology(topology)
 25 |         self.flat_topology = self.topology.ravel()
 26 |         self.shape = self.topology.shape
 27 | 
 28 |     def in_bounds_flat(self, position):
 29 |         return 0 <= position < np.product(self.shape)
 30 | 
 31 |     def in_bounds_unflat(self, position):
 32 |         return 0 <= position[0] < self.shape[0] and 0 <= position[1] < self.shape[1]
 33 | 
 34 |     def get_flat(self, position):
 35 |         if not self.in_bounds_flat(position):
 36 |             raise IndexError("Position out of bounds: {}".format(position))
 37 |         return self.flat_topology[position]
 38 | 
 39 |     def get_unflat(self, position):
 40 |         if not self.in_bounds_unflat(position):
 41 |             raise IndexError("Position out of bounds: {}".format(position))
 42 |         return self.topology[tuple(position)]
 43 | 
 44 |     def flatten_index(self, index_tuple):
 45 |         return np.ravel_multi_index(index_tuple, self.shape)
 46 | 
 47 |     def unflatten_index(self, flattened_index):
 48 |         return np.unravel_index(flattened_index, self.shape)
 49 | 
 50 |     def flat_positions_containing(self, x):
 51 |         return list(np.nonzero(self.flat_topology == x)[0])
 52 | 
 53 |     def flat_positions_not_containing(self, x):
 54 |         return list(np.nonzero(self.flat_topology != x)[0])
 55 | 
 56 |     def __str__(self):
 57 |         return '\n'.join(''.join(row) for row in self.topology.tolist())
 58 | 
 59 |     def __repr__(self):
 60 |         return 'Maze({})'.format(repr(self.topology.tolist()))
 61 | 
 62 | 
 63 | def move_avoiding_walls(maze, position, action):
 64 |     """
 65 |     Return the new position after moving, and the event that happened ('hit-wall' or 'moved').
 66 | 
 67 |     Works with the position and action as a (row, column) array.
 68 |     """
 69 |     # Compute new position
 70 |     new_position = position + action
 71 | 
 72 |     # Compute collisions with walls, including implicit walls at the ends of the world.
 73 |     if not maze.in_bounds_unflat(new_position) or maze.get_unflat(new_position) == '#':
 74 |         return position, 'hit-wall'
 75 | 
 76 |     return new_position, 'moved'
 77 | 
 78 | 
 79 | 
 80 | class GridWorld(object):
 81 |     """
 82 |     A simple task in a maze: get to the goal.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 | 
 87 |     maze : list of strings or lists
 88 |         maze topology (see below)
 89 | 
 90 |     rewards: dict of string to number. default: {'*': 10}.
 91 |         Rewards obtained by being in a maze grid with the specified contents,
 92 |         or experiencing the specified event (either 'hit-wall' or 'moved'). The
 93 |         contributions of content reward and event reward are summed. For
 94 |         example, you might specify a cost for moving by passing
 95 |         rewards={'*': 10, 'moved': -1}.
 96 | 
 97 |     terminal_markers: sequence of chars, default '*'
 98 |         A grid cell containing any of these markers will be considered a
 99 |         "terminal" state.
100 | 
101 |     action_error_prob: float
102 |         With this probability, the requested action is ignored and a random
103 |         action is chosen instead.
104 | 
105 |     random_state: None, int, or RandomState object
106 |         For repeatable experiments, you can pass a random state here. See
107 |         http://scikit-learn.org/stable/modules/generated/sklearn.utils.check_random_state.html
108 | 
109 |     Notes
110 |     -----
111 | 
112 |     Maze topology is expressed textually. Key:
113 |      '#': wall
114 |      '.': open (really, anything that's not '#')
115 |      '*': goal
116 |      'o': origin
117 |     """
118 | 
119 |     def __init__(self, maze, rewards={'*': 10}, terminal_markers='*', action_error_prob=0, random_state=None, directions="NSEW"):
120 | 
121 |         self.maze = Maze(maze) if not isinstance(maze, Maze) else maze
122 |         self.rewards = rewards
123 |         self.terminal_markers = terminal_markers
124 |         self.action_error_prob = action_error_prob
125 |         self.random_state = check_random_state(random_state)
126 | 
127 |         self.actions = [maze_actions[direction] for direction in directions]
128 |         self.num_actions = len(self.actions)
129 |         self.state = None
130 |         self.reset()
131 |         self.num_states = self.maze.shape[0] * self.maze.shape[1]
132 | 
133 |     def __repr__(self):
134 |         return 'GridWorld(maze={maze!r}, rewards={rewards}, terminal_markers={terminal_markers}, action_error_prob={action_error_prob})'.format(**self.__dict__)
135 | 
136 |     def reset(self):
137 |         """
138 |         Reset the position to a starting position (an 'o'), chosen at random.
139 |         """
140 |         options = self.maze.flat_positions_containing('o')
141 |         self.state = options[self.random_state.choice(len(options))]
142 | 
143 |     def is_terminal(self, state):
144 |         """Check if the given state is a terminal state."""
145 |         return self.maze.get_flat(state) in self.terminal_markers
146 | 
147 |     def observe(self):
148 |         """
149 |         Return the current state as an integer.
150 | 
151 |         The state is the index into the flattened maze.
152 |         """
153 |         return self.state
154 | 
155 |     def perform_action(self, action_idx):
156 |         """Perform an action (specified by index), yielding a new state and reward."""
157 |         # In the absorbing end state, nothing does anything.
158 |         if self.is_terminal(self.state):
159 |             return self.observe(), 0
160 | 
161 |         if self.action_error_prob and self.random_state.rand() < self.action_error_prob:
162 |             action_idx = self.random_state.choice(self.num_actions)
163 |         action = self.actions[action_idx]
164 |         new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(self.state), action)
165 |         self.state = self.maze.flatten_index(new_state_tuple)
166 | 
167 |         reward = self.rewards.get(self.maze.get_flat(self.state), 0) + self.rewards.get(result, 0)
168 |         return self.observe(), reward
169 | 
170 |     def as_mdp(self):
171 |         transition_probabilities = np.zeros((self.num_states, self.num_actions, self.num_states))
172 |         rewards = np.zeros((self.num_states, self.num_actions, self.num_states))
173 |         action_rewards = np.zeros((self.num_states, self.num_actions))
174 |         destination_rewards = np.zeros(self.num_states)
175 | 
176 |         for state in range(self.num_states):
177 |             destination_rewards[state] = self.rewards.get(self.maze.get_flat(state), 0)
178 | 
179 |         is_terminal_state = np.zeros(self.num_states, dtype=np.bool)
180 | 
181 |         for state in range(self.num_states):
182 |             if self.is_terminal(state):
183 |                 is_terminal_state[state] = True
184 |                 transition_probabilities[state, :, state] = 1.
185 |             else:
186 |                 for action in range(self.num_actions):
187 |                     new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(state), self.actions[action])
188 |                     new_state = self.maze.flatten_index(new_state_tuple)
189 |                     transition_probabilities[state, action, new_state] = 1.
190 |                     action_rewards[state, action] = self.rewards.get(result, 0)
191 | 
192 |         # Now account for action noise.
193 |         transitions_given_random_action = transition_probabilities.mean(axis=1, keepdims=True)
194 |         transition_probabilities *= (1 - self.action_error_prob)
195 |         transition_probabilities += self.action_error_prob * transitions_given_random_action
196 | 
197 |         rewards_given_random_action = action_rewards.mean(axis=1, keepdims=True)
198 |         action_rewards = (1 - self.action_error_prob) * action_rewards + self.action_error_prob * rewards_given_random_action
199 |         rewards = action_rewards[:, :, None] + destination_rewards[None, None, :]
200 |         rewards[is_terminal_state] = 0
201 | 
202 |         return transition_probabilities, rewards
203 | 
204 |     def get_max_reward(self):
205 |         transition_probabilities, rewards = self.as_mdp()
206 |         return rewards.max()
207 | 
208 |     ### Old API, where terminal states were None.
209 | 
210 |     def observe_old(self):
211 |         return None if self.is_terminal(self.state) else self.state
212 | 
213 |     def perform_action_old(self, action_idx):
214 |         new_state, reward = self.perform_action(action_idx)
215 |         if self.is_terminal(new_state):
216 |             return None, reward
217 |         else:
218 |             return new_state, reward
219 | 
220 | 
221 |     samples = {
222 |         'trivial': [
223 |             '###',
224 |             '#o#',
225 |             '#.#',
226 |             '#*#',
227 |             '###'],
228 | 
229 |         'larger': [
230 |             '#########',
231 |             '#..#....#',
232 |             '#..#..#.#',
233 |             '#..#..#.#',
234 |             '#..#.##.#',
235 |             '#....*#.#',
236 |             '#######.#',
237 |             '#o......#',
238 |             '#########']
239 |     }
240 | 
241 | 
242 | def construct_cliff_task(width, height, goal_reward=50, move_reward=-1, cliff_reward=-100, **kw):
243 |     """
244 |     Construct a 'cliff' task, a GridWorld with a "cliff" between the start and
245 |     goal. Falling off the cliff gives a large negative reward and ends the
246 |     episode.
247 | 
248 |     Any other parameters, like action_error_prob, are passed on to the
249 |     GridWorld constructor.
250 |     """
251 | 
252 |     maze = ['.' * width] * (height - 1)  # middle empty region
253 |     maze.append('o' + 'X' * (width - 2) + '*') # bottom goal row
254 | 
255 |     rewards = {
256 |         '*': goal_reward,
257 |         'moved': move_reward,
258 |         'hit-wall': move_reward,
259 |         'X': cliff_reward
260 |     }
261 | 
262 |     return GridWorld(maze, rewards=rewards, terminal_markers='*X', **kw)
263 | 


--------------------------------------------------------------------------------
/bayesrl/environments/pomdpgw.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from ..utils import check_random_state
  3 | 
  4 | 
  5 | # Maze state is represented as a 2-element NumPy array: (Y, X). Increasing Y is South.
  6 | 
  7 | # Possible actions, expressed as (delta-y, delta-x).
  8 | maze_actions = {
  9 |     'N': np.array([-1, 0]),
 10 |     'S': np.array([1, 0]),
 11 |     'E': np.array([0, 1]),
 12 |     'W': np.array([0, -1]),
 13 | }
 14 | 
 15 | def parse_topology(topology):
 16 |     return np.array([list(row) for row in topology])
 17 | 
 18 | 
 19 | class Maze(object):
 20 |     """
 21 |     Simple wrapper around a NumPy 2D array to handle flattened indexing and staying in bounds.
 22 |     """
 23 |     def __init__(self, topology, true_obs_prob=.8, easy_obs_model=True):
 24 |         self.topology = parse_topology(topology)
 25 |         self.flat_topology = self.topology.ravel()
 26 |         self.shape = self.topology.shape
 27 | 	self.true_obs_prob = true_obs_prob
 28 | 	self.easy_obs_model = easy_obs_model
 29 | 	#If the observation model is easy, the agent can observe which directions have walls
 30 | 	#If the observation model is not easy, the agent only observes how many of its four neighbors are walls. 
 31 | 	self.num_observations = 16 if easy_obs_model else 5
 32 | 
 33 |     def in_bounds_flat(self, position):
 34 |         return 0 <= position < np.product(self.shape)
 35 | 
 36 |     def in_bounds_unflat(self, position):
 37 |         return 0 <= position[0] < self.shape[0] and 0 <= position[1] < self.shape[1]
 38 | 
 39 |     def get_flat(self, position):
 40 |         if not self.in_bounds_flat(position):
 41 |             raise IndexError("Position out of bounds: {}".format(position))
 42 |         return self.flat_topology[position]
 43 | 
 44 |     def get_unflat(self, position):
 45 |         if not self.in_bounds_unflat(position):
 46 |             raise IndexError("Position out of bounds: {}".format(position))
 47 |         return self.topology[tuple(position)]
 48 | 
 49 |     def flatten_index(self, index_tuple):
 50 |         return np.ravel_multi_index(index_tuple, self.shape)
 51 | 
 52 |     def unflatten_index(self, flattened_index):
 53 |         return np.unravel_index(flattened_index, self.shape)
 54 | 
 55 |     def flat_positions_containing(self, x):
 56 |         return list(np.nonzero(self.flat_topology == x)[0])
 57 | 
 58 |     def flat_positions_not_containing(self, x):
 59 |         return list(np.nonzero(self.flat_topology != x)[0])
 60 | 
 61 |     def get_inbound_index(self, index_tuple):
 62 | 	x = min(max(index_tuple[0],0),self.shape[0]-1)
 63 | 	y = min(max(index_tuple[1],0),self.shape[1]-1)
 64 | 	return x, y
 65 | 
 66 |     def true_observation(self, index_tuple):
 67 | 	it = index_tuple
 68 | 	if type(it) == np.int64:
 69 | 	    it = self.unflatten_index(it)
 70 | 	neighbors = [(it[0]+1,it[1]),
 71 | 		     (it[0]-1,it[1]),
 72 | 		     (it[0],it[1]+1),
 73 | 		     (it[0],it[1]-1)]
 74 | 	neighbors = [n for n in neighbors if self.in_bounds_unflat(n)]
 75 | 	if_wall = [self.get_unflat(n)=='#' for n in neighbors]
 76 | 	if self.easy_obs_model:
 77 | 	    obs = sum(if_wall)
 78 | 	else:
 79 | 	    obs = sum(np.array([8,4,2,1])*if_wall)    
 80 | 	return obs
 81 | 
 82 |     def obs_distribution(self, index_tuple):
 83 | 	if type(index_tuple) == int:
 84 | 	    index_tuple = self.unflatten_index(index_tuple)
 85 | 	other_obs_prob = (1-self.true_obs_prob)/(self.num_observations-1)
 86 | 	obs_distribution = [other_obs_prob] * self.num_observations
 87 | 	true_obs = self.true_observation(index_tuple)
 88 | 	obs_distribution[true_obs] = self.true_obs_prob
 89 | 	return obs_distribution
 90 | 
 91 |     def get_all_obs_distribution(self):
 92 | 	return [self.obs_distribution((x,y)) for x in range(self.shape[0]) for y in range(self.shape[1])]
 93 | 
 94 |     def observation(self, index_tuple):
 95 | 	if type(index_tuple) == int:
 96 | 	    index_tuple = self.unflatten_index(index_tuple)
 97 | 	obs_distribution = self.obs_distribution(index_tuple)
 98 | 	obs = np.random.multinomial(1, obs_distribution)
 99 | 	return obs.tolist().index(1)
100 | 
101 |     def __str__(self):
102 |         return '\n'.join(''.join(row) for row in self.topology.tolist())
103 | 
104 |     def __repr__(self):
105 |         return 'Maze({})'.format(repr(self.topology.tolist()))
106 | 
107 | 
108 | def move_avoiding_walls(maze, position, action):
109 |     """
110 |     Return the new position after moving, and the event that happened ('hit-wall' or 'moved').
111 | 
112 |     Works with the position and action as a (row, column) array.
113 |     """
114 |     # Compute new position
115 |     new_position = position + action
116 | 
117 |     # Compute collisions with walls, including implicit walls at the ends of the world.
118 |     if not maze.in_bounds_unflat(new_position) or maze.get_unflat(new_position) == '#':
119 |         return position, 'hit-wall'
120 | 
121 |     return new_position, 'moved'
122 | 
123 | 
124 | 
125 | class GridWorld(object):
126 |     """
127 |     A simple task in a maze: get to the goal.
128 | 
129 |     Parameters
130 |     ----------
131 | 
132 |     maze : list of strings or lists
133 |         maze topology (see below)
134 | 
135 |     rewards: dict of string to number. default: {'*': 10}.
136 |         Rewards obtained by being in a maze grid with the specified contents,
137 |         or experiencing the specified event (either 'hit-wall' or 'moved'). The
138 |         contributions of content reward and event reward are summed. For
139 |         example, you might specify a cost for moving by passing
140 |         rewards={'*': 10, 'moved': -1}.
141 | 
142 |     terminal_markers: sequence of chars, default '*'
143 |         A grid cell containing any of these markers will be considered a
144 |         "terminal" state.
145 | 
146 |     action_error_prob: float
147 |         With this probability, the requested action is ignored and a random
148 |         action is chosen instead.
149 | 
150 |     random_state: None, int, or RandomState object
151 |         For repeatable experiments, you can pass a random state here. See
152 |         http://scikit-learn.org/stable/modules/generated/sklearn.utils.check_random_state.html
153 | 
154 |     Notes
155 |     -----
156 | 
157 |     Maze topology is expressed textually. Key:
158 |      '#': wall
159 |      '.': open (really, anything that's not '#')
160 |      '*': goal
161 |      'o': origin
162 |     """
163 | 
164 |     def __init__(self, maze, rewards={'*': 10}, terminal_markers='*',
165 |     action_error_prob=0, random_state=None, directions="NSEW", pomdp=False):
166 | 
167 |         self.maze = Maze(maze) if not isinstance(maze, Maze) else maze
168 |         self.rewards = rewards
169 |         self.terminal_markers = terminal_markers
170 |         self.action_error_prob = action_error_prob
171 |         self.random_state = check_random_state(random_state)
172 | 
173 |         self.actions = [maze_actions[direction] for direction in directions]
174 |         self.num_actions = len(self.actions)
175 |         self.state = None
176 |         self.reset()
177 |         self.num_states = self.maze.shape[0] * self.maze.shape[1]
178 | 	self.pomdp = pomdp
179 | 
180 |     def __repr__(self):
181 |         return 'GridWorld(maze={maze!r}, rewards={rewards}, terminal_markers={terminal_markers}, action_error_prob={action_error_prob})'.format(**self.__dict__)
182 | 
183 |     def reset(self):
184 |         """
185 |         Reset the position to a starting position (an 'o'), chosen at random.
186 |         """
187 |         options = self.maze.flat_positions_containing('o')
188 |         self.state = options[self.random_state.choice(len(options))]
189 | 
190 |     def is_terminal(self, state):
191 |         """Check if the given state is a terminal state."""
192 |         return self.maze.get_flat(state) in self.terminal_markers
193 | 
194 |     def observe(self):
195 |         """
196 |         Return the current state as an integer.
197 | 
198 |         The state is the index into the flattened maze.
199 |         """
200 | 	o = self.maze.observation(self.state) if self.pomdp else self.state
201 |         return o
202 | 
203 |     def perform_action(self, action_idx):
204 |         """Perform an action (specified by index), yielding a new state and reward."""
205 |         # In the absorbing end state, nothing does anything.
206 |         if self.is_terminal(self.state):
207 |             return self.observe(), 0
208 | 
209 |         if self.action_error_prob and self.random_state.rand() < self.action_error_prob:
210 |             action_idx = self.random_state.choice(self.num_actions)
211 |         action = self.actions[action_idx]
212 |         new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(self.state), action)
213 |         self.state = self.maze.flatten_index(new_state_tuple)
214 | 
215 |         reward = self.rewards.get(self.maze.get_flat(self.state), 0) + self.rewards.get(result, 0)
216 |         return self.observe(), reward
217 | 
218 |     def as_mdp(self):
219 |         transition_probabilities = np.zeros((self.num_states, self.num_actions, self.num_states))
220 |         rewards = np.zeros((self.num_states, self.num_actions, self.num_states))
221 |         action_rewards = np.zeros((self.num_states, self.num_actions))
222 |         destination_rewards = np.zeros(self.num_states)
223 | 
224 |         for state in range(self.num_states):
225 |             destination_rewards[state] = self.rewards.get(self.maze.get_flat(state), 0)
226 | 
227 |         is_terminal_state = np.zeros(self.num_states, dtype=np.bool)
228 | 
229 |         for state in range(self.num_states):
230 |             if self.is_terminal(state):
231 |                 is_terminal_state[state] = True
232 |                 transition_probabilities[state, :, state] = 1.
233 |             else:
234 |                 for action in range(self.num_actions):
235 |                     new_state_tuple, result = move_avoiding_walls(self.maze, self.maze.unflatten_index(state), self.actions[action])
236 |                     new_state = self.maze.flatten_index(new_state_tuple)
237 |                     transition_probabilities[state, action, new_state] = 1.
238 |                     action_rewards[state, action] = self.rewards.get(result, 0)
239 | 
240 |         # Now account for action noise.
241 |         transitions_given_random_action = transition_probabilities.mean(axis=1, keepdims=True)
242 |         transition_probabilities *= (1 - self.action_error_prob)
243 |         transition_probabilities += self.action_error_prob * transitions_given_random_action
244 | 
245 |         rewards_given_random_action = action_rewards.mean(axis=1, keepdims=True)
246 |         action_rewards = (1 - self.action_error_prob) * action_rewards + self.action_error_prob * rewards_given_random_action
247 |         rewards = action_rewards[:, :, None] + destination_rewards[None, None, :]
248 |         rewards[is_terminal_state] = 0
249 | 
250 |         return transition_probabilities, rewards
251 | 
252 |     def get_max_reward(self):
253 |         transition_probabilities, rewards = self.as_mdp()
254 |         return rewards.max()
255 | 
256 |     ### Old API, where terminal states were None.
257 | 
258 |     def observe_old(self):
259 |         return None if self.is_terminal(self.state) else self.state
260 | 
261 |     def perform_action_old(self, action_idx):
262 |         new_state, reward = self.perform_action(action_idx)
263 |         if self.is_terminal(new_state):
264 |             return None, reward
265 |         else:
266 |             return new_state, reward
267 | 
268 | 
269 |     samples = {
270 |         'trivial': [
271 |             '###',
272 |             '#o#',
273 |             '#.#',
274 |             '#*#',
275 |             '###'],
276 | 
277 |         'larger': [
278 |             '#########',
279 |             '#..#....#',
280 |             '#..#..#.#',
281 |             '#..#..#.#',
282 |             '#..#.##.#',
283 |             '#....*#.#',
284 |             '#######.#',
285 |             '#o......#',
286 |             '#########']
287 |     }
288 | 
289 | 
290 | 
291 | 
292 | def construct_cliff_task(width, height, goal_reward=50, move_reward=-1, cliff_reward=-100, **kw):
293 |     """
294 |     Construct a 'cliff' task, a GridWorld with a "cliff" between the start and
295 |     goal. Falling off the cliff gives a large negative reward and ends the
296 |     episode.
297 | 
298 |     Any other parameters, like action_error_prob, are passed on to the
299 |     GridWorld constructor.
300 |     """
301 | 
302 |     maze = ['.' * width] * (height - 1)  # middle empty region
303 |     maze.append('o' + 'X' * (width - 2) + '*') # bottom goal row
304 | 
305 |     rewards = {
306 |         '*': goal_reward,
307 |         'moved': move_reward,
308 |         'hit-wall': move_reward,
309 |         'X': cliff_reward
310 |     }
311 | 
312 |     return GridWorld(maze, rewards=rewards, terminal_markers='*X', **kw)
313 | 


--------------------------------------------------------------------------------
/bayesrl/plot.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | 
  4 | class Plot(object):
  5 |     """
  6 |     Wrapper class for collecting all trials to use in visualization methods.
  7 | 
  8 |     Parameters
  9 |     ----------
 10 |     dict_trial: dictionary of lists, where each object in the list is a Trial
 11 |         Key is the name of the learner, and value is a list of trials
 12 |         for that learner using different parameter settings.
 13 |     """
 14 |     def __init__(self, dict_trial):
 15 |         self.dict_trial = dict_trial
 16 | 
 17 |         self.colors = ['r', 'b', 'g', 'm', 'c', 'y']
 18 |         self.line_type = ['-', '--', '-.']
 19 | 
 20 |     def cum_rewards_by_iteration(self):
 21 |         """
 22 |         Plot B.
 23 |         y-axis: Sum of all rewards.
 24 |         x-axis: Iteration of trial(s).
 25 |         """
 26 |         self.__rewards_by_idx("cum", "iters")
 27 | 
 28 |     def rewards_by_episode(self):
 29 |         """
 30 |         Plot C.
 31 |         y-axis: Immediate reward.
 32 |         x-axis: Episode of trial(s).
 33 |         """
 34 |         self.__rewards_by_idx("imm", "epi")
 35 | 
 36 |     def cum_rewards_by_prob_start(self):
 37 |         """
 38 |         Plot F.
 39 |         y-axis: Sum of all the rewards.
 40 |         x-axis: Pr(return to start).
 41 |         """
 42 |         self.__rewards_by_prob_start("cum")
 43 | 
 44 |     def end_rewards_by_prob_start(self):
 45 |         """
 46 |         Plot G.
 47 |         y-axis: Sum of all the rewards in the last 100 iterations.
 48 |         x-axis: Pr(return to start).
 49 |         """
 50 |         self.__rewards_by_prob_start("end")
 51 | 
 52 |     def cum_rewards_by_act_err_prob(self):
 53 |         """
 54 |         Plot I.
 55 |         y-axis: Sum of all the rewards.
 56 |         x-axis: Action-error probability.
 57 |         """
 58 |         self.__rewards_by_act_err_prob("cum")
 59 | 
 60 |     def end_rewards_by_act_err_prob(self):
 61 |         """
 62 |         Plot J.
 63 |         y-axis: Sum of all the rewards in the last 100 iterations.
 64 |         x-axis: Action-error probability.
 65 |         """
 66 |         self.__rewards_by_act_err_prob("end")
 67 | 
 68 |     def __rewards_by_idx(self, reward_type, idx_type):
 69 |         """
 70 |         reward_type: "cum" or "imm"
 71 |         idx_type: "iters" or "epi"
 72 |         """
 73 |         i = 0
 74 |         for key,value in self.dict_trial.items():
 75 |             color = self.colors[i]
 76 |             j = 0
 77 |             for trial in value:
 78 |                 line_type = self.line_type[j]
 79 |                 if reward_type == "cum" and idx_type == "iters":
 80 |                     array = trial.array_rewards_by_iteration.cumsum(axis=1)
 81 |                 elif reward_type == "imm" and idx_type == "epi":
 82 |                     array = trial.array_rewards_by_episode
 83 |                 else:
 84 |                     raise Exception("Arguments not specified correctly.")
 85 |                 x = np.arange(array.shape[1])
 86 |                 mean = array.mean(axis=0)
 87 |                 if j == 0:
 88 |                     plt.plot(x, mean, color+line_type, label=key)
 89 |                 else:
 90 |                     plt.plot(x, mean, color+line_type)
 91 |                 j += 1
 92 |             i += 1
 93 |         if reward_type == "cum" and idx_type == "iters":
 94 |             plt.title("Cumulative reward by iteration")
 95 |             plt.ylabel("Cumulative reward")
 96 |             plt.xlabel("Iteration")
 97 |             plt.legend(loc=2)
 98 |         elif reward_type == "imm" and idx_type == "epi":
 99 |             plt.title("Immediate reward by episode")
100 |             plt.ylabel("Immediate reward")
101 |             plt.xlabel("Episode")
102 |             plt.legend(loc=4)
103 |         plt.show()
104 | 
105 |     def __rewards_by_prob_start(self, reward_type):
106 |         """
107 |         reward_type: "cum" or "end"
108 |         """
109 |         i = 0
110 |         for key,value in self.dict_trial.items():
111 |             color = self.colors[i]
112 |             x = np.arange(0, 1, 0.1)
113 |             means = np.zeros(len(value))
114 |             j = 0
115 |             for trial in value:
116 |                 if reward_type == "cum":
117 |                     array = trial.array_rewards_by_iteration.sum(axis=1)
118 |                 elif reward_type == "end":
119 |                     array = trial.array_rewards_by_iteration[:,-100:].sum(axis=1)
120 |                 else:
121 |                     raise Exception("Arguments not specified correctly.")
122 |                 means[j] = array.mean(axis=0)
123 |                 j += 1
124 |             plt.plot(x, means, color, label=key)
125 |             i += 1
126 |         if reward_type == "cum":
127 |             plt.title("Cumulative reward by prob(return_start)")
128 |             plt.ylabel("Cumulative reward")
129 |         elif reward_type == "end":
130 |             plt.title("End reward by prob(return_start)")
131 |             plt.ylabel("End reward (sum of last 100 iterations)")
132 |         plt.xlabel("Prob(return_start)")
133 |         plt.legend()
134 |         plt.show()
135 | 
136 |     def __rewards_by_act_err_prob(self, reward_type):
137 |         """
138 |         reward_type: "cum" or "end"
139 |         """
140 |         i = 0
141 |         for key,value in self.dict_trial.items():
142 |             color = self.colors[i]
143 |             j = 0
144 |             for trial_list in value:
145 |                 line_width = np.linspace(0.5, 3, endpoint=True, num=len(value))[j]
146 |                 x = np.arange(0, 0.55, 0.05)
147 |                 means = np.zeros(len(trial_list))
148 |                 for k,trial in enumerate(trial_list):
149 |                     if reward_type == "cum":
150 |                         array = trial.array_rewards_by_iteration.sum(axis=1)
151 |                     elif reward_type == "end":
152 |                         array = trial.array_rewards_by_iteration[:,-100:].sum(axis=1)
153 |                     else:
154 |                         raise Exception("Arguments not specified correctly.")
155 |                     means[k] = array.mean(axis=0)
156 |                 if line_width == 1:
157 |                     plt.plot(x, means, color, linewidth=line_width, label=key)
158 |                 else:
159 |                     plt.plot(x, means, color, linewidth=line_width)
160 |                 j += 1
161 |             i += 1
162 |         if reward_type == "cum":
163 |             plt.title("Cumulative reward by action-error probability (thicker=larger epsilon)")
164 |             plt.ylabel("Cumulative reward")
165 |         elif reward_type == "end":
166 |             plt.title("End reward by action-error probability (thicker=larger epsilon)")
167 |             plt.ylabel("End reward")
168 |         plt.xlabel("Action-error probability")
169 |         plt.legend()
170 |         plt.show()
171 | 


--------------------------------------------------------------------------------
/bayesrl/trial.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Trial(object):
 4 |     """
 5 |     Class for running trial(s) for a given agent and task.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     agent: Agent
10 |     task: Task
11 |     MIN_ITERATIONS: int
12 |         The minimum number of iterations for a trial.
13 |     MIN_EPISODES: int
14 |         The minimum number of episodes for a trial.
15 |     MAX_EPISODE_ITERATION: int
16 |         The maximum number of iterations for each episode.
17 |     """
18 |     def __init__(self, agent, task, MIN_ITERATIONS=5000, MIN_EPISODES=100, MAX_EPISODE_ITERATION=1000):
19 |         self.agent = agent
20 |         self.task = task
21 |         self.MIN_ITERATIONS = MIN_ITERATIONS
22 |         self.MIN_EPISODES = MIN_EPISODES
23 |         self.MAX_EPISODE_ITERATION = MAX_EPISODE_ITERATION
24 | 
25 |         self.array_rewards_by_episode = None
26 |         self.array_rewards_by_iteration = None
27 | 
28 |     def run(self):
29 |         iteration = episode = 0
30 |         rewards_by_iteration = np.zeros(self.MIN_ITERATIONS)
31 |         rewards_by_episode = np.zeros(self.MIN_EPISODES)
32 |         self.agent.reset()
33 | 
34 |         while iteration < self.MIN_ITERATIONS or episode < self.MIN_EPISODES:
35 |             print "Episode:",episode
36 |             # Initialize the episode.
37 |             self.task.reset()
38 |             #if self.task.pomdp:
39 |             #    self.agent.reset_belief()
40 |             state = self.task.observe()
41 |             reward = None
42 |             cumulative_reward = 0
43 |             episode_iteration = 0
44 | 
45 |             while episode_iteration < self.MAX_EPISODE_ITERATION:
46 |                 # Tell the agent what happened and ask for a next action.
47 |                 action = self.agent.interact(reward, state, self.task.is_terminal(state), iteration)
48 | 
49 |                 if self.task.is_terminal(state):
50 |                     # End of episode (happens after interaction so agent can learn from final reward).
51 |                     break
52 | 
53 |                 # Take action A, observe R, S'.
54 |                 state, reward = self.task.perform_action(action)
55 | 
56 |                 # Log rewards.
57 |                 if iteration < self.MIN_ITERATIONS:
58 |                     rewards_by_iteration[iteration] = reward
59 |                 cumulative_reward += reward
60 | 
61 |                 iteration += 1
62 |                 episode_iteration += 1
63 | 
64 |             if episode < self.MIN_EPISODES:
65 |                 rewards_by_episode[episode] = cumulative_reward
66 |             episode += 1
67 | 
68 |         return rewards_by_iteration, rewards_by_episode
69 | 
70 |     def run_multiple(self, num_trials):
71 |         self.array_rewards_by_episode = np.zeros((num_trials, self.MIN_EPISODES))
72 |         self.array_rewards_by_iteration = np.zeros((num_trials, self.MIN_ITERATIONS))
73 |         for i in xrange(num_trials):
74 |             self.array_rewards_by_iteration[i], self.array_rewards_by_episode[i] = self.run()
75 | 


--------------------------------------------------------------------------------
/bayesrl/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import numbers
 3 | 
 4 | def check_random_state(seed):
 5 |     # From https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py
 6 |     """Turn seed into a np.random.RandomState instance
 7 | 
 8 |     If seed is None, return the RandomState singleton used by np.random.
 9 |     If seed is an int, return a new RandomState instance seeded with seed.
10 |     If seed is already a RandomState instance, return it.
11 |     Otherwise raise ValueError.
12 |     """
13 |     if seed is None or seed is np.random:
14 |         return np.random.mtrand._rand
15 |     if isinstance(seed, (numbers.Integral, np.integer)):
16 |         return np.random.RandomState(seed)
17 |     if isinstance(seed, np.random.RandomState):
18 |         return seed
19 |     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
20 |                      ' instance' % seed)
21 | 


--------------------------------------------------------------------------------
/benchmarks/thompson_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Solves grid world using three different parameter settings for Thompson
 3 | sampling. This empirically shows the convergence of Thompson sampling regardless
 4 | of the prior misspecification.
 5 | """
 6 | 
 7 | from bayesrl.environments import GridWorld
 8 | from bayesrl.agents.thompsonsampagent import ThompsonSampAgent
 9 | from bayesrl.trial import Trial
10 | from bayesrl.plot import Plot
11 | 
12 | # Define environment.
13 | task = GridWorld(
14 |     GridWorld.samples['larger'],
15 |     action_error_prob=.1,
16 |     rewards={'*': 50, 'moved': -1, 'hit-wall': -1})
17 | 
18 | num_trials = 1
19 | 
20 | ################################################################################
21 | # Thompson Sampling
22 | ################################################################################
23 | # Dirichlet params = 1, Reward params = 50
24 | agent = ThompsonSampAgent(
25 |     num_states=task.num_states, num_actions=task.num_actions,
26 |     discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50)
27 | trial_thompson1 = Trial(agent, task)
28 | trial_thompson1.run_multiple(num_trials)
29 | 
30 | # Dirichlet params = 1, Reward params = 10
31 | agent.dirichlet_param = 1
32 | agent.reward_param = 10
33 | trial_thompson2 = Trial(agent, task)
34 | trial_thompson2.run_multiple(num_trials)
35 | 
36 | # Dirichlet params = 10, Reward params = 50
37 | agent.dirichlet_param = 10
38 | agent.reward_param = 50
39 | trial_thompson3 = Trial(agent, task)
40 | trial_thompson3.run_multiple(num_trials)
41 | 
42 | ################################################################################
43 | # Plots!
44 | ################################################################################
45 | plot = Plot({"Thompson sampling": [trial_thompson1, trial_thompson2, trial_thompson3]
46 |             })
47 | # Plot cumulative rewards by iteration
48 | plot.cum_rewards_by_iteration()
49 | # Plot rewards by episode
50 | plot.rewards_by_episode()
51 | 


--------------------------------------------------------------------------------
/benchmarks/thompson_gridworld_pomdp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Solves grid world using three different parameter settings for Thompson
 3 | sampling. This empirically shows the convergence of Thompson sampling regardless
 4 | of the prior misspecification.
 5 | """
 6 | 
 7 | from bayesrl.environments import pomdpgw
 8 | from bayesrl.agents.thompsonsampagent_pomdp import ThompsonSampAgentPOMDP
 9 | from bayesrl.trial import Trial
10 | from bayesrl.plot import Plot
11 | 
12 | # Define environment.
13 | task = pomdpgw.GridWorld(
14 |     pomdpgw.GridWorld.samples['larger'],
15 |     action_error_prob=.1,
16 |     rewards={'*': 50, 'moved': -1, 'hit-wall': -1},
17 |     pomdp=True)
18 | 
19 | num_trials = 1
20 | 
21 | ################################################################################
22 | # Thompson Sampling
23 | ################################################################################
24 | # Dirichlet params = 1, Reward params = 50
25 | agent = ThompsonSampAgentPOMDP(observation_model=task.maze.get_all_obs_distribution(),
26 |     num_states=task.num_states, num_actions=task.num_actions,
27 |     discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50)
28 | trial_thompson1 = Trial(agent, task, MIN_EPISODES=100)
29 | trial_thompson1.run_multiple(num_trials)
30 | 
31 | # # Dirichlet params = 1, Reward params = 10
32 | # agent.dirichlet_param = 1
33 | # agent.reward_param = 10
34 | # trial_thompson2 = Trial(agent, task)
35 | # trial_thompson2.run_multiple(num_trials)
36 | 
37 | # # Dirichlet params = 10, Reward params = 50
38 | # agent.dirichlet_param = 10
39 | # agent.reward_param = 50
40 | # trial_thompson3 = Trial(agent, task)
41 | # trial_thompson3.run_multiple(num_trials)
42 | 
43 | ################################################################################
44 | # Plots!
45 | ################################################################################
46 | plot = Plot({"Thompson sampling": [trial_thompson1]#, trial_thompson2, trial_thompson3]
47 |             })
48 | # Plot cumulative rewards by iteration
49 | plot.cum_rewards_by_iteration()
50 | # Plot rewards by episode
51 | plot.rewards_by_episode()
52 | 


--------------------------------------------------------------------------------
/reports/6_834j_ps03.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{strens2000bayesian,
 2 |     author = {Malcolm Strens},
 3 |     title = {A Bayesian framework for reinforcement learning},
 4 |     booktitle = {Proceedings of the 17th International Conference on Machine Learning (ICML)},
 5 |     year = {2000}
 6 | }
 7 | 
 8 | @article{astrom1965optimal,
 9 |     author = {Karl Johan Astr\"{o}m},
10 |     title = {Optimal control of Markov decision processes with incomplete state
11 |     estimation},
12 |     journal = {Journal of Mathematical Analysis and Applications},
13 |     year = {1965},
14 |     volume = {10:174–205}
15 | }
16 | 
17 | @unpublished{Braziunas,
18 |     author = {Darius Braziunas},
19 |     title = {POMDP solution methods},
20 |     year = {2003},
21 |     school = {University of Toronto}
22 | }
23 | 
24 | @book{sutton1998reinforcement,
25 |   author = {Richard S. Sutton and Andrew G. Barto},
26 |   publisher = {Cambridge Univ Press},
27 |   title = {Reinforcement learning: An introduction},
28 |   volume = 116,
29 |   year = 1998
30 | }
31 | 


--------------------------------------------------------------------------------
/reports/6_834j_ps03.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps03.pdf


--------------------------------------------------------------------------------
/reports/6_834j_ps03.tex:
--------------------------------------------------------------------------------
  1 | %##############################################################################
  2 | % Preamble
  3 | %##############################################################################
  4 | 
  5 | \documentclass{pset}
  6 | \name{Dustin Tran, Xiaomin Wang, Rodrigo Gomes}
  7 | \email{\{trandv,xiaominw, rgomes\}@mit.edu}
  8 | 
  9 | \course{6.834J/16.412J, S15}
 10 | \instructor{Professor Brian Williams}
 11 | \assignment{Problem Set \#3}
 12 | \duedate{April 17, 2015}
 13 | 
 14 | \begin{document}
 15 | 
 16 | %##############################################################################
 17 | % Begin Document
 18 | %##############################################################################
 19 | 
 20 | \section{Introduction}
 21 | Many scenarios in real life occur where one must make a sequence of decisions
 22 | under uncertainty. Quite often in these scenarios two unknowns exist: the
 23 | result of taking an action at a given state and the true state of the agent at
 24 | any point in time. This is similar to that of a hidden Markov model (HMM), only
 25 | in that one must make a sequence of actions instead of a single action; thus the
 26 | problem falls no longer in supervised learning but reinforcement learning. The
 27 | classic example is a robot which tries to navigate a discrete environment using a
 28 | GPS, and it performs action that lead to different states with various
 29 | probabilities. The transition probabilities are unknown, and because of the GPS,
 30 | there is also inaccuracy in what the
 31 | underlying state is.
 32 | 
 33 | One can formalize the problem, under Markovian assumptions, as a partially
 34 | observable Markov decision process (POMDP). In this project we provide a
 35 | software library for implementing and solving them, which is modular and
 36 | flexible enough for further development and user-specified agents/environments.
 37 | We encode a variety of basic tasks and solve them using a combination of value
 38 | iteration and several standard (PO)MDP solvers---one is a variant of Thompson
 39 | sampling \cite{strens2000bayesian}, which is a Bayesian approach following a
 40 | Dirichlet-multinomial posterior over each state-action pair.
 41 | 
 42 | The repository can be found at https://github.com/dustinvtran/bayesrl.
 43 | To install from pip, run
 44 | \begin{lstlisting}
 45 | pip install -e "git+https://github.com/dustinvtran/bayesrl.git#egg=bayesrl"
 46 | \end{lstlisting}
 47 | 
 48 | \section{Technical Background}
 49 | 
 50 | \subsection{POMDP}
 51 | POMDP is a generalization of a Markov decision process (MDP). In a MDP, for each
 52 | possible state of the process, a decision has to be made regarding which action
 53 | should be executed in that state. The chosen action and given state affects the
 54 | costs (or rewards) incurred. The goal is to learn the optimal \emph{policy},
 55 | which is a
 56 | choice of actions that in expectation leads to the optima reward in a
 57 | pre-defined number of steps (in the case of finite horizon; or in infinite time). In a
 58 | POMDP model, the agent does not fully observe the underlying states but a
 59 | (non-sufficient) statistic of it. Thus POMDPs also
 60 | maintain a probability distribution over the set of possible states, based on a
 61 | set of observations and observation probabilities, and the underlying MDP.
 62 | 
 63 | More formally, a POMDP is a collection $(S,A,T,R,\Omega,O,\gamma)$, where
 64 | 
 65 | \begin{itemize}
 66 | \item $S$ is a set of states
 67 | \item $A$ is a set of actions
 68 | \item $T$ is a set of transition probabilities between states. If the agent is currently in state $s
 69 | \in S$, and it takes action $a \in A$, the agent will transition to a new state
 70 | $s'$ with probability
 71 | $T(s' \mid s,a)$.
 72 | \item $R: S \times A \rightarrow \mathbb{R}$ is a reward function that assigns a numeric reward (or
 73 | cost if the value is negative) for each state and action.
 74 | \item $\Omega$ is a set of observations
 75 | \item $O$ is a set of conditional observation probabilites. If the agent is now in state $s$,
 76 | it receives an observation $o$ according to $O(o \mid s)$
 77 | \item $\gamma \in [0,1]$ is a discount factor that determines how much rewards should be discounted over time
 78 | \end{itemize}
 79 | 
 80 | We use our testing environment \texttt{GridWorld} as an example.
 81 | \texttt{GridWorld} represents a 2D maze where the agent can be in discrete locations.
 82 | Certain locations are impossible for the agent to move to, representing ``walls''. Every
 83 | action can move the agent between two adjacent grid locations, or fail, and
 84 | cause
 85 | the agent to take a uniformly random action instead according to some
 86 | user-specified probability. The goal is to reach
 87 | the goal location in least amount of time, as each move is -1 reward, hitting a
 88 | wall is -1 reward, and reaching the end positions incurs a reward of +50.
 89 | 
 90 | For \texttt{GridWorld}, $S$ consists of all the possible (row, column) location
 91 | tuples inside the maze.  $A$ contains the four possible actions the agent can
 92 | take: up, down, left, right.  $T$ describes a transition model that allows the
 93 | agent to move without hitting the wall.  We define $R$ as the above. To make
 94 | this a partially observable problem, we implemented two observation models.
 95 | \begin{itemize}
 96 | \item
 97 | The
 98 | easier model gives the agent more information about the environment. The agent
 99 | knows which of its four neighbors are walls, giving rise to 16 total
100 | observations.
101 | \item
102 | In the second observation model, the agent can only observe how
103 | many of its four neighbors are walls, giving rise to 5 possible observations.
104 | $O$ is such that $Pr(true\_observation\mid state)=true\_observation\_prob$,
105 | where $true\_observation\_prob$ can be adjusted, and
106 | \begin{equation}
107 | Pr(other\_observation\mid
108 | state)=\frac{1-true\_observation\_prob}{total\_num\_of\_observations-1}
109 | \end{equation}
110 | \end{itemize}
111 | Since
112 | we cannot work with the underlying states directly in POMDP, we also need $B$
113 | which is the set of belief states, or the probabilities the agent is at all
114 | possible states.
115 | 
116 | \subsection{Thompson Sampling}
117 | Thompson Sampling is used to learn the transition distribution $T$, where one
118 | first specifies a prior according to one's knowledge about the transitions
119 | before trying any actions. After a constant set of time steps, the transition
120 | probabilities are then recalculated using a posterior update following a
121 | Dirichlet-multinomial distribution. In MDP, since the agent observes the states
122 | directly, the posterior transition probabilities are updated directly using the
123 | transition counts. In POMDP, the posterior probabilities are iteratively updated
124 | following an update on belief state transition probabilities.
125 | 
126 | Note that we've also implemented a variety of other standard MDP solvers for
127 | benchmarking: Q-Learning, SARSA, and R-MAX.
128 | 
129 | \subsection{Value Iteration}
130 | Given estimated transition probabilities, we then solve for the underlying policy with value iteration. Value $V$, is the expected total reward
131 | given a policy $\pi$, where a policy decides which action to take given the
132 | belief state. $a = \pi(b)$. The expected reward for policy $\pi$ starting freom
133 | belief $b_0$ is defined as
134 | \[ V^{\pi}(b_{0})=\sum\limits_{t=0}^\infty \gamma^{t}r(b_t,a_t) \]
135 | where $r(b_t, a_t) = \sum\limits_{s \in S} b_t(s)R(s,a_t)$.
136 | The optimal policy should maximize the long term reward
137 | \[ \pi = \underset{\pi}{\text{argmax}} V^{\pi}(b_0) \]
138 | At each time step, we update the belief states based on the observation, and
139 | then update the values based on the updated belief states. The action that gives
140 | the largest expected reward over the belief states is selected for the next time
141 | step. The values gradually improve until convergence.
142 | By improving the values, the policy is implicitly improved.
143 | 
144 | \section{Implementation}
145 | 
146 | \subsection{Agent Environment Paradigm}
147 | For MDP, we follow the paradigm set forth in Sutton and Barto (Figure 3.1,
148 | \cite{sutton1998reinforcement}).
149 | \begin{figure}[ht]
150 | \begin{center}
151 | \centerline{\includegraphics[width=\textwidth]{img/agent_environment.png}}
152 | \end{center}
153 | \end{figure}
154 | It suggests that RL agents need only output a suggested action after previous history and a given state and reward.  Overall by following the paradigm for the software design, we make the learning process explicit and intuitive.
155 | 
156 | \subsubsection{Agent}
157 | We implement a base class \texttt{Agent} which is a collection of objects and functions to be used for all other agents. Agents differ primarily in their \texttt{interact()} function, which determines the next action to perform given a state and reward from the environment.
158 | 
159 | The model-based algorithms R-MAX and Thompson sampling inherit from \texttt{ModelBasedAgent}, which is a class that itself inherits from \texttt{Agent}; \texttt{ModelBasedAgent} adds subroutines specific to model-based approaches such as value iteration.
160 | 
161 | In order to reduce the most redundant code, we also could have used an
162 | additional class that inherits from \texttt{Agent} for temporal difference method agents; this
163 | would be used by both \texttt{SARSAAgent} and \texttt{QLearningAgent}, as they
164 | differ only in their \texttt{value\_table} assignment. However, the efficiency gain in such an
165 | abstraction is not worth the loss of readability in our opinion.
166 | 
167 | \subsubsection{Environment}
168 | An \texttt{Environment} object is initialized at some state, with an arbitrarily
169 | defined state and action space. Actions are performed on an \texttt{Environment}
170 | object under the subroutine \texttt{perform\_action()}, and the output is a new state and its reward.
171 | 
172 | \subsubsection{Trial and Plot}
173 | As for trials, we implement a class \texttt{Trial} which contains all information for running multiple trials, i.e., independent collections of episodes to learn and act upon. We also add a \texttt{Plot} class which is a wrapper containing all \texttt{Trial} objects; this is convenient for generating plots on collections of trials coming from possibly many different agents.
174 | 
175 | \subsection{POMDP}
176 | For POMDP, we assume that the agent is given an observation model of the
177 | environment it is acting in, in the form of a conditional probability distribution
178 | $P(observation \mid state)$. Astr\"{o}m has shown that a properly updated probability
179 | distribution over the state space $S$ is sufficient to summarize all the observable
180 | history of a POMDP agent without loss of optimality \cite{astrom1965optimal}. 
181 | Thus we add a step of updating the belief state to the MDP paradigm. And the belief state, instead
182 | of the underlying state, is used to update the transition model.
183 | 
184 | \subsection{Organization of Code}
185 | We follow the directory structure specified in the problem set, with two
186 | exceptions:
187 | \begin{itemize}
188 | \item \texttt{documentation/} does not exist. Instead, documentation is written
189 | in the \texttt{README.md} inside the current working directory. Any additional
190 | documentation not purely necessary for the problem set submission is in the
191 | Github wiki (which is a subset of anything in this writeup).
192 | \item \texttt{source/} is named \texttt{bayesrl/} in order to follow Python
193 | convention for installing modules.
194 | \end{itemize}
195 | 
196 | \section{Analysis}
197 | 
198 | \subsection{Runtime}
199 | The algorithm takes $O(\mid S \mid ^2 \mid A \mid)$ to calculate the transition probabilities, the
200 | expected rewards and the belief state. It is uncertain how many time steps are required for
201 | convergence. All solvers we implement are guaranteed to converge in polynomial time for MDPs,
202 | although unfortunately we have not seen stricter theoretical results on the
203 | upper bounds than this. This certainly makes sense as it is true for all
204 | environments regardless of the pathological scenario. However, it would
205 | certainly be interesting to examine bounds under stricter assumptions where we
206 | fix the environment and perhaps certain parameter settings to simplify the
207 | analysis.
208 | 
209 | \subsection{Memory}
210 | We to store several arrays for the computation. The transition probabilities
211 | table is of dimension $\mid S \mid ^2 \mid A \mid$. The value table is of
212 | dimension $\mid S \mid \mid A \mid$. The transition observation table is of
213 | dimension $\mid S \mid ^2 \mid A \mid$. Thus the space requirement for the
214 | algorithm is $O(\mid S \mid ^2 \mid A \mid)$.
215 | 
216 | \subsection{Limitations}
217 | If the number of states is large, the algorithm quickly becomes intractable. We
218 | are quite interested in examining function approximations, which allow one to
219 | essentially apply a supervised learning algorithm to predict the optimal action
220 | given state characteristics, rather than to hardcode them manually. Note
221 | however that this makes the runtime even worse as there is an additional error
222 | accumulating as a result of the predictions.
223 | 
224 | \section{Experiments}
225 | The implementation of Thompson Sampling for MDPs ran very successfully: as time
226 | progressed, the agent was able to get to the goal much faster, and get a high
227 | reward. It did not work, however, for POMDPs. As time progressed, the agent seemed
228 | to take about the same amount of time to reach the goal on every execution, not
229 | improving in performance. We hypothesize that the reason is due to a lack of
230 | a good prior on the transition model: Thompson sampling relies a lot on being
231 | able to count the number of transitions between states, given an action.
232 | 
233 | \begin{figure}[ht]
234 | \begin{center}
235 | \centerline{\includegraphics[width=\textwidth]{img/mdp_imm_rewards.png}}
236 | \caption{Immediate reward for MDP solvers in a variety of parameter settings on
237 | the Gridworld example.
238 | Each of the settings for Thompson sampling vary how strong the misspecification of the
239 | prior as a uniform distribution, and yet in all three scenarios it indicates
240 | convergence. We also see that R-MAX performs better than the temporal difference methods.}
241 | \end{center}
242 | \end{figure}
243 | 
244 | This is very hard in a POMDP with a weak prior on its transition model, as it
245 | may have a completely wrong idea of where it ends up at each step. The
246 | observation model is supposed to improve its accuracy, but it was not sufficient
247 | in this case. In fact, we can see this empirically by essentially hardcoding the
248 | optimal path by putting high probability on where it should go, and indeed it
249 | converges to the optimal quite quickly.
250 | 
251 | To have an idea of how differently successful the same approach was on MDPs vs
252 | POMDPs, we show you our results:
253 | 
254 | \begin{figure}[ht]
255 | \centering
256 | \includegraphics[width=0.75\textwidth]{img/pomdp.png}
257 | \caption{\label{fig:pomdp}Cumulative Reward for POMDP}
258 | \end{figure}
259 | 
260 | These graphs show the cumulative reward that the agent got as time progressed.
261 | In the MDP case, the reward started low, and kept getting lower, as the agent
262 | explored. It, however, started getting higher as the agent started acting in a
263 | more deliberate manner, to maximize reward. The POMDP agent, however, seems to
264 | be in a state where its reward just gets increasingly negative. It is unclear
265 | whether it never leaves the exploration phase, or if it simply learns a completely
266 | wrong transition model, and thus computes a very wrong policy.
267 | 
268 | \bibliography{6_834j_ps03}
269 | \bibliographystyle{plain}
270 | 
271 | %##############################################################################
272 | % End Document
273 | %##############################################################################
274 | 
275 | \end{document}
276 | 


--------------------------------------------------------------------------------
/reports/6_834j_ps04.bib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps04.bib


--------------------------------------------------------------------------------
/reports/6_834j_ps04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/6_834j_ps04.pdf


--------------------------------------------------------------------------------
/reports/6_834j_ps04.tex:
--------------------------------------------------------------------------------
  1 | %##############################################################################
  2 | % Preamble
  3 | %##############################################################################
  4 | 
  5 | \documentclass{pset}
  6 | \name{Dustin Tran, Xiaomin Wang, Rodrigo Gomes}
  7 | \email{\{trandv,xiaominw, rgomes\}@mit.edu}
  8 | 
  9 | \course{6.834J/16.412J, S15}
 10 | \instructor{Professor Brian Williams}
 11 | \assignment{Problem Set \#4}
 12 | \duedate{May 13, 2015}
 13 | 
 14 | \begin{document}
 15 | 
 16 | %##############################################################################
 17 | % Begin Document
 18 | %##############################################################################
 19 | 
 20 | \begin{center}
 21 | \Large Task: Robot grocery shopping in partially observable settings
 22 | \end{center}
 23 | \section{Motivation}
 24 | \label{sec:motivation}
 25 | Imagine that you're in your bed, hungry, and you just don't want to walk
 26 | all the way to the grocery store. Equivalently, imagine you're working, doing the
 27 | most exciting task to save humanity, and you have no time to grab food---menial
 28 | tasks are beyond you. We'd like a robot which can intelligently learn to shop
 29 | groceries for us: it understands a query from the user, moves to the
 30 | grocery store, finds the relevant items, purchases it, and comes
 31 | back to the user.
 32 | 
 33 | In our project we focus on the arguably most difficult of
 34 | these tasks, which is to locate the groceries in the store. Moreover, we work
 35 | under the realistic
 36 | scenario in which the robot can only observe its surroundings (POMDP) rather
 37 | than an understanding of where it precisely is in the store (MDP).
 38 | 
 39 | Given this partially observable setting, the robot should learn how to obtain
 40 | all items in the grocery store and do so in an optimal amount of time. It must
 41 | 1. figure out where it is in the supermarket;
 42 | 2. intelligently search for the items by learning which aisle corresponds to
 43 | which category; and 3. find the optimal path and sequence to obtain all items.
 44 | 
 45 | \section{Setup}
 46 | The robot is given a list of items it should find in the supermarket. The supermarket is represented
 47 | by a grid world. It has several aisles containing different categories of goods.
 48 | The robot has a map of the supermarket (it knows where the walls and aisles are), but
 49 | it does not where items are located on the shelves. The robot has a perfect sensor, which can
 50 | observe its four neighbors. It can move in four directions, up, down, left, and right. It will
 51 | transition to the intended position 90\% of the time. For the rest of the time, it is equally likely
 52 | to move to any of the permitted direction, except the one opposite the intended direction. The robot
 53 | is equally likely to start the mission in one of the four corners of the supermarket. When the robot
 54 | goes to the grid next to a target item, the item is considered found. It continues moving untill all
 55 | the items have been found.
 56 | 
 57 | 
 58 | \section{Procedure}
 59 | \label{sec:procedure}
 60 | The code for the grounded scenario can be found in the \texttt{visual} folder.
 61 | You can start the simulation by running \texttt{python debug.py}.
 62 | An instance of the robot, and random supermarket is created automatically.
 63 | To set what products you want the robot to grab for you, you can write write:
 64 | \texttt{g.targets = set([...])}. For example, if you want ice cream, and beans,
 65 | you can write: \texttt{g.targets = set(['iscream', 'beans'])}. To view all possible
 66 | products, organized by aisles, just type: \texttt{print g.aisles\_content}.
 67 | After selecting your items, just type \texttt{go()} to start the robot demo.
 68 | 
 69 | \subsection{Belief Update}
 70 | At every step of the demo, the following happens:
 71 | 
 72 | \begin{itemize}
 73 | \item Agent provides next action based on current belief state
 74 | \item Simulator executes action (errors happen with some probability)
 75 | \item Belief state is updated based on transition probabilities
 76 | \item Belief state is updated based on observation
 77 | \item Belief about the world is updated based on belief state, and observation
 78 | \end{itemize}
 79 | 
 80 | The belief updates are done using Bayes Rule. One issue we ran into is that we are
 81 | making the Markov assumption for our states, which is not entirely correct, due to
 82 | the fact that the organization of the aisles is part of it. This causes us to be biased
 83 | towards our most likely state when entering an aisle, since each step causes
 84 | that aisle's type to be more likely, and the most likely state to, in turn, become
 85 | more likely. The desired behavior would be to only update the aisle organization
 86 | belief state when we enter the aisle, and not as we walk in it. An alternative
 87 | implementation of our belief state that could potentially avoid this problem would
 88 | have been a particle filter.
 89 | 
 90 | \subsection{Max Probability Value Iteration}
 91 | We reduce the POMDP to MDP, and solve the MDP using value iteration. At each time step, we assume
 92 | that the robot is at the most likely state based on its belief state. We also generate an
 93 | arrangement of the all items according to the probability distribution of items on the aisles. With
 94 | this information, the robot knows exactly where it is and where everything is. Since it also has the
 95 | transition model, value iteration becomes straightforward. Thus we can find the best action to take
 96 | assuming those are the true state of the world.
 97 | 
 98 | In a way, we are approximating the POMDP with most likely state MDP. Alternatively, we can also
 99 | sample over all possible state according to their probability distribution and run value iteration
100 | for each of them. We then choose the best action that gives the robot the maximal weighted averaged
101 |     expected reward. Such sampling can lower the probability of choosing a bad action assuming a
102 |     completely wrong world state. Thus the robot can make smarter moves, but at the expense of
103 |     computation power.
104 | 
105 | \section{Experiments}
106 | \label{sec:experiments}
107 | 
108 | \subsection{Path Planning}
109 | The robot plans its path intelligently but greedily. As long as there are still targets to be
110 | retrieved, the robot moves to the state with highest reward. But it does not try to minimize the
111 | total distance of retrieving all the targets. If the best state at the moment is very far away, but
112 | there is a target with lower reward closer to the robot, but not on the way to the best state, the
113 | robot will ignore the closer target and go for the best state.
114 | 
115 | One approach to address this non-optimality is to include the remaining target items as part of our state.
116 | The reward function will also have to be changed accordingly. However, this will increase the state
117 | space greatly and slow down the value iteration.
118 | 
119 | 
120 | \subsection{Run Time}
121 | Value iteration runs fairly fast for our problem size. The number of iterations it takes to converge
122 | depends a lot on the value of the discount factor, $\gamma$. For example, for a 11 by 7 grid and a
123 | $\gamma$ value of 0.9, it takes about 90 iterations. If we change $\gamma$ to 0.5, it only takes about
124 | 15 iterations.
125 | 
126 | \bibliography{6_834j_ps04}
127 | \bibliographystyle{plain}
128 | 
129 | %##############################################################################
130 | % End Document
131 | %##############################################################################
132 | 
133 | \end{document}
134 | 


--------------------------------------------------------------------------------
/reports/6_834j_talk.tex:
--------------------------------------------------------------------------------
  1 | % NOTE: must be run with
  2 | % xelatex -shell-escape 6_834j_talk
  3 | \documentclass[10pt, compress]{beamer}
  4 | 
  5 | \usetheme{m}
  6 | 
  7 | \usepackage{booktabs}
  8 | \usepackage[scale=2]{ccicons}
  9 | \usepackage{minted}
 10 | 
 11 | \usepgfplotslibrary{dateplot}
 12 | 
 13 | \usemintedstyle{trac}
 14 | 
 15 | \newcommand{\cmark}{\ding{51}}%
 16 | \newcommand{\xmark}{\ding{55}}%
 17 | 
 18 | \newcommand{\defeq}{\mathrel{\overset{\makebox[0pt]{\mbox{\normalfont\tiny\sffamily def}}}{=}}}
 19 | 
 20 | \title{Robot grocery shopping in partially observable settings}
 21 | \subtitle{}
 22 | \date{May 13, 2015}
 23 | \author{Rodrigo Gomes, Xiaomin Wang, Dustin Tran}
 24 | \institute{MIT, 6.834j Cognitive Robotics}
 25 | 
 26 | \begin{document}
 27 | %(2 min) Background on POMDPs, belief-state MDP, MDP solvers we have
 28 | %(2 min) Setup: Grocery shopping as planning in a POMDP
 29 | %(4 min) Demo
 30 | %(2 min) The solver actually used (value iteration)
 31 | %(1 min) Things that failed (Thompson sampling)
 32 | %(1 min) Q&A
 33 | 
 34 | \maketitle
 35 | 
 36 | \begin{frame}[fragile]
 37 |   \frametitle{Outline}
 38 | 
 39 |   \begin{enumerate}
 40 |   \item Background on POMDPs
 41 |   \item Grocery shopping as planning in a POMDP
 42 |   \item Demo!
 43 |   \item What worked
 44 |   \item What failed
 45 |   \end{enumerate}
 46 | 
 47 | \end{frame}
 48 | 
 49 | \begin{frame}[fragile]
 50 |   \frametitle{Background}
 51 | 
 52 |   A \emph{partially observable Markov decision process} (POMDP) is a tuple $(S,A,\Omega,R,T,O)$
 53 | 
 54 |   \begin{itemize}
 55 |   \item $S$: state space
 56 |   \item $A$: action space
 57 |   \item $\Omega$: observation space
 58 |   \item $R: S \times A \rightarrow \mathbb{R}$ reward function
 59 |   \item $T$: transition operator. $T(s' \mid s,a)$ is probability of next state $s'$ given state $s$ and action $a$
 60 |   \item $O$: observable operator. $O(o \mid s)$ is probability of observing
 61 |   $o$ given at state $s$
 62 |   \end{itemize}
 63 | \end{frame}
 64 | 
 65 | \begin{frame}[fragile]
 66 |   \frametitle{Background}
 67 | 
 68 |   \begin{figure}[ht]
 69 |   \begin{center}
 70 |   \centerline{\includegraphics[width=1.25\textwidth]{img/agent_environment_untitled.png}}
 71 |   \end{center}
 72 |   \end{figure}
 73 | \end{frame}
 74 | 
 75 | \begin{frame}[fragile]
 76 |   \frametitle{Background}
 77 |   A POMDP induces an equivalent representation as a \emph{belief MDP} with tuple $(B, A, \tau, R)$
 78 | 
 79 |   \begin{itemize}
 80 |   \item $B$: set of belief states over the POMDP states
 81 |   \item $A$: action space of original POMDP
 82 |   \item $\tau$: belief state transition operator\footnote{
 83 |   Given $b(s)$, after taking action $a$ and observing $o$ (and reaching state
 84 |   $s'$), update belief states
 85 |   \begin{equation}
 86 |   b'(s') = \frac{P(o\mid b,a,s')}{P(o\mid b,a)}=\frac{O(o\mid s',a)\sum_{s\in S} T(s'\mid
 87 |   s,a)b(s)}{\sum_{s'\in S} O(o\mid s',a)\sum_{s\in S}T(s'\mid s,a)b(s)}
 88 |   \end{equation}
 89 |   }
 90 |   \begin{equation*}
 91 |   \tau(b, a, b')
 92 |   = \sum_{o\in\Omega} P(b'\mid b, a, o)P(o\mid a, b)
 93 |   \end{equation*}
 94 |   \item $r: B\times A \rightarrow \mathbb{R}$ belief state reward function
 95 |   \begin{equation*}
 96 |   r(b,a) = \sum_{s\in S}b(s)R(s,a)
 97 |   \end{equation*}
 98 |   \end{itemize}
 99 | 
100 | \end{frame}
101 | 
102 | \begin{frame}[fragile]
103 |   \frametitle{Background}
104 |   A POMDP induces an equivalent representation as a \emph{belief MDP} with tuple $(B, A, \tau, R)$
105 | 
106 |   \begin{itemize}
107 |   \item $B$: set of belief states over the POMDP states
108 |   \item $A$: action space of original POMDP
109 |   \item $\tau$: belief state transition operator\footnote{
110 |   \alert{
111 |   Given $b(s)$, after taking action $a$ and observing $o$ (and reaching state
112 |   $s'$), update belief states
113 |   \begin{equation}
114 |   b'(s') = \frac{P(o\mid b,a,s')}{P(o\mid b,a)}=\frac{O(o\mid s',a)\sum_{s\in S} T(s'\mid
115 |   s,a)b(s)}{\sum_{s'\in S} O(o\mid s',a)\sum_{s\in S}T(s'\mid s,a)b(s)}
116 |   \end{equation}
117 |   }}
118 |   \begin{equation*}
119 |   \tau(b, a, b')
120 |   = \sum_{o\in\Omega} P(b'\mid b, a, o)P(o\mid a, b)
121 |   \end{equation*}
122 |   \item $r: B\times A \rightarrow \mathbb{R}$ belief state reward function
123 |   \begin{equation*}
124 |   r(b,a) = \sum_{s\in S}b(s)R(s,a)
125 |   \end{equation*}
126 |   \end{itemize}
127 | 
128 | \end{frame}
129 | 
130 | \begin{frame}[fragile]
131 |   \frametitle{Background}
132 | 
133 |   Implemented MDP solvers:
134 |   \begin{itemize}
135 |   \item Q-learning
136 |   \item SARSA
137 |   \item R-MAX
138 |   \item Thompson sampling
139 |   \end{itemize}
140 | 
141 |   There are a lot!
142 |   \begin{itemize}
143 |   \item Function approximations with adaptive basis functions
144 |   \item BOSS
145 |   \item Spectral methods
146 |   \item Skill chaining
147 |   \item $\cdots$
148 |   \end{itemize}
149 | 
150 | \end{frame}
151 | 
152 | \begin{frame}[fragile]
153 |   \frametitle{Background}
154 | 
155 |   Implemented MDP solvers:
156 |   \begin{itemize}
157 |   \item Q-learning \alert{(Watkins, 1989)}
158 |   \item SARSA \alert{(Rummery and Niranjan, 1994)}
159 |   \item R-MAX \alert{(Brafman and Tennenholtz, 2002)}
160 |   \item Thompson sampling \alert{(Strens, 2000)}
161 |   \end{itemize}
162 | 
163 |   There are a lot more!
164 |   \begin{itemize}
165 |   \item Function approximations with adaptive basis functions \alert{(Mnih et
166 |   al., 2013)}
167 |   \item BOSS \alert{(Asmuth et al., 2009)}
168 |   \item Spectral methods \alert{(Boots et al., 2009)}
169 |   \item Skill chaining \alert{(Konidaris and Barto, 2009)}
170 |   \item $\cdots$
171 |   \end{itemize}
172 | 
173 | \end{frame}
174 | 
175 | \begin{frame}[fragile]
176 |   \frametitle{Grocery shopping}
177 | 
178 |   Setup: Grid World POMDP
179 | 
180 |   Uncertain movement
181 | 
182 |   \centerline{\includegraphics[width=0.22\textwidth]{img/uncertain_transition.png}}
183 | 
184 |   Can only see around current cell (partially observable)
185 | 
186 |   \centerline{\includegraphics[width=0.22\textwidth]{img/partial_obs.png}}
187 | 
188 |   World is not fully known beforehand
189 |   \begin{itemize}
190 |   \item Model of how items in the same aisle correlate
191 |   \item Unknown arrangement of aisles
192 |   \item Unknown arrangement of items within aisles
193 |   \end{itemize}
194 | \end{frame}
195 | 
196 | \begin{frame}[fragile]
197 |   \frametitle{Grocery shopping}
198 | 
199 |   GUI interface: {\it pygame}
200 | 
201 |   Every second:
202 |   \begin{itemize}
203 |   \item Agent provides next action based on current belief state
204 |   \item Simulator executes action (errors may happen)
205 |   \item Belief state is updated based on transition probabilities
206 |   \item Belief state is updated based on observation
207 |   \item Belief about the world is updated based on belief state, and observation
208 |   \end{itemize}
209 | 
210 |   Challenges:
211 |   \begin{itemize}
212 |   \item Markov assumption is not completely accurate
213 |   \item Bias towards increasing probability of most likely states
214 |     % (state increases observation probability, observation increases state probability)
215 |   %\item
216 |   \end{itemize}
217 | \end{frame}
218 | 
219 | \plain{demo}
220 | 
221 | \begin{frame}[fragile]
222 |   \frametitle{Our working solver}
223 |   We encode a \textbf{Max Probability} MDP
224 |   \begin{itemize}
225 |   \item Motivated from greedy policies
226 |   \item Choose the most likely state from belief states as one's position in an
227 |   MDP
228 |   \item Solve the MDP!
229 |   \end{itemize}
230 | \end{frame}
231 | 
232 | \begin{frame}[fragile]
233 |   \frametitle{Our working solver}
234 |   Value iteration:
235 |   \begin{align*}
236 |   v_{k+1}(s) &= \max_a \mathbb{E}[R_{t+1} + \gamma v_k(S_{t+1}) \mid S_t = s, A_t = a] \\
237 |   &= \max_a \sum_{s'} p(s' \mid s,a) [r(s,a,s') + \gamma v_k(s')]
238 |   \end{align*}
239 | \end{frame}
240 | 
241 | \begin{frame}[fragile]
242 |   \frametitle{Failed tasks}
243 | 
244 |   \begin{itemize}
245 |   \item Continuous state space in belief MDP: Value iteration
246 |   \item Thompson sampling
247 |   \item TD($\lambda$) methods: Q-Learning, SARSA, Monte Carlo Tree Search
248 |   \end{itemize}
249 | \end{frame}
250 | 
251 | \begin{frame}
252 | \begin{figure}[ht]
253 |   \frametitle{Most simplified task (GridWorld)}
254 |   \vspace{3ex}
255 |   \begin{center}
256 |   \centerline{\includegraphics[width=1.1\textwidth]{img/mdp_imm_rewards.png}}
257 |   \end{center}
258 |   \end{figure}
259 | \end{frame}
260 | 
261 | \plain{
262 |   {\Large Play with it!}\\[5ex]
263 |   \includegraphics{img/octocat.png}\\[3ex]
264 |   github.com/dustinvtran/bayesrl
265 | }
266 | 
267 | \end{document}
268 | 


--------------------------------------------------------------------------------
/reports/beamercolorthememetropolis.sty:
--------------------------------------------------------------------------------
 1 | % Beamer mtheme
 2 | %
 3 | % Copyright 2014 Matthias Vogelgesang
 4 | % Licensed under CC-BY-SA 4.0 International.
 5 | %
 6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which
 7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme.
 8 | %
 9 | 
10 | \ProvidesPackage{beamercolorthememetropolis}
11 | 
12 | 
13 | %}}}
14 | %{{{ --- Options ----------------------
15 | 
16 | \newif\if@beamer@metropolis@blockbg
17 | \@beamer@metropolis@blockbgfalse
18 | \DeclareOptionBeamer{blockbg}{\@beamer@metropolis@blockbgtrue}
19 | 
20 | \DeclareOptionBeamer*{%
21 |   \PackageWarning{beamercolorthememetropolis}{Unknown option `\CurrentOption'}%
22 | }
23 | 
24 | \ProcessOptionsBeamer
25 | 
26 | %}}}
27 | %{{{ --- Colors ---------------------
28 | 
29 | % http://paletton.com/#uid=7050t0kkJkJsntwoyp6gYgoddc4
30 | 
31 | \definecolor{mDarkBrown}{HTML}{604c38}
32 | \definecolor{mDarkTeal}{HTML}{23373b}
33 | 
34 | \definecolor{mLightBrown}{HTML}{EB811B}
35 | \definecolor{mMediumBrown}{HTML}{C87A2F}
36 | 
37 | \setbeamercolor{palette primary}{fg=mDarkTeal, bg=black!2}
38 | \setbeamercolor{palette secondary}{fg=white, bg=mDarkTeal}
39 | \setbeamercolor{palette quaternary}{fg=mDarkBrown}
40 | \setbeamercolor{palette tertiary}{fg=white, bg=mMediumBrown}
41 | 
42 | \setbeamercolor{title}{parent=palette primary}
43 | \setbeamercolor{subtitle}{parent=palette primary}
44 | \setbeamercolor{author}{parent=palette primary}
45 | \setbeamercolor{date}{parent=palette primary}
46 | \setbeamercolor{institute}{parent=palette primary}
47 | 
48 | \setbeamercolor{section title}{parent=palette primary}
49 | \setbeamercolor{frametitle}{parent=palette secondary}
50 | \setbeamercolor{background canvas}{parent=palette primary}
51 | \setbeamercolor{structure}{fg=mDarkTeal}
52 | 
53 | \setbeamercolor{normal text}{fg=black!97}
54 | \setbeamercolor{alerted text}{fg=mLightBrown}
55 | 
56 | \setbeamercolor{footnote}{fg=mDarkTeal!50}
57 | \setbeamercolor{footnote mark}{fg=.}
58 | \setbeamercolor{page number in head/foot}{fg=mDarkTeal}
59 | 
60 | \if@beamer@metropolis@blockbg
61 | 
62 | \setbeamercolor{block title}{use=palette primary,parent=palette primary,bg=palette primary.bg!80!fg}
63 | \setbeamercolor{block title alerted}{use={palette primary,alerted text},parent=palette primary,fg=alerted text.fg,bg=palette primary.bg!80!fg}
64 | \setbeamercolor{block title example}{use={palette primary,example text},parent=palette primary,fg=example text.fg,bg=palette primary.bg!80!fg}
65 | 
66 | \setbeamercolor{block body}{use=block title,parent=normal text,bg=block title.bg!50}
67 | \setbeamercolor{block body alerted}{use={normal text,block body},parent=normal text,bg=block body.bg}
68 | \setbeamercolor{block body example}{use={normal text,block body},parent=normal text,bg=block body.bg}
69 | 
70 | \fi
71 | 
72 | \mode<all>
73 | 


--------------------------------------------------------------------------------
/reports/beamerfontthememetropolis.sty:
--------------------------------------------------------------------------------
 1 | % Beamer mtheme
 2 | %
 3 | % Copyright 2014 Matthias Vogelgesang
 4 | % Licensed under CC-BY-SA 4.0 International.
 5 | %
 6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which
 7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme.
 8 | %
 9 | 
10 | \ProvidesPackage{beamerfontthememetropolis}
11 | 
12 | \RequirePackage[no-math]{fontspec}
13 | 
14 | 
15 | \defaultfontfeatures{Mapping=tex-text}
16 | %\setsansfont[BoldFont={Fira Sans}]{Fira Sans Light}
17 | %\setmonofont{Fira Mono}
18 | %\newfontfamily\ExtraLight{Fira Sans ExtraLight}
19 | %\newfontfamily\Light{Fira Sans Light}
20 | %\newfontfamily\Book{Fira Sans}
21 | %\newfontfamily\Medium{Fira Sans Medium}
22 | %\setsansfont{Helvetica Neue Thin}
23 | %\setmonofont{Courier New}
24 | %\newfontfamily\ExtraLight{Helvetica Neue Thin}
25 | %\newfontfamily\Light{Helvetica Neue Thin}
26 | %\newfontfamily\Book{Helvetica Neue Thin}
27 | %\newfontfamily\Medium{Helvetica Neue Thin}
28 | \setsansfont{Palatino}
29 | \setmonofont{Courier New}
30 | \newfontfamily\ExtraLight{Palatino}
31 | \newfontfamily\Light{Palatino}
32 | \newfontfamily\Book{Helvetica Neue Thin}
33 | \newfontfamily\Medium{Palatino}
34 | 
35 | %\AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Fira Sans}, Numbers={Monospaced}]{Fira Sans Light}}
36 | %\AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Helvetica Neue Thin}, Numbers={Monospaced}]{Helvetica Neue Thin}}
37 | \AtBeginEnvironment{tabular}{\setsansfont[BoldFont={Palatino}, Numbers={Monospaced}]{Palatino}}
38 | 
39 | \setbeamerfont{title}{family=\Book, size=\Large}
40 | \setbeamerfont{author}{family=\ExtraLight, size=\small}
41 | \setbeamerfont{date}{family=\ExtraLight, size=\small}
42 | 
43 | \setbeamerfont{section title}{family=\Book, size=\Large}
44 | 
45 | \setbeamerfont{block title}{family=\Book, size=\normalsize}
46 | \setbeamerfont{block title alerted}{family=\Book,size=\normalsize}
47 | 
48 | \setbeamerfont{subtitle}{family=\Light, size=\fontsize{12}{14}}
49 | \setbeamerfont{frametitle}{family=\Book, size=\large}
50 | 
51 | \setbeamerfont{caption}{size=\small}
52 | \setbeamerfont{caption name}{family=\Book}
53 | 
54 | \setbeamerfont{description item}{family=\Book}
55 | 
56 | \setbeamerfont{page number in head/foot}{size=\scriptsize}
57 | 
58 | 
59 | \linespread{1.15}
60 | 


--------------------------------------------------------------------------------
/reports/beamerthemem.sty:
--------------------------------------------------------------------------------
  1 | % Beamer mtheme
  2 | %
  3 | % Copyright 2014 Matthias Vogelgesang
  4 | % Licensed under CC-BY-SA 4.0 International.
  5 | %
  6 | % The initial template comes from the HSRM beamer theme by Benjamin Weiss, which
  7 | % you can find at https://github.com/hsrmbeamertheme/hsrmbeamertheme.
  8 | %
  9 | 
 10 | \ProvidesPackage{beamerthemem}
 11 | 
 12 | %{{{ --- Options ----------------------
 13 | 
 14 | \newif\if@useTitleProgressBar
 15 | \newif\if@protectFrameTitle
 16 | \newif\if@noSmallCapitals
 17 | \newif\if@noSectionSlide
 18 | \newif\if@useTotalSlideIndicator
 19 | 
 20 | \@useTitleProgressBarfalse
 21 | \@protectFrameTitlefalse
 22 | \@noSmallCapitalsfalse
 23 | \@noSectionSlidefalse
 24 | \@useTotalSlideIndicatorfalse
 25 | 
 26 | \newlength{\@mtheme@voffset}
 27 | \setlength{\@mtheme@voffset}{2em}
 28 | 
 29 | \DeclareOptionBeamer{usetitleprogressbar}{\@useTitleProgressBartrue}
 30 | \DeclareOptionBeamer{protectframetitle}{\@protectFrameTitletrue}
 31 | \DeclareOptionBeamer{blockbg}{%
 32 |   \PassOptionsToPackage{blockbg}{beamercolorthememetropolis}%
 33 | }
 34 | \DeclareOptionBeamer{nooffset}{\setlength{\@mtheme@voffset}{0em}}
 35 | 
 36 | \DeclareOptionBeamer*{%
 37 |   \PackageWarning{beamerthemem}{Unknown option `\CurrentOption'}%
 38 | }
 39 | 
 40 | \DeclareOptionBeamer{nosmallcapitals}{\@noSmallCapitalstrue}
 41 | \DeclareOptionBeamer{nosectionslide}{\@noSectionSlidetrue}
 42 | \DeclareOptionBeamer{usetotalslideindicator}{\@useTotalSlideIndicatortrue}
 43 | 
 44 | \ProcessOptionsBeamer
 45 | 
 46 | %}}}
 47 | 
 48 | \mode<presentation>
 49 | 
 50 | %{{{ --- Packages ---------------------
 51 | 
 52 | \RequirePackage[no-math]{fontspec}
 53 | \RequirePackage{etoolbox}
 54 | \RequirePackage{tikz}
 55 | \RequirePackage{pgfplots}
 56 | 
 57 | \usetikzlibrary{backgrounds}
 58 | \usetikzlibrary{calc}
 59 | 
 60 | \usecolortheme{metropolis}
 61 | \usefonttheme{metropolis}
 62 | 
 63 | %}}}
 64 | %{{{ --- Titlepage --------------------
 65 | 
 66 | \def\maketitle{\ifbeamer@inframe\titlepage\else\frame[plain]{\titlepage}\fi}
 67 | 
 68 | \def\titlepage{\usebeamertemplate{title page}}
 69 | \setbeamertemplate{title page}
 70 | {
 71 |   \begin{minipage}[b][\paperheight]{\textwidth}
 72 |     \vspace*{\@mtheme@voffset}
 73 |     \ifx\inserttitlegraphic\@empty%
 74 |     \else%
 75 |     {
 76 |     	% actual output of titlegraphic
 77 |    	    \usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par%
 78 |     	% measurement and add negative vspace
 79 | 		\newdimen\logoheight
 80 | 		\setbox0=\vbox{\inserttitlegraphic}%
 81 | 		\logoheight=\ht0 \advance\logoheight by \dp0 %
 82 | 		\vspace*{-\logoheight}%
 83 | 		\vspace*{-1em}% I don't know why this additional negative space is needed
 84 |     }%
 85 |     \fi%
 86 |     \vfill
 87 |     \ifx\inserttitle\@empty%
 88 |     \else%
 89 |     \if@noSmallCapitals%
 90 |     {\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\inserttitle\par}%
 91 |     \else%
 92 |     %{\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\scshape\MakeLowercase{\inserttitle}\par}%
 93 |     {\raggedright\linespread{1.0}\usebeamerfont{title}\usebeamercolor[fg]{title}\scshape{\inserttitle}\par}%
 94 |     \fi%
 95 |     \vspace*{0.5em}
 96 |     \fi%
 97 |     \ifx\insertsubtitle\@empty%
 98 |     \else%
 99 |     {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}%
100 |     \vspace*{0.5em}
101 |     \fi%
102 |     \begin{tikzpicture}\draw[alerted text.fg] (0, 0) -- (\textwidth, 0);\end{tikzpicture}%
103 |     \vspace*{1em}
104 |     \ifx\insertauthor\@empty%
105 |     \else%
106 |     {\usebeamerfont{author}\usebeamercolor[fg]{author}\insertauthor\par}%
107 |     \vspace*{0.25em}
108 |     \fi%
109 |     \ifx\insertdate\@empty%
110 |     \else%
111 |     {\usebeamerfont{date}\usebeamercolor[fg]{date}\insertdate\par}%
112 |     \fi%
113 |     \ifx\insertinstitut\@empty%
114 |     \else%
115 |     \vspace*{3mm}
116 |     {\usebeamerfont{institute}\usebeamercolor[fg]{institute}\insertinstitute\par}%
117 |     \fi%
118 |     \vfill
119 |     \vspace*{\@mtheme@voffset}
120 |   \end{minipage}
121 | }
122 | 
123 | %}}}
124 | %{{{ --- Progressbar ------------------
125 | 
126 | \makeatletter
127 | \def\progressbar@sectionprogressbar{}
128 | \def\progressbar@titleprogressbar{}
129 | \newcount\progressbar@tmpcounta % auxiliary counter
130 | \newcount\progressbar@tmpcountb % auxiliary counter
131 | \newdimen\progressbar@pbht      % progressbar height
132 | \newdimen\progressbar@pbwd      % progressbar width
133 | \newdimen\progressbar@tmpdim    % auxiliary dimension
134 | 
135 | \progressbar@pbwd=22em
136 | \progressbar@pbht=0.4pt
137 | 
138 | % the progress bar
139 | \def\progressbar@sectionprogressbar{%
140 |   {\usebeamercolor{palette primary}%
141 |     \progressbar@tmpcounta=\insertframenumber
142 |     \progressbar@tmpcountb=\inserttotalframenumber
143 |     \progressbar@tmpdim=\progressbar@pbwd
144 |     \divide\progressbar@tmpdim by 100
145 |     \multiply\progressbar@tmpdim by \progressbar@tmpcounta
146 |     \divide\progressbar@tmpdim by \progressbar@tmpcountb
147 |     \multiply\progressbar@tmpdim by 100
148 | 
149 |     % fixes very high linespacing introduced via \textsc{\MakeLowercase{...}}
150 |     \fontsize{1em}{1em}\selectfont
151 | 
152 |     \makebox[\textwidth][c]{
153 |       \begin{tikzpicture}[tight background]
154 | 
155 |         \node[anchor=south west, fg, inner sep=0pt, text width=\progressbar@pbwd] at (0pt, 0pt) {\insertsectionHEAD};
156 | 
157 |         \draw[anchor=west, fg!20, fill=fg!20, inner sep=0pt]
158 |         (0, -1ex) rectangle ++ (\progressbar@pbwd, \progressbar@pbht);
159 | 
160 |         \draw[anchor=west, fg, fill=fg, inner sep=0pt]
161 |         (0, -1ex) rectangle ++ (\progressbar@tmpdim, \progressbar@pbht);
162 |       \end{tikzpicture}%
163 |     }
164 |   } % end usebeamercolor{palette primary}
165 | }
166 | 
167 | \if@useTitleProgressBar
168 | \def\progressbar@titleprogressbar{%
169 |   \progressbar@tmpcounta=\insertframenumber
170 |   \progressbar@tmpcountb=\inserttotalframenumber
171 |   \progressbar@tmpdim=\paperwidth
172 |   \divide\progressbar@tmpdim by 100
173 |   \multiply\progressbar@tmpdim by \progressbar@tmpcounta
174 |   \divide\progressbar@tmpdim by \progressbar@tmpcountb
175 |   \multiply\progressbar@tmpdim by 100
176 |   {%
177 |     \usebeamercolor{palette quaternary}%
178 |     \usebeamercolor{alerted text}%
179 |     \begin{tikzpicture}[tight background]
180 |       \draw[palette quaternary.fg, fill=palette quaternary.fg] (0, 0) rectangle ($(\paperwidth, 0.6pt) - (0.4pt, 0)$);
181 |       \draw[alerted text.fg, fill=alerted text.fg] (0, 0) rectangle (\progressbar@tmpdim, 0.6pt);
182 |     \end{tikzpicture}%
183 |   }%
184 | }
185 | \fi
186 | %}}}
187 | %{{{ --- Commands ---------------------
188 | 
189 | \newcommand{\insertsectionHEAD}{%
190 |   \expandafter\insertsectionHEADaux\insertsectionhead}
191 | 
192 | \if@noSmallCapitals%
193 | \newcommand{\insertsectionHEADaux}[3]{#3}%
194 | \else%
195 | \newcommand{\insertsectionHEADaux}[3]{\textsc{\MakeLowercase{#3}}}%
196 | \fi%
197 | 
198 | \newcommand{\plain}[2][]{%
199 |   \begingroup
200 |   \setbeamercolor{background canvas}{use=palette primary,bg=palette primary.fg}
201 |   \begin{frame}{#1}
202 |     \centering
203 |     \vfill\vspace{1em}\usebeamerfont{section title}\textcolor{white}{\scshape #2}\vfill
204 |   \end{frame}
205 |   \endgroup
206 | }
207 | 
208 | %}}}
209 | %{{{ --- Itemize ----------------------
210 | 
211 | \setlength{\leftmargini}{1em}
212 | 
213 | % Actually one level should be enough but ...
214 | \setlength{\leftmarginii}{1em}
215 | \setlength{\leftmarginiii}{1em}
216 | 
217 | \newcommand{\itemBullet}{∙}
218 | 
219 | \setbeamertemplate{itemize item}{\itemBullet}
220 | \setbeamertemplate{itemize subitem}{\itemBullet}
221 | \setbeamertemplate{itemize subsubitem}{\itemBullet}
222 | \setlength{\parskip}{0.5em}
223 | 
224 | %}}}
225 | %{{{ --- Sections ---------------------
226 | 
227 | % Insert frame with section title at every section start
228 | \AtBeginSection[]
229 | {
230 |   \if@noSectionSlide%
231 |   \else%
232 |   \begingroup
233 |   \setbeamercolor{background canvas}{parent=palette primary}
234 |   \begin{frame}[plain]
235 |     \vspace{2em}\usebeamerfont{section title}
236 |     \progressbar@sectionprogressbar%
237 |   \end{frame}
238 |   \endgroup
239 |   \fi%
240 | }
241 | 
242 | %}}}
243 | %{{{ --- Captions ---------------------
244 | 
245 | \setbeamertemplate{caption label separator}{: }
246 | \setbeamertemplate{caption}[numbered]
247 | 
248 | %}}}
249 | %{{{ --- Footline/footnote ------------
250 | 
251 | \usenavigationsymbolstemplate{}
252 | \setbeamertemplate{footline}
253 | {%
254 | \begin{beamercolorbox}[wd=\textwidth,ht=3ex,dp=3ex,leftskip=0.3cm,rightskip=0.3cm]{structure}%
255 |   \hfill\usebeamerfont{page number in head/foot}%
256 |   \if@useTotalSlideIndicator%
257 |   \insertpagenumber/\insertpresentationendpage%
258 |   \else%
259 |   \insertpagenumber%
260 |   \fi%
261 | \end{beamercolorbox}%
262 | }
263 | 
264 | \setbeamertemplate{footnote}
265 | {%
266 |   \parindent 0em\noindent%
267 |   \raggedright
268 |   \usebeamercolor{footnote}\hbox to 0.8em{\hfil\insertfootnotemark}\insertfootnotetext\par%
269 | }
270 | 
271 | %}}}
272 | %{{{ --- Frametitle -------------------
273 | 
274 | \setbeamertemplate{frametitle}{%
275 | \nointerlineskip
276 | \begin{beamercolorbox}[wd=\paperwidth,leftskip=0.3cm,rightskip=0.3cm,ht=2.5ex,dp=1.5ex]{frametitle}
277 | \usebeamerfont{frametitle}%
278 | \if@protectFrameTitle%
279 |   \protect%
280 |   \if@noSmallCapitals%
281 |     \insertframetitle%
282 |   \else%
283 |     %\textsc{\MakeLowercase{\insertframetitle}}%
284 |     \textsc{{\insertframetitle}}%
285 |   \fi%
286 | \else%
287 |   \if@noSmallCapitals%
288 |     \insertframetitle%
289 |   \else%
290 |     %\textsc{\MakeLowercase{\insertframetitle}}%
291 |     \textsc{{\insertframetitle}}%
292 |   \fi%
293 | \fi%
294 | \end{beamercolorbox}%
295 | \if@useTitleProgressBar
296 |   \vspace{-.5em}
297 |   \begin{beamercolorbox}[wd=\paperwidth,ht=1pt,dp=0pt]{frametitle}
298 |     \progressbar@titleprogressbar
299 |   \end{beamercolorbox}
300 | \fi
301 | \vspace{\@mtheme@voffset}
302 | }
303 | 
304 | %}}}
305 | %{{{ --- pgfplots ---------------------
306 | 
307 | %{{{ Colors
308 | 
309 | % TolColors from http://www.r-bloggers.com/the-paul-tol-21-color-salute/
310 | \definecolor{TolColor1}{HTML}{332288}   % dark purple
311 | \definecolor{TolColor2}{HTML}{6699CC}   % dark blue
312 | \definecolor{TolColor3}{HTML}{88CCEE}   % light blue
313 | \definecolor{TolColor4}{HTML}{44AA99}   % light green
314 | \definecolor{TolColor5}{HTML}{117733}   % dark green
315 | \definecolor{TolColor6}{HTML}{999933}   % dark brown
316 | \definecolor{TolColor7}{HTML}{DDCC77}   % light brown
317 | \definecolor{TolColor8}{HTML}{661100}   % dark red
318 | \definecolor{TolColor9}{HTML}{CC6677}   % light red
319 | \definecolor{TolColor10}{HTML}{AA4466}  % light pink
320 | \definecolor{TolColor11}{HTML}{882255}  % dark pink
321 | \definecolor{TolColor12}{HTML}{AA4499}  % light purple
322 | 
323 | %}}}
324 | %{{{ Color cycles
325 | 
326 | \pgfplotscreateplotcyclelist{mbarplot cycle}{%
327 |   {draw=TolColor2, fill=TolColor2!70},
328 |   {draw=TolColor7, fill=TolColor7!70},
329 |   {draw=TolColor4, fill=TolColor4!70},
330 |   {draw=TolColor11, fill=TolColor11!70},
331 |   {draw=TolColor1, fill=TolColor1!70},
332 |   {draw=TolColor8, fill=TolColor8!70},
333 |   {draw=TolColor6, fill=TolColor6!70},
334 |   {draw=TolColor9, fill=TolColor9!70},
335 |   {draw=TolColor10, fill=TolColor10!70},
336 |   {draw=TolColor12, fill=TolColor12!70},
337 |   {draw=TolColor3, fill=TolColor3!70},
338 |   {draw=TolColor5, fill=TolColor5!70},
339 | }
340 | 
341 | \pgfplotscreateplotcyclelist{mlineplot cycle}{%
342 |   {TolColor2, mark=*, mark size=1.5pt},
343 |   {TolColor7, mark=square*, mark size=1.3pt},
344 |   {TolColor4, mark=triangle*, mark size=1.5pt},
345 |   {TolColor6, mark=diamond*, mark size=1.5pt},
346 | }
347 | 
348 | %}}}
349 | %{{{ Styles
350 | 
351 | \pgfplotsset{
352 |   compat=1.9,
353 |   mbaseplot/.style={
354 |     legend style={
355 |       draw=none,
356 |       fill=none,
357 |       cells={anchor=west},
358 |     },
359 |     x tick label style={
360 |       font=\footnotesize
361 |     },
362 |     y tick label style={
363 |       font=\footnotesize
364 |     },
365 |     legend style={
366 |       font=\footnotesize
367 |     },
368 |     major grid style={
369 |       dotted,
370 |     },
371 |     axis x line*=bottom,
372 |   },
373 |   mlineplot/.style={
374 |     mbaseplot,
375 |     xmajorgrids=true,
376 |     ymajorgrids=true,
377 |     major grid style={dotted},
378 |     axis x line=bottom,
379 |     axis y line=left,
380 |     legend style={
381 |       cells={anchor=west},
382 |       draw=none
383 |     },
384 |     cycle list name=mlineplot cycle,
385 |   },
386 |   mbarplot base/.style={
387 |     mbaseplot,
388 |     bar width=6pt,
389 |     axis y line*=none,
390 |   },
391 |   mbarplot/.style={
392 |     mbarplot base,
393 |     ybar,
394 |     xmajorgrids=false,
395 |     ymajorgrids=true,
396 |     area legend,
397 |     legend image code/.code={%
398 |       \draw[#1] (0cm,-0.1cm) rectangle (0.15cm,0.1cm);
399 |     },
400 |     cycle list name=mbarplot cycle,
401 |   },
402 |   horizontal mbarplot/.style={
403 |     mbarplot base,
404 |     xmajorgrids=true,
405 |     ymajorgrids=false,
406 |     xbar stacked,
407 |     area legend,
408 |     legend image code/.code={%
409 |       \draw[#1] (0cm,-0.1cm) rectangle (0.15cm,0.1cm);
410 |     },
411 |     cycle list name=mbarplot cycle,
412 |   },
413 |   disable thousands separator/.style={
414 |     /pgf/number format/.cd,
415 |       1000 sep={}
416 |   },
417 | }
418 | 
419 | %}}}
420 | 
421 | \mode<all>
422 | 
423 | %{{{ misc
424 | \let\otp\titlepage
425 | \renewcommand{\titlepage}{\otp\addtocounter{framenumber}{-1}}
426 | \newcommand{\mreducelistspacing}{\vspace{-\topsep}}
427 | 
428 | \linespread{1.15}
429 | 
430 | %}}}
431 | 


--------------------------------------------------------------------------------
/reports/demo.webm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/demo.webm


--------------------------------------------------------------------------------
/reports/img/agent_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/agent_environment.png


--------------------------------------------------------------------------------
/reports/img/agent_environment_untitled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/agent_environment_untitled.png


--------------------------------------------------------------------------------
/reports/img/mdp_imm_rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/mdp_imm_rewards.png


--------------------------------------------------------------------------------
/reports/img/octocat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/octocat.png


--------------------------------------------------------------------------------
/reports/img/partial_obs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/partial_obs.png


--------------------------------------------------------------------------------
/reports/img/pomdp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/pomdp.png


--------------------------------------------------------------------------------
/reports/img/uncertain_transition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/reports/img/uncertain_transition.png


--------------------------------------------------------------------------------
/reports/pset.cls:
--------------------------------------------------------------------------------
  1 | % This is a modified version of the Harvard ML template by Ryan Adams.
  2 | 
  3 | \NeedsTeXFormat{LaTeX2e}[1995/01/01]
  4 | \ProvidesClass{pset}
  5 | [2013/08/31 v0.01 Harvard ML Assignment Class]
  6 | 
  7 | %##############################################################################
  8 | % Base class
  9 | %##############################################################################
 10 | 
 11 | \LoadClass[10pt,letterpaper]{article}
 12 | 
 13 | % "(no)submit" argument specifies whether to include credits in header.
 14 | \newif\ifpset@submit
 15 | \DeclareOption{submit}{\pset@submittrue}
 16 | \DeclareOption{nosubmit}{\pset@submitfalse}
 17 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
 18 | \ExecuteOptions{submit}
 19 | \ProcessOptions\relax
 20 | 
 21 | %##############################################################################
 22 | % Packages
 23 | %##############################################################################
 24 | 
 25 | \RequirePackage{palatino}
 26 | \RequirePackage{mathpazo}
 27 | \RequirePackage{amsmath}
 28 | \RequirePackage{amssymb}
 29 | \RequirePackage{amsthm}
 30 | %\RequirePackage{fancyhdr}
 31 | \RequirePackage{fullpage}
 32 | \RequirePackage{graphicx}
 33 | \RequirePackage{mdframed}
 34 | 
 35 | %##############################################################################
 36 | % Page Headings
 37 | %##############################################################################
 38 | 
 39 | %\pagestyle{fancy}
 40 | %\fancyhead[L]{\rule[-1.25ex]{0em}{0ex}Tran}
 41 | %\fancyhead[C]{\pset@assignment}
 42 | %\fancyhead[R]{\thepage}
 43 | %\fancyfoot[L]{}
 44 | %\fancyfoot[C]{}
 45 | %\fancyfoot[R]{}
 46 | 
 47 | %##############################################################################
 48 | % Header
 49 | %##############################################################################
 50 | 
 51 | \def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip}
 52 | 
 53 | \newcommand{\headerblock}{%
 54 |   \noindent\begin{minipage}{0.33\textwidth}
 55 |   \begin{flushleft}
 56 |   \ifpset@submit
 57 |     \mbox{\pset@course}\\
 58 |     \mbox{\pset@instructor}
 59 |     \fi
 60 |   \end{flushleft}
 61 |   \end{minipage}
 62 |   \noindent\begin{minipage}{0.33\textwidth}
 63 |   \begin{center}
 64 |     \mbox{\Large\pset@assignment}\protect\\
 65 |     Due: \pset@duedate
 66 |   \end{center}
 67 |   \end{minipage}
 68 |   \noindent\begin{minipage}{0.33\textwidth}
 69 |   \begin{flushright}
 70 |   \ifpset@submit
 71 |     \mbox{\pset@name}\\
 72 |     \mbox{\pset@email}
 73 |     \fi
 74 |   \end{flushright}
 75 |   \end{minipage}
 76 |   \vspace{0.1cm}
 77 |   \titlebar
 78 | }
 79 | 
 80 | \AtBeginDocument{\headerblock}
 81 | 
 82 | \def\pset@name{}
 83 | \def\pset@email{}
 84 | \def\pset@course{}
 85 | \def\pset@instructor{}
 86 | \def\pset@assignment{}
 87 | \def\pset@duedate{}
 88 | 
 89 | % Commands to automatically input info.
 90 | \newcommand{\name}[1]{\def\pset@name{#1}}
 91 | \newcommand{\email}[1]{\def\pset@email{#1}}
 92 | \newcommand{\course}[1]{\def\pset@course{#1}}
 93 | \newcommand{\instructor}[1]{\def\pset@instructor{#1}}
 94 | \newcommand{\assignment}[1]{\def\pset@assignment{#1}}
 95 | \newcommand{\duedate}[1]{\def\pset@duedate{#1}}
 96 | 
 97 | %##############################################################################
 98 | % Environments
 99 | %##############################################################################
100 | 
101 | \newtheoremstyle{box}
102 |                 {3pt}% Space above
103 |                 {3pt}% Space below
104 |                 {}% Body font
105 |                 {}% Indent amount
106 |                 {\bfseries}% Theorem head font
107 |                 {\\*[3pt]}% Punctuation after theorem head
108 |                 {.5em}% Space after theorem head
109 |                 {}% Theorem head spec (can be left empty, meaning `normal')
110 | \theoremstyle{box}
111 | \newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep]{problem}{Problem}
112 | 
113 | %##############################################################################
114 | % Misc
115 | % TODO: Organize
116 | %##############################################################################
117 | \RequirePackage{amsfonts,amssymb,amsthm}
118 | \RequirePackage{cancel}
119 | \RequirePackage{centernot}
120 | \RequirePackage{color}
121 | \RequirePackage{enumerate}
122 | \RequirePackage{graphicx}
123 | %\RequirePackage{hyperref}
124 | %\hypersetup{colorlinks=true,urlcolor=blue}
125 | \RequirePackage{listings}
126 | \RequirePackage{mathrsfs}
127 | \RequirePackage{tikz}
128 | %\RequirePackage{tikz-cd}
129 | \usetikzlibrary{patterns,shapes,snakes}
130 | %\RequirePackage[usenames,dvipsnames,svgnames,table]{xcolor}
131 | %\RequirePackage{fontspec, xunicode}
132 | %\setmonofont{Consolas}
133 | 
134 | % Emulate markdown's light grey background for monospace.
135 | \usepackage{soul}
136 | \definecolor{Light}{gray}{.96}
137 | \sethlcolor{Light}
138 | \let\OldTexttt\texttt
139 | \renewcommand{\texttt}[1]{\OldTexttt{\hl{#1}}}% will affect all \texttt
140 | 
141 | % Use knitr's colorscheme.
142 | \definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
143 | \definecolor{hlnum}{rgb}{0.686,0.059,0.569}
144 | \definecolor{hlstr}{rgb}{0.192,0.494,0.8}
145 | \definecolor{hlcom}{rgb}{0.678,0.584,0.686}
146 | \definecolor{hlopt}{rgb}{0,0,0}
147 | \definecolor{hlstd}{rgb}{0.345,0.345,0.345}
148 | \definecolor{hlkwa}{rgb}{0.161,0.373,0.58}
149 | \definecolor{hlkwb}{rgb}{0.69,0.353,0.396}
150 | \definecolor{hlkwc}{rgb}{0.333,0.667,0.333}
151 | \definecolor{hlkwd}{rgb}{0.737,0.353,0.396}
152 | \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}
153 | 
154 | \lstset{
155 |   backgroundcolor=\color{shadecolor},
156 |   basicstyle=\color{hlstd}\ttfamily\footnotesize,
157 |   breakatwhitespace=false,
158 |   %breaklines=true,
159 |   captionpos=b,
160 |   commentstyle=\color{hlcom},
161 |   deletekeywords={...},
162 |   escapeinside={\%*}{*)},
163 |   extendedchars=true,
164 |   frame=lines,
165 |   keepspaces=true,
166 |   keywordstyle=\color{hlkwb},
167 |   morekeywords={*,...},
168 |   numbers=left,
169 |   numbersep=5pt,
170 |   numberstyle=\tiny\color{hlstd},
171 |   rulecolor=\color{hlstd},
172 |   showspaces=false,
173 |   showstringspaces=false,
174 |   showtabs=false,
175 |   stepnumber=1,
176 |   stringstyle=\color{hlstr},
177 |   tabsize=2,
178 |   title=\lstname
179 | }
180 | 
181 | %##############################################################################
182 | % Operator Macros
183 | %##############################################################################
184 | 
185 | \newcommand\given[1][]{\:#1\vert\:}
186 | \newcommand{\todo}[1]{\textcolor{red}{xx TODO: #1  xx }}
187 | \newcommand{\eminus}{\text{\sc{e}-}}
188 | \newcommand{\e}{\text{\sc{e}}}
189 | 
190 | %##############################################################################
191 | % Environments
192 | %##############################################################################
193 | 
194 | \newtheorem{theorem}{Theorem}[]
195 | \newtheorem{definition}[theorem]{Definition}
196 | \newtheorem{assumption}[theorem]{Assumption}
197 | \newtheorem{conjecture}[theorem]{Conjecture}
198 | \newtheorem{claim}[theorem]{Claim}
199 | \newtheorem{lemma}[theorem]{Lemma}
200 | \newtheorem{proposition}[theorem]{Proposition}
201 | \newtheorem{property}[theorem]{Property}
202 | \newtheorem{fact}[theorem]{Fact}
203 | \newtheorem{corollary}[theorem]{Corollary}
204 | \newtheorem{example}[theorem]{Example}
205 | \newtheorem{remark}[theorem]{Remark}
206 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | config = {
 7 |     'name': 'bayesrl',
 8 |     'description': 'Reinforcement learning using Bayesian approaches',
 9 |     'author': 'Dustin Tran',
10 |     'author_email': 'dtran@g.harvard.edu',
11 |     'version': '0.1',
12 |     'packages': ['bayesrl'],
13 |     'scripts': [],
14 | }
15 | 
16 | setup(**config)
17 | 


--------------------------------------------------------------------------------
/tests/gridworld.py:
--------------------------------------------------------------------------------
1 | """
2 | Prints an example of a grid world. See gridworld.py for a key to the symbols.
3 | """
4 | from bayesrl.environments import GridWorld
5 | 
6 | maze = GridWorld.samples['larger']
7 | for row in maze:
8 |     print(row)
9 | 


--------------------------------------------------------------------------------
/tests/thompsongridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Solves grid world using Thompson sampling.
 3 | """
 4 | 
 5 | from bayesrl.environments import GridWorld
 6 | from bayesrl.agents.thompsonsampagent import ThompsonSampAgent
 7 | from bayesrl.trial import Trial
 8 | from bayesrl.plot import Plot
 9 | 
10 | # Define environment.
11 | task = GridWorld(
12 |     GridWorld.samples['larger'],
13 |     action_error_prob=.1,
14 |     rewards={'*': 50, 'moved': -1, 'hit-wall': -1})
15 | 
16 | num_trials = 1
17 | 
18 | # Define agent.
19 | # Dirichlet params = 1, Reward params = 50
20 | agent = ThompsonSampAgent(
21 |     num_states=task.num_states, num_actions=task.num_actions,
22 |     discount_factor=0.95, T=50, dirichlet_param=1, reward_param=50)
23 | trial_thompson1 = Trial(agent, task)
24 | trial_thompson1.run_multiple(num_trials)
25 | 
26 | # Plots!
27 | plot = Plot({"Thompson sampling": [trial_thompson1]})
28 | plot.rewards_by_episode()
29 | 


--------------------------------------------------------------------------------
/visual/.grid.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/.grid.py.swp


--------------------------------------------------------------------------------
/visual/agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2
  2 | import numpy as np
  3 | import itertools
  4 | 
  5 | 
  6 | class Agent(object):
  7 |     """
  8 |     Base class for all reinforcement learning agents to inherit from.
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     grid: the environment the agent acts on.
 13 |     gamma: float in (0,1]
 14 |         The discount factor per iteration.
 15 |     target_reward: reward for getting a target item
 16 | 	reward is -1 on non-target states.
 17 |     """
 18 |     def __init__(self, grid, gamma=.92, target_reward=100, aisle_reward=50):
 19 | 	self.discount_factor = gamma
 20 | 	self.target_reward = target_reward
 21 |         self.aisle_reward = aisle_reward
 22 | 	self.grid = grid
 23 | 	self.states = [(r,c) for r in range(self.grid.height) for c in range(self.grid.width)]
 24 |         self.num_actions = len(self.grid.actions)
 25 |         self.value_table = np.zeros((self.grid.height, self.grid.width, self.num_actions))
 26 | 
 27 |     def _value_iteration(self):
 28 | 	value = np.zeros(self.value_table.shape)
 29 | 	reward = self.get_reward_state()
 30 |         k = 0
 31 |         while True:
 32 |             diff = 0
 33 |             for s in self.states:
 34 |                 old = np.max(value[s])
 35 |                 value[s] = [np.sum([p*(reward[s_]+self.discount_factor*np.max(value[s_])) for (s_,p)
 36 |                                     in self.grid.transition_probs(s,a).items()]) for a in self.grid.actions]
 37 |                 diff = max(diff, abs(old - np.max(value[s])))
 38 |             k += 1
 39 |             if diff < 1e-2:
 40 |                 break
 41 |             if k > 1e6:
 42 |                 raise Exception("Value iteration not converging. Stopped at 1e6 iterations.")
 43 |         self.value_table = value
 44 | 
 45 |     def next_action(self):
 46 | 	most_likely_state = self.states[self._argmax_breaking_ties_randomly(np.ravel(self.grid.belief))]
 47 |         next_action = self._argmax_breaking_ties_randomly(self.value_table[most_likely_state])
 48 | 	return self.grid.actions[next_action]
 49 | 
 50 |     def get_reward_state(self):
 51 | 	targets = self.grid.targets
 52 | 	target_states = []
 53 | 	rewards = np.ones((self.grid.height, self.grid.width))*-1.
 54 | 	aisles_belief = self.grid.aisles_belief
 55 | 	content_belief = self.grid.content_belief
 56 | 	categories = aisles_belief[1].keys()
 57 | 	aisles_configs = []
 58 | 	aisles_probs = []
 59 | 	for config in itertools.permutations(categories):
 60 | 	    aisles_configs.append(config)
 61 | 	    aisle_prob = np.product([aisles_belief[i+1][config[i]] for i in
 62 | 	    range(len(config))])
 63 | 	    aisles_probs.append(aisle_prob)
 64 | 	aisles_probs = np.array(aisles_probs)/sum(aisles_probs)
 65 | 	multinomial = np.random.multinomial(1, aisles_probs)
 66 | 	aisles_config = aisles_configs[list(multinomial).index(1)]
 67 | 	items_configs = []
 68 | 	for category in aisles_config:
 69 | 	    shelf_configs = []
 70 | 	    shelf_probs = []
 71 | 	    items = content_belief[category][1].keys()
 72 | 	    for config in itertools.permutations(items):
 73 | 		shelf_configs.append(config)
 74 | 		shelf_prob = np.product([content_belief[category][i][config[i]] for i in
 75 | 		range(len(config))])
 76 | 		shelf_probs.append(shelf_prob)
 77 | 	    shelf_probs = np.array(shelf_probs)/sum(shelf_probs)
 78 | 	    multinomial = np.random.multinomial(1, shelf_probs)
 79 | 	    items_config = shelf_configs[list(multinomial).index(1)]
 80 | 	    items_configs.append(items_config)
 81 | 	    for t in targets:
 82 | 		if t in items_config:
 83 | 		    state = \
 84 |                             self.grid.aisles_list[aisles_config.index(category)][items_config.index(t)]
 85 | 		    target_states.append(state)
 86 | 	for s in target_states:
 87 | 
 88 |             a,_ = self.grid.cell_to_aisle(s)
 89 |             for (r,c) in self.grid.aisles_list[a-1]:
 90 |                 for dr, dc in self.grid.actions:
 91 |                     neighbor = (r+dr, c+dc)
 92 |                     if not self.grid.blocked(neighbor):
 93 |                         if rewards[neighbor] == -1:
 94 |                             rewards[neighbor] = 0
 95 |                         rewards[neighbor] += self.aisle_reward
 96 | 
 97 |             for dr, dc in self.grid.actions:
 98 | 		neighbor = (s[0]+dr, s[1]+dc)
 99 | 		if not self.grid.blocked(neighbor):
100 | 		    if rewards[neighbor] == -1:
101 | 			rewards[neighbor] = 0
102 | 		    rewards[neighbor] += self.target_reward
103 | 	return rewards
104 | 
105 | 
106 |     # Make sure inherited classes have interact() function.
107 |     def interact(self, reward, next_state, next_state_is_terminal):
108 | 	return
109 | 
110 | 
111 |     def _argmax_breaking_ties_randomly(self, x):
112 |         """Taken from Ken."""
113 |         max_value = np.max(x)
114 |         indices_with_max_value = np.flatnonzero(x == max_value)
115 |         return np.random.choice(indices_with_max_value)
116 | 


--------------------------------------------------------------------------------
/visual/colors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2
 2 | import pygame
 3 | 
 4 | black = pygame.Color(0,0,0)
 5 | white = pygame.Color(255,255,255)
 6 | blue  = pygame.Color(0,0,255)
 7 | green = pygame.Color(0,255,0)
 8 | red   = pygame.Color(255,0,0)
 9 | nameToColor = {
10 |     'black' : black,
11 |     'white' : white,
12 |     'blue'  : blue,
13 |     'green' : green,
14 |     'red'   : red
15 | }
16 | 
17 | gray = lambda fraction: (lambda c: pygame.Color(c,c,c))(int((1-fraction)*255))
18 | 


--------------------------------------------------------------------------------
/visual/debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2
 2 | import IPython
 3 | from threading import Thread
 4 | from grid import SuperMarket
 5 | import display
 6 | import sys
 7 | import pygame
 8 | from pygame.locals import *
 9 | import agent
10 | 
11 | held = None
12 | def event_handler(e):
13 |     global held
14 |     if e.type == pygame.KEYDOWN and held is None:
15 |         if e.key == pygame.K_a:
16 |             g.set_robot(a1)
17 |             held = 'a'
18 |         elif e.key == pygame.K_s:
19 |             g.set_robot(a2)
20 |             held = 'b'
21 |         elif e.key == pygame.K_d:
22 |             g.set_robot(a3)
23 |             held = 'd'
24 |         elif e.key == pygame.K_w:
25 |             g.set_robot(a4)
26 |             held = 'w'
27 |         if held is not None:
28 |             print g.observe()
29 |     elif e.type == pygame.KEYUP and held:
30 |         if e.key == pygame.K_a and held == 'a':
31 |             held = None
32 |         elif e.key == pygame.K_s and held == 'b':
33 |             held = None
34 |         elif e.key == pygame.K_d and held == 'd':
35 |             held = None
36 |         elif e.key == pygame.K_w and held == 'w':
37 |             held = None
38 | 
39 | g = SuperMarket()
40 | a = agent.Agent(g)
41 | class Count:
42 |     pass
43 | def process(every_frames):
44 |     counter = Count()
45 |     counter.n = 0
46 |     counter.success = False
47 |     def autonomous_action():
48 |         if counter.n == 0 and len(g.targets) > 0:
49 |             counter.success = False
50 |             a._value_iteration()
51 |             g.set_robot(a.next_action())
52 |             g.observe()
53 |         elif len(g.targets) == 0:
54 |             if not counter.success:
55 |                 counter.success = True
56 |                 print "SUCCESS!!!!!!"
57 |         counter.n = (counter.n+1)%every_frames
58 |     return autonomous_action
59 | 
60 | display.event_handler = event_handler
61 | def go():
62 |     display.process = process(10)
63 | 
64 | a1,a2,a3,a4 = g.actions
65 | display.drawables1.append(g.draw)
66 | display.drawables2.append(g.draw_belief)
67 | t = Thread(target=display.main, args=[sys.argv])
68 | t.start()
69 | IPython.embed()
70 | display.done = True
71 | t.join()
72 | 


--------------------------------------------------------------------------------
/visual/display.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2
 2 | import sys
 3 | import pygame
 4 | from pygame.locals import *
 5 | from colors import *
 6 | 
 7 | FPS = 10
 8 | 
 9 | # WIDTH = 800
10 | # HEIGHT = 500
11 | done = False
12 | 
13 | surface = None
14 | drawables1 = []
15 | drawables2 = []
16 | 
17 | def event_handler(e):
18 |     pass
19 | 
20 | def process():
21 |     pass
22 | 
23 | def main(args):
24 |     global done,surface,drawables
25 |     pygame.init()
26 |     info = pygame.display.Info()
27 |     WIDTH = info.current_w-500
28 |     HEIGHT = (WIDTH)/2
29 |     fpsClock = pygame.time.Clock()
30 | 
31 |     master_surface = pygame.display.set_mode((WIDTH+100,HEIGHT))
32 |     pygame.display.set_caption("6.834 Simulator")
33 |     surface1 = pygame.Surface(((WIDTH/2),HEIGHT))
34 |     surface2 = pygame.Surface(((WIDTH/2),HEIGHT))
35 | 
36 |     while not done:
37 |         process()
38 |         master_surface.fill(black)
39 |         surface1.fill(white)
40 |         surface2.fill(white)
41 | 
42 |         list(d(surface1) for d in drawables1)
43 |         list(d(surface2) for d in drawables2)
44 |         for event in pygame.event.get():
45 |             if event.type == QUIT:
46 |                 done = True
47 |             else:
48 |                 event_handler(event)
49 | 
50 |         master_surface.blit(surface1,(0,0))
51 |         master_surface.blit(surface2,(WIDTH/2+100,0))
52 |         fpsClock.tick(FPS)
53 |         pygame.display.flip()
54 | 
55 |     pygame.quit()
56 |     return 0
57 | 
58 | if __name__ == "__main__":
59 |     sys.exit(main(sys.argv))
60 | 


--------------------------------------------------------------------------------
/visual/grid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2
  2 | from threading import Lock
  3 | from math import *
  4 | import pygame
  5 | from pygame.locals import *
  6 | from colors import *
  7 | import random
  8 | import IPython
  9 | 
 10 | class Grid(object):
 11 |     def __init__(self, height, width, aisles, robot):
 12 |         self.height = height
 13 |         self.width = width
 14 |         self.aisles = set(aisles)
 15 |         self.robot = robot
 16 |         self.l = Lock()
 17 |         self.actions = [(0,-1),(1,0),(0,1),(-1,0)]
 18 |         self.p_error = 0.
 19 |         self.belief = [[1. if (r,c) == robot else 0. for c in range(width)] for r in range(height)]
 20 | 
 21 |     def action_errors(self,action):
 22 |         return [action]
 23 | 
 24 |     def blocked(self, (r,c)):
 25 |         return not (0 <= r < self.height and 0 <= c < self.width) or (r,c) in self.aisles
 26 | 
 27 |     def set_robot(self,action):
 28 |         with self.l:
 29 |             (r,c) = self.robot
 30 |         if action in self.actions:
 31 |             new_belief = self.transition_update(self.belief,action)
 32 |             with self.l:
 33 |                 self.belief = new_belief
 34 |             errors = self.action_errors(action)
 35 |             if random.random() <= self.p_error:
 36 |                 action = random.choice(errors)
 37 |             (dr,dc) = action
 38 |             (nr,nc) = (r+dr,c+dc)
 39 |             if not self.blocked((nr,nc)):
 40 |                 with self.l:
 41 |                     self.robot = (nr,nc)
 42 | 
 43 |     def transition_probs(self,state,action):
 44 | 	t_probs = {}
 45 | 	r, c = state
 46 |         errors = self.action_errors(action)
 47 | 	dr,dc = action
 48 | 	new_state = (r+dr,c+dc) if not self.blocked((r+dr,c+dc)) else (r,c)
 49 | 	t_probs[new_state] = 1. - self.p_error
 50 | 	for (dr,dc) in errors:
 51 | 	    new_state = (r+dr,c+dc) if not self.blocked((r+dr,c+dc)) else (r,c)
 52 | 	    if new_state in t_probs:
 53 | 		t_probs[new_state] += self.p_error/len(errors)
 54 | 	    else:
 55 | 		t_probs[new_state] = self.p_error/len(errors)
 56 |         return t_probs
 57 | 
 58 |     def transition_update(self,belief,action):
 59 |         new_belief = [[0. for c in range(self.width)] for r in range(self.height)]
 60 |         for r in range(self.height):
 61 |             for c in range(self.width):
 62 |                 t_probs = self.transition_probs((r,c),action)
 63 |                 for (nr,nc) in t_probs:
 64 |                     new_belief[nr][nc] += t_probs[(nr,nc)]*self.belief[r][c]
 65 |         return new_belief
 66 | 
 67 |     def dimensions(self,surface):
 68 |         pix_height = surface.get_height()
 69 |         pix_width = surface.get_width()
 70 | 
 71 |         row_height = int(pix_height/self.height)
 72 |         col_width = int(pix_width/self.width)
 73 | 
 74 |         return pix_height,pix_width,row_height,col_width
 75 | 
 76 |     def draw(self,surface,robot=True):
 77 |         pix_height,pix_width,row_height,col_width = self.dimensions(surface)
 78 |         # Draw rows
 79 |         #
 80 |         for r in range(1,self.height):
 81 |             pygame.draw.line(surface,black,(0,r*row_height),(pix_width,r*row_height))
 82 |         # Draw columns
 83 |         #
 84 |         for c in range(1,self.width):
 85 |             pygame.draw.line(surface,black,(c*col_width,0),(c*col_width,pix_height))
 86 | 
 87 |         # Draw the aisles
 88 |         #
 89 |         for (r,c) in self.aisles:
 90 |             surface.fill(black, rect=(c*col_width,r*row_height,col_width,row_height))
 91 | 
 92 |         if robot:
 93 |             with self.l:
 94 |                 (r,c) = self.robot
 95 |             (x,y) = int((c+0.5)*col_width),int((r+0.5)*row_height)
 96 |             radius = int(min(row_height,col_width)/2.0)
 97 |             pygame.draw.circle(surface,red,(x,y),radius,10)
 98 | 
 99 | class SuperMarket(Grid):
100 |     def __init__(self):
101 |         self.aisles_content = {
102 |             'meats': ['chicken','beef','pork','turkey'],
103 |             'candy': ['oreo','twix','nutella','kitkat'],
104 |             'dairy': ['milk','iscream','butter','curd'],
105 |             'drink': ['water','juice','soda','smoothi'],
106 |             'grain': ['rice','flour','barley', 'beans'],
107 |             'pasta': ['penne','fusilli','farfalle','lasagna']
108 |         }
109 | 
110 |         self.aisles_list = [
111 |             [(i+1,2*n+1) for i in range(len(self.aisles_content.values()[0]))]
112 |             for n in range(len(self.aisles_content))
113 |         ]
114 |         aisles = [cell for a in self.aisles_list for cell in a]
115 | 
116 |         width = len(self.aisles_list * 2) + 1
117 |         height = len(self.aisles_list[0])+3
118 |         possible_robot = [(0,0),(height-1,width-1)]
119 |         robot = random.choice(possible_robot)
120 |         possible_robot = set(possible_robot)
121 | 
122 |         super(SuperMarket,self).__init__(height,width,aisles,robot)
123 | 
124 |         self.belief = [[1./len(possible_robot) if (r,c) in possible_robot else 0.
125 |                         for c in range(width)] for r in range(height)]
126 |         self.actions = [(0,-1),(1,0),(0,1),(-1,0)]
127 |         self.p_error = 0.05
128 | 
129 |         self.targets = set(['oreo','iscream','milk'])
130 | 
131 |         for c in self.aisles_content:
132 |             random.shuffle(self.aisles_content[c])
133 |         aisles_order = self.aisles_list[:]
134 |         random.shuffle(aisles_order)
135 |         self.obs = {}
136 |         for prods,aisle in zip(self.aisles_content.values(),aisles_order):
137 |             self.obs.update(zip(aisle,prods))
138 | 
139 |         # Aisle belief state
140 |         #
141 |         self.aisles_belief = {}
142 |         aisles_types = self.aisles_content.keys()
143 |         for a in range(len(self.aisles_list)):
144 |             self.aisles_belief[a+1] = dict(zip(
145 |                 aisles_types,
146 |                 [1./len(aisles_types)]*len(aisles_types)))
147 | 
148 |         # Inner aisle belief state
149 |         #
150 |         inner = lambda prods: dict((p,1./len(prods)) for p in prods)
151 |         self.content_belief = dict(
152 |             (cat,dict(enumerate([inner(prods) for _ in prods])))
153 |             for cat,prods in self.aisles_content.items()
154 |         )
155 | 
156 |         self.images = {
157 |             'meats'   : pygame.image.load("images/meats.jpg"),
158 |             'candy'   : pygame.image.load("images/candy.gif"),
159 |             'dairy'   : pygame.image.load("images/dairy.jpg"),
160 |             'drink'   : pygame.image.load("images/drink.jpg"),
161 |             'grain'   : pygame.image.load("images/grain.jpg"),
162 |             'pasta'   : pygame.image.load("images/pasta.jpg"),
163 | 
164 |             'chicken' : pygame.image.load("images/chicken.jpg"),
165 |             'pork'    : pygame.image.load("images/pork.jpg"),
166 |             'turkey'  : pygame.image.load("images/turkey.gif"),
167 |             'beef'    : pygame.image.load("images/beef.jpg"),
168 | 
169 |             'oreo'    : pygame.image.load("images/oreo.jpg"),
170 |             'twix'    : pygame.image.load("images/twix.jpg"),
171 |             'nutella' : pygame.image.load("images/nutella.jpg"),
172 |             'kitkat'  : pygame.image.load("images/kitkat.jpg"),
173 | 
174 |             'milk'    : pygame.image.load("images/milk.jpg"),
175 |             'curd'    : pygame.image.load("images/curd.jpg"),
176 |             'iscream' : pygame.image.load("images/iscream.jpg"),
177 |             'butter'  : pygame.image.load("images/butter.jpg"),
178 | 
179 |             'water'   : pygame.image.load("images/water.jpg"),
180 |             'juice'   : pygame.image.load("images/juice.jpg"),
181 |             'soda'    : pygame.image.load("images/soda.jpg"),
182 |             'smoothi' : pygame.image.load("images/smoothi.jpg"),
183 | 
184 |             'rice'    : pygame.image.load("images/rice.jpg"),
185 |             'flour'   : pygame.image.load("images/flour.jpg"),
186 |             'barley'  : pygame.image.load("images/barley.jpg"),
187 |             'beans'   : pygame.image.load("images/beans.jpg"),
188 | 
189 |             'penne'   : pygame.image.load("images/penne.jpg"),
190 |             'fusilli' : pygame.image.load("images/fusilli.jpg"),
191 |             'farfalle': pygame.image.load("images/farfalle.jpg"),
192 |             'lasagna' : pygame.image.load("images/lasagna.jpg")
193 |         }
194 | 
195 |     def cell_to_aisle(self,(r,c)):
196 |         for i in range(len(self.aisles_list)):
197 |             if (r,c) in self.aisles_list[i]:
198 |                 return (i+1,self.aisles_list[i].index((r,c)))
199 |         return None
200 | 
201 |     def category(self, product):
202 |         for c in self.aisles_content:
203 |             if product in self.aisles_content[c]:
204 |                 return c
205 |         return None
206 | 
207 |     transformed = False
208 |     def draw(self,surface):
209 |         pix_height,pix_width,row_height,col_width = self.dimensions(surface)
210 |         super(SuperMarket,self).draw(surface)
211 |         if not self.transformed:
212 |             for prod in self.images:
213 |                 img = self.images[prod]
214 |                 self.images[prod] = pygame.transform.scale(img.convert(),(col_width,row_height))
215 |             self.transformed = True
216 |         for (r,c) in self.aisles:
217 |             prod = self.obs[(r,c)]
218 |             img = self.images.get(prod,None)
219 |             if img is not None:
220 |                 surface.blit(img, dest=(c*col_width,r*row_height))
221 | 
222 |     def draw_belief(self,surface):
223 |         with self.l:
224 |             belief = [r[:] for r in self.belief]
225 |         pix_height,pix_width,row_height,col_width = self.dimensions(surface)
226 |         for r in range(self.height):
227 |             for c in range(self.width):
228 |                 logB = log(belief[r][c]) if belief[r][c] != 0 else -12
229 |                 surface.fill(gray(max((logB+12)/12.0,0)),
230 |                              rect=(c*col_width,r*row_height,col_width,row_height))
231 | 
232 |         super(SuperMarket,self).draw(surface,False)
233 | 
234 |         if not self.transformed:
235 |             for prod in self.images:
236 |                 img = self.images[prod]
237 |                 self.images[prod] = pygame.transform.scale(img.convert(),(col_width,row_height))
238 |             self.transformed = True
239 | 
240 |         for (r,c) in self.aisles:
241 |             # Are we more than 50% certain about any product here?
242 |             found = False
243 |             a,p = self.cell_to_aisle((r,c))
244 |             for cat in self.aisles_belief[a]:
245 |                 prob_cat = self.aisles_belief[a][cat]
246 |                 if prob_cat > 0.5:
247 |                     img = self.images.get(cat,None)
248 |                     found = True
249 |                     for prod in self.content_belief[cat][p]:
250 |                         prob = prob_cat*\
251 |                                self.content_belief[cat][p][prod]
252 |                         if prob > 0.5:
253 |                             img = self.images.get(prod,None)
254 |                             break
255 |                     break
256 |             if not found:
257 |                 continue
258 |             surface.blit(img, dest=(c*col_width,r*row_height))
259 | 
260 |     def action_errors(self,action):
261 |         i = self.actions.index(action)
262 |         l = len(self.actions)
263 |         return self.actions[(i-1)%l],self.actions[(i+1)%l],(0,0)
264 | 
265 |     def observe(self):
266 |         with self.l:
267 |             (r,c) = self.robot
268 |         obs = ()
269 |         for dr,dc in [(0,-1),(1,0),(0,1),(-1,0)]:
270 |             obs += (self.obs.get((r+dr,c+dc),None),)
271 |         self.observation_update(obs)
272 |         list(self.targets.discard(o) for o in obs)
273 |         return obs
274 | 
275 |     def observation_update(self, observation):
276 |         with self.l:
277 |             belief = [r[:] for r in self.belief]
278 | 
279 |         # Update position belief
280 |         obs_cells = [(obs,(r+dr,c+dc),(r,c))
281 |                      for r in range(self.height)
282 |                      for c in range(self.width)
283 |                      for (obs,(dr,dc)) in zip(observation, [(0,-1),(1,0),(0,1),(-1,0)])
284 |                      if belief[r][c] != 0
285 |         ] # (obs,(row,col),parent)
286 |         for (obs,neigh,(r,c)) in obs_cells:
287 |             if self.cell_to_aisle(neigh) is None:
288 |                 if obs is not None:
289 |                     belief[r][c] = 0
290 |             else:
291 |                 if obs is None:
292 |                     belief[r][c] = 0
293 |                 else:
294 |                     aisle,pos = self.cell_to_aisle(neigh)
295 |                     cat = self.category(obs)
296 |                     belief[r][c] *= self.aisles_belief[aisle][cat] * self.content_belief[cat][pos][obs]
297 |         Z = sum(b for r in belief for b in r)
298 |         belief = [[b/Z for b in r] for r in belief]
299 |         with self.l:
300 |             self.belief = belief
301 |         if not all(o is None for o in observation):
302 |             obs_cells = [(obs,(r+dr,c+dc),belief[r][c],(r,c))
303 |                          for r in range(self.height)
304 |                          for c in range(self.width)
305 |                          for (obs,(dr,dc)) in zip(observation, [(0,-1),(1,0),(0,1),(-1,0)])
306 |                          if obs is not None
307 |                      ] # (obs,(row,col),prob,parent)
308 |             self.observation_world_update(obs_cells)
309 | 
310 |     def observation_world_update(self,obs_cells):
311 |         with self.l:
312 |             belief = [r[:] for r in self.belief]
313 |         # Update world belief
314 |         op = {}
315 |         ac = {}
316 |         for (obs,neigh,prob,(r,c)) in obs_cells:
317 |             if self.cell_to_aisle(neigh) is None:
318 |                 continue
319 |             aisle,pos = self.cell_to_aisle(neigh)
320 |             cat = self.category(obs)
321 | 
322 |             if obs not in op:
323 |                 op[obs] = {}
324 |             op[obs][pos] = op[obs].get(pos,0)+prob
325 | 
326 |             if aisle not in ac:
327 |                 ac[aisle] = {}
328 |             ac[aisle][cat] = ac[aisle].get(cat,0)+prob
329 | 
330 |         # Update aisle beliefs
331 |         #
332 |         ab = self.aisles_belief
333 |         for a in ac:
334 |             for cat in ac[a]:
335 |                 prob = ac[a][cat]
336 |                 Z1 = ab[a][cat]
337 |                 Z2 = 1-Z1
338 |                 if Z1 == 0 or Z2 == 0:
339 |                     continue
340 |                 for c in ab[a]:
341 |                     ab[a][c] *= prob/Z1 if (c==cat) else (1.-prob)/Z2
342 | 
343 |         # Update aisle position beliefs
344 |         #
345 |         for obs in op:
346 |             cat = self.category(obs)
347 |             cb = self.content_belief[cat]
348 |             for p in cb:
349 |                 prob = op[obs][p]
350 |                 Z1 = cb[p][obs]
351 |                 Z2 = 1-Z1
352 |                 if Z1 == 0 or Z2 == 0:
353 |                     continue
354 |                 for o in cb[p]:
355 |                     cb[p][o] *= prob/Z1 if (o==obs) else (1.-prob)/Z2
356 | 


--------------------------------------------------------------------------------
/visual/images/barley.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/barley.jpg


--------------------------------------------------------------------------------
/visual/images/beans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/beans.jpg


--------------------------------------------------------------------------------
/visual/images/beef.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/beef.jpg


--------------------------------------------------------------------------------
/visual/images/butter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/butter.jpg


--------------------------------------------------------------------------------
/visual/images/candy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/candy.gif


--------------------------------------------------------------------------------
/visual/images/chicken.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/chicken.jpg


--------------------------------------------------------------------------------
/visual/images/curd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/curd.jpg


--------------------------------------------------------------------------------
/visual/images/dairy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/dairy.jpg


--------------------------------------------------------------------------------
/visual/images/drink.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/drink.jpg


--------------------------------------------------------------------------------
/visual/images/farfalle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/farfalle.jpg


--------------------------------------------------------------------------------
/visual/images/flour.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/flour.jpg


--------------------------------------------------------------------------------
/visual/images/fusilli.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/fusilli.jpg


--------------------------------------------------------------------------------
/visual/images/grain.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/grain.jpg


--------------------------------------------------------------------------------
/visual/images/iscream.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/iscream.jpg


--------------------------------------------------------------------------------
/visual/images/juice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/juice.jpg


--------------------------------------------------------------------------------
/visual/images/kitkat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/kitkat.jpg


--------------------------------------------------------------------------------
/visual/images/lasagna.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/lasagna.jpg


--------------------------------------------------------------------------------
/visual/images/meats.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/meats.jpg


--------------------------------------------------------------------------------
/visual/images/milk.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/milk.jpg


--------------------------------------------------------------------------------
/visual/images/nutella.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/nutella.jpg


--------------------------------------------------------------------------------
/visual/images/oreo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/oreo.jpg


--------------------------------------------------------------------------------
/visual/images/pasta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/pasta.jpg


--------------------------------------------------------------------------------
/visual/images/penne.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/penne.jpg


--------------------------------------------------------------------------------
/visual/images/pork.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/pork.jpg


--------------------------------------------------------------------------------
/visual/images/rice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/rice.jpg


--------------------------------------------------------------------------------
/visual/images/smoothi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/smoothi.jpg


--------------------------------------------------------------------------------
/visual/images/soda.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/soda.jpg


--------------------------------------------------------------------------------
/visual/images/turkey.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/turkey.gif


--------------------------------------------------------------------------------
/visual/images/twix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/twix.jpg


--------------------------------------------------------------------------------
/visual/images/water.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dustinvtran/bayesrl/8995357d26fed5d38d7e061a1dc5d2b65b5c3c76/visual/images/water.jpg


--------------------------------------------------------------------------------