├── .gitignore ├── MDP.png ├── README.md ├── reinforce ├── __init__.py ├── encoding.py ├── learn.py ├── policy.py ├── rewards.py └── transitions.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | dist 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /MDP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanEpstein/reinforce/06a698c91da1b4acd59a4fac29e64c08f27fb12c/MDP.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reinforce 2 | 3 | 4 | 5 | A 'plug and play' reinforcement learning library in Python. 6 | 7 | Infers a Markov Decision Process from data and solves for the optimal policy. 8 | 9 | Implementation based on Andrew Ng's notes. 10 | 11 | More information related to this project can be found here. 12 | 13 | ## Example Usage 14 | 15 | ```python 16 | 17 | observations = [ 18 | { 'state_transitions': [ 19 | { 'state': 'low', 'action': 'climb', 'state_': 'mid' }, 20 | { 'state': 'mid', 'action': 'climb', 'state_': 'high' }, 21 | { 'state': 'high', 'action': 'sink', 'state_': 'mid' }, 22 | { 'state': 'mid', 'action': 'sink', 'state_': 'low' }, 23 | { 'state': 'low', 'action': 'sink', 'state_': 'bottom' } 24 | ], 25 | 'reward': 0 26 | }, 27 | { 'state_transitions': [ 28 | { 'state': 'low', 'action': 'climb', 'state_': 'mid' }, 29 | { 'state': 'mid', 'action': 'climb', 'state_': 'high' }, 30 | { 'state': 'high', 'action': 'climb', 'state_': 'top' }, 31 | ], 32 | 'reward': 0 33 | } 34 | ] 35 | 36 | trap_states = [ 37 | { 38 | 'state_transitions': [ 39 | { 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' }, 40 | { 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' } 41 | ], 42 | 'reward': 0 43 | }, 44 | { 45 | 'state_transitions': [ 46 | { 'state': 'top', 'action': 'sink', 'state_': 'top' }, 47 | { 'state': 'top', 'action': 'climb', 'state_': 'top' }, 48 | ], 49 | 'reward': 1 50 | }, 51 | ] 52 | 53 | from learn import MarkovAgent 54 | mark = MarkovAgent(observations + trap_states) 55 | mark.learn() 56 | 57 | print(mark.policy) 58 | # {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'} 59 | # NOTE: policy in top and bottom states is chosen randomly (doesn't affect state) 60 | 61 | ``` 62 | -------------------------------------------------------------------------------- /reinforce/__init__.py: -------------------------------------------------------------------------------- 1 | from .learn import * -------------------------------------------------------------------------------- /reinforce/encoding.py: -------------------------------------------------------------------------------- 1 | class StateActionEncoder: 2 | def __init__(self, observations): 3 | self.observations = observations 4 | self._parse_states_and_actions() 5 | 6 | def parse_dimensions(self): 7 | return { 8 | 'state_count': len(self.int_to_state), 9 | 'action_count': len(self.int_to_action) 10 | } 11 | 12 | def observations_to_int(self): 13 | for observation in self.observations: 14 | for transition in observation['state_transitions']: 15 | transition['state'] = self.state_to_int[transition['state']] 16 | transition['state_'] = self.state_to_int[transition['state_']] 17 | transition['action'] = self.action_to_int[transition['action']] 18 | 19 | def parse_encoded_policy(self, encoded_policy): 20 | policy = {} 21 | for index, encoded_action in enumerate(encoded_policy): 22 | state = self.int_to_state[index] 23 | action = self.int_to_action[int(encoded_action)] 24 | policy[state] = action 25 | 26 | return policy 27 | 28 | def _parse_states_and_actions(self): 29 | state_dict, action_dict = {}, {} 30 | state_array, action_array = [], [] 31 | state_index, action_index = 0, 0 32 | 33 | for observation in self.observations: 34 | for transition in observation['state_transitions']: 35 | state = transition['state'] 36 | action = transition['action'] 37 | 38 | if state not in state_dict.keys(): 39 | state_dict[state] = state_index 40 | state_array.append(state) 41 | state_index += 1 42 | 43 | if action not in action_dict.keys(): 44 | action_dict[action] = action_index 45 | action_array.append(action) 46 | action_index += 1 47 | 48 | self.state_to_int = state_dict 49 | self.action_to_int = action_dict 50 | self.int_to_state = state_array 51 | self.int_to_action = action_array 52 | 53 | -------------------------------------------------------------------------------- /reinforce/learn.py: -------------------------------------------------------------------------------- 1 | from encoding import StateActionEncoder 2 | from rewards import RewardParser 3 | from transitions import TransitionParser 4 | from policy import PolicyParser 5 | 6 | class MarkovAgent: 7 | def __init__(self, observations): 8 | # encode observation data as int values 9 | self.state_action_encoder = StateActionEncoder(observations) 10 | self.state_action_encoder.observations_to_int() 11 | dimensions = self.state_action_encoder.parse_dimensions() 12 | 13 | # create reward, transition, and policy parsers 14 | self.reward_parser = RewardParser(observations, dimensions) 15 | self.transition_parser = TransitionParser(observations, dimensions) 16 | self.policy_parser = PolicyParser(dimensions) 17 | 18 | def learn(self): 19 | R = self.reward_parser.rewards() 20 | P = self.transition_parser.transition_probabilities() 21 | 22 | # learn int-encoded policy and convert to readable dictionary 23 | encoded_policy = self.policy_parser.policy(P, R) 24 | self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy) 25 | -------------------------------------------------------------------------------- /reinforce/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class PolicyParser: 4 | def __init__(self, dimensions): 5 | self.state_count = dimensions['state_count'] 6 | self.action_count = dimensions['action_count'] 7 | 8 | def policy(self, P, rewards): 9 | print('COMPUTING POLICY') 10 | 11 | best_policy = np.zeros(self.state_count) 12 | state_values = np.zeros(self.state_count) 13 | 14 | GAMMA = 0.9 15 | ITERATIONS = 125 16 | for i in range(ITERATIONS): 17 | print ("iteration: {0} / {1}".format(i + 1, ITERATIONS)) 18 | 19 | for state in range(0, self.state_count): 20 | state_value = -float('Inf') 21 | 22 | for action in range(0, self.action_count): 23 | action_value = 0 24 | 25 | for state_ in range(0, self.state_count): 26 | action_value += (P[state][action][state_] * state_values[state_] * GAMMA) 27 | 28 | if (action_value >= state_value): 29 | state_value = action_value 30 | best_policy[state] = action 31 | 32 | state_values[state] = rewards[state] + state_value 33 | 34 | return best_policy -------------------------------------------------------------------------------- /reinforce/rewards.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class RewardParser: 4 | def __init__(self, observations, dimensions): 5 | self.observations = observations 6 | self.state_count = dimensions['state_count'] 7 | 8 | def rewards(self): 9 | print('COMPUTING REWARDS') 10 | total_state_rewards = np.zeros(self.state_count) 11 | total_state_visits = np.zeros(self.state_count) 12 | 13 | for observation in self.observations: 14 | visits = float(len(observation['state_transitions'])) 15 | reward_per_visit = observation['reward'] / visits 16 | 17 | for state_transition in observation['state_transitions']: 18 | state = state_transition['state'] 19 | total_state_rewards[state] += reward_per_visit 20 | total_state_visits[state] += 1 21 | 22 | average_state_rewards = total_state_rewards / total_state_visits 23 | average_state_rewards = np.nan_to_num(average_state_rewards) 24 | 25 | return average_state_rewards -------------------------------------------------------------------------------- /reinforce/transitions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class TransitionParser: 4 | def __init__(self, observations, dimensions): 5 | self.observations = observations 6 | self.state_count = dimensions['state_count'] 7 | self.action_count = dimensions['action_count'] 8 | 9 | def transition_probabilities(self): 10 | print('COMPUTING TRANSITIONS') 11 | transition_count = self._count_transitions() 12 | return self._parse_probabilities(transition_count) 13 | 14 | def _count_transitions(self): 15 | transition_count = np.zeros((self.state_count, self.action_count, self.state_count)) 16 | 17 | for observation in self.observations: 18 | for state_transition in observation['state_transitions']: 19 | state = state_transition['state'] 20 | action = state_transition['action'] 21 | state_ = state_transition['state_'] 22 | 23 | transition_count[state][action][state_] += 1 24 | 25 | return transition_count 26 | 27 | def _parse_probabilities(self, transition_count): 28 | P = np.zeros((self.state_count, self.action_count, self.state_count)) 29 | 30 | for state in range(0, self.state_count): 31 | for action in range(0, self.action_count): 32 | 33 | total_transitions = float(sum(transition_count[state][action])) 34 | 35 | if (total_transitions > 0): 36 | P[state][action] = transition_count[state][action] / total_transitions 37 | else: 38 | P[state][action] = 1.0 / self.state_count 39 | 40 | return P -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='reinforce', 4 | version='0.2.0', 5 | description='plug and play reinforcement learning', 6 | url='http://github.com/nathanepstein/reinforce', 7 | author='Nathan Epstein', 8 | author_email='ne2210@columbia.edu', 9 | license='MIT', 10 | packages=['reinforce'], 11 | ) --------------------------------------------------------------------------------