├── .gitignore
├── MDP.png
├── README.md
├── reinforce
├── __init__.py
├── encoding.py
├── learn.py
├── policy.py
├── rewards.py
└── transitions.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg-info
3 | dist
4 | .DS_Store
5 |
--------------------------------------------------------------------------------
/MDP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanEpstein/reinforce/06a698c91da1b4acd59a4fac29e64c08f27fb12c/MDP.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # reinforce
2 |
3 |
4 |
5 | A 'plug and play' reinforcement learning library in Python.
6 |
7 | Infers a Markov Decision Process from data and solves for the optimal policy.
8 |
9 | Implementation based on Andrew Ng's notes.
10 |
11 | More information related to this project can be found here.
12 |
13 | ## Example Usage
14 |
15 | ```python
16 |
17 | observations = [
18 | { 'state_transitions': [
19 | { 'state': 'low', 'action': 'climb', 'state_': 'mid' },
20 | { 'state': 'mid', 'action': 'climb', 'state_': 'high' },
21 | { 'state': 'high', 'action': 'sink', 'state_': 'mid' },
22 | { 'state': 'mid', 'action': 'sink', 'state_': 'low' },
23 | { 'state': 'low', 'action': 'sink', 'state_': 'bottom' }
24 | ],
25 | 'reward': 0
26 | },
27 | { 'state_transitions': [
28 | { 'state': 'low', 'action': 'climb', 'state_': 'mid' },
29 | { 'state': 'mid', 'action': 'climb', 'state_': 'high' },
30 | { 'state': 'high', 'action': 'climb', 'state_': 'top' },
31 | ],
32 | 'reward': 0
33 | }
34 | ]
35 |
36 | trap_states = [
37 | {
38 | 'state_transitions': [
39 | { 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' },
40 | { 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' }
41 | ],
42 | 'reward': 0
43 | },
44 | {
45 | 'state_transitions': [
46 | { 'state': 'top', 'action': 'sink', 'state_': 'top' },
47 | { 'state': 'top', 'action': 'climb', 'state_': 'top' },
48 | ],
49 | 'reward': 1
50 | },
51 | ]
52 |
53 | from learn import MarkovAgent
54 | mark = MarkovAgent(observations + trap_states)
55 | mark.learn()
56 |
57 | print(mark.policy)
58 | # {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'}
59 | # NOTE: policy in top and bottom states is chosen randomly (doesn't affect state)
60 |
61 | ```
62 |
--------------------------------------------------------------------------------
/reinforce/__init__.py:
--------------------------------------------------------------------------------
1 | from .learn import *
--------------------------------------------------------------------------------
/reinforce/encoding.py:
--------------------------------------------------------------------------------
1 | class StateActionEncoder:
2 | def __init__(self, observations):
3 | self.observations = observations
4 | self._parse_states_and_actions()
5 |
6 | def parse_dimensions(self):
7 | return {
8 | 'state_count': len(self.int_to_state),
9 | 'action_count': len(self.int_to_action)
10 | }
11 |
12 | def observations_to_int(self):
13 | for observation in self.observations:
14 | for transition in observation['state_transitions']:
15 | transition['state'] = self.state_to_int[transition['state']]
16 | transition['state_'] = self.state_to_int[transition['state_']]
17 | transition['action'] = self.action_to_int[transition['action']]
18 |
19 | def parse_encoded_policy(self, encoded_policy):
20 | policy = {}
21 | for index, encoded_action in enumerate(encoded_policy):
22 | state = self.int_to_state[index]
23 | action = self.int_to_action[int(encoded_action)]
24 | policy[state] = action
25 |
26 | return policy
27 |
28 | def _parse_states_and_actions(self):
29 | state_dict, action_dict = {}, {}
30 | state_array, action_array = [], []
31 | state_index, action_index = 0, 0
32 |
33 | for observation in self.observations:
34 | for transition in observation['state_transitions']:
35 | state = transition['state']
36 | action = transition['action']
37 |
38 | if state not in state_dict.keys():
39 | state_dict[state] = state_index
40 | state_array.append(state)
41 | state_index += 1
42 |
43 | if action not in action_dict.keys():
44 | action_dict[action] = action_index
45 | action_array.append(action)
46 | action_index += 1
47 |
48 | self.state_to_int = state_dict
49 | self.action_to_int = action_dict
50 | self.int_to_state = state_array
51 | self.int_to_action = action_array
52 |
53 |
--------------------------------------------------------------------------------
/reinforce/learn.py:
--------------------------------------------------------------------------------
1 | from encoding import StateActionEncoder
2 | from rewards import RewardParser
3 | from transitions import TransitionParser
4 | from policy import PolicyParser
5 |
6 | class MarkovAgent:
7 | def __init__(self, observations):
8 | # encode observation data as int values
9 | self.state_action_encoder = StateActionEncoder(observations)
10 | self.state_action_encoder.observations_to_int()
11 | dimensions = self.state_action_encoder.parse_dimensions()
12 |
13 | # create reward, transition, and policy parsers
14 | self.reward_parser = RewardParser(observations, dimensions)
15 | self.transition_parser = TransitionParser(observations, dimensions)
16 | self.policy_parser = PolicyParser(dimensions)
17 |
18 | def learn(self):
19 | R = self.reward_parser.rewards()
20 | P = self.transition_parser.transition_probabilities()
21 |
22 | # learn int-encoded policy and convert to readable dictionary
23 | encoded_policy = self.policy_parser.policy(P, R)
24 | self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy)
25 |
--------------------------------------------------------------------------------
/reinforce/policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class PolicyParser:
4 | def __init__(self, dimensions):
5 | self.state_count = dimensions['state_count']
6 | self.action_count = dimensions['action_count']
7 |
8 | def policy(self, P, rewards):
9 | print('COMPUTING POLICY')
10 |
11 | best_policy = np.zeros(self.state_count)
12 | state_values = np.zeros(self.state_count)
13 |
14 | GAMMA = 0.9
15 | ITERATIONS = 125
16 | for i in range(ITERATIONS):
17 | print ("iteration: {0} / {1}".format(i + 1, ITERATIONS))
18 |
19 | for state in range(0, self.state_count):
20 | state_value = -float('Inf')
21 |
22 | for action in range(0, self.action_count):
23 | action_value = 0
24 |
25 | for state_ in range(0, self.state_count):
26 | action_value += (P[state][action][state_] * state_values[state_] * GAMMA)
27 |
28 | if (action_value >= state_value):
29 | state_value = action_value
30 | best_policy[state] = action
31 |
32 | state_values[state] = rewards[state] + state_value
33 |
34 | return best_policy
--------------------------------------------------------------------------------
/reinforce/rewards.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class RewardParser:
4 | def __init__(self, observations, dimensions):
5 | self.observations = observations
6 | self.state_count = dimensions['state_count']
7 |
8 | def rewards(self):
9 | print('COMPUTING REWARDS')
10 | total_state_rewards = np.zeros(self.state_count)
11 | total_state_visits = np.zeros(self.state_count)
12 |
13 | for observation in self.observations:
14 | visits = float(len(observation['state_transitions']))
15 | reward_per_visit = observation['reward'] / visits
16 |
17 | for state_transition in observation['state_transitions']:
18 | state = state_transition['state']
19 | total_state_rewards[state] += reward_per_visit
20 | total_state_visits[state] += 1
21 |
22 | average_state_rewards = total_state_rewards / total_state_visits
23 | average_state_rewards = np.nan_to_num(average_state_rewards)
24 |
25 | return average_state_rewards
--------------------------------------------------------------------------------
/reinforce/transitions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class TransitionParser:
4 | def __init__(self, observations, dimensions):
5 | self.observations = observations
6 | self.state_count = dimensions['state_count']
7 | self.action_count = dimensions['action_count']
8 |
9 | def transition_probabilities(self):
10 | print('COMPUTING TRANSITIONS')
11 | transition_count = self._count_transitions()
12 | return self._parse_probabilities(transition_count)
13 |
14 | def _count_transitions(self):
15 | transition_count = np.zeros((self.state_count, self.action_count, self.state_count))
16 |
17 | for observation in self.observations:
18 | for state_transition in observation['state_transitions']:
19 | state = state_transition['state']
20 | action = state_transition['action']
21 | state_ = state_transition['state_']
22 |
23 | transition_count[state][action][state_] += 1
24 |
25 | return transition_count
26 |
27 | def _parse_probabilities(self, transition_count):
28 | P = np.zeros((self.state_count, self.action_count, self.state_count))
29 |
30 | for state in range(0, self.state_count):
31 | for action in range(0, self.action_count):
32 |
33 | total_transitions = float(sum(transition_count[state][action]))
34 |
35 | if (total_transitions > 0):
36 | P[state][action] = transition_count[state][action] / total_transitions
37 | else:
38 | P[state][action] = 1.0 / self.state_count
39 |
40 | return P
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(name='reinforce',
4 | version='0.2.0',
5 | description='plug and play reinforcement learning',
6 | url='http://github.com/nathanepstein/reinforce',
7 | author='Nathan Epstein',
8 | author_email='ne2210@columbia.edu',
9 | license='MIT',
10 | packages=['reinforce'],
11 | )
--------------------------------------------------------------------------------