├── .gitignore
├── MDP.png
├── README.md
├── reinforce
    ├── __init__.py
    ├── encoding.py
    ├── learn.py
    ├── policy.py
    ├── rewards.py
    └── transitions.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg-info
3 | dist
4 | .DS_Store
5 | 


--------------------------------------------------------------------------------
/MDP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanEpstein/reinforce/06a698c91da1b4acd59a4fac29e64c08f27fb12c/MDP.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # reinforce
 2 | 
 3 | <img src="./MDP.png">
 4 | 
 5 | A 'plug and play' reinforcement learning library in Python.
 6 | 
 7 | Infers a Markov Decision Process from data and solves for the optimal policy.
 8 | 
 9 | Implementation based on Andrew Ng's <a href="https://web.cs.wpi.edu/~kmlee/cs539/cs229-notes12.pdf">notes.</a>
10 | 
11 | More information related to this project can be found <a href="https://github.com/NathanEpstein/pydata-reinforce">here.</a>
12 | 
13 | ## Example Usage
14 | 
15 | ```python
16 | 
17 | observations = [
18 |   { 'state_transitions': [
19 |       { 'state': 'low', 'action': 'climb', 'state_': 'mid' },
20 |       { 'state': 'mid', 'action': 'climb', 'state_': 'high' },
21 |       { 'state': 'high', 'action': 'sink', 'state_': 'mid' },
22 |       { 'state': 'mid', 'action': 'sink', 'state_': 'low' },
23 |       { 'state': 'low', 'action': 'sink', 'state_': 'bottom' }
24 |     ],
25 |     'reward': 0
26 |   },
27 |   { 'state_transitions': [
28 |       { 'state': 'low', 'action': 'climb', 'state_': 'mid' },
29 |       { 'state': 'mid', 'action': 'climb', 'state_': 'high' },
30 |       { 'state': 'high', 'action': 'climb', 'state_': 'top' },
31 |     ],
32 |     'reward': 0
33 |   }
34 | ]
35 | 
36 | trap_states = [
37 |   {
38 |     'state_transitions': [
39 |       { 'state': 'bottom', 'action': 'sink', 'state_': 'bottom' },
40 |       { 'state': 'bottom', 'action': 'climb', 'state_': 'bottom' }
41 |     ],
42 |     'reward': 0
43 |   },
44 |   {
45 |     'state_transitions': [
46 |       { 'state': 'top', 'action': 'sink', 'state_': 'top' },
47 |       { 'state': 'top', 'action': 'climb', 'state_': 'top' },
48 |     ],
49 |     'reward': 1
50 |   },
51 | ]
52 | 
53 | from learn import MarkovAgent
54 | mark = MarkovAgent(observations + trap_states)
55 | mark.learn()
56 | 
57 | print(mark.policy)
58 | # {'high': 'climb', 'top': 'sink', 'bottom': 'sink', 'low': 'climb', 'mid': 'climb'}
59 | # NOTE: policy in top and bottom states is chosen randomly (doesn't affect state)
60 | 
61 | ```
62 | 


--------------------------------------------------------------------------------
/reinforce/__init__.py:
--------------------------------------------------------------------------------
1 | from .learn import *


--------------------------------------------------------------------------------
/reinforce/encoding.py:
--------------------------------------------------------------------------------
 1 | class StateActionEncoder:
 2 |   def __init__(self, observations):
 3 |     self.observations = observations
 4 |     self._parse_states_and_actions()
 5 | 
 6 |   def parse_dimensions(self):
 7 |     return {
 8 |       'state_count': len(self.int_to_state),
 9 |       'action_count': len(self.int_to_action)
10 |     }
11 | 
12 |   def observations_to_int(self):
13 |     for observation in self.observations:
14 |       for transition in observation['state_transitions']:
15 |         transition['state'] = self.state_to_int[transition['state']]
16 |         transition['state_'] = self.state_to_int[transition['state_']]
17 |         transition['action'] = self.action_to_int[transition['action']]
18 | 
19 |   def parse_encoded_policy(self, encoded_policy):
20 |     policy = {}
21 |     for index, encoded_action in enumerate(encoded_policy):
22 |       state = self.int_to_state[index]
23 |       action = self.int_to_action[int(encoded_action)]
24 |       policy[state] = action
25 | 
26 |     return policy
27 | 
28 |   def _parse_states_and_actions(self):
29 |     state_dict, action_dict = {}, {}
30 |     state_array, action_array = [], []
31 |     state_index, action_index = 0, 0
32 | 
33 |     for observation in self.observations:
34 |       for transition in observation['state_transitions']:
35 |         state = transition['state']
36 |         action = transition['action']
37 | 
38 |         if state not in state_dict.keys():
39 |           state_dict[state] = state_index
40 |           state_array.append(state)
41 |           state_index += 1
42 | 
43 |         if action not in action_dict.keys():
44 |           action_dict[action] = action_index
45 |           action_array.append(action)
46 |           action_index += 1
47 | 
48 |     self.state_to_int = state_dict
49 |     self.action_to_int = action_dict
50 |     self.int_to_state = state_array
51 |     self.int_to_action = action_array
52 | 
53 | 


--------------------------------------------------------------------------------
/reinforce/learn.py:
--------------------------------------------------------------------------------
 1 | from encoding import StateActionEncoder
 2 | from rewards import RewardParser
 3 | from transitions import TransitionParser
 4 | from policy import PolicyParser
 5 | 
 6 | class MarkovAgent:
 7 |   def __init__(self, observations):
 8 |     # encode observation data as int values
 9 |     self.state_action_encoder = StateActionEncoder(observations)
10 |     self.state_action_encoder.observations_to_int()
11 |     dimensions = self.state_action_encoder.parse_dimensions()
12 | 
13 |     # create reward, transition, and policy parsers
14 |     self.reward_parser = RewardParser(observations, dimensions)
15 |     self.transition_parser = TransitionParser(observations, dimensions)
16 |     self.policy_parser = PolicyParser(dimensions)
17 | 
18 |   def learn(self):
19 |     R = self.reward_parser.rewards()
20 |     P = self.transition_parser.transition_probabilities()
21 | 
22 |     # learn int-encoded policy and convert to readable dictionary
23 |     encoded_policy = self.policy_parser.policy(P, R)
24 |     self.policy = self.state_action_encoder.parse_encoded_policy(encoded_policy)
25 | 


--------------------------------------------------------------------------------
/reinforce/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class PolicyParser:
 4 |   def __init__(self, dimensions):
 5 |     self.state_count = dimensions['state_count']
 6 |     self.action_count = dimensions['action_count']
 7 | 
 8 |   def policy(self, P, rewards):
 9 |     print('COMPUTING POLICY')
10 | 
11 |     best_policy = np.zeros(self.state_count)
12 |     state_values = np.zeros(self.state_count)
13 | 
14 |     GAMMA = 0.9
15 |     ITERATIONS = 125
16 |     for i in range(ITERATIONS):
17 |       print ("iteration: {0} / {1}".format(i + 1, ITERATIONS))
18 | 
19 |       for state in range(0, self.state_count):
20 |         state_value = -float('Inf')
21 | 
22 |         for action in range(0, self.action_count):
23 |           action_value = 0
24 | 
25 |           for state_ in range(0, self.state_count):
26 |             action_value += (P[state][action][state_] * state_values[state_] * GAMMA)
27 | 
28 |           if (action_value >= state_value):
29 |             state_value = action_value
30 |             best_policy[state] = action
31 | 
32 |         state_values[state] = rewards[state] + state_value
33 | 
34 |     return best_policy


--------------------------------------------------------------------------------
/reinforce/rewards.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class RewardParser:
 4 |   def __init__(self, observations, dimensions):
 5 |     self.observations = observations
 6 |     self.state_count = dimensions['state_count']
 7 | 
 8 |   def rewards(self):
 9 |     print('COMPUTING REWARDS')
10 |     total_state_rewards = np.zeros(self.state_count)
11 |     total_state_visits = np.zeros(self.state_count)
12 | 
13 |     for observation in self.observations:
14 |       visits = float(len(observation['state_transitions']))
15 |       reward_per_visit = observation['reward'] / visits
16 | 
17 |       for state_transition in observation['state_transitions']:
18 |         state = state_transition['state']
19 |         total_state_rewards[state] += reward_per_visit
20 |         total_state_visits[state] += 1
21 | 
22 |     average_state_rewards = total_state_rewards / total_state_visits
23 |     average_state_rewards = np.nan_to_num(average_state_rewards)
24 | 
25 |     return average_state_rewards


--------------------------------------------------------------------------------
/reinforce/transitions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class TransitionParser:
 4 |   def __init__(self, observations, dimensions):
 5 |     self.observations = observations
 6 |     self.state_count = dimensions['state_count']
 7 |     self.action_count = dimensions['action_count']
 8 | 
 9 |   def transition_probabilities(self):
10 |     print('COMPUTING TRANSITIONS')
11 |     transition_count = self._count_transitions()
12 |     return self._parse_probabilities(transition_count)
13 | 
14 |   def _count_transitions(self):
15 |     transition_count = np.zeros((self.state_count, self.action_count, self.state_count))
16 | 
17 |     for observation in self.observations:
18 |       for state_transition in observation['state_transitions']:
19 |         state = state_transition['state']
20 |         action = state_transition['action']
21 |         state_ = state_transition['state_']
22 | 
23 |         transition_count[state][action][state_] += 1
24 | 
25 |     return transition_count
26 | 
27 |   def _parse_probabilities(self, transition_count):
28 |     P = np.zeros((self.state_count, self.action_count, self.state_count))
29 | 
30 |     for state in range(0, self.state_count):
31 |       for action in range(0, self.action_count):
32 | 
33 |         total_transitions = float(sum(transition_count[state][action]))
34 | 
35 |         if (total_transitions > 0):
36 |           P[state][action] = transition_count[state][action] / total_transitions
37 |         else:
38 |           P[state][action] = 1.0 / self.state_count
39 | 
40 |     return P


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='reinforce',
 4 |       version='0.2.0',
 5 |       description='plug and play reinforcement learning',
 6 |       url='http://github.com/nathanepstein/reinforce',
 7 |       author='Nathan Epstein',
 8 |       author_email='ne2210@columbia.edu',
 9 |       license='MIT',
10 |       packages=['reinforce'],
11 |       )


--------------------------------------------------------------------------------