├── .gitignore
├── DP
    ├── Procfile
    ├── README.md
    ├── __init__.py
    ├── application.py
    ├── bellman_equation.py
    ├── environment.py
    ├── environment_demo.py
    ├── planner.py
    ├── requirements.txt
    ├── run_server.py
    ├── static
    │   ├── css
    │   │   └── index.css
    │   ├── images
    │   │   └── agent.png
    │   └── js
    │   │   └── index.js
    ├── templates
    │   ├── base.html
    │   └── index.html
    └── tests
    │   ├── __init__.py
    │   ├── test_environment.py
    │   └── test_planner.py
├── EL
    ├── __init__.py
    ├── actor_critic.py
    ├── compare_q_s.py
    ├── el_agent.py
    ├── epsilon_greedy.py
    ├── frozen_lake_util.py
    ├── monte_carlo.py
    ├── notebooks
    │   ├── Actor&Critic.ipynb
    │   ├── Epsilon&Greedy.ipynb
    │   ├── Monte Carlo.ipynb
    │   ├── Q-learning.ipynb
    │   └── SARSA.ipynb
    ├── q_learning.py
    └── sarsa.py
├── EV
    └── evolution.py
├── FN
    ├── __init__.py
    ├── a2c_agent.py
    ├── dqn_agent.py
    ├── fn_framework.py
    ├── nn_tutorial
    │   ├── explanation_keras.py
    │   ├── explanation_keras_batch.py
    │   ├── explanation_keras_boston.py
    │   ├── explanation_keras_mnist.py
    │   ├── explanation_tf.py
    │   ├── explanation_tf_batch.py
    │   └── gradient.py
    ├── policy_gradient_agent.py
    ├── policy_gradient_continuous_agent.py
    └── value_function_agent.py
├── IM
    └── dagger.py
├── IRL
    ├── backups
    │   ├── environment.py
    │   ├── irl_from_traj.py
    │   ├── linear.py
    │   ├── planner.py
    │   └── visualizer.py
    ├── bayesian.py
    ├── environment.py
    ├── maxent.py
    └── planner.py
├── ISSUE_TEMPLATE.md
├── LICENSE
├── MM
    └── dyna.py
├── README.md
├── doc
    ├── application.PNG
    ├── be.PNG
    ├── colab_a2c.png
    ├── colab_dqn.png
    ├── frozen_lake.png
    ├── irl.png
    ├── mdp.PNG
    ├── rl_application.PNG
    ├── rl_ways.PNG
    ├── sample_improve.PNG
    ├── td.PNG
    ├── tradeoffs.png
    └── train_architecture.PNG
├── environment.yml
├── requirements-colab.txt
├── requirements.txt
└── welcome.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | .vscode
103 | .DS_Store
104 | FN/logs
105 | *.h5
106 | *.pkl
107 | /src
108 | 


--------------------------------------------------------------------------------
/DP/Procfile:
--------------------------------------------------------------------------------
1 | web: python run_server.py
2 | 


--------------------------------------------------------------------------------
/DP/README.md:
--------------------------------------------------------------------------------
 1 | # Plan before Action: Dynamic Programming
 2 | 
 3 | There are 3 programs are available.
 4 | 
 5 | * To understand MDP: `environment.py`
 6 |   * `python environment_demo.py`
 7 | * To understand Bellman Equation: `bellman_equation.py`
 8 |   * `python bellman_equation.py`
 9 | * To understand Dynamic Programming: `planner.py`
10 |   * `python run_server.py`
11 |   * You can simulate Dynamic Programming online!
12 | 


--------------------------------------------------------------------------------
/DP/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/__init__.py


--------------------------------------------------------------------------------
/DP/application.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tornado.web
 3 | import tornado.escape
 4 | from environment import Environment
 5 | from planner import ValueIterationPlanner, PolicyIterationPlanner
 6 | 
 7 | 
 8 | class IndexHandler(tornado.web.RequestHandler):
 9 | 
10 |     def get(self):
11 |         self.render("index.html")
12 | 
13 | 
14 | class PlanningHandler(tornado.web.RequestHandler):
15 | 
16 |     def post(self):
17 |         data = tornado.escape.json_decode(self.request.body) 
18 |         grid = data["grid"]
19 |         plan_type = data["plan"]
20 |         move_prob = 0.8  # default value
21 | 
22 |         try:
23 |             move_prob = float(data["prob"])
24 |         except ValueError:
25 |             pass
26 | 
27 |         env = Environment(grid, move_prob=move_prob)
28 |         if plan_type == "value":
29 |             planner = ValueIterationPlanner(env)
30 |         elif plan_type == "policy":
31 |             planner = PolicyIterationPlanner(env)
32 | 
33 |         result = planner.plan()
34 |         planner.log.append(result)
35 |         self.write({"log": planner.log})
36 | 
37 | 
38 | class Application(tornado.web.Application):
39 | 
40 |     def __init__(self):
41 |         handlers = [
42 |             (r"/", IndexHandler),
43 |             (r"/plan", PlanningHandler),
44 |         ]
45 | 
46 |         settings = dict(
47 |             template_path=os.path.join(os.path.dirname(__file__), "templates"),
48 |             static_path=os.path.join(os.path.dirname(__file__), "static"),
49 |             cookie_secret=os.environ.get("SECRET_TOKEN", "__TODO:_GENERATE_YOUR_OWN_RANDOM_VALUE_HERE__"),
50 |             debug=True,
51 |         )
52 | 
53 |         super(Application, self).__init__(handlers, **settings)
54 | 


--------------------------------------------------------------------------------
/DP/bellman_equation.py:
--------------------------------------------------------------------------------
 1 | def V(s, gamma=0.99):
 2 |     V = R(s) + gamma * max_V_on_next_state(s)
 3 |     return V
 4 | 
 5 | 
 6 | def R(s):
 7 |     if s == "happy_end":
 8 |         return 1
 9 |     elif s == "bad_end":
10 |         return -1
11 |     else:
12 |         return 0
13 | 
14 | 
15 | def max_V_on_next_state(s):
16 |     # If game end, expected value is 0.
17 |     if s in ["happy_end", "bad_end"]:
18 |         return 0
19 | 
20 |     actions = ["up", "down"]
21 |     values = []
22 |     for a in actions:
23 |         transition_probs = transit_func(s, a)
24 |         v = 0
25 |         for next_state in transition_probs:
26 |             prob = transition_probs[next_state]
27 |             v += prob * V(next_state)
28 |         values.append(v)
29 |     return max(values)
30 | 
31 | 
32 | def transit_func(s, a):
33 |     """
34 |     Make next state by adding action str to state.
35 |     ex: (s = 'state', a = 'up') => 'state_up'
36 |         (s = 'state_up', a = 'down') => 'state_up_down'
37 |     """
38 | 
39 |     actions = s.split("_")[1:]
40 |     LIMIT_GAME_COUNT = 5
41 |     HAPPY_END_BORDER = 4
42 |     MOVE_PROB = 0.9
43 | 
44 |     def next_state(state, action):
45 |         return "_".join([state, action])
46 | 
47 |     if len(actions) == LIMIT_GAME_COUNT:
48 |         up_count = sum([1 if a == "up" else 0 for a in actions])
49 |         state = "happy_end" if up_count >= HAPPY_END_BORDER else "bad_end"
50 |         prob = 1.0
51 |         return {state: prob}
52 |     else:
53 |         opposite = "up" if a == "down" else "down"
54 |         return {
55 |             next_state(s, a): MOVE_PROB,
56 |             next_state(s, opposite): 1 - MOVE_PROB
57 |         }
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     print(V("state"))
62 |     print(V("state_up_up"))
63 |     print(V("state_down_down"))
64 | 


--------------------------------------------------------------------------------
/DP/environment.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import numpy as np
  3 | 
  4 | 
  5 | class State():
  6 | 
  7 |     def __init__(self, row=-1, column=-1):
  8 |         self.row = row
  9 |         self.column = column
 10 | 
 11 |     def __repr__(self):
 12 |         return "<State: [{}, {}]>".format(self.row, self.column)
 13 | 
 14 |     def clone(self):
 15 |         return State(self.row, self.column)
 16 | 
 17 |     def __hash__(self):
 18 |         return hash((self.row, self.column))
 19 | 
 20 |     def __eq__(self, other):
 21 |         return self.row == other.row and self.column == other.column
 22 | 
 23 | 
 24 | class Action(Enum):
 25 |     UP = 1
 26 |     DOWN = -1
 27 |     LEFT = 2
 28 |     RIGHT = -2
 29 | 
 30 | 
 31 | class Environment():
 32 | 
 33 |     def __init__(self, grid, move_prob=0.8):
 34 |         # grid is 2d-array. Its values are treated as an attribute.
 35 |         # Kinds of attribute is following.
 36 |         #  0: ordinary cell
 37 |         #  -1: damage cell (game end)
 38 |         #  1: reward cell (game end)
 39 |         #  9: block cell (can't locate agent)
 40 |         self.grid = grid
 41 |         self.agent_state = State()
 42 | 
 43 |         # Default reward is minus. Just like a poison swamp.
 44 |         # It means the agent has to reach the goal fast!
 45 |         self.default_reward = -0.04
 46 | 
 47 |         # Agent can move to a selected direction in move_prob.
 48 |         # It means the agent will move different direction
 49 |         # in (1 - move_prob).
 50 |         self.move_prob = move_prob
 51 |         self.reset()
 52 | 
 53 |     @property
 54 |     def row_length(self):
 55 |         return len(self.grid)
 56 | 
 57 |     @property
 58 |     def column_length(self):
 59 |         return len(self.grid[0])
 60 | 
 61 |     @property
 62 |     def actions(self):
 63 |         return [Action.UP, Action.DOWN,
 64 |                 Action.LEFT, Action.RIGHT]
 65 | 
 66 |     @property
 67 |     def states(self):
 68 |         states = []
 69 |         for row in range(self.row_length):
 70 |             for column in range(self.column_length):
 71 |                 # Block cells are not included to the state.
 72 |                 if self.grid[row][column] != 9:
 73 |                     states.append(State(row, column))
 74 |         return states
 75 | 
 76 |     def transit_func(self, state, action):
 77 |         transition_probs = {}
 78 |         if not self.can_action_at(state):
 79 |             # Already on the terminal cell.
 80 |             return transition_probs
 81 | 
 82 |         opposite_direction = Action(action.value * -1)
 83 | 
 84 |         for a in self.actions:
 85 |             prob = 0
 86 |             if a == action:
 87 |                 prob = self.move_prob
 88 |             elif a != opposite_direction:
 89 |                 prob = (1 - self.move_prob) / 2
 90 | 
 91 |             next_state = self._move(state, a)
 92 |             if next_state not in transition_probs:
 93 |                 transition_probs[next_state] = prob
 94 |             else:
 95 |                 transition_probs[next_state] += prob
 96 | 
 97 |         return transition_probs
 98 | 
 99 |     def can_action_at(self, state):
100 |         if self.grid[state.row][state.column] == 0:
101 |             return True
102 |         else:
103 |             return False
104 | 
105 |     def _move(self, state, action):
106 |         if not self.can_action_at(state):
107 |             raise Exception("Can't move from here!")
108 | 
109 |         next_state = state.clone()
110 | 
111 |         # Execute an action (move).
112 |         if action == Action.UP:
113 |             next_state.row -= 1
114 |         elif action == Action.DOWN:
115 |             next_state.row += 1
116 |         elif action == Action.LEFT:
117 |             next_state.column -= 1
118 |         elif action == Action.RIGHT:
119 |             next_state.column += 1
120 | 
121 |         # Check whether a state is out of the grid.
122 |         if not (0 <= next_state.row < self.row_length):
123 |             next_state = state
124 |         if not (0 <= next_state.column < self.column_length):
125 |             next_state = state
126 | 
127 |         # Check whether the agent bumped a block cell.
128 |         if self.grid[next_state.row][next_state.column] == 9:
129 |             next_state = state
130 | 
131 |         return next_state
132 | 
133 |     def reward_func(self, state):
134 |         reward = self.default_reward
135 |         done = False
136 | 
137 |         # Check an attribute of next state.
138 |         attribute = self.grid[state.row][state.column]
139 |         if attribute == 1:
140 |             # Get reward! and the game ends.
141 |             reward = 1
142 |             done = True
143 |         elif attribute == -1:
144 |             # Get damage! and the game ends.
145 |             reward = -1
146 |             done = True
147 | 
148 |         return reward, done
149 | 
150 |     def reset(self):
151 |         # Locate the agent at lower left corner.
152 |         self.agent_state = State(self.row_length - 1, 0)
153 |         return self.agent_state
154 | 
155 |     def step(self, action):
156 |         next_state, reward, done = self.transit(self.agent_state, action)
157 |         if next_state is not None:
158 |             self.agent_state = next_state
159 | 
160 |         return next_state, reward, done
161 | 
162 |     def transit(self, state, action):
163 |         transition_probs = self.transit_func(state, action)
164 |         if len(transition_probs) == 0:
165 |             return None, None, True
166 | 
167 |         next_states = []
168 |         probs = []
169 |         for s in transition_probs:
170 |             next_states.append(s)
171 |             probs.append(transition_probs[s])
172 | 
173 |         next_state = np.random.choice(next_states, p=probs)
174 |         reward, done = self.reward_func(next_state)
175 |         return next_state, reward, done
176 | 


--------------------------------------------------------------------------------
/DP/environment_demo.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from environment import Environment
 3 | 
 4 | 
 5 | class Agent():
 6 | 
 7 |     def __init__(self, env):
 8 |         self.actions = env.actions
 9 | 
10 |     def policy(self, state):
11 |         return random.choice(self.actions)
12 | 
13 | 
14 | def main():
15 |     # Make grid environment.
16 |     grid = [
17 |         [0, 0, 0, 1],
18 |         [0, 9, 0, -1],
19 |         [0, 0, 0, 0]
20 |     ]
21 |     env = Environment(grid)
22 |     agent = Agent(env)
23 | 
24 |     # Try 10 game.
25 |     for i in range(10):
26 |         # Initialize position of agent.
27 |         state = env.reset()
28 |         total_reward = 0
29 |         done = False
30 | 
31 |         while not done:
32 |             action = agent.policy(state)
33 |             next_state, reward, done = env.step(action)
34 |             total_reward += reward
35 |             state = next_state
36 | 
37 |         print("Episode {}: Agent gets {} reward.".format(i, total_reward))
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/DP/planner.py:
--------------------------------------------------------------------------------
  1 | class Planner():
  2 | 
  3 |     def __init__(self, env):
  4 |         self.env = env
  5 |         self.log = []
  6 | 
  7 |     def initialize(self):
  8 |         self.env.reset()
  9 |         self.log = []
 10 | 
 11 |     def plan(self, gamma=0.9, threshold=0.0001):
 12 |         raise Exception("Planner have to implements plan method.")
 13 | 
 14 |     def transitions_at(self, state, action):
 15 |         transition_probs = self.env.transit_func(state, action)
 16 |         for next_state in transition_probs:
 17 |             prob = transition_probs[next_state]
 18 |             reward, _ = self.env.reward_func(next_state)
 19 |             yield prob, next_state, reward
 20 | 
 21 |     def dict_to_grid(self, state_reward_dict):
 22 |         grid = []
 23 |         for i in range(self.env.row_length):
 24 |             row = [0] * self.env.column_length
 25 |             grid.append(row)
 26 |         for s in state_reward_dict:
 27 |             grid[s.row][s.column] = state_reward_dict[s]
 28 | 
 29 |         return grid
 30 | 
 31 | 
 32 | class ValueIterationPlanner(Planner):
 33 | 
 34 |     def __init__(self, env):
 35 |         super().__init__(env)
 36 | 
 37 |     def plan(self, gamma=0.9, threshold=0.0001):
 38 |         self.initialize()
 39 |         actions = self.env.actions
 40 |         V = {}
 41 |         for s in self.env.states:
 42 |             # Initialize each state's expected reward.
 43 |             V[s] = 0
 44 | 
 45 |         while True:
 46 |             delta = 0
 47 |             self.log.append(self.dict_to_grid(V))
 48 |             for s in V:
 49 |                 if not self.env.can_action_at(s):
 50 |                     continue
 51 |                 expected_rewards = []
 52 |                 for a in actions:
 53 |                     r = 0
 54 |                     for prob, next_state, reward in self.transitions_at(s, a):
 55 |                         r += prob * (reward + gamma * V[next_state])
 56 |                     expected_rewards.append(r)
 57 |                 max_reward = max(expected_rewards)
 58 |                 delta = max(delta, abs(max_reward - V[s]))
 59 |                 V[s] = max_reward
 60 | 
 61 |             if delta < threshold:
 62 |                 break
 63 | 
 64 |         V_grid = self.dict_to_grid(V)
 65 |         return V_grid
 66 | 
 67 | 
 68 | class PolicyIterationPlanner(Planner):
 69 | 
 70 |     def __init__(self, env):
 71 |         super().__init__(env)
 72 |         self.policy = {}
 73 | 
 74 |     def initialize(self):
 75 |         super().initialize()
 76 |         self.policy = {}
 77 |         actions = self.env.actions
 78 |         states = self.env.states
 79 |         for s in states:
 80 |             self.policy[s] = {}
 81 |             for a in actions:
 82 |                 # Initialize policy.
 83 |                 # At first, each action is taken uniformly.
 84 |                 self.policy[s][a] = 1 / len(actions)
 85 | 
 86 |     def estimate_by_policy(self, gamma, threshold):
 87 |         V = {}
 88 |         for s in self.env.states:
 89 |             # Initialize each state's expected reward.
 90 |             V[s] = 0
 91 | 
 92 |         while True:
 93 |             delta = 0
 94 |             for s in V:
 95 |                 expected_rewards = []
 96 |                 for a in self.policy[s]:
 97 |                     action_prob = self.policy[s][a]
 98 |                     r = 0
 99 |                     for prob, next_state, reward in self.transitions_at(s, a):
100 |                         r += action_prob * prob * \
101 |                              (reward + gamma * V[next_state])
102 |                     expected_rewards.append(r)
103 |                 value = sum(expected_rewards)
104 |                 delta = max(delta, abs(value - V[s]))
105 |                 V[s] = value
106 |             if delta < threshold:
107 |                 break
108 | 
109 |         return V
110 | 
111 |     def plan(self, gamma=0.9, threshold=0.0001):
112 |         self.initialize()
113 |         states = self.env.states
114 |         actions = self.env.actions
115 | 
116 |         def take_max_action(action_value_dict):
117 |             return max(action_value_dict, key=action_value_dict.get)
118 | 
119 |         while True:
120 |             update_stable = True
121 |             # Estimate expected rewards under current policy.
122 |             V = self.estimate_by_policy(gamma, threshold)
123 |             self.log.append(self.dict_to_grid(V))
124 | 
125 |             for s in states:
126 |                 # Get an action following to the current policy.
127 |                 policy_action = take_max_action(self.policy[s])
128 | 
129 |                 # Compare with other actions.
130 |                 action_rewards = {}
131 |                 for a in actions:
132 |                     r = 0
133 |                     for prob, next_state, reward in self.transitions_at(s, a):
134 |                         r += prob * (reward + gamma * V[next_state])
135 |                     action_rewards[a] = r
136 |                 best_action = take_max_action(action_rewards)
137 |                 if policy_action != best_action:
138 |                     update_stable = False
139 | 
140 |                 # Update policy (set best_action prob=1, otherwise=0 (greedy))
141 |                 for a in self.policy[s]:
142 |                     prob = 1 if a == best_action else 0
143 |                     self.policy[s][a] = prob
144 | 
145 |             if update_stable:
146 |                 # If policy isn't updated, stop iteration
147 |                 break
148 | 
149 |         # Turn dictionary to grid
150 |         V_grid = self.dict_to_grid(V)
151 |         return V_grid
152 | 


--------------------------------------------------------------------------------
/DP/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/requirements.txt


--------------------------------------------------------------------------------
/DP/run_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tornado.ioloop
 3 | from tornado.options import define, options, parse_command_line
 4 | from application import Application
 5 | 
 6 | 
 7 | define("port", default=8888, help="run on the given port", type=int)
 8 | 
 9 | 
10 | def main():
11 |     parse_command_line()
12 |     app = Application()
13 |     port = int(os.environ.get("PORT", 8888))
14 |     app.listen(port)
15 |     print("Run server on port: {}".format(port))
16 |     tornado.ioloop.IOLoop.current().start()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/DP/static/css/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-family:-apple-system, BlinkMacSystemFont, "Helvetica Neue", "Segoe UI","Noto Sans Japanese","ヒラギノ角ゴ ProN W3", Meiryo, sans-serif;
 3 | }
 4 | .title-top{
 5 |     margin-top: 20px;
 6 | }
 7 | .cell{
 8 |     width: 80px;
 9 |     height: 80px;
10 |     border: 1px solid silver;
11 |     float: left;
12 | }
13 | .cell-content{
14 |     width: 100%;
15 |     height: 100%;
16 |     text-align: center;
17 |     line-height: 80px;
18 | }
19 | .cell-content.active{
20 |     background-color: whitesmoke;
21 | }
22 | .cell-content.treasure{
23 |     background-color: #00d1b2;
24 | }
25 | .cell-content.danger{
26 |     background-color: #ff3860;
27 | }
28 | .cell-content.block{
29 |     background-color: #363636;
30 | }
31 | .cell-content.agent{
32 |     width: 60px;
33 |     height: 60px;
34 |     margin: auto;
35 |     margin-top: 10px;
36 |     background: url(/static/images/agent.png);
37 |     background-size: 60px;
38 | }
39 | .cell-content.v5{
40 |     background-color: rgba(0, 209, 178, 0.8);
41 | }
42 | .cell-content.v4{
43 |     background-color: rgba(0, 209, 178, 0.6);
44 | }
45 | .cell-content.v3{
46 |     background-color: rgba(0, 209, 178, 0.3);
47 | }
48 | .cell-content.v2{
49 |     background-color: rgba(0, 209, 178, 0.1);
50 | }
51 | .cell-content.v1{
52 |     background-color: rgba(0, 209, 178, 0);
53 | }
54 | 


--------------------------------------------------------------------------------
/DP/static/images/agent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/static/images/agent.png


--------------------------------------------------------------------------------
/DP/static/js/index.js:
--------------------------------------------------------------------------------
  1 | Vue.config.debug = true;
  2 | var app = new Vue({
  3 |     el: "#app",
  4 |     delimiters: ["[[", "]]"],
  5 |     data: {
  6 |         row: 3,
  7 |         column: 4,
  8 |         moveProb: 0.8,
  9 |         grid: [],
 10 |         selectedIndex: null,
 11 |         simulation: false,
 12 |         log: [],
 13 |         logIndex: 0
 14 |     },
 15 |     created: function(){
 16 |         this.draw();
 17 |         this.selectedIndex = [0, 3];
 18 |         this.setTreasure();
 19 |         this.selectedIndex = [1, 3];
 20 |         this.setDanger();
 21 |         this.selectedIndex = [1, 1];
 22 |         this.setBlock();
 23 |     },
 24 |     computed: {
 25 |         targetGrid: function () {
 26 |             if(!this.simulation){
 27 |                 return this.grid;
 28 |             }else{
 29 |                 return this.log[this.logIndex];
 30 |             }
 31 |         },
 32 |         hasLog: function(){
 33 |             if(this.log.length > 0){
 34 |                 return true;
 35 |             }else{
 36 |                 return false;
 37 |             }
 38 |         }
 39 |     },
 40 |     methods: {
 41 |         init: function(){
 42 |             this.selectedIndex = null;
 43 |             this.simulation = false;
 44 |             this.logIndex = 0;
 45 |             this.log = [];
 46 |         },
 47 |         draw: function(){
 48 |             this.init();
 49 |             this.makeGrid();
 50 |         },
 51 |         makeGrid: function(){
 52 |             this.grid = [];
 53 |             var size = this.row * this.column;
 54 |             for(var i = 0; i < size; i++){
 55 |                 var rowIndex = Math.floor(i / this.column);
 56 |                 var columnIndex = i % this.column;
 57 |                 if(columnIndex == 0){
 58 |                     this.grid.push([]);
 59 |                 }
 60 |                 var cellAttribute = 0;
 61 |                 this.grid[rowIndex].push(cellAttribute);
 62 |             }
 63 |         },
 64 |         getCellAttribute: function(row, column){
 65 |             var attribute = this.grid[row][column];
 66 |             switch(attribute){
 67 |                 case 1:
 68 |                     return "treasure"
 69 |                 case -1:
 70 |                     return "danger"
 71 |                 case 9:
 72 |                     return "block"
 73 |             }
 74 |             if(this.selectedIndex != null && (this.selectedIndex[0] == row && this.selectedIndex[1] == column)){
 75 |                 return "active"
 76 |             }
 77 |             if(row == (this.grid.length - 1) && column == 0){
 78 |                 return "agent"
 79 |             }
 80 |             if(this.simulation){
 81 |                 var value = this.log[this.logIndex][row][column];
 82 |                 if(value >= 0.8){
 83 |                     return "v5"
 84 |                 }else if(value >= 0.6){
 85 |                     return "v4"
 86 |                 }else if(value >= 0.3){
 87 |                     return "v3"
 88 |                 }else if(value >= 0.1){
 89 |                     return "v2"
 90 |                 }else{
 91 |                     return "v1"                    
 92 |                 }
 93 |             }
 94 |         },
 95 |         plan: function(planType){
 96 |             var data = {
 97 |                 "plan": planType,
 98 |                 "prob": this.moveProb,
 99 |                 "grid": this.grid
100 |             }
101 |             var self = this;
102 |             fetch("/plan", {
103 |                 method: "POST",
104 |                 credentials: "same-origin",
105 |                 headers: {
106 |                     "Content-Type": "application/json"
107 |                 },
108 |                 body: JSON.stringify(data)
109 |             }).then(function(resp){
110 |                 return resp.json()
111 |             }).then(function(resp){
112 |                 self.log = resp["log"];
113 |                 self.play();
114 |             })
115 |         },
116 |         play: function(){
117 |             this.logIndex = 0;
118 |             this.simulation = true;
119 |             var self = this;
120 |             var timer = setInterval(function(){
121 |                 if(self.logIndex < self.log.length - 1){
122 |                     self.logIndex += 1;
123 |                 }else{
124 |                     clearInterval(timer);
125 |                 }
126 |             }, 1000);
127 |         },
128 |         stop: function(){
129 |             this.init();
130 |         },
131 |         value: function(row, column){
132 |             var attribute = this.grid[row][column];
133 |             if(attribute != 0 || (row == (this.grid.length -1) && column == 0)){
134 |                 return "";
135 |             }
136 |             var value = this.log[this.logIndex][row][column];
137 |             var value = Math.floor(value * 1000) / 1000;
138 |             return value;
139 |         },
140 |         selectCell: function(row, column){
141 |             // [row, 0] is Agent point
142 |             if(!(row == (this.grid.length - 1) && column == 0)){
143 |                 this.selectedIndex = [row, column];
144 |             }
145 |         },
146 |         setTreasure: function(){
147 |             this.setAttribute(1);
148 |         },
149 |         setDanger: function(){
150 |             this.setAttribute(-1);
151 |         },
152 |         setBlock: function(){
153 |             this.setAttribute(9);
154 |         },
155 |         clearAttribute: function(row, column){
156 |             if(this.simulation){
157 |                 this.init();
158 |             }
159 |             this.selectedIndex = [row, column];
160 |             this.setAttribute(0);
161 |         },
162 |         setAttribute: function(attribute){
163 |             var index = this.selectedIndex;
164 |             if(this.selectedIndex != null){
165 |                 this.grid[index[0]][index[1]] = attribute;
166 |                 this.selectedIndex = null;
167 |             }
168 |         }
169 |     }
170 | })
171 | 


--------------------------------------------------------------------------------
/DP/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
 5 |     <title>MDP by Dynamic Programming Demo</title>
 6 |     <meta name="HandheldFriendly" content="True">
 7 |     <meta name="MobileOptimized" content="320">
 8 |     <meta name="mobile-web-app-capable" content="yes">
 9 |     <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0">
10 |     <link rel="stylesheet" type="text/css" href="//cdnjs.cloudflare.com/ajax/libs/bulma/0.7.2/css/bulma.min.css">
11 |     <link rel="stylesheet" type="text/css" href="//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
12 |     <script src="//cdnjs.cloudflare.com/ajax/libs/fetch/3.0.0/fetch.min.js" type="text/javascript"></script>
13 | 
14 |     {% block head %}{% end %}
15 | </head>
16 | <body>
17 |     {% block body %}{% end %}
18 |     {% block bottom %}{% end %}
19 | </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/DP/templates/index.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block head %}
  4 | <link rel="stylesheet" href="{{ static_url('css/index.css') }}" type="text/css"/>
  5 | <script src="//cdnjs.cloudflare.com/ajax/libs/vue/2.5.17/vue.min.js" type="text/javascript"></script>
  6 | {% end %}
  7 | 
  8 | {% block bottom %}
  9 | <script type="text/javascript" src="{{ static_url('js/index.js') }}"></script>
 10 | {% end %}
 11 | 
 12 | {% block body %}
 13 | <div class="container">
 14 |     <p class="title-top">
 15 |         <h1 class="title">Dynamic Programming Simulator</h1>
 16 |     </p>
 17 |     <div id="app">
 18 |         <div class="columns">
 19 |             <div class="column is-three-fifths">
 20 |                 <div v-for="(row, rowIndex) in grid">
 21 |                     <div v-for="(cell, columnIndex) in row" class="cell">
 22 |                         <div v-if="simulation"
 23 |                              @dblclick="clearAttribute(rowIndex, columnIndex)"
 24 |                              v-bind:class="['cell-content', getCellAttribute(rowIndex, columnIndex)]">
 25 |                             <span>[[value(rowIndex, columnIndex)]]</span>
 26 |                         </div>
 27 |                         <div v-else 
 28 |                             @click="selectCell(rowIndex, columnIndex)"
 29 |                             @dblclick="clearAttribute(rowIndex, columnIndex)"
 30 |                             v-bind:class="['cell-content', getCellAttribute(rowIndex, columnIndex)]">
 31 |                         </div>
 32 |                     </div>
 33 |                     <br style="clear:both"/>
 34 |                 </div>
 35 |             </div>
 36 |             <div class="column">
 37 |                 <h4 class="title is-4">Area (Row x Column)</h4>
 38 |                 <div class="field has-addons">
 39 |                     <p class="control">
 40 |                         <span class="select">
 41 |                         <select v-model="row" type="number">
 42 |                             <option>2</option>
 43 |                             <option>3</option>
 44 |                             <option>4</option>
 45 |                             <option>5</option>
 46 |                             <option>6</option>
 47 |                         </select>
 48 |                         </span>
 49 |                     </p>
 50 |                     <p class="control">
 51 |                         <span class="select">
 52 |                         <select v-model="column" type="number">
 53 |                             <option>2</option>
 54 |                             <option>3</option>
 55 |                             <option>4</option>
 56 |                             <option>5</option>
 57 |                             <option>6</option>
 58 |                         </select>
 59 |                         </span>
 60 |                     </p>
 61 |                     <button class="button" style="margin-left: 30px" @click="draw">Draw</button>
 62 |                 </div>
 63 |                 <h4 class="title is-4">Cell Setting</h4>
 64 |                     <button class="button is-primary" @click="setTreasure">Treasure</button>
 65 |                     <button class="button is-danger" @click="setDanger">Danger</button>
 66 |                     <button class="button is-dark" @click="setBlock">Block</button>
 67 |                 <br/>
 68 |                 <h4 class="title is-4">Move Prob</h4>
 69 |                 <div class="control">
 70 |                     <input class="input" type="number" step="0.1" placeholder="move probability" v-model="moveProb">
 71 |                 </div>
 72 |                 <hr/>
 73 |                 <h4 class="title is-4" style="color: mediumseagreen">Simulation</h4>
 74 |                 <button class="button is-primary is-outlined" @click="plan('value')">Value Iteration</button>
 75 |                 <button class="button is-primary is-outlined" @click="plan('policy')">Policy Iteration</button>
 76 |                 <hr/>
 77 |                 <div v-if="hasLog">
 78 |                     <h4 class="title is-4" style="color: mediumseagreen">Result</h4>
 79 |                     <div >[[log.length]] iterations have done to converge.</div>
 80 |                     <div class="field has-addons">
 81 |                         <p class="control">
 82 |                         <a class="button" @click="play">
 83 |                             <span class="icon is-small">
 84 |                             <i class="fa fa-play"></i>
 85 |                             </span>
 86 |                             <span>Play</span>
 87 |                         </a>
 88 |                         </p>
 89 |                         <p class="control">
 90 |                         <a class="button" @click="stop">
 91 |                             <span class="icon is-small">
 92 |                             <i class="fa fa-stop"></i>
 93 |                             </span>
 94 |                             <span>End</span>
 95 |                         </a>
 96 |                         </p>
 97 |                     </div>
 98 |                 </div>
 99 |             </div>
100 |         </div>
101 |     </div>
102 | </div>
103 | {% end %}
104 | 


--------------------------------------------------------------------------------
/DP/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/tests/__init__.py


--------------------------------------------------------------------------------
/DP/tests/test_environment.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | from DP.environment import Environment
 4 | 
 5 | 
 6 | class TestEnvironment(unittest.TestCase):
 7 | 
 8 |     def test_run_environment(self):
 9 |         grid = self.get_sample_grid()
10 |         env = Environment(grid)
11 |         for i in range(100):
12 |             state = env.reset()  # initialize agent position
13 |             self.assertEqual(state.row, len(env.grid) - 1)
14 |             self.assertEqual(state.column, 0)
15 |             goal = False
16 |             for t in range(10):
17 |                 action = random.choice(env.actions)
18 |                 state, reward, done = env.step(action)
19 |                 self.assertTrue(0 <= state.row < len(env.grid))
20 |                 self.assertTrue(0 <= state.column < len(env.grid[0]))
21 |                 if done:
22 |                     print("Episode {}: get reward {}, {} timesteps".format(
23 |                         i, reward, t + 1))
24 |                     goal = True
25 |                     break
26 |             if not goal:
27 |                 print("Episode {}: no reward".format(i))
28 | 
29 |     def get_sample_grid(self):
30 |         # 3 x 4 grid
31 |         grid = [
32 |             [0, 0, 0, 1],
33 |             [0, 9, 0, -1],
34 |             [0, 0, 0, 0]
35 |             ]
36 |         return grid
37 | 


--------------------------------------------------------------------------------
/DP/tests/test_planner.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | from DP.environment import Environment
 4 | from DP.planner import ValueIterationPlanner, PolicyIterationPlanner
 5 | 
 6 | 
 7 | class TestPlanner(unittest.TestCase):
 8 | 
 9 |     def test_value_iteration(self):
10 |         grid = self.get_sample_grid()
11 |         env = Environment(grid)
12 |         planner = ValueIterationPlanner(env)
13 |         result = planner.plan()
14 |         print("Value Iteration")
15 |         for r in result:
16 |             print(r)
17 | 
18 |     def test_policy_iteration(self):
19 |         grid = self.get_sample_grid()
20 |         env = Environment(grid)
21 |         planner = PolicyIterationPlanner(env)
22 |         result = planner.plan()
23 |         print("Policy Iteration")
24 |         for r in result:
25 |             print(r)
26 | 
27 |     def get_sample_grid(self):
28 |         # 3 x 4 grid
29 |         grid = [
30 |             [0, 0, 0, 1],
31 |             [0, 9, 0, -1],
32 |             [0, 0, 0, 0]
33 |             ]
34 |         return grid
35 | 


--------------------------------------------------------------------------------
/EL/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/EL/__init__.py


--------------------------------------------------------------------------------
/EL/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from el_agent import ELAgent
 4 | from frozen_lake_util import show_q_value
 5 | 
 6 | 
 7 | class Actor(ELAgent):
 8 | 
 9 |     def __init__(self, env):
10 |         super().__init__(epsilon=-1)
11 |         nrow = env.observation_space.n
12 |         ncol = env.action_space.n
13 |         self.actions = list(range(env.action_space.n))
14 |         self.Q = np.random.uniform(0, 1, nrow * ncol).reshape((nrow, ncol))
15 | 
16 |     def softmax(self, x):
17 |         return np.exp(x) / np.sum(np.exp(x), axis=0)
18 | 
19 |     def policy(self, s):
20 |         a = np.random.choice(self.actions, 1,
21 |                              p=self.softmax(self.Q[s]))
22 |         return a[0]
23 | 
24 | 
25 | class Critic():
26 | 
27 |     def __init__(self, env):
28 |         states = env.observation_space.n
29 |         self.V = np.zeros(states)
30 | 
31 | 
32 | class ActorCritic():
33 | 
34 |     def __init__(self, actor_class, critic_class):
35 |         self.actor_class = actor_class
36 |         self.critic_class = critic_class
37 | 
38 |     def train(self, env, episode_count=1000, gamma=0.9,
39 |               learning_rate=0.1, render=False, report_interval=50):
40 |         actor = self.actor_class(env)
41 |         critic = self.critic_class(env)
42 | 
43 |         actor.init_log()
44 |         for e in range(episode_count):
45 |             s = env.reset()
46 |             done = False
47 |             while not done:
48 |                 if render:
49 |                     env.render()
50 |                 a = actor.policy(s)
51 |                 n_state, reward, done, info = env.step(a)
52 | 
53 |                 gain = reward + gamma * critic.V[n_state]
54 |                 estimated = critic.V[s]
55 |                 td = gain - estimated
56 |                 actor.Q[s][a] += learning_rate * td
57 |                 critic.V[s] += learning_rate * td
58 |                 s = n_state
59 | 
60 |             else:
61 |                 actor.log(reward)
62 | 
63 |             if e != 0 and e % report_interval == 0:
64 |                 actor.show_reward_log(episode=e)
65 | 
66 |         return actor, critic
67 | 
68 | 
69 | def train():
70 |     trainer = ActorCritic(Actor, Critic)
71 |     env = gym.make("FrozenLakeEasy-v0")
72 |     actor, critic = trainer.train(env, episode_count=3000)
73 |     show_q_value(actor.Q)
74 |     actor.show_reward_log()
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     train()
79 | 


--------------------------------------------------------------------------------
/EL/compare_q_s.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | from collections import defaultdict
 3 | import gym
 4 | from el_agent import ELAgent
 5 | from frozen_lake_util import show_q_value
 6 | 
 7 | 
 8 | class CompareAgent(ELAgent):
 9 | 
10 |     def __init__(self, q_learning=True, epsilon=0.33):
11 |         self.q_learning = q_learning
12 |         super().__init__(epsilon)
13 | 
14 |     def learn(self, env, episode_count=1000, gamma=0.9,
15 |               learning_rate=0.1, render=False, report_interval=50):
16 |         self.init_log()
17 |         self.Q = defaultdict(lambda: [0] * len(actions))
18 |         actions = list(range(env.action_space.n))
19 |         for e in range(episode_count):
20 |             s = env.reset()
21 |             done = False
22 |             a = self.policy(s, actions)
23 |             while not done:
24 |                 if render:
25 |                     env.render()
26 | 
27 |                 n_state, reward, done, info = env.step(a)
28 | 
29 |                 if done and reward == 0:
30 |                     reward = -0.5  # Reward as penalty
31 | 
32 |                 n_action = self.policy(n_state, actions)
33 | 
34 |                 if self.q_learning:
35 |                     gain = reward + gamma * max(self.Q[n_state])
36 |                 else:
37 |                     gain = reward + gamma * self.Q[n_state][n_action]
38 | 
39 |                 estimated = self.Q[s][a]
40 |                 self.Q[s][a] += learning_rate * (gain - estimated)
41 |                 s = n_state
42 | 
43 |                 if self.q_learning:
44 |                     a = self.policy(s, actions)
45 |                 else:
46 |                     a = n_action
47 |             else:
48 |                 self.log(reward)
49 | 
50 |             if e != 0 and e % report_interval == 0:
51 |                 self.show_reward_log(episode=e)
52 | 
53 | 
54 | def train(q_learning):
55 |     env = gym.make("FrozenLakeEasy-v0")
56 |     agent = CompareAgent(q_learning=q_learning)
57 |     agent.learn(env, episode_count=3000)
58 |     return dict(agent.Q)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     with Pool() as pool:
63 |         results = pool.map(train, ([True, False]))
64 |         for r in results:
65 |             show_q_value(r)
66 | 


--------------------------------------------------------------------------------
/EL/el_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | class ELAgent():
 6 | 
 7 |     def __init__(self, epsilon):
 8 |         self.Q = {}
 9 |         self.epsilon = epsilon
10 |         self.reward_log = []
11 | 
12 |     def policy(self, s, actions):
13 |         if np.random.random() < self.epsilon:
14 |             return np.random.randint(len(actions))
15 |         else:
16 |             if s in self.Q and sum(self.Q[s]) != 0:
17 |                 return np.argmax(self.Q[s])
18 |             else:
19 |                 return np.random.randint(len(actions))
20 | 
21 |     def init_log(self):
22 |         self.reward_log = []
23 | 
24 |     def log(self, reward):
25 |         self.reward_log.append(reward)
26 | 
27 |     def show_reward_log(self, interval=50, episode=-1):
28 |         if episode > 0:
29 |             rewards = self.reward_log[-interval:]
30 |             mean = np.round(np.mean(rewards), 3)
31 |             std = np.round(np.std(rewards), 3)
32 |             print("At Episode {} average reward is {} (+/-{}).".format(
33 |                    episode, mean, std))
34 |         else:
35 |             indices = list(range(0, len(self.reward_log), interval))
36 |             means = []
37 |             stds = []
38 |             for i in indices:
39 |                 rewards = self.reward_log[i:(i + interval)]
40 |                 means.append(np.mean(rewards))
41 |                 stds.append(np.std(rewards))
42 |             means = np.array(means)
43 |             stds = np.array(stds)
44 |             plt.figure()
45 |             plt.title("Reward History")
46 |             plt.grid()
47 |             plt.fill_between(indices, means - stds, means + stds,
48 |                              alpha=0.1, color="g")
49 |             plt.plot(indices, means, "o-", color="g",
50 |                      label="Rewards for each {} episode".format(interval))
51 |             plt.legend(loc="best")
52 |             plt.show()
53 | 


--------------------------------------------------------------------------------
/EL/epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | 
 5 | class CoinToss():
 6 | 
 7 |     def __init__(self, head_probs, max_episode_steps=30):
 8 |         self.head_probs = head_probs
 9 |         self.max_episode_steps = max_episode_steps
10 |         self.toss_count = 0
11 | 
12 |     def __len__(self):
13 |         return len(self.head_probs)
14 | 
15 |     def reset(self):
16 |         self.toss_count = 0
17 | 
18 |     def step(self, action):
19 |         final = self.max_episode_steps - 1
20 |         if self.toss_count > final:
21 |             raise Exception("The step count exceeded maximum. \
22 |                             Please reset env.")
23 |         else:
24 |             done = True if self.toss_count == final else False
25 | 
26 |         if action >= len(self.head_probs):
27 |             raise Exception("The No.{} coin doesn't exist.".format(action))
28 |         else:
29 |             head_prob = self.head_probs[action]
30 |             if random.random() < head_prob:
31 |                 reward = 1.0
32 |             else:
33 |                 reward = 0.0
34 |             self.toss_count += 1
35 |             return reward, done
36 | 
37 | 
38 | class EpsilonGreedyAgent():
39 | 
40 |     def __init__(self, epsilon):
41 |         self.epsilon = epsilon
42 |         self.V = []
43 | 
44 |     def policy(self):
45 |         coins = range(len(self.V))
46 |         if random.random() < self.epsilon:
47 |             return random.choice(coins)
48 |         else:
49 |             return np.argmax(self.V)
50 | 
51 |     def play(self, env):
52 |         # Initialize estimation.
53 |         N = [0] * len(env)
54 |         self.V = [0] * len(env)
55 | 
56 |         env.reset()
57 |         done = False
58 |         rewards = []
59 |         while not done:
60 |             selected_coin = self.policy()
61 |             reward, done = env.step(selected_coin)
62 |             rewards.append(reward)
63 | 
64 |             n = N[selected_coin]
65 |             coin_average = self.V[selected_coin]
66 |             new_average = (coin_average * n + reward) / (n + 1)
67 |             N[selected_coin] += 1
68 |             self.V[selected_coin] = new_average
69 | 
70 |         return rewards
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     import pandas as pd
75 |     import matplotlib.pyplot as plt
76 | 
77 |     def main():
78 |         env = CoinToss([0.1, 0.5, 0.1, 0.9, 0.1])
79 |         epsilons = [0.0, 0.1, 0.2, 0.5, 0.8]
80 |         game_steps = list(range(10, 310, 10))
81 |         result = {}
82 |         for e in epsilons:
83 |             agent = EpsilonGreedyAgent(epsilon=e)
84 |             means = []
85 |             for s in game_steps:
86 |                 env.max_episode_steps = s
87 |                 rewards = agent.play(env)
88 |                 means.append(np.mean(rewards))
89 |             result["epsilon={}".format(e)] = means
90 |         result["coin toss count"] = game_steps
91 |         result = pd.DataFrame(result)
92 |         result.set_index("coin toss count", drop=True, inplace=True)
93 |         result.plot.line(figsize=(10, 5))
94 |         plt.show()
95 | 
96 |     main()
97 | 


--------------------------------------------------------------------------------
/EL/frozen_lake_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib.cm as cm
 4 | import gym
 5 | from gym.envs.registration import register
 6 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv",
 7 |          kwargs={"is_slippery": False})
 8 | 
 9 | 
10 | def show_q_value(Q):
11 |     """
12 |     Show Q-values for FrozenLake-v0.
13 |     To show each action's evaluation,
14 |     a state is shown as 3 x 3 matrix like following.
15 | 
16 |     +---+---+---+
17 |     |   | u |   |  u: up value
18 |     | l | m | r |  l: left value, r: right value, m: mean value
19 |     |   | d |   |  d: down value
20 |     +---+---+---+
21 |     """
22 |     env = gym.make("FrozenLake-v0")
23 |     nrow = env.unwrapped.nrow
24 |     ncol = env.unwrapped.ncol
25 |     state_size = 3
26 |     q_nrow = nrow * state_size
27 |     q_ncol = ncol * state_size
28 |     reward_map = np.zeros((q_nrow, q_ncol))
29 | 
30 |     for r in range(nrow):
31 |         for c in range(ncol):
32 |             s = r * ncol + c
33 |             state_exist = False
34 |             if isinstance(Q, dict) and s in Q:
35 |                 state_exist = True
36 |             elif isinstance(Q, (np.ndarray, np.generic)) and s < Q.shape[0]:
37 |                 state_exist = True
38 | 
39 |             if state_exist:
40 |                 # At the display map, the vertical index is reversed.
41 |                 _r = 1 + (nrow - 1 - r) * state_size
42 |                 _c = 1 + c * state_size
43 |                 reward_map[_r][_c - 1] = Q[s][0]  # LEFT = 0
44 |                 reward_map[_r - 1][_c] = Q[s][1]  # DOWN = 1
45 |                 reward_map[_r][_c + 1] = Q[s][2]  # RIGHT = 2
46 |                 reward_map[_r + 1][_c] = Q[s][3]  # UP = 3
47 |                 reward_map[_r][_c] = np.mean(Q[s])  # Center
48 | 
49 |     fig = plt.figure()
50 |     ax = fig.add_subplot(1, 1, 1)
51 |     plt.imshow(reward_map, cmap=cm.RdYlGn, interpolation="bilinear",
52 |                vmax=abs(reward_map).max(), vmin=-abs(reward_map).max())
53 |     ax.set_xlim(-0.5, q_ncol - 0.5)
54 |     ax.set_ylim(-0.5, q_nrow - 0.5)
55 |     ax.set_xticks(np.arange(-0.5, q_ncol, state_size))
56 |     ax.set_yticks(np.arange(-0.5, q_nrow, state_size))
57 |     ax.set_xticklabels(range(ncol + 1))
58 |     ax.set_yticklabels(range(nrow + 1))
59 |     ax.grid(which="both")
60 |     plt.show()
61 | 


--------------------------------------------------------------------------------
/EL/monte_carlo.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from collections import defaultdict
 3 | import gym
 4 | from el_agent import ELAgent
 5 | from frozen_lake_util import show_q_value
 6 | 
 7 | 
 8 | class MonteCarloAgent(ELAgent):
 9 | 
10 |     def __init__(self, epsilon=0.1):
11 |         super().__init__(epsilon)
12 | 
13 |     def learn(self, env, episode_count=1000, gamma=0.9,
14 |               render=False, report_interval=50):
15 |         self.init_log()
16 |         actions = list(range(env.action_space.n))
17 |         self.Q = defaultdict(lambda: [0] * len(actions))
18 |         N = defaultdict(lambda: [0] * len(actions))
19 | 
20 |         for e in range(episode_count):
21 |             s = env.reset()
22 |             done = False
23 |             # Play until the end of episode.
24 |             experience = []
25 |             while not done:
26 |                 if render:
27 |                     env.render()
28 |                 a = self.policy(s, actions)
29 |                 n_state, reward, done, info = env.step(a)
30 |                 experience.append({"state": s, "action": a, "reward": reward})
31 |                 s = n_state
32 |             else:
33 |                 self.log(reward)
34 | 
35 |             # Evaluate each state, action.
36 |             for i, x in enumerate(experience):
37 |                 s, a = x["state"], x["action"]
38 | 
39 |                 # Calculate discounted future reward of s.
40 |                 G, t = 0, 0
41 |                 for j in range(i, len(experience)):
42 |                     G += math.pow(gamma, t) * experience[j]["reward"]
43 |                     t += 1
44 | 
45 |                 N[s][a] += 1  # count of s, a pair
46 |                 alpha = 1 / N[s][a]
47 |                 self.Q[s][a] += alpha * (G - self.Q[s][a])
48 | 
49 |             if e != 0 and e % report_interval == 0:
50 |                 self.show_reward_log(episode=e)
51 | 
52 | 
53 | def train():
54 |     agent = MonteCarloAgent(epsilon=0.1)
55 |     env = gym.make("FrozenLakeEasy-v0")
56 |     agent.learn(env, episode_count=500)
57 |     show_q_value(agent.Q)
58 |     agent.show_reward_log()
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     train()
63 | 


--------------------------------------------------------------------------------
/EL/q_learning.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import gym
 3 | from el_agent import ELAgent
 4 | from frozen_lake_util import show_q_value
 5 | 
 6 | 
 7 | class QLearningAgent(ELAgent):
 8 | 
 9 |     def __init__(self, epsilon=0.1):
10 |         super().__init__(epsilon)
11 | 
12 |     def learn(self, env, episode_count=1000, gamma=0.9,
13 |               learning_rate=0.1, render=False, report_interval=50):
14 |         self.init_log()
15 |         actions = list(range(env.action_space.n))
16 |         self.Q = defaultdict(lambda: [0] * len(actions))
17 |         for e in range(episode_count):
18 |             s = env.reset()
19 |             done = False
20 |             while not done:
21 |                 if render:
22 |                     env.render()
23 |                 a = self.policy(s, actions)
24 |                 n_state, reward, done, info = env.step(a)
25 | 
26 |                 gain = reward + gamma * max(self.Q[n_state])
27 |                 estimated = self.Q[s][a]
28 |                 self.Q[s][a] += learning_rate * (gain - estimated)
29 |                 s = n_state
30 | 
31 |             else:
32 |                 self.log(reward)
33 | 
34 |             if e != 0 and e % report_interval == 0:
35 |                 self.show_reward_log(episode=e)
36 | 
37 | 
38 | def train():
39 |     agent = QLearningAgent()
40 |     env = gym.make("FrozenLakeEasy-v0")
41 |     agent.learn(env, episode_count=500)
42 |     show_q_value(agent.Q)
43 |     agent.show_reward_log()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     train()
48 | 


--------------------------------------------------------------------------------
/EL/sarsa.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import gym
 3 | from el_agent import ELAgent
 4 | from frozen_lake_util import show_q_value
 5 | 
 6 | 
 7 | class SARSAAgent(ELAgent):
 8 | 
 9 |     def __init__(self, epsilon=0.1):
10 |         super().__init__(epsilon)
11 | 
12 |     def learn(self, env, episode_count=1000, gamma=0.9,
13 |               learning_rate=0.1, render=False, report_interval=50):
14 |         self.init_log()
15 |         actions = list(range(env.action_space.n))
16 |         self.Q = defaultdict(lambda: [0] * len(actions))
17 |         for e in range(episode_count):
18 |             s = env.reset()
19 |             done = False
20 |             a = self.policy(s, actions)
21 |             while not done:
22 |                 if render:
23 |                     env.render()
24 |                 n_state, reward, done, info = env.step(a)
25 | 
26 |                 n_action = self.policy(n_state, actions)  # On-policy
27 |                 gain = reward + gamma * self.Q[n_state][n_action]
28 |                 estimated = self.Q[s][a]
29 |                 self.Q[s][a] += learning_rate * (gain - estimated)
30 |                 s = n_state
31 |                 a = n_action
32 |             else:
33 |                 self.log(reward)
34 | 
35 |             if e != 0 and e % report_interval == 0:
36 |                 self.show_reward_log(episode=e)
37 | 
38 | 
39 | def train():
40 |     agent = SARSAAgent()
41 |     env = gym.make("FrozenLakeEasy-v0")
42 |     agent.learn(env, episode_count=500)
43 |     show_q_value(agent.Q)
44 |     agent.show_reward_log()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     train()
49 | 


--------------------------------------------------------------------------------
/EV/evolution.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import numpy as np
  4 | from sklearn.externals.joblib import Parallel, delayed
  5 | from PIL import Image
  6 | import matplotlib.pyplot as plt
  7 | import gym
  8 | 
  9 | # Disable TensorFlow GPU for parallel execution
 10 | if os.name == "nt":
 11 |     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 12 | else:
 13 |     os.environ["CUDA_VISIBLE_DEVICES"] = ""
 14 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 15 | 
 16 | from tensorflow.python import keras as K
 17 | 
 18 | 
 19 | class EvolutionalAgent():
 20 | 
 21 |     def __init__(self, actions):
 22 |         self.actions = actions
 23 |         self.model = None
 24 | 
 25 |     def save(self, model_path):
 26 |         self.model.save(model_path, overwrite=True, include_optimizer=False)
 27 | 
 28 |     @classmethod
 29 |     def load(cls, env, model_path):
 30 |         actions = list(range(env.action_space.n))
 31 |         agent = cls(actions)
 32 |         agent.model = K.models.load_model(model_path)
 33 |         return agent
 34 | 
 35 |     def initialize(self, state, weights=()):
 36 |         normal = K.initializers.glorot_normal()
 37 |         model = K.Sequential()
 38 |         model.add(K.layers.Conv2D(
 39 |             3, kernel_size=5, strides=3,
 40 |             input_shape=state.shape, kernel_initializer=normal,
 41 |             activation="relu"))
 42 |         model.add(K.layers.Flatten())
 43 |         model.add(K.layers.Dense(len(self.actions), activation="softmax"))
 44 |         self.model = model
 45 |         if len(weights) > 0:
 46 |             self.model.set_weights(weights)
 47 | 
 48 |     def policy(self, state):
 49 |         action_probs = self.model.predict(np.array([state]))[0]
 50 |         action = np.random.choice(self.actions,
 51 |                                   size=1, p=action_probs)[0]
 52 |         return action
 53 | 
 54 |     def play(self, env, episode_count=5, render=True):
 55 |         for e in range(episode_count):
 56 |             s = env.reset()
 57 |             done = False
 58 |             episode_reward = 0
 59 |             while not done:
 60 |                 if render:
 61 |                     env.render()
 62 |                 a = self.policy(s)
 63 |                 n_state, reward, done, info = env.step(a)
 64 |                 episode_reward += reward
 65 |                 s = n_state
 66 |             else:
 67 |                 print("Get reward {}".format(episode_reward))
 68 | 
 69 | 
 70 | class CatcherObserver():
 71 | 
 72 |     def __init__(self, width, height, frame_count):
 73 |         import gym_ple
 74 |         self._env = gym.make("Catcher-v0")
 75 |         self.width = width
 76 |         self.height = height
 77 | 
 78 |     @property
 79 |     def action_space(self):
 80 |         return self._env.action_space
 81 | 
 82 |     @property
 83 |     def observation_space(self):
 84 |         return self._env.observation_space
 85 | 
 86 |     def reset(self):
 87 |         return self.transform(self._env.reset())
 88 | 
 89 |     def render(self):
 90 |         self._env.render(mode="human")
 91 | 
 92 |     def step(self, action):
 93 |         n_state, reward, done, info = self._env.step(action)
 94 |         return self.transform(n_state), reward, done, info
 95 | 
 96 |     def transform(self, state):
 97 |         grayed = Image.fromarray(state).convert("L")
 98 |         resized = grayed.resize((self.width, self.height))
 99 |         resized = np.array(resized).astype("float")
100 |         normalized = resized / 255.0  # scale to 0~1
101 |         normalized = np.expand_dims(normalized, axis=2)  # H x W => W x W x C
102 |         return normalized
103 | 
104 | 
105 | class EvolutionalTrainer():
106 | 
107 |     def __init__(self, population_size=20, sigma=0.5, learning_rate=0.1,
108 |                  report_interval=10):
109 |         self.population_size = population_size
110 |         self.sigma = sigma
111 |         self.learning_rate = learning_rate
112 |         self.weights = ()
113 |         self.reward_log = []
114 | 
115 |     def train(self, epoch=100, episode_per_agent=1, render=False):
116 |         env = self.make_env()
117 |         actions = list(range(env.action_space.n))
118 |         s = env.reset()
119 |         agent = EvolutionalAgent(actions)
120 |         agent.initialize(s)
121 |         self.weights = agent.model.get_weights()
122 | 
123 |         with Parallel(n_jobs=-1) as parallel:
124 |             for e in range(epoch):
125 |                 experiment = delayed(EvolutionalTrainer.run_agent)
126 |                 results = parallel(experiment(
127 |                                 episode_per_agent, self.weights, self.sigma)
128 |                                 for p in range(self.population_size))
129 |                 self.update(results)
130 |                 self.log()
131 | 
132 |         agent.model.set_weights(self.weights)
133 |         return agent
134 | 
135 |     @classmethod
136 |     def make_env(cls):
137 |         return CatcherObserver(width=50, height=50, frame_count=5)
138 | 
139 |     @classmethod
140 |     def run_agent(cls, episode_per_agent, base_weights, sigma,
141 |                   max_step=1000):
142 |         env = cls.make_env()
143 |         actions = list(range(env.action_space.n))
144 |         agent = EvolutionalAgent(actions)
145 | 
146 |         noises = []
147 |         new_weights = []
148 | 
149 |         # Make weight
150 |         for w in base_weights:
151 |             noise = np.random.randn(*w.shape)
152 |             new_weights.append(w + sigma * noise)
153 |             noises.append(noise)
154 | 
155 |         # Test Play
156 |         total_reward = 0
157 |         for e in range(episode_per_agent):
158 |             s = env.reset()
159 |             if agent.model is None:
160 |                 agent.initialize(s, new_weights)
161 |             done = False
162 |             step = 0
163 |             while not done and step < max_step:
164 |                 a = agent.policy(s)
165 |                 n_state, reward, done, info = env.step(a)
166 |                 total_reward += reward
167 |                 s = n_state
168 |                 step += 1
169 | 
170 |         reward = total_reward / episode_per_agent
171 |         return reward, noises
172 | 
173 |     def update(self, agent_results):
174 |         rewards = np.array([r[0] for r in agent_results])
175 |         noises = np.array([r[1] for r in agent_results])
176 |         normalized_rs = (rewards - rewards.mean()) / rewards.std()
177 | 
178 |         # Update base weights
179 |         new_weights = []
180 |         for i, w in enumerate(self.weights):
181 |             noise_at_i = np.array([n[i] for n in noises])
182 |             rate = self.learning_rate / (self.population_size * self.sigma)
183 |             w = w + rate * np.dot(noise_at_i.T, normalized_rs).T
184 |             new_weights.append(w)
185 | 
186 |         self.weights = new_weights
187 |         self.reward_log.append(rewards)
188 | 
189 |     def log(self):
190 |         rewards = self.reward_log[-1]
191 |         print("Epoch {}: reward {:.3}(max:{}, min:{})".format(
192 |             len(self.reward_log), rewards.mean(),
193 |             rewards.max(), rewards.min()))
194 | 
195 |     def plot_rewards(self):
196 |         indices = range(len(self.reward_log))
197 |         means = np.array([rs.mean() for rs in self.reward_log])
198 |         stds = np.array([rs.std() for rs in self.reward_log])
199 |         plt.figure()
200 |         plt.title("Reward History")
201 |         plt.grid()
202 |         plt.fill_between(indices, means - stds, means + stds,
203 |                          alpha=0.1, color="g")
204 |         plt.plot(indices, means, "o-", color="g",
205 |                  label="reward")
206 |         plt.legend(loc="best")
207 |         plt.show()
208 | 
209 | 
210 | def main(play):
211 |     model_path = os.path.join(os.path.dirname(__file__), "ev_agent.h5")
212 | 
213 |     if play:
214 |         env = EvolutionalTrainer.make_env()
215 |         agent = EvolutionalAgent.load(env, model_path)
216 |         agent.play(env, episode_count=5, render=True)
217 |     else:
218 |         trainer = EvolutionalTrainer()
219 |         trained = trainer.train()
220 |         trained.save(model_path)
221 |         trainer.plot_rewards()
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     parser = argparse.ArgumentParser(description="Evolutional Agent")
226 |     parser.add_argument("--play", action="store_true",
227 |                         help="play with trained model")
228 | 
229 |     args = parser.parse_args()
230 |     main(args.play)
231 | 


--------------------------------------------------------------------------------
/FN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/FN/__init__.py


--------------------------------------------------------------------------------
/FN/a2c_agent.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import deque
  3 | import numpy as np
  4 | from sklearn.preprocessing import StandardScaler
  5 | import tensorflow as tf
  6 | from tensorflow.python import keras as K
  7 | from PIL import Image
  8 | import gym
  9 | import gym_ple
 10 | from fn_framework import FNAgent, Trainer, Observer
 11 | tf.compat.v1.disable_eager_execution()
 12 | 
 13 | 
 14 | class ActorCriticAgent(FNAgent):
 15 | 
 16 |     def __init__(self, actions):
 17 |         # ActorCriticAgent uses self policy (doesn't use epsilon).
 18 |         super().__init__(epsilon=0.0, actions=actions)
 19 |         self._updater = None
 20 | 
 21 |     @classmethod
 22 |     def load(cls, env, model_path):
 23 |         actions = list(range(env.action_space.n))
 24 |         agent = cls(actions)
 25 |         agent.model = K.models.load_model(model_path, custom_objects={
 26 |                         "SampleLayer": SampleLayer})
 27 |         agent.initialized = True
 28 |         return agent
 29 | 
 30 |     def initialize(self, experiences, optimizer):
 31 |         feature_shape = experiences[0].s.shape
 32 |         self.make_model(feature_shape)
 33 |         self.set_updater(optimizer)
 34 |         self.initialized = True
 35 |         print("Done initialization. From now, begin training!")
 36 | 
 37 |     def make_model(self, feature_shape):
 38 |         normal = K.initializers.glorot_normal()
 39 |         model = K.Sequential()
 40 |         model.add(K.layers.Conv2D(
 41 |             32, kernel_size=8, strides=4, padding="same",
 42 |             input_shape=feature_shape,
 43 |             kernel_initializer=normal, activation="relu"))
 44 |         model.add(K.layers.Conv2D(
 45 |             64, kernel_size=4, strides=2, padding="same",
 46 |             kernel_initializer=normal, activation="relu"))
 47 |         model.add(K.layers.Conv2D(
 48 |             64, kernel_size=3, strides=1, padding="same",
 49 |             kernel_initializer=normal, activation="relu"))
 50 |         model.add(K.layers.Flatten())
 51 |         model.add(K.layers.Dense(256, kernel_initializer=normal,
 52 |                                  activation="relu"))
 53 | 
 54 |         actor_layer = K.layers.Dense(len(self.actions),
 55 |                                      kernel_initializer=normal)
 56 |         action_evals = actor_layer(model.output)
 57 |         actions = SampleLayer()(action_evals)
 58 | 
 59 |         critic_layer = K.layers.Dense(1, kernel_initializer=normal)
 60 |         values = critic_layer(model.output)
 61 | 
 62 |         self.model = K.Model(inputs=model.input,
 63 |                              outputs=[actions, action_evals, values])
 64 | 
 65 |     def set_updater(self, optimizer,
 66 |                     value_loss_weight=1.0, entropy_weight=0.1):
 67 |         actions = tf.compat.v1.placeholder(shape=(None), dtype="int32")
 68 |         values = tf.compat.v1.placeholder(shape=(None), dtype="float32")
 69 | 
 70 |         _, action_evals, estimateds = self.model.output
 71 | 
 72 |         neg_logs = tf.nn.sparse_softmax_cross_entropy_with_logits(
 73 |                         logits=action_evals, labels=actions)
 74 |         # tf.stop_gradient: Prevent policy_loss influences critic_layer.
 75 |         advantages = values - tf.stop_gradient(estimateds)
 76 | 
 77 |         policy_loss = tf.reduce_mean(neg_logs * advantages)
 78 |         value_loss = tf.keras.losses.MeanSquaredError()(values, estimateds)
 79 |         action_entropy = tf.reduce_mean(self.categorical_entropy(action_evals))
 80 | 
 81 |         loss = policy_loss + value_loss_weight * value_loss
 82 |         loss -= entropy_weight * action_entropy
 83 | 
 84 |         updates = optimizer.get_updates(loss=loss,
 85 |                                         params=self.model.trainable_weights)
 86 | 
 87 |         self._updater = K.backend.function(
 88 |                                         inputs=[self.model.input,
 89 |                                                 actions, values],
 90 |                                         outputs=[loss,
 91 |                                                  policy_loss,
 92 |                                                  value_loss,
 93 |                                                  tf.reduce_mean(neg_logs),
 94 |                                                  tf.reduce_mean(advantages),
 95 |                                                  action_entropy],
 96 |                                         updates=updates)
 97 | 
 98 |     def categorical_entropy(self, logits):
 99 |         """
100 |         From OpenAI baseline implementation.
101 |         https://github.com/openai/baselines/blob/master/baselines/common/distributions.py#L192
102 |         """
103 |         a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
104 |         ea0 = tf.exp(a0)
105 |         z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
106 |         p0 = ea0 / z0
107 |         return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1)
108 | 
109 |     def policy(self, s):
110 |         if not self.initialized:
111 |             return np.random.randint(len(self.actions))
112 |         else:
113 |             action, action_evals, values = self.model.predict(np.array([s]))
114 |             return action[0]
115 | 
116 |     def estimate(self, s):
117 |         action, action_evals, values = self.model.predict(np.array([s]))
118 |         return values[0][0]
119 | 
120 |     def update(self, states, actions, rewards):
121 |         return self._updater([states, actions, rewards])
122 | 
123 | 
124 | class SampleLayer(K.layers.Layer):
125 | 
126 |     def __init__(self, **kwargs):
127 |         self.output_dim = 1  # sample one action from evaluations
128 |         super(SampleLayer, self).__init__(**kwargs)
129 | 
130 |     def build(self, input_shape):
131 |         super(SampleLayer, self).build(input_shape)
132 | 
133 |     def call(self, x):
134 |         noise = tf.random.uniform(tf.shape(x))
135 |         return tf.argmax(x - tf.math.log(-tf.math.log(noise)), axis=1)
136 | 
137 |     def compute_output_shape(self, input_shape):
138 |         return (input_shape[0], self.output_dim)
139 | 
140 | 
141 | class ActorCriticAgentTest(ActorCriticAgent):
142 | 
143 |     def make_model(self, feature_shape):
144 |         normal = K.initializers.glorot_normal()
145 |         model = K.Sequential()
146 |         model.add(K.layers.Dense(10, input_shape=feature_shape,
147 |                                  kernel_initializer=normal, activation="relu"))
148 |         model.add(K.layers.Dense(10, kernel_initializer=normal,
149 |                                  activation="relu"))
150 | 
151 |         actor_layer = K.layers.Dense(len(self.actions),
152 |                                      kernel_initializer=normal)
153 | 
154 |         action_evals = actor_layer(model.output)
155 |         actions = SampleLayer()(action_evals)
156 | 
157 |         critic_layer = K.layers.Dense(1, kernel_initializer=normal)
158 |         values = critic_layer(model.output)
159 | 
160 |         self.model = K.Model(inputs=model.input,
161 |                              outputs=[actions, action_evals, values])
162 | 
163 | 
164 | class CatcherObserver(Observer):
165 | 
166 |     def __init__(self, env, width, height, frame_count):
167 |         super().__init__(env)
168 |         self.width = width
169 |         self.height = height
170 |         self.frame_count = frame_count
171 |         self._frames = deque(maxlen=frame_count)
172 | 
173 |     def transform(self, state):
174 |         grayed = Image.fromarray(state).convert("L")
175 |         resized = grayed.resize((self.width, self.height))
176 |         resized = np.array(resized).astype("float")
177 |         normalized = resized / 255.0  # scale to 0~1
178 |         if len(self._frames) == 0:
179 |             for i in range(self.frame_count):
180 |                 self._frames.append(normalized)
181 |         else:
182 |             self._frames.append(normalized)
183 |         feature = np.array(self._frames)
184 |         # Convert the feature shape (f, w, h) => (h, w, f).
185 |         feature = np.transpose(feature, (1, 2, 0))
186 |         return feature
187 | 
188 | 
189 | class ActorCriticTrainer(Trainer):
190 | 
191 |     def __init__(self, buffer_size=256, batch_size=32,
192 |                  gamma=0.99, learning_rate=1e-3,
193 |                  report_interval=10, log_dir="", file_name=""):
194 |         super().__init__(buffer_size, batch_size, gamma,
195 |                          report_interval, log_dir)
196 |         self.file_name = file_name if file_name else "a2c_agent.h5"
197 |         self.learning_rate = learning_rate
198 |         self.losses = {}
199 |         self.rewards = []
200 |         self._max_reward = -10
201 | 
202 |     def train(self, env, episode_count=900, initial_count=10,
203 |               test_mode=False, render=False, observe_interval=100):
204 |         actions = list(range(env.action_space.n))
205 |         if not test_mode:
206 |             agent = ActorCriticAgent(actions)
207 |         else:
208 |             agent = ActorCriticAgentTest(actions)
209 |             observe_interval = 0
210 |         self.training_episode = episode_count
211 | 
212 |         self.train_loop(env, agent, episode_count, initial_count, render,
213 |                         observe_interval)
214 |         return agent
215 | 
216 |     def episode_begin(self, episode, agent):
217 |         self.rewards = []
218 | 
219 |     def step(self, episode, step_count, agent, experience):
220 |         self.rewards.append(experience.r)
221 |         if not agent.initialized:
222 |             if len(self.experiences) < self.buffer_size:
223 |                 # Store experience until buffer_size (enough to initialize).
224 |                 return False
225 | 
226 |             optimizer = K.optimizers.Adam(lr=self.learning_rate,
227 |                                           clipnorm=5.0)
228 |             agent.initialize(self.experiences, optimizer)
229 |             self.logger.set_model(agent.model)
230 |             self.training = True
231 |             self.experiences.clear()
232 |         else:
233 |             if len(self.experiences) < self.batch_size:
234 |                 # Store experience until batch_size (enough to update).
235 |                 return False
236 | 
237 |             batch = self.make_batch(agent)
238 |             loss, lp, lv, p_ng, p_ad, p_en = agent.update(*batch)
239 |             # Record latest metrics.
240 |             self.losses["loss/total"] = loss
241 |             self.losses["loss/policy"] = lp
242 |             self.losses["loss/value"] = lv
243 |             self.losses["policy/neg_logs"] = p_ng
244 |             self.losses["policy/advantage"] = p_ad
245 |             self.losses["policy/entropy"] = p_en
246 |             self.experiences.clear()
247 | 
248 |     def make_batch(self, agent):
249 |         states = []
250 |         actions = []
251 |         values = []
252 |         experiences = list(self.experiences)
253 |         states = np.array([e.s for e in experiences])
254 |         actions = np.array([e.a for e in experiences])
255 | 
256 |         # Calculate values.
257 |         # If the last experience isn't terminal (done) then estimates value.
258 |         last = experiences[-1]
259 |         future = last.r if last.d else agent.estimate(last.n_s)
260 |         for e in reversed(experiences):
261 |             value = e.r
262 |             if not e.d:
263 |                 value += self.gamma * future
264 |             values.append(value)
265 |             future = value
266 |         values = np.array(list(reversed(values)))
267 | 
268 |         scaler = StandardScaler()
269 |         values = scaler.fit_transform(values.reshape((-1, 1))).flatten()
270 | 
271 |         return states, actions, values
272 | 
273 |     def episode_end(self, episode, step_count, agent):
274 |         reward = sum(self.rewards)
275 |         self.reward_log.append(reward)
276 | 
277 |         if agent.initialized:
278 |             self.logger.write(self.training_count, "reward", reward)
279 |             self.logger.write(self.training_count, "reward_max",
280 |                               max(self.rewards))
281 | 
282 |             for k in self.losses:
283 |                 self.logger.write(self.training_count, k, self.losses[k])
284 | 
285 |             if reward > self._max_reward:
286 |                 agent.save(self.logger.path_of(self.file_name))
287 |                 self._max_reward = reward
288 | 
289 |         if self.is_event(episode, self.report_interval):
290 |             recent_rewards = self.reward_log[-self.report_interval:]
291 |             self.logger.describe("reward", recent_rewards, episode=episode)
292 | 
293 | 
294 | def main(play, is_test):
295 |     file_name = "a2c_agent.h5" if not is_test else "a2c_agent_test.h5"
296 |     trainer = ActorCriticTrainer(file_name=file_name)
297 |     path = trainer.logger.path_of(trainer.file_name)
298 |     agent_class = ActorCriticAgent
299 | 
300 |     if is_test:
301 |         print("Train on test mode")
302 |         obs = gym.make("CartPole-v0")
303 |         agent_class = ActorCriticAgentTest
304 |     else:
305 |         env = gym.make("Catcher-v0")
306 |         obs = CatcherObserver(env, 80, 80, 4)
307 |         trainer.learning_rate = 7e-5
308 | 
309 |     if play:
310 |         agent = agent_class.load(obs, path)
311 |         agent.play(obs, episode_count=10, render=True)
312 |     else:
313 |         trainer.train(obs, test_mode=is_test)
314 | 
315 | 
316 | if __name__ == "__main__":
317 |     parser = argparse.ArgumentParser(description="A2C Agent")
318 |     parser.add_argument("--play", action="store_true",
319 |                         help="play with trained model")
320 |     parser.add_argument("--test", action="store_true",
321 |                         help="train by test mode")
322 | 
323 |     args = parser.parse_args()
324 |     main(args.play, args.test)
325 | 


--------------------------------------------------------------------------------
/FN/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import argparse
  3 | from collections import deque
  4 | import numpy as np
  5 | from tensorflow.python import keras as K
  6 | from PIL import Image
  7 | import gym
  8 | import gym_ple
  9 | from fn_framework import FNAgent, Trainer, Observer
 10 | 
 11 | 
 12 | class DeepQNetworkAgent(FNAgent):
 13 | 
 14 |     def __init__(self, epsilon, actions):
 15 |         super().__init__(epsilon, actions)
 16 |         self._scaler = None
 17 |         self._teacher_model = None
 18 | 
 19 |     def initialize(self, experiences, optimizer):
 20 |         feature_shape = experiences[0].s.shape
 21 |         self.make_model(feature_shape)
 22 |         self.model.compile(optimizer, loss="mse")
 23 |         self.initialized = True
 24 |         print("Done initialization. From now, begin training!")
 25 | 
 26 |     def make_model(self, feature_shape):
 27 |         normal = K.initializers.glorot_normal()
 28 |         model = K.Sequential()
 29 |         model.add(K.layers.Conv2D(
 30 |             32, kernel_size=8, strides=4, padding="same",
 31 |             input_shape=feature_shape, kernel_initializer=normal,
 32 |             activation="relu"))
 33 |         model.add(K.layers.Conv2D(
 34 |             64, kernel_size=4, strides=2, padding="same",
 35 |             kernel_initializer=normal,
 36 |             activation="relu"))
 37 |         model.add(K.layers.Conv2D(
 38 |             64, kernel_size=3, strides=1, padding="same",
 39 |             kernel_initializer=normal,
 40 |             activation="relu"))
 41 |         model.add(K.layers.Flatten())
 42 |         model.add(K.layers.Dense(256, kernel_initializer=normal,
 43 |                                  activation="relu"))
 44 |         model.add(K.layers.Dense(len(self.actions),
 45 |                                  kernel_initializer=normal))
 46 |         self.model = model
 47 |         self._teacher_model = K.models.clone_model(self.model)
 48 | 
 49 |     def estimate(self, state):
 50 |         return self.model.predict(np.array([state]))[0]
 51 | 
 52 |     def update(self, experiences, gamma):
 53 |         states = np.array([e.s for e in experiences])
 54 |         n_states = np.array([e.n_s for e in experiences])
 55 | 
 56 |         estimateds = self.model.predict(states)
 57 |         future = self._teacher_model.predict(n_states)
 58 | 
 59 |         for i, e in enumerate(experiences):
 60 |             reward = e.r
 61 |             if not e.d:
 62 |                 reward += gamma * np.max(future[i])
 63 |             estimateds[i][e.a] = reward
 64 | 
 65 |         loss = self.model.train_on_batch(states, estimateds)
 66 |         return loss
 67 | 
 68 |     def update_teacher(self):
 69 |         self._teacher_model.set_weights(self.model.get_weights())
 70 | 
 71 | 
 72 | class DeepQNetworkAgentTest(DeepQNetworkAgent):
 73 | 
 74 |     def __init__(self, epsilon, actions):
 75 |         super().__init__(epsilon, actions)
 76 | 
 77 |     def make_model(self, feature_shape):
 78 |         normal = K.initializers.glorot_normal()
 79 |         model = K.Sequential()
 80 |         model.add(K.layers.Dense(64, input_shape=feature_shape,
 81 |                                  kernel_initializer=normal, activation="relu"))
 82 |         model.add(K.layers.Dense(len(self.actions), kernel_initializer=normal,
 83 |                                  activation="relu"))
 84 |         self.model = model
 85 |         self._teacher_model = K.models.clone_model(self.model)
 86 | 
 87 | 
 88 | class CatcherObserver(Observer):
 89 | 
 90 |     def __init__(self, env, width, height, frame_count):
 91 |         super().__init__(env)
 92 |         self.width = width
 93 |         self.height = height
 94 |         self.frame_count = frame_count
 95 |         self._frames = deque(maxlen=frame_count)
 96 | 
 97 |     def transform(self, state):
 98 |         grayed = Image.fromarray(state).convert("L")
 99 |         resized = grayed.resize((self.width, self.height))
100 |         resized = np.array(resized).astype("float")
101 |         normalized = resized / 255.0  # scale to 0~1
102 |         if len(self._frames) == 0:
103 |             for i in range(self.frame_count):
104 |                 self._frames.append(normalized)
105 |         else:
106 |             self._frames.append(normalized)
107 |         feature = np.array(self._frames)
108 |         # Convert the feature shape (f, w, h) => (h, w, f).
109 |         feature = np.transpose(feature, (1, 2, 0))
110 | 
111 |         return feature
112 | 
113 | 
114 | class DeepQNetworkTrainer(Trainer):
115 | 
116 |     def __init__(self, buffer_size=50000, batch_size=32,
117 |                  gamma=0.99, initial_epsilon=0.5, final_epsilon=1e-3,
118 |                  learning_rate=1e-3, teacher_update_freq=3, report_interval=10,
119 |                  log_dir="", file_name=""):
120 |         super().__init__(buffer_size, batch_size, gamma,
121 |                          report_interval, log_dir)
122 |         self.file_name = file_name if file_name else "dqn_agent.h5"
123 |         self.initial_epsilon = initial_epsilon
124 |         self.final_epsilon = final_epsilon
125 |         self.learning_rate = learning_rate
126 |         self.teacher_update_freq = teacher_update_freq
127 |         self.loss = 0
128 |         self.training_episode = 0
129 |         self._max_reward = -10
130 | 
131 |     def train(self, env, episode_count=1200, initial_count=200,
132 |               test_mode=False, render=False, observe_interval=100):
133 |         actions = list(range(env.action_space.n))
134 |         if not test_mode:
135 |             agent = DeepQNetworkAgent(1.0, actions)
136 |         else:
137 |             agent = DeepQNetworkAgentTest(1.0, actions)
138 |             observe_interval = 0
139 |         self.training_episode = episode_count
140 | 
141 |         self.train_loop(env, agent, episode_count, initial_count, render,
142 |                         observe_interval)
143 |         return agent
144 | 
145 |     def episode_begin(self, episode, agent):
146 |         self.loss = 0
147 | 
148 |     def begin_train(self, episode, agent):
149 |         optimizer = K.optimizers.Adam(lr=self.learning_rate, clipvalue=1.0)
150 |         agent.initialize(self.experiences, optimizer)
151 |         self.logger.set_model(agent.model)
152 |         agent.epsilon = self.initial_epsilon
153 |         self.training_episode -= episode
154 | 
155 |     def step(self, episode, step_count, agent, experience):
156 |         if self.training:
157 |             batch = random.sample(self.experiences, self.batch_size)
158 |             self.loss += agent.update(batch, self.gamma)
159 | 
160 |     def episode_end(self, episode, step_count, agent):
161 |         reward = sum([e.r for e in self.get_recent(step_count)])
162 |         self.loss = self.loss / step_count
163 |         self.reward_log.append(reward)
164 |         if self.training:
165 |             self.logger.write(self.training_count, "loss", self.loss)
166 |             self.logger.write(self.training_count, "reward", reward)
167 |             self.logger.write(self.training_count, "epsilon", agent.epsilon)
168 |             if reward > self._max_reward:
169 |                 agent.save(self.logger.path_of(self.file_name))
170 |                 self._max_reward = reward
171 |             if self.is_event(self.training_count, self.teacher_update_freq):
172 |                 agent.update_teacher()
173 | 
174 |             diff = (self.initial_epsilon - self.final_epsilon)
175 |             decay = diff / self.training_episode
176 |             agent.epsilon = max(agent.epsilon - decay, self.final_epsilon)
177 | 
178 |         if self.is_event(episode, self.report_interval):
179 |             recent_rewards = self.reward_log[-self.report_interval:]
180 |             self.logger.describe("reward", recent_rewards, episode=episode)
181 | 
182 | 
183 | def main(play, is_test):
184 |     file_name = "dqn_agent.h5" if not is_test else "dqn_agent_test.h5"
185 |     trainer = DeepQNetworkTrainer(file_name=file_name)
186 |     path = trainer.logger.path_of(trainer.file_name)
187 |     agent_class = DeepQNetworkAgent
188 | 
189 |     if is_test:
190 |         print("Train on test mode")
191 |         obs = gym.make("CartPole-v0")
192 |         agent_class = DeepQNetworkAgentTest
193 |     else:
194 |         env = gym.make("Catcher-v0")
195 |         obs = CatcherObserver(env, 80, 80, 4)
196 |         trainer.learning_rate = 1e-4
197 | 
198 |     if play:
199 |         agent = agent_class.load(obs, path)
200 |         agent.play(obs, render=True)
201 |     else:
202 |         trainer.train(obs, test_mode=is_test)
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     parser = argparse.ArgumentParser(description="DQN Agent")
207 |     parser.add_argument("--play", action="store_true",
208 |                         help="play with trained model")
209 |     parser.add_argument("--test", action="store_true",
210 |                         help="train by test mode")
211 | 
212 |     args = parser.parse_args()
213 |     main(args.play, args.test)
214 | 


--------------------------------------------------------------------------------
/FN/fn_framework.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import re
  4 | from collections import namedtuple
  5 | from collections import deque
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from tensorflow.python import keras as K
  9 | from PIL import Image
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | Experience = namedtuple("Experience",
 14 |                         ["s", "a", "r", "n_s", "d"])
 15 | 
 16 | 
 17 | class FNAgent():
 18 | 
 19 |     def __init__(self, epsilon, actions):
 20 |         self.epsilon = epsilon
 21 |         self.actions = actions
 22 |         self.model = None
 23 |         self.estimate_probs = False
 24 |         self.initialized = False
 25 | 
 26 |     def save(self, model_path):
 27 |         self.model.save(model_path, overwrite=True, include_optimizer=False)
 28 | 
 29 |     @classmethod
 30 |     def load(cls, env, model_path, epsilon=0.0001):
 31 |         actions = list(range(env.action_space.n))
 32 |         agent = cls(epsilon, actions)
 33 |         agent.model = K.models.load_model(model_path)
 34 |         agent.initialized = True
 35 |         return agent
 36 | 
 37 |     def initialize(self, experiences):
 38 |         raise NotImplementedError("You have to implement initialize method.")
 39 | 
 40 |     def estimate(self, s):
 41 |         raise NotImplementedError("You have to implement estimate method.")
 42 | 
 43 |     def update(self, experiences, gamma):
 44 |         raise NotImplementedError("You have to implement update method.")
 45 | 
 46 |     def policy(self, s):
 47 |         if np.random.random() < self.epsilon or not self.initialized:
 48 |             return np.random.randint(len(self.actions))
 49 |         else:
 50 |             estimates = self.estimate(s)
 51 |             if self.estimate_probs:
 52 |                 action = np.random.choice(self.actions,
 53 |                                           size=1, p=estimates)[0]
 54 |                 return action
 55 |             else:
 56 |                 return np.argmax(estimates)
 57 | 
 58 |     def play(self, env, episode_count=5, render=True):
 59 |         for e in range(episode_count):
 60 |             s = env.reset()
 61 |             done = False
 62 |             episode_reward = 0
 63 |             while not done:
 64 |                 if render:
 65 |                     env.render()
 66 |                 a = self.policy(s)
 67 |                 n_state, reward, done, info = env.step(a)
 68 |                 episode_reward += reward
 69 |                 s = n_state
 70 |             else:
 71 |                 print("Get reward {}.".format(episode_reward))
 72 | 
 73 | 
 74 | class Trainer():
 75 | 
 76 |     def __init__(self, buffer_size=1024, batch_size=32,
 77 |                  gamma=0.9, report_interval=10, log_dir=""):
 78 |         self.buffer_size = buffer_size
 79 |         self.batch_size = batch_size
 80 |         self.gamma = gamma
 81 |         self.report_interval = report_interval
 82 |         self.logger = Logger(log_dir, self.trainer_name)
 83 |         self.experiences = deque(maxlen=buffer_size)
 84 |         self.training = False
 85 |         self.training_count = 0
 86 |         self.reward_log = []
 87 | 
 88 |     @property
 89 |     def trainer_name(self):
 90 |         class_name = self.__class__.__name__
 91 |         snaked = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", class_name)
 92 |         snaked = re.sub("([a-z0-9])([A-Z])", r"\1_\2", snaked).lower()
 93 |         snaked = snaked.replace("_trainer", "")
 94 |         return snaked
 95 | 
 96 |     def train_loop(self, env, agent, episode=200, initial_count=-1,
 97 |                    render=False, observe_interval=0):
 98 |         self.experiences = deque(maxlen=self.buffer_size)
 99 |         self.training = False
100 |         self.training_count = 0
101 |         self.reward_log = []
102 |         frames = []
103 | 
104 |         for i in range(episode):
105 |             s = env.reset()
106 |             done = False
107 |             step_count = 0
108 |             self.episode_begin(i, agent)
109 |             while not done:
110 |                 if render:
111 |                     env.render()
112 |                 if self.training and observe_interval > 0 and\
113 |                    (self.training_count == 1 or
114 |                     self.training_count % observe_interval == 0):
115 |                     frames.append(s)
116 | 
117 |                 a = agent.policy(s)
118 |                 n_state, reward, done, info = env.step(a)
119 |                 e = Experience(s, a, reward, n_state, done)
120 |                 self.experiences.append(e)
121 |                 if not self.training and \
122 |                    len(self.experiences) == self.buffer_size:
123 |                     self.begin_train(i, agent)
124 |                     self.training = True
125 | 
126 |                 self.step(i, step_count, agent, e)
127 | 
128 |                 s = n_state
129 |                 step_count += 1
130 |             else:
131 |                 self.episode_end(i, step_count, agent)
132 | 
133 |                 if not self.training and \
134 |                    initial_count > 0 and i >= initial_count:
135 |                     self.begin_train(i, agent)
136 |                     self.training = True
137 | 
138 |                 if self.training:
139 |                     if len(frames) > 0:
140 |                         self.logger.write_image(self.training_count,
141 |                                                 frames)
142 |                         frames = []
143 |                     self.training_count += 1
144 | 
145 |     def episode_begin(self, episode, agent):
146 |         pass
147 | 
148 |     def begin_train(self, episode, agent):
149 |         pass
150 | 
151 |     def step(self, episode, step_count, agent, experience):
152 |         pass
153 | 
154 |     def episode_end(self, episode, step_count, agent):
155 |         pass
156 | 
157 |     def is_event(self, count, interval):
158 |         return True if count != 0 and count % interval == 0 else False
159 | 
160 |     def get_recent(self, count):
161 |         recent = range(len(self.experiences) - count, len(self.experiences))
162 |         return [self.experiences[i] for i in recent]
163 | 
164 | 
165 | class Observer():
166 | 
167 |     def __init__(self, env):
168 |         self._env = env
169 | 
170 |     @property
171 |     def action_space(self):
172 |         return self._env.action_space
173 | 
174 |     @property
175 |     def observation_space(self):
176 |         return self._env.observation_space
177 | 
178 |     def reset(self):
179 |         return self.transform(self._env.reset())
180 | 
181 |     def render(self):
182 |         self._env.render(mode="human")
183 | 
184 |     def step(self, action):
185 |         n_state, reward, done, info = self._env.step(action)
186 |         return self.transform(n_state), reward, done, info
187 | 
188 |     def transform(self, state):
189 |         raise NotImplementedError("You have to implement transform method.")
190 | 
191 | 
192 | class Logger():
193 | 
194 |     def __init__(self, log_dir="", dir_name=""):
195 |         self.log_dir = log_dir
196 |         if not log_dir:
197 |             self.log_dir = os.path.join(os.path.dirname(__file__), "logs")
198 |         if not os.path.exists(self.log_dir):
199 |             os.mkdir(self.log_dir)
200 | 
201 |         if dir_name:
202 |             self.log_dir = os.path.join(self.log_dir, dir_name)
203 |             if not os.path.exists(self.log_dir):
204 |                 os.mkdir(self.log_dir)
205 | 
206 |         self._callback = tf.compat.v1.keras.callbacks.TensorBoard(
207 |                             self.log_dir)
208 | 
209 |     @property
210 |     def writer(self):
211 |         return self._callback.writer
212 | 
213 |     def set_model(self, model):
214 |         self._callback.set_model(model)
215 | 
216 |     def path_of(self, file_name):
217 |         return os.path.join(self.log_dir, file_name)
218 | 
219 |     def describe(self, name, values, episode=-1, step=-1):
220 |         mean = np.round(np.mean(values), 3)
221 |         std = np.round(np.std(values), 3)
222 |         desc = "{} is {} (+/-{})".format(name, mean, std)
223 |         if episode > 0:
224 |             print("At episode {}, {}".format(episode, desc))
225 |         elif step > 0:
226 |             print("At step {}, {}".format(step, desc))
227 | 
228 |     def plot(self, name, values, interval=10):
229 |         indices = list(range(0, len(values), interval))
230 |         means = []
231 |         stds = []
232 |         for i in indices:
233 |             _values = values[i:(i + interval)]
234 |             means.append(np.mean(_values))
235 |             stds.append(np.std(_values))
236 |         means = np.array(means)
237 |         stds = np.array(stds)
238 |         plt.figure()
239 |         plt.title("{} History".format(name))
240 |         plt.grid()
241 |         plt.fill_between(indices, means - stds, means + stds,
242 |                          alpha=0.1, color="g")
243 |         plt.plot(indices, means, "o-", color="g",
244 |                  label="{} per {} episode".format(name.lower(), interval))
245 |         plt.legend(loc="best")
246 |         plt.show()
247 | 
248 |     def write(self, index, name, value):
249 |         summary = tf.compat.v1.Summary()
250 |         summary_value = summary.value.add()
251 |         summary_value.tag = name
252 |         summary_value.simple_value = value
253 |         self.writer.add_summary(summary, index)
254 |         self.writer.flush()
255 | 
256 |     def write_image(self, index, frames):
257 |         # Deal with a 'frames' as a list of sequential gray scaled image.
258 |         last_frames = [f[:, :, -1] for f in frames]
259 |         if np.min(last_frames[-1]) < 0:
260 |             scale = 127 / np.abs(last_frames[-1]).max()
261 |             offset = 128
262 |         else:
263 |             scale = 255 / np.max(last_frames[-1])
264 |             offset = 0
265 |         channel = 1  # gray scale
266 |         tag = "frames_at_training_{}".format(index)
267 |         values = []
268 | 
269 |         for f in last_frames:
270 |             height, width = f.shape
271 |             array = np.asarray(f * scale + offset, dtype=np.uint8)
272 |             image = Image.fromarray(array)
273 |             output = io.BytesIO()
274 |             image.save(output, format="PNG")
275 |             image_string = output.getvalue()
276 |             output.close()
277 |             image = tf.compat.v1.Summary.Image(
278 |                         height=height, width=width, colorspace=channel,
279 |                         encoded_image_string=image_string)
280 |             value = tf.compat.v1.Summary.Value(tag=tag, image=image)
281 |             values.append(value)
282 | 
283 |         summary = tf.compat.v1.Summary(value=values)
284 |         self.writer.add_summary(summary, index)
285 |         self.writer.flush()
286 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_keras.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tensorflow.python import keras as K
 3 | 
 4 | model = K.Sequential([
 5 |     K.layers.Dense(units=4, input_shape=((2, ))),
 6 | ])
 7 | 
 8 | weight, bias = model.layers[0].get_weights()
 9 | print("Weight shape is {}.".format(weight.shape))
10 | print("Bias shape is {}.".format(bias.shape))
11 | 
12 | x = np.random.rand(1, 2)
13 | y = model.predict(x)
14 | print("x is ({}) and y is ({}).".format(x.shape, y.shape))
15 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_keras_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tensorflow.python import keras as K
 3 | 
 4 | # 2-layer neural network.
 5 | model = K.Sequential([
 6 |     K.layers.Dense(units=4, input_shape=((2, )),
 7 |                    activation="sigmoid"),
 8 |     K.layers.Dense(units=4),
 9 | ])
10 | 
11 | # Make batch size = 3 data (dimension of x is 2).
12 | batch = np.random.rand(3, 2)
13 | 
14 | y = model.predict(batch)
15 | print(y.shape)  # Will be (3, 4)
16 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_keras_boston.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.datasets import load_boston
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | from tensorflow.python import keras as K
 7 | 
 8 | 
 9 | dataset = load_boston()
10 | 
11 | y = dataset.target
12 | X = dataset.data
13 | 
14 | X_train, X_test, y_train, y_test = train_test_split(
15 |     X, y, test_size=0.33)
16 | 
17 | model = K.Sequential([
18 |     K.layers.BatchNormalization(input_shape=(13,)),
19 |     K.layers.Dense(units=13, activation="softplus", kernel_regularizer="l1"),
20 |     K.layers.Dense(units=1)
21 | ])
22 | model.compile(loss="mean_squared_error", optimizer="sgd")
23 | model.fit(X_train, y_train, epochs=8)
24 | 
25 | predicts = model.predict(X_test)
26 | 
27 | result = pd.DataFrame({
28 |     "predict": np.reshape(predicts, (-1,)),
29 |     "actual": y_test
30 | })
31 | limit = np.max(y_test)
32 | 
33 | result.plot.scatter(x="actual", y="predict", xlim=(0, limit), ylim=(0, limit))
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_keras_mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.datasets import load_digits
 4 | from sklearn.metrics import classification_report
 5 | from tensorflow.python import keras as K
 6 | 
 7 | 
 8 | dataset = load_digits()
 9 | image_shape = (8, 8, 1)
10 | num_class = 10
11 | 
12 | y = dataset.target
13 | y = K.utils.to_categorical(y, num_class)
14 | X = dataset.data
15 | X = np.array([data.reshape(image_shape) for data in X])
16 | 
17 | X_train, X_test, y_train, y_test = train_test_split(
18 |     X, y, test_size=0.33)
19 | 
20 | model = K.Sequential([
21 |     K.layers.Conv2D(
22 |         5, kernel_size=3, strides=1, padding="same",
23 |         input_shape=image_shape, activation="relu"),
24 |     K.layers.Conv2D(
25 |         3, kernel_size=2, strides=1, padding="same",
26 |         activation="relu"),
27 |     K.layers.Flatten(),
28 |     K.layers.Dense(units=num_class, activation="softmax")
29 | ])
30 | model.compile(loss="categorical_crossentropy", optimizer="sgd")
31 | model.fit(X_train, y_train, epochs=8)
32 | 
33 | predicts = model.predict(X_test)
34 | predicts = np.argmax(predicts, axis=1)
35 | actual = np.argmax(y_test, axis=1)
36 | print(classification_report(actual, predicts))
37 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_tf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | # Weight (row=4 x col=2).
 5 | a = tf.Variable(np.random.rand(4, 2))
 6 | 
 7 | # Bias (row=4 x col=1).
 8 | b = tf.Variable(np.random.rand(4, 1))
 9 | 
10 | # Input(x) (row=2 x col=1).
11 | x = tf.compat.v1.placeholder(tf.float64, shape=(2, 1))
12 | 
13 | # Output(y) (row=4 x col=1).
14 | y = tf.matmul(a, x) + b
15 | 
16 | 
17 | with tf.Session() as sess:
18 |     # Initialize variable.
19 |     init = tf.global_variables_initializer()
20 |     sess.run(init)
21 | 
22 |     # Make input to x.
23 |     x_value = np.random.rand(2, 1)
24 | 
25 |     # Execute culculation.
26 |     y_output = sess.run(y, feed_dict={x: x_value})
27 |     print(y_output.shape)  # Will be (4, 1)
28 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/explanation_tf_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | # Weight (row=4 x col=2).
 5 | a = tf.Variable(np.random.rand(4, 2))
 6 | 
 7 | # Bias (row=4 x col=1).
 8 | b = tf.Variable(np.random.rand(4, 1))
 9 | 
10 | # Input(x) (row=2 x col=1).
11 | x = tf.compat.v1.placeholder(tf.float64, shape=(2, 1))
12 | 
13 | # Output(y) (row=4 x col=1).
14 | y = tf.matmul(a, x) + b
15 | 
16 | 
17 | with tf.Session() as sess:
18 |     # Initialize variable.
19 |     init = tf.global_variables_initializer()
20 |     sess.run(init)
21 | 
22 |     # Make batch.
23 |     batch = []
24 |     for i in range(3):
25 |         x_value = np.random.rand(2, 1)
26 |         batch.append(x_value)
27 | 
28 |     # Execute culculation.
29 |     y_outputs = []
30 |     for x_value in batch:
31 |         y_output = sess.run(y, feed_dict={x: x_value})
32 |         y_outputs.append(y_output)
33 | 
34 |     y_output = np.array(y_outputs)
35 |     print(y_output.shape)  # Will be (3, 4, 1)
36 | 


--------------------------------------------------------------------------------
/FN/nn_tutorial/gradient.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | tf.enable_eager_execution()
 5 | tfe = tf.contrib.eager
 6 | 
 7 | 
 8 | def f(x, a, b):
 9 |     return tf.add(tf.multiply(x, a), b)
10 | 
11 | 
12 | def grad(f):
13 |     return lambda x, a, b: tfe.gradients_function(f)(x, a, b)
14 | 
15 | 
16 | x = 2.0
17 | a = 3.0
18 | b = -1.0
19 | 
20 | print(f(x, a, b))
21 | print(grad(f)(x, a, b))
22 | 


--------------------------------------------------------------------------------
/FN/policy_gradient_agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import argparse
  4 | import numpy as np
  5 | from sklearn.preprocessing import StandardScaler
  6 | from sklearn.externals import joblib
  7 | import tensorflow as tf
  8 | from tensorflow.python import keras as K
  9 | import gym
 10 | from fn_framework import FNAgent, Trainer, Observer, Experience
 11 | tf.compat.v1.disable_eager_execution()
 12 | 
 13 | 
 14 | class PolicyGradientAgent(FNAgent):
 15 | 
 16 |     def __init__(self, actions):
 17 |         # PolicyGradientAgent uses self policy (doesn't use epsilon).
 18 |         super().__init__(epsilon=0.0, actions=actions)
 19 |         self.estimate_probs = True
 20 |         self.scaler = StandardScaler()
 21 |         self._updater = None
 22 | 
 23 |     def save(self, model_path):
 24 |         super().save(model_path)
 25 |         joblib.dump(self.scaler, self.scaler_path(model_path))
 26 | 
 27 |     @classmethod
 28 |     def load(cls, env, model_path):
 29 |         actions = list(range(env.action_space.n))
 30 |         agent = cls(actions)
 31 |         agent.model = K.models.load_model(model_path)
 32 |         agent.initialized = True
 33 |         agent.scaler = joblib.load(agent.scaler_path(model_path))
 34 |         return agent
 35 | 
 36 |     def scaler_path(self, model_path):
 37 |         fname, _ = os.path.splitext(model_path)
 38 |         fname += "_scaler.pkl"
 39 |         return fname
 40 | 
 41 |     def initialize(self, experiences, optimizer):
 42 |         states = np.vstack([e.s for e in experiences])
 43 |         feature_size = states.shape[1]
 44 |         self.model = K.models.Sequential([
 45 |             K.layers.Dense(10, activation="relu", input_shape=(feature_size,)),
 46 |             K.layers.Dense(10, activation="relu"),
 47 |             K.layers.Dense(len(self.actions), activation="softmax")
 48 |         ])
 49 |         self.set_updater(optimizer)
 50 |         self.scaler.fit(states)
 51 |         self.initialized = True
 52 |         print("Done initialization. From now, begin training!")
 53 | 
 54 |     def set_updater(self, optimizer):
 55 |         actions = tf.compat.v1.placeholder(shape=(None), dtype="int32")
 56 |         rewards = tf.compat.v1.placeholder(shape=(None), dtype="float32")
 57 |         one_hot_actions = tf.one_hot(actions, len(self.actions), axis=1)
 58 |         action_probs = self.model.output
 59 |         selected_action_probs = tf.reduce_sum(one_hot_actions * action_probs,
 60 |                                               axis=1)
 61 |         clipped = tf.clip_by_value(selected_action_probs, 1e-10, 1.0)
 62 |         loss = - tf.math.log(clipped) * rewards
 63 |         loss = tf.reduce_mean(loss)
 64 | 
 65 |         updates = optimizer.get_updates(loss=loss,
 66 |                                         params=self.model.trainable_weights)
 67 |         self._updater = K.backend.function(
 68 |                                         inputs=[self.model.input,
 69 |                                                 actions, rewards],
 70 |                                         outputs=[loss],
 71 |                                         updates=updates)
 72 | 
 73 |     def estimate(self, s):
 74 |         normalized = self.scaler.transform(s)
 75 |         action_probs = self.model.predict(normalized)[0]
 76 |         return action_probs
 77 | 
 78 |     def update(self, states, actions, rewards):
 79 |         normalizeds = self.scaler.transform(states)
 80 |         actions = np.array(actions)
 81 |         rewards = np.array(rewards)
 82 |         self._updater([normalizeds, actions, rewards])
 83 | 
 84 | 
 85 | class CartPoleObserver(Observer):
 86 | 
 87 |     def transform(self, state):
 88 |         return np.array(state).reshape((1, -1))
 89 | 
 90 | 
 91 | class PolicyGradientTrainer(Trainer):
 92 | 
 93 |     def __init__(self, buffer_size=256, batch_size=32, gamma=0.9,
 94 |                  report_interval=10, log_dir=""):
 95 |         super().__init__(buffer_size, batch_size, gamma,
 96 |                          report_interval, log_dir)
 97 | 
 98 |     def train(self, env, episode_count=220, initial_count=-1, render=False):
 99 |         actions = list(range(env.action_space.n))
100 |         agent = PolicyGradientAgent(actions)
101 |         self.train_loop(env, agent, episode_count, initial_count, render)
102 |         return agent
103 | 
104 |     def episode_begin(self, episode, agent):
105 |         if agent.initialized:
106 |             self.experiences = []
107 | 
108 |     def make_batch(self, policy_experiences):
109 |         length = min(self.batch_size, len(policy_experiences))
110 |         batch = random.sample(policy_experiences, length)
111 |         states = np.vstack([e.s for e in batch])
112 |         actions = [e.a for e in batch]
113 |         rewards = [e.r for e in batch]
114 |         scaler = StandardScaler()
115 |         rewards = np.array(rewards).reshape((-1, 1))
116 |         rewards = scaler.fit_transform(rewards).flatten()
117 |         return states, actions, rewards
118 | 
119 |     def episode_end(self, episode, step_count, agent):
120 |         rewards = [e.r for e in self.get_recent(step_count)]
121 |         self.reward_log.append(sum(rewards))
122 | 
123 |         if not agent.initialized:
124 |             if len(self.experiences) == self.buffer_size:
125 |                 optimizer = K.optimizers.Adam(lr=0.01)
126 |                 agent.initialize(self.experiences, optimizer)
127 |                 self.training = True
128 |         else:
129 |             policy_experiences = []
130 |             for t, e in enumerate(self.experiences):
131 |                 s, a, r, n_s, d = e
132 |                 d_r = [_r * (self.gamma ** i) for i, _r in
133 |                        enumerate(rewards[t:])]
134 |                 d_r = sum(d_r)
135 |                 d_e = Experience(s, a, d_r, n_s, d)
136 |                 policy_experiences.append(d_e)
137 | 
138 |             agent.update(*self.make_batch(policy_experiences))
139 | 
140 |         if self.is_event(episode, self.report_interval):
141 |             recent_rewards = self.reward_log[-self.report_interval:]
142 |             self.logger.describe("reward", recent_rewards, episode=episode)
143 | 
144 | 
145 | def main(play):
146 |     env = CartPoleObserver(gym.make("CartPole-v0"))
147 |     trainer = PolicyGradientTrainer()
148 |     path = trainer.logger.path_of("policy_gradient_agent.h5")
149 | 
150 |     if play:
151 |         agent = PolicyGradientAgent.load(env, path)
152 |         agent.play(env)
153 |     else:
154 |         trained = trainer.train(env)
155 |         trainer.logger.plot("Rewards", trainer.reward_log,
156 |                             trainer.report_interval)
157 |         trained.save(path)
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     parser = argparse.ArgumentParser(description="PG Agent")
162 |     parser.add_argument("--play", action="store_true",
163 |                         help="play with trained model")
164 | 
165 |     args = parser.parse_args()
166 |     main(args.play)
167 | 


--------------------------------------------------------------------------------
/FN/policy_gradient_continuous_agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import random
  4 | import numpy as np
  5 | from sklearn.preprocessing import StandardScaler
  6 | from sklearn.externals import joblib
  7 | import tensorflow as tf
  8 | from tensorflow.python import keras as K
  9 | import gym
 10 | from fn_framework import FNAgent, Trainer, Observer
 11 | 
 12 | 
 13 | class PolicyGradientContinuousAgent(FNAgent):
 14 | 
 15 |     def __init__(self, epsilon, low, high):
 16 |         super().__init__(epsilon, [low, high])
 17 |         self.scaler = None
 18 |         self._updater = None
 19 | 
 20 |     def save(self, model_path):
 21 |         super().save(model_path)
 22 |         joblib.dump(self.scaler, self.scaler_path(model_path))
 23 | 
 24 |     @classmethod
 25 |     def load(cls, env, model_path, epsilon=0.0001):
 26 |         low, high = [env.action_space.low[0], env.action_space.high[0]]
 27 |         agent = cls(epsilon, low, high)
 28 |         agent.model = K.models.load_model(model_path, custom_objects={
 29 |                         "SampleLayer": SampleLayer})
 30 |         agent.scaler = joblib.load(agent.scaler_path(model_path))
 31 |         return agent
 32 | 
 33 |     def scaler_path(self, model_path):
 34 |         fname, _ = os.path.splitext(model_path)
 35 |         fname += "_scaler.pkl"
 36 |         return fname
 37 | 
 38 |     def initialize(self, experiences, actor_optimizer, critic_optimizer):
 39 |         self.scaler = StandardScaler()
 40 |         states = np.vstack([e.s for e in experiences])
 41 |         self.scaler.fit(states)
 42 |         feature_size = states.shape[1]
 43 | 
 44 |         base = K.models.Sequential()
 45 |         base.add(K.layers.Dense(16, activation="relu",
 46 |                                 input_shape=(feature_size,)))
 47 |         base.add(K.layers.Dense(16, activation="relu"))
 48 |         base.add(K.layers.Dense(16, activation="relu"))
 49 | 
 50 |         # Actor
 51 |         #  define action distribution
 52 |         mu = K.layers.Dense(1, activation="tanh")(base.output)
 53 |         mu = K.layers.Lambda(lambda m: m * 2)(mu)
 54 |         #sigma = K.layers.Dense(1, activation="softplus")(base.output)
 55 |         #self.dist_model = K.Model(inputs=base.input, outputs=[mu, sigma])
 56 |         self.dist_model = K.Model(inputs=base.input, outputs=[mu])
 57 | 
 58 |         #  sample action from distribution
 59 |         low, high = self.actions
 60 |         action = SampleLayer(low, high)((mu))
 61 |         self.model = K.Model(inputs=base.input, outputs=[action])
 62 | 
 63 |         # Critic
 64 |         self.critic = K.models.Sequential([
 65 |             K.layers.Dense(32, activation="relu", input_shape=(feature_size + 1,)),
 66 |             K.layers.Dense(32, activation="relu"),
 67 |             K.layers.Dense(32, activation="relu"),
 68 |             K.layers.Dense(1, activation="linear")
 69 |         ])
 70 |         self.set_updater(actor_optimizer)
 71 |         self.critic.compile(loss="mse", optimizer=critic_optimizer)
 72 |         self.initialized = True
 73 |         print("Done initialize. From now, begin training!")
 74 | 
 75 |     def set_updater(self, optimizer):
 76 |         actions = tf.compat.v1.placeholder(shape=(None), dtype="float32")
 77 |         td_error = tf.compat.v1.placeholder(shape=(None), dtype="float32")
 78 | 
 79 |         # Actor loss
 80 |         mu = self.dist_model.output
 81 |         action_dist = tf.distributions.Normal(loc=tf.squeeze(mu),
 82 |                                               scale=0.1)
 83 |         action_probs = action_dist.prob(tf.squeeze(actions))
 84 |         clipped = tf.clip_by_value(action_probs, 1e-10, 1.0)
 85 |         loss = - tf.math.log(clipped) * td_error
 86 |         loss = tf.reduce_mean(loss)
 87 | 
 88 |         updates = optimizer.get_updates(loss=loss,
 89 |                                         params=self.model.trainable_weights)
 90 |         self._updater = K.backend.function(
 91 |                                         inputs=[self.model.input,
 92 |                                                 actions, td_error],
 93 |                                         outputs=[loss, action_probs, mu],
 94 |                                         updates=updates)
 95 | 
 96 |     def policy(self, s):
 97 |         if np.random.random() < self.epsilon or not self.initialized:
 98 |             low, high = self.actions
 99 |             return np.random.uniform(low, high)
100 |         else:
101 |             normalized_s = self.scaler.transform(s)
102 |             action = self.model.predict(normalized_s)[0]
103 |             return action[0]
104 | 
105 |     def update(self, batch, gamma):
106 |         states = np.vstack([e.s for e in batch])
107 |         normalized_s = self.scaler.transform(states)
108 |         actions = np.vstack([e.a for e in batch])
109 | 
110 |         # Calculate value
111 |         next_states = np.vstack([e.n_s for e in batch])
112 |         normalized_n_s = self.scaler.transform(next_states)
113 |         n_s_actions = self.model.predict(normalized_n_s)
114 |         feature_n = np.concatenate([normalized_n_s, n_s_actions], axis=1)
115 |         n_s_values = self.critic.predict(feature_n)
116 |         values = [b.r + gamma * (0 if b.d else 1) * n_s_values
117 |                   for b, n_s_values in zip(batch, n_s_values)]
118 |         values = np.array(values)
119 | 
120 |         feature = np.concatenate([normalized_s, actions], axis=1)
121 |         td_error = values - self.critic.predict(feature)
122 |         a_loss, probs, mu = self._updater([normalized_s, actions, td_error])
123 |         c_loss = self.critic.train_on_batch(feature, values)
124 | 
125 |         """
126 |         print([a_loss, c_loss])
127 |         for x in zip(actions, mu, probs):
128 |             print("Took action {}. (mu={}, its prob={})".format(*x))
129 |         """
130 | 
131 | 
132 | class SampleLayer(K.layers.Layer):
133 | 
134 |     def __init__(self, low, high, **kwargs):
135 |         self.low = low
136 |         self.high = high
137 |         super(SampleLayer, self).__init__(**kwargs)
138 | 
139 |     def build(self, input_shape):
140 |         super(SampleLayer, self).build(input_shape)
141 | 
142 |     def call(self, x):
143 |         mu = x
144 |         actions = tf.distributions.Normal(loc=tf.squeeze(mu),
145 |                                           scale=0.1).sample([1])
146 |         actions = tf.clip_by_value(actions, self.low, self.high)
147 |         return tf.reshape(actions, (-1, 1))
148 | 
149 |     def compute_output_shape(self, input_shape):
150 |         return (input_shape[0], 1)
151 | 
152 |     def get_config(self):
153 |         config = super().get_config()
154 |         config["low"] = self.low
155 |         config["high"] = self.high
156 |         return config
157 | 
158 | 
159 | class PendulumObserver(Observer):
160 | 
161 |     def step(self, action):
162 |         n_state, reward, done, info = self._env.step([action])
163 |         return self.transform(n_state), reward, done, info
164 | 
165 |     def transform(self, state):
166 |         return np.reshape(state, (1, -1))
167 | 
168 | 
169 | class PolicyGradientContinuousTrainer(Trainer):
170 | 
171 |     def __init__(self, buffer_size=100000, batch_size=32,
172 |                  gamma=0.99, report_interval=10, log_dir=""):
173 |         super().__init__(buffer_size, batch_size, gamma,
174 |                          report_interval, log_dir)
175 | 
176 |     def train(self, env, episode_count=220, epsilon=1.0, initial_count=-1,
177 |               render=False):
178 |         low, high = [env.action_space.low[0], env.action_space.high[0]]
179 |         agent = PolicyGradientContinuousAgent(epsilon, low, high)
180 | 
181 |         self.train_loop(env, agent, episode_count, initial_count, render)
182 |         return agent
183 | 
184 |     def begin_train(self, episode, agent):
185 |         actor_optimizer = K.optimizers.Adam(lr=0.001, clipnorm=1.0)
186 |         critic_optimizer = K.optimizers.Adam(lr=0.001, clipnorm=1.0)
187 |         agent.initialize(self.experiences, actor_optimizer, critic_optimizer)
188 |         agent.epsilon = 0.01
189 | 
190 |     def step(self, episode, step_count, agent, experience):
191 |         if self.training:
192 |             batch = random.sample(self.experiences, self.batch_size)
193 |             agent.update(batch, self.gamma)
194 | 
195 |     def episode_end(self, episode, step_count, agent):
196 |         reward = sum([e.r for e in self.get_recent(step_count)])
197 |         self.reward_log.append(reward)
198 | 
199 |         if self.is_event(episode, self.report_interval):
200 |             recent_rewards = self.reward_log[-self.report_interval:]
201 |             self.logger.describe("reward", recent_rewards, episode=episode)
202 | 
203 | 
204 | def main(play):
205 |     env = PendulumObserver(gym.make("Pendulum-v0"))
206 |     trainer = PolicyGradientContinuousTrainer()
207 |     path = trainer.logger.path_of("policy_gradient_continuous_agent.h5")
208 | 
209 |     if play:
210 |         agent = PolicyGradientContinuousAgent.load(env, path)
211 |         agent.play(env)
212 |     else:
213 |         trained = trainer.train(env, episode_count=1500, render=True)
214 |         trainer.logger.plot("Rewards", trainer.reward_log,
215 |                             trainer.report_interval)
216 |         trained.save(path)
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     parser = argparse.ArgumentParser(description="PG Agent Pendulum-v0")
221 |     parser.add_argument("--play", action="store_true",
222 |                         help="play with trained model")
223 | 
224 |     args = parser.parse_args()
225 |     main(args.play)
226 | 


--------------------------------------------------------------------------------
/FN/value_function_agent.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import argparse
  3 | import numpy as np
  4 | from sklearn.neural_network import MLPRegressor
  5 | from sklearn.preprocessing import StandardScaler
  6 | from sklearn.pipeline import Pipeline
  7 | from sklearn.externals import joblib
  8 | import gym
  9 | from fn_framework import FNAgent, Trainer, Observer
 10 | 
 11 | 
 12 | class ValueFunctionAgent(FNAgent):
 13 | 
 14 |     def save(self, model_path):
 15 |         joblib.dump(self.model, model_path)
 16 | 
 17 |     @classmethod
 18 |     def load(cls, env, model_path, epsilon=0.0001):
 19 |         actions = list(range(env.action_space.n))
 20 |         agent = cls(epsilon, actions)
 21 |         agent.model = joblib.load(model_path)
 22 |         agent.initialized = True
 23 |         return agent
 24 | 
 25 |     def initialize(self, experiences):
 26 |         scaler = StandardScaler()
 27 |         estimator = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=1)
 28 |         self.model = Pipeline([("scaler", scaler), ("estimator", estimator)])
 29 | 
 30 |         states = np.vstack([e.s for e in experiences])
 31 |         self.model.named_steps["scaler"].fit(states)
 32 | 
 33 |         # Avoid the predict before fit.
 34 |         self.update([experiences[0]], gamma=0)
 35 |         self.initialized = True
 36 |         print("Done initialization. From now, begin training!")
 37 | 
 38 |     def estimate(self, s):
 39 |         estimated = self.model.predict(s)[0]
 40 |         return estimated
 41 | 
 42 |     def _predict(self, states):
 43 |         if self.initialized:
 44 |             predicteds = self.model.predict(states)
 45 |         else:
 46 |             size = len(self.actions) * len(states)
 47 |             predicteds = np.random.uniform(size=size)
 48 |             predicteds = predicteds.reshape((-1, len(self.actions)))
 49 |         return predicteds
 50 | 
 51 |     def update(self, experiences, gamma):
 52 |         states = np.vstack([e.s for e in experiences])
 53 |         n_states = np.vstack([e.n_s for e in experiences])
 54 | 
 55 |         estimateds = self._predict(states)
 56 |         future = self._predict(n_states)
 57 | 
 58 |         for i, e in enumerate(experiences):
 59 |             reward = e.r
 60 |             if not e.d:
 61 |                 reward += gamma * np.max(future[i])
 62 |             estimateds[i][e.a] = reward
 63 | 
 64 |         estimateds = np.array(estimateds)
 65 |         states = self.model.named_steps["scaler"].transform(states)
 66 |         self.model.named_steps["estimator"].partial_fit(states, estimateds)
 67 | 
 68 | 
 69 | class CartPoleObserver(Observer):
 70 | 
 71 |     def transform(self, state):
 72 |         return np.array(state).reshape((1, -1))
 73 | 
 74 | 
 75 | class ValueFunctionTrainer(Trainer):
 76 | 
 77 |     def train(self, env, episode_count=220, epsilon=0.1, initial_count=-1,
 78 |               render=False):
 79 |         actions = list(range(env.action_space.n))
 80 |         agent = ValueFunctionAgent(epsilon, actions)
 81 |         self.train_loop(env, agent, episode_count, initial_count, render)
 82 |         return agent
 83 | 
 84 |     def begin_train(self, episode, agent):
 85 |         agent.initialize(self.experiences)
 86 | 
 87 |     def step(self, episode, step_count, agent, experience):
 88 |         if self.training:
 89 |             batch = random.sample(self.experiences, self.batch_size)
 90 |             agent.update(batch, self.gamma)
 91 | 
 92 |     def episode_end(self, episode, step_count, agent):
 93 |         rewards = [e.r for e in self.get_recent(step_count)]
 94 |         self.reward_log.append(sum(rewards))
 95 | 
 96 |         if self.is_event(episode, self.report_interval):
 97 |             recent_rewards = self.reward_log[-self.report_interval:]
 98 |             self.logger.describe("reward", recent_rewards, episode=episode)
 99 | 
100 | 
101 | def main(play):
102 |     env = CartPoleObserver(gym.make("CartPole-v0"))
103 |     trainer = ValueFunctionTrainer()
104 |     path = trainer.logger.path_of("value_function_agent.pkl")
105 | 
106 |     if play:
107 |         agent = ValueFunctionAgent.load(env, path)
108 |         agent.play(env)
109 |     else:
110 |         trained = trainer.train(env)
111 |         trainer.logger.plot("Rewards", trainer.reward_log,
112 |                             trainer.report_interval)
113 |         trained.save(path)
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     parser = argparse.ArgumentParser(description="VF Agent")
118 |     parser.add_argument("--play", action="store_true",
119 |                         help="play with trained model")
120 | 
121 |     args = parser.parse_args()
122 |     main(args.play)
123 | 


--------------------------------------------------------------------------------
/IM/dagger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import warnings
  4 | import numpy as np
  5 | from sklearn.externals import joblib
  6 | from sklearn.neural_network import MLPRegressor, MLPClassifier
  7 | import gym
  8 | from gym.envs.registration import register
  9 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv",
 10 |          kwargs={"is_slippery": False})
 11 | 
 12 | 
 13 | class TeacherAgent():
 14 | 
 15 |     def __init__(self, env, epsilon=0.1):
 16 |         self.actions = list(range(env.action_space.n))
 17 |         self.epsilon = epsilon
 18 |         self.model = None
 19 | 
 20 |     def save(self, model_path):
 21 |         joblib.dump(self.model, model_path)
 22 | 
 23 |     @classmethod
 24 |     def load(cls, env, model_path, epsilon=0.1):
 25 |         agent = cls(env, epsilon)
 26 |         agent.model = joblib.load(model_path)
 27 |         return agent
 28 | 
 29 |     def initialize(self, state):
 30 |         # Only state => action projection is needed.
 31 |         self.model = MLPRegressor(hidden_layer_sizes=(), max_iter=1)
 32 |         # Warmup to use predict method.
 33 |         dummy_label = [np.random.uniform(size=len(self.actions))]
 34 |         self.model.partial_fit([state], dummy_label)
 35 |         return self
 36 | 
 37 |     def estimate(self, state):
 38 |         q = self.model.predict([state])[0]
 39 |         return q
 40 | 
 41 |     def policy(self, state):
 42 |         if np.random.random() < self.epsilon:
 43 |             return np.random.randint(len(self.actions))
 44 |         else:
 45 |             return np.argmax(self.estimate(state))
 46 | 
 47 |     @classmethod
 48 |     def train(cls, env, episode_count=3000, gamma=0.9,
 49 |               initial_epsilon=1.0, final_epsilon=0.1, report_interval=100):
 50 |         agent = cls(env, initial_epsilon).initialize(env.reset())
 51 |         rewards = []
 52 |         decay = (initial_epsilon - final_epsilon) / episode_count
 53 |         for e in range(episode_count):
 54 |             s = env.reset()
 55 |             done = False
 56 |             goal_reward = 0
 57 |             while not done:
 58 |                 a = agent.policy(s)
 59 |                 estimated = agent.estimate(s)
 60 | 
 61 |                 n_state, reward, done, info = env.step(a)
 62 |                 gain = reward + gamma * max(agent.estimate(n_state))
 63 | 
 64 |                 estimated[a] = gain
 65 |                 agent.model.partial_fit([s], [estimated])
 66 |                 s = n_state
 67 |             else:
 68 |                 goal_reward = reward
 69 | 
 70 |             rewards.append(goal_reward)
 71 |             if e != 0 and e % report_interval == 0:
 72 |                 recent = np.array(rewards[-report_interval:])
 73 |                 print("At episode {}, reward is {}".format(
 74 |                         e, recent.mean()))
 75 |             agent.epsilon -= decay
 76 | 
 77 |         return agent
 78 | 
 79 | 
 80 | class FrozenLakeObserver():
 81 | 
 82 |     def __init__(self):
 83 |         self._env = gym.make("FrozenLakeEasy-v0")
 84 | 
 85 |     @property
 86 |     def action_space(self):
 87 |         return self._env.action_space
 88 | 
 89 |     @property
 90 |     def observation_space(self):
 91 |         return self._env.observation_space
 92 | 
 93 |     def reset(self):
 94 |         return self.transform(self._env.reset())
 95 | 
 96 |     def render(self):
 97 |         self._env.render()
 98 | 
 99 |     def step(self, action):
100 |         n_state, reward, done, info = self._env.step(action)
101 |         return self.transform(n_state), reward, done, info
102 | 
103 |     def transform(self, state):
104 |         feature = np.zeros(self.observation_space.n)
105 |         feature[state] = 1.0
106 |         return feature
107 | 
108 | 
109 | class Student():
110 | 
111 |     def __init__(self, env):
112 |         self.actions = list(range(env.action_space.n))
113 |         self.model = None
114 | 
115 |     def initialize(self, state):
116 |         self.model = MLPClassifier(hidden_layer_sizes=(), max_iter=1)
117 |         dummy_action = 0
118 |         self.model.partial_fit([state], [dummy_action],
119 |                                classes=self.actions)
120 |         return self
121 | 
122 |     def policy(self, state):
123 |         return self.model.predict([state])[0]
124 | 
125 |     def imitate(self, env, teacher, initial_step=100, train_step=200,
126 |                 report_interval=10):
127 |         states = []
128 |         actions = []
129 | 
130 |         # Collect teacher's demonstrations.
131 |         for e in range(initial_step):
132 |             s = env.reset()
133 |             done = False
134 |             while not done:
135 |                 a = teacher.policy(s)
136 |                 n_state, reward, done, info = env.step(a)
137 |                 states.append(s)
138 |                 actions.append(a)
139 |                 s = n_state
140 | 
141 |         self.initialize(states[0])
142 |         self.model.partial_fit(states, actions)
143 | 
144 |         print("Start imitation.")
145 |         # Student tries to learn teacher's actions.
146 |         step_limit = 20
147 |         for e in range(train_step):
148 |             s = env.reset()
149 |             done = False
150 |             rewards = []
151 |             step = 0
152 |             while not done and step < step_limit:
153 |                 a = self.policy(s)
154 |                 n_state, reward, done, info = env.step(a)
155 |                 states.append(s)
156 |                 actions.append(teacher.policy(s))
157 |                 s = n_state
158 |                 step += 1
159 |             else:
160 |                 goal_reward = reward
161 | 
162 |             rewards.append(goal_reward)
163 |             if e != 0 and e % report_interval == 0:
164 |                 recent = np.array(rewards[-report_interval:])
165 |                 print("At episode {}, reward is {}".format(
166 |                         e, recent.mean()))
167 | 
168 |             with warnings.catch_warnings():
169 |                 # It will be fixed in latest scikit-learn.
170 |                 # https://github.com/scikit-learn/scikit-learn/issues/10449
171 |                 warnings.filterwarnings("ignore", category=DeprecationWarning)
172 |                 self.model.partial_fit(states, actions)
173 | 
174 | 
175 | def main(teacher):
176 |     env = FrozenLakeObserver()
177 |     path = os.path.join(os.path.dirname(__file__), "imitation_teacher.pkl")
178 | 
179 |     if teacher:
180 |         agent = TeacherAgent.train(env)
181 |         agent.save(path)
182 |     else:
183 |         teacher_agent = TeacherAgent.load(env, path)
184 |         student = Student(env)
185 |         student.imitate(env, teacher_agent)
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     parser = argparse.ArgumentParser(description="Imitation Learning")
190 |     parser.add_argument("--teacher", action="store_true",
191 |                         help="train teacher model")
192 | 
193 |     args = parser.parse_args()
194 |     main(args.teacher)
195 | 


--------------------------------------------------------------------------------
/IRL/backups/environment.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import numpy as np
  3 | 
  4 | 
  5 | class Direction(Enum):
  6 |     UP = 1
  7 |     DOWN = -1
  8 |     LEFT = 2
  9 |     RIGHT = -2
 10 | 
 11 | 
 12 | class State():
 13 | 
 14 |     def __init__(self, row=-1, column=-1):
 15 |         self.row = row
 16 |         self.column = column
 17 | 
 18 |     def index(self, n_row):
 19 |         return self.row * n_row + self.column
 20 | 
 21 |     def __repr__(self):
 22 |         return "<State: [{}, {}]>".format(self.row, self.column)
 23 | 
 24 |     def clone(self):
 25 |         return State(self.row, self.column)
 26 | 
 27 |     def __hash__(self):
 28 |         return hash((self.row, self.column))
 29 | 
 30 |     def __eq__(self, other):
 31 |         return self.row == other.row and self.column == other.column
 32 | 
 33 | 
 34 | class Environment():
 35 | 
 36 |     def __init__(self, grid, move_prob=0.8):
 37 |         # Grid is 2d-array, and each value treated as attribute.
 38 |         # attribute is
 39 |         #  0: ordinary cell
 40 |         #  -1: damage cell (game end)
 41 |         #  1: reward cell (game end)
 42 |         #  9: block cell (can't locate agent)
 43 |         self.grid = grid
 44 |         self.agent_state = State()
 45 | 
 46 |         # Default reward is minus like poison swamp.
 47 |         # It means agent have to reach the goal fast!
 48 |         self.default_reward = -0.04
 49 | 
 50 |         # Agent can move to decided direction in move_prob.
 51 |         # It means agent will move different direction in (1 - move_prob).
 52 |         self.move_prob = move_prob
 53 |         self.reset()
 54 | 
 55 |     @property
 56 |     def row_length(self):
 57 |         return len(self.grid)
 58 | 
 59 |     @property
 60 |     def column_length(self):
 61 |         return len(self.grid[0])
 62 | 
 63 |     @property
 64 |     def action_space(self):
 65 |         return [Direction.UP, Direction.DOWN,
 66 |                 Direction.LEFT, Direction.RIGHT]
 67 | 
 68 |     @property
 69 |     def states(self):
 70 |         states = []
 71 |         for row in range(self.row_length):
 72 |             for column in range(self.column_length):
 73 |                 # Avoid the Block Cell
 74 |                 if self.grid[row][column] != 9:
 75 |                     states.append(State(row, column))
 76 |         return states
 77 | 
 78 |     def reset(self):
 79 |         # Locate agent at lower left corner
 80 |         self.agent_state = State(self.row_length - 1, 0)
 81 |         return self.agent_state
 82 | 
 83 |     def step(self, action):
 84 |         next_state, reward, done = self.transit(self.agent_state, action)
 85 |         if next_state is not None:
 86 |             self.agent_state = next_state
 87 | 
 88 |         return next_state, reward, done
 89 | 
 90 |     def transit(self, state, action):
 91 |         transition_probs = self.transit_func(state, action)
 92 |         if len(transition_probs) == 0:
 93 |             return None, None, True
 94 | 
 95 |         next_states = []
 96 |         probs = []
 97 |         for s in transition_probs:
 98 |             next_states.append(s)
 99 |             probs.append(transition_probs[s])
100 | 
101 |         next_state = np.random.choice(next_states, p=probs)
102 |         reward, done = self.reward_func(next_state)
103 |         return next_state, reward, done
104 | 
105 |     def transit_func(self, state, action):
106 |         transition_probs = {}
107 |         if not self.can_action_at(state):
108 |             # Already on the terminal cell
109 |             return transition_probs
110 | 
111 |         actions = self.action_space
112 |         opposite_direction = Direction(action.value * -1)
113 | 
114 |         for a in actions:
115 |             prob = 0
116 |             if a == action:
117 |                 prob = self.move_prob
118 |             elif a != opposite_direction:
119 |                 prob = (1 - self.move_prob) / 2
120 | 
121 |             next_state = self._move(state, a)
122 |             if next_state not in transition_probs:
123 |                 transition_probs[next_state] = prob
124 |             else:
125 |                 transition_probs[next_state] += prob
126 | 
127 |         return transition_probs
128 | 
129 |     def can_action_at(self, state):
130 |         if self.grid[state.row][state.column] == 0:
131 |             return True
132 |         else:
133 |             return False
134 | 
135 |     def _move(self, state, action):
136 |         if not self.can_action_at(state):
137 |             raise Exception("Can't move from here!")
138 | 
139 |         next_state = state.clone()
140 | 
141 |         # Move state by action
142 |         if action == Direction.UP:
143 |             next_state.row -= 1
144 |         elif action == Direction.DOWN:
145 |             next_state.row += 1
146 |         elif action == Direction.LEFT:
147 |             next_state.column -= 1
148 |         elif action == Direction.RIGHT:
149 |             next_state.column += 1
150 | 
151 |         # Check the out of grid
152 |         if not (0 <= next_state.row < self.row_length):
153 |             next_state = state
154 |         if not (0 <= next_state.column < self.column_length):
155 |             next_state = state
156 | 
157 |         # Check the Agent bumped the block
158 |         if self.grid[next_state.row][next_state.column] == 9:
159 |             next_state = state
160 | 
161 |         return next_state
162 | 
163 |     def reward_func(self, state):
164 |         reward = self.default_reward
165 |         done = False
166 | 
167 |         # Check the attribute of next state
168 |         attribute = self.grid[state.row][state.column]
169 |         if attribute == 1:
170 |             # Get treasure! and game ends.
171 |             reward = 1
172 |             done = True
173 |         elif attribute == -1:
174 |             # Go to hell! and the game ends.
175 |             reward = -1
176 |             done = True
177 | 
178 |         return reward, done
179 | 


--------------------------------------------------------------------------------
/IRL/backups/irl_from_traj.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | from sklearn.externals import joblib
  6 | from sklearn.neural_network import MLPRegressor
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.eager as tfe
  9 | from tensorflow.python import keras as K
 10 | import gym
 11 | from gym.envs.registration import register
 12 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv",
 13 |          kwargs={"is_slippery": False})
 14 | 
 15 | 
 16 | tfe.enable_eager_execution()
 17 | 
 18 | 
 19 | class TeacherAgent():
 20 | 
 21 |     def __init__(self, env, epsilon=0.1):
 22 |         self.actions = list(range(env.action_space.n))
 23 |         self.num_states = env.observation_space.n
 24 |         self.epsilon = epsilon
 25 |         self.model = None
 26 | 
 27 |     def save(self, model_path):
 28 |         joblib.dump(self.model, model_path)
 29 | 
 30 |     @classmethod
 31 |     def load(cls, env, model_path, epsilon=0.1):
 32 |         agent = cls(env, epsilon)
 33 |         agent.model = joblib.load(model_path)
 34 |         return agent
 35 | 
 36 |     def initialize(self, state):
 37 |         # Only state => action projection is needed
 38 |         self.model = MLPRegressor(hidden_layer_sizes=(), max_iter=1)
 39 |         # Warmup to use predict method
 40 |         dummy_label = [np.random.uniform(size=len(self.actions))]
 41 |         self.model.partial_fit(np.array([self.transform(state)]),
 42 |                                np.array(dummy_label))
 43 |         return self
 44 | 
 45 |     def estimate(self, state):
 46 |         feature = self.transform(state)
 47 |         q = self.model.predict([feature])[0]
 48 |         return q
 49 | 
 50 |     def policy(self, state):
 51 |         if np.random.random() < self.epsilon:
 52 |             return np.random.randint(len(self.actions))
 53 |         else:
 54 |             return np.argmax(self.estimate(state))
 55 | 
 56 |     def transform(self, state):
 57 |         feature = np.zeros(self.num_states)
 58 |         feature[state] = 1.0
 59 |         return feature
 60 | 
 61 |     @classmethod
 62 |     def train(cls, env, episode_count=3000, gamma=0.9,
 63 |               initial_epsilon=1.0, final_epsilon=0.1, report_interval=100):
 64 |         agent = cls(env, initial_epsilon).initialize(env.reset())
 65 |         rewards = []
 66 |         decay = (initial_epsilon - final_epsilon) / episode_count
 67 |         for e in range(episode_count):
 68 |             s = env.reset()
 69 |             done = False
 70 |             goal_reward = 0
 71 |             while not done:
 72 |                 a = agent.policy(s)
 73 |                 estimated = agent.estimate(s)
 74 | 
 75 |                 n_state, reward, done, info = env.step(a)
 76 | 
 77 |                 gain = reward + gamma * max(agent.estimate(n_state))
 78 |                 estimated[a] = gain
 79 |                 agent.model.partial_fit([agent.transform(s)], [estimated])
 80 |                 s = n_state
 81 |             else:
 82 |                 goal_reward = reward
 83 | 
 84 |             rewards.append(goal_reward)
 85 |             if e != 0 and e % report_interval == 0:
 86 |                 recent = np.array(rewards[-report_interval:])
 87 |                 print("At episode {}, reward is {}".format(
 88 |                         e, recent.mean()))
 89 |             agent.epsilon -= decay
 90 | 
 91 |         return agent
 92 | 
 93 | 
 94 | class IRL():
 95 | 
 96 |     def __init__(self, env):
 97 |         self.actions = list(range(env.action_space.n))
 98 |         self.num_states = env.observation_space.n
 99 |         self.rewards = tfe.Variable(tf.random_uniform(
100 |                                         [env.observation_space.n]),
101 |                                     name="rewards")
102 |         """
103 |         self.rewards = tfe.Variable(initial_value=[0.0, 0.0, 0.0, 0.0,
104 |                                                    0.0, 0.0, 0.0, 0.0,
105 |                                                    0.0, 0.0, 0.0, 0.0,
106 |                                                    0.0, 0.0, 0.0, 1.0,],
107 |                                     name="rewards")
108 |         """
109 |         self._updater = tfe.implicit_gradients(self.loss)
110 | 
111 |     """
112 |     def value_estimate(self, steps, gamma):
113 |         values = {}
114 |         counts = {}
115 |         for i, t in enumerate(steps):
116 |             rewards = [self.rewards[s] for s in t]
117 |             for j, s in enumerate(t):
118 |                 discounteds = [r * (gamma ** k)
119 |                                for k, r in enumerate(rewards[j:])]
120 |                 discounted = tf.reduce_sum(discounteds)
121 |                 if s not in values:
122 |                     values[s] = discounted
123 |                     counts[s] = 0.0
124 | 
125 |                 counts[s] += 1
126 |                 values[s] = tf.add(values[s], tf.divide(
127 |                                     tf.subtract(discounted, values[s]),
128 |                                     counts[s]))
129 | 
130 |         value_tensors = []
131 |         total_count = sum([counts[s] for s in counts])
132 |         for i in range(self.rewards.shape[0].value):
133 |             if i in values:
134 |                 visit = counts[i] / total_count
135 |                 value = tf.multiply(values[i], visit)
136 |             else:
137 |                 value = tf.constant(0.0)
138 |             value_tensors.append(value)
139 |         values = tf.stack(value_tensors)
140 |         return values
141 |     """
142 | 
143 |     def value_estimate(self, trajectory, gamma):
144 |         values = {}
145 |         one_host_trajectory = tf.one_hot(trajectory, self.num_states)
146 |         rewards = tf.reduce_sum(one_host_trajectory * self.rewards, axis=1)
147 |         for i, r in enumerate(rewards):
148 |             future = [_r * (gamma ** (k + 1))
149 |                       for k, _r in enumerate(rewards[(i + 1):])]
150 |             reward = r + tf.reduce_sum(future)
151 |             s = trajectory[i]
152 |             values[s] = reward
153 | 
154 |         value_tensors = []
155 |         for i in range(self.num_states):
156 |             if i in values:
157 |                 value = values[i]
158 |             else:
159 |                 value = tf.constant(0.0)
160 |             value_tensors.append(value)
161 |         values = tf.stack(value_tensors)
162 |         return values
163 | 
164 |     def get_rewards(self):
165 |         return self.rewards.numpy()
166 | 
167 |     def loss(self, teacher_steps, steps, gamma):
168 |         teacher_values = tf.stack([self.value_estimate(t, gamma) for t in teacher_steps])
169 |         values = tf.stack([self.value_estimate(t, gamma) for t in steps])
170 |         best = tf.reduce_mean(teacher_values, axis=0)
171 |         diff = tf.reduce_min(best - values, axis=0)
172 |         #print(">>>>>>>>")
173 |         #print(tf.reshape(best, (4, 4)))
174 |         #print(tf.reshape(tf.reduce_mean(values, axis=0), (4, 4)))
175 | 
176 |         loss = tf.reduce_sum(tf.boolean_mask(diff, diff > 0))
177 |         penalty = -2 * tf.reduce_sum(tf.boolean_mask(diff, diff < 0))
178 |         loss += penalty
179 | 
180 |         #_loss = _loss + 1.5 * tf.reduce_sum(tf.abs(self.rewards))
181 |         return loss
182 | 
183 |     def update(self, optimizer, teacher_steps, steps, gamma):
184 |         loss = self.loss(teacher_steps, steps, gamma)
185 |         optimizer.apply_gradients(self._updater(teacher_steps, steps, gamma))
186 |         return loss, self.get_rewards()
187 | 
188 |     def take_action(self, Q, state, actions, epsilon=0.1):
189 |         rand_action = np.random.randint(len(actions))
190 |         if np.random.random() < epsilon:
191 |             return rand_action
192 |         elif state in Q and sum(Q[state]) != 0:
193 |             return np.argmax(Q[state])
194 |         else:
195 |             return rand_action
196 | 
197 |     def estimate(self, env, teacher, episode_count=3000,
198 |                  teacher_demo_size=256, batch_size=32,
199 |                  learning_rate=1e-3, max_step=10,
200 |                  gamma=0.9, report_interval=10):
201 | 
202 |         # Accumulate teacher's demonstration
203 |         demos = []
204 |         for e in range(teacher_demo_size):
205 |             s = env.reset()
206 |             done = False
207 |             trajectory = [s]
208 |             while not done:
209 |                 a = teacher.policy(s)
210 |                 n_state, reward, done, info = env.step(a)
211 |                 s = n_state
212 |                 trajectory.append(s)
213 |             demos.append(trajectory)
214 | 
215 |         print("Start reward estimation.")
216 |         actions = list(range(env.action_space.n))
217 |         rewards = np.zeros((env.observation_space.n))
218 |         Q = defaultdict(lambda: [0] * len(actions))
219 |         optimizer = tf.train.AdamOptimizer(learning_rate)
220 | 
221 |         for e in range(episode_count):
222 |             batch = []
223 |             total_reward = 0
224 |             for b in range(batch_size):
225 |                 s = env.reset()
226 |                 done = False
227 |                 trajectory = [s]
228 |                 step = 0
229 |                 epsilon = 1.0
230 |                 while not done and step < max_step:
231 |                     a = self.take_action(Q, s, actions, epsilon)
232 |                     n_state, reward, done, info = env.step(a)
233 | 
234 |                     estimated = Q[s][a]
235 |                     gain = rewards[n_state] + gamma * max(Q[n_state])
236 |                     Q[s][a] += learning_rate * (gain - estimated)
237 |                     s = n_state
238 |                     trajectory.append(s)
239 |                     step += 1
240 |                     epsilon = epsilon * ((batch_size - b) / batch_size)
241 |                 else:
242 |                     total_reward += reward
243 |                 batch.append(trajectory)
244 | 
245 |             teacher_batch = np.random.choice(demos, size=batch_size)
246 |             loss, new_rewards = self.update(optimizer,
247 |                                             teacher_batch, batch, gamma)
248 | 
249 |             rewards = new_rewards
250 | 
251 |             if e % 10 == 0:
252 |                 print("At episode {}, reward={}, loss={}".format(
253 |                         e, total_reward, loss))
254 |                 print("Reward")
255 |                 print(new_rewards.reshape(4, 4))
256 | 
257 | 
258 | def main(train):
259 |     env = gym.make("FrozenLakeEasy-v0")
260 |     path = os.path.join(os.path.dirname(__file__), "irl_teacher.pkl")
261 | 
262 |     if train:
263 |         agent = TeacherAgent.train(env)
264 |         agent.save(path)
265 |     else:
266 |         teacher = TeacherAgent.load(env, path)
267 |         irl = IRL(env)
268 |         irl.estimate(env, teacher)
269 | 
270 | 
271 | if __name__ == "__main__":
272 |     parser = argparse.ArgumentParser(description="Imitation Learning")
273 |     parser.add_argument("--train", action="store_true",
274 |                         help="train teacher model")
275 | 
276 |     args = parser.parse_args()
277 |     main(args.train)
278 | 


--------------------------------------------------------------------------------
/IRL/backups/linear.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from tensorflow.python import keras as K
  4 | import tensorflow as tf
  5 | from environment import Environment
  6 | from planner import PolicyIterationPlanner
  7 | import visualizer as viz
  8 | 
  9 | 
 10 | class LinerIRL():
 11 | 
 12 |     def __init__(self):
 13 |         self._updater = None
 14 |         self.rewards = None
 15 | 
 16 |     def initialize(self, num_states, num_actions, optimizer, C=1.0, r_max=2):
 17 |         # Variables
 18 |         best_trans_probs = tf.compat.v1.placeholder(
 19 |                                           tf.float32,
 20 |                                           shape=(num_states, num_states))
 21 |         other_trans_probss = tf.compat.v1.placeholder(
 22 |                                             tf.float32,
 23 |                                             shape=(num_states,
 24 |                                                    num_actions - 1,
 25 |                                                    num_states))
 26 |         gamma = tf.compat.v1.placeholder(tf.float32, shape=())
 27 |         rewards = tf.Variable(tf.random_normal([num_states], mean=r_max/2),
 28 |                               name="rewards")
 29 | 
 30 |         _indices = tf.constant([0] * num_states)
 31 |         _min_losses = tf.constant([1e+10] * num_states)
 32 |         eye = tf.eye(num_states)
 33 | 
 34 |         condition = lambda s, i, loss: tf.less(i, other_trans_probss.shape[1])  # noqa
 35 | 
 36 |         def process(s, i, loss):
 37 |             best_trans_prob = best_trans_probs[s]
 38 |             other_trans_prob = other_trans_probss[s][i]
 39 | 
 40 |             f_left = tf.reshape((best_trans_prob - other_trans_prob), (1, -1))
 41 |             f_right = tf.matrix_inverse(eye - gamma * best_trans_prob)
 42 | 
 43 |             # Limit the rewards of other actions smaller than best's one.
 44 |             R = tf.reshape(tf.clip_by_value(rewards, -r_max, r_max), (-1, 1))
 45 | 
 46 |             formula = K.backend.dot(K.backend.dot(f_left, f_right), R)
 47 | 
 48 |             # Formula should be positive
 49 |             _loss = tf.abs(tf.squeeze(tf.nn.leaky_relu(formula)))
 50 |             loss = tf.reduce_min([loss, _loss])
 51 |             i = tf.add(i, 1)
 52 |             return s, i, loss
 53 | 
 54 |         total_loss = tf.constant(0.0)
 55 |         for s in range(num_states):
 56 |             _, _, min_loss = tf.while_loop(condition, process,
 57 |                                            [s, _indices[s], _min_losses[s]])
 58 |             total_loss = tf.add(total_loss, min_loss)
 59 | 
 60 |         total_loss -= C * tf.reduce_sum(tf.abs(rewards))  # L1 regularization
 61 |         total_loss = -total_loss  # Maximize to Minimize
 62 | 
 63 |         # Get gradients
 64 |         updates = optimizer.get_updates(loss=total_loss, params=[rewards])
 65 |         self._updater = K.backend.function(
 66 |                                         inputs=[best_trans_probs,
 67 |                                                 other_trans_probss,
 68 |                                                 gamma],
 69 |                                         outputs=[total_loss, rewards],
 70 |                                         updates=updates)
 71 | 
 72 |     def to_trans_prob(self, env, probs):
 73 |         states = env.states
 74 |         mx = np.zeros(len(states))
 75 |         for s in states:
 76 |             if s in probs:
 77 |                 mx[s.index(env.row_length)] = probs[s]
 78 |         return mx
 79 | 
 80 |     def estimate(self, env, teacher, episode_count=6000, learning_rate=1e-3,
 81 |                  gamma=0.9, report_interval=100):
 82 |         optimizer = K.optimizers.Adam(learning_rate)
 83 |         num_actions = len(env.action_space)
 84 |         num_states = len(env.states)
 85 |         self.initialize(num_states, num_actions, optimizer)
 86 |         loss_history = []
 87 |         for e in range(episode_count):
 88 |             best_trans_probs = []
 89 |             other_trans_probss = []
 90 |             for s in env.states:
 91 |                 actions = teacher.policy[s]
 92 |                 best_action = max(actions, key=actions.get)
 93 |                 best_trans_prob = np.zeros(num_states)
 94 |                 other_trans_probs = []
 95 |                 for a in env.action_space:
 96 |                     probs = env.transit_func(s, a)
 97 |                     if len(probs) == 0:
 98 |                         continue
 99 |                     if a == best_action:
100 |                         best_trans_prob = self.to_trans_prob(env, probs)
101 |                     else:
102 |                         other_trans_probs.append(
103 |                             self.to_trans_prob(env, probs)
104 |                         )
105 |                 if len(other_trans_probs) == 0:
106 |                     other_trans_probs = [np.zeros(num_states)] * (num_actions - 1)
107 | 
108 |                 other_trans_probs = np.array(other_trans_probs)
109 | 
110 |                 best_trans_probs.append(best_trans_prob)
111 |                 other_trans_probss.append(other_trans_probs)
112 | 
113 |             best_trans_probs = np.array(best_trans_probs)
114 |             other_trans_probss = np.array(other_trans_probss)
115 | 
116 |             loss, self.rewards = self._updater([best_trans_probs,
117 |                                                 other_trans_probss,
118 |                                                 gamma])
119 |             loss_history.append(loss)
120 |             if e != 0 and e % report_interval == 0:
121 |                 viz.describe(e, "loss", loss_history, report_interval)
122 | 
123 |         return loss_history
124 | 
125 | 
126 | def main():
127 |     grid = [
128 |         [0, 0, 0, 1],
129 |         [0, 0, 0, 0],
130 |         [0, 0, 0, 0],
131 |         [0, 0, 0, 0]
132 |     ]
133 |     # Prepare Teacher
134 |     env = Environment(grid)
135 |     planner = PolicyIterationPlanner(env)
136 |     planner.plan()
137 | 
138 |     # Execute IRL
139 |     irl = LinerIRL()
140 |     irl.estimate(env, planner)
141 |     print(irl.rewards)
142 | 
143 |     # Plot Reward Map
144 |     ncol = env.column_length
145 |     nrow = env.row_length
146 |     import matplotlib.pyplot as plt
147 |     import matplotlib.cm as cm
148 |     fig, ax = plt.subplots()
149 |     reward_map = irl.rewards.reshape((nrow, ncol))
150 |     ax.imshow(reward_map, cmap=cm.RdYlGn)
151 |     ax.set_xticks(np.arange(ncol))
152 |     ax.set_yticks(np.arange(nrow))
153 |     fig.tight_layout()
154 |     plt.show()
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | 


--------------------------------------------------------------------------------
/IRL/backups/planner.py:
--------------------------------------------------------------------------------
  1 | class Planner():
  2 | 
  3 |     def __init__(self, env):
  4 |         self.env = env
  5 |         self.log = []
  6 | 
  7 |     def initialize(self):
  8 |         self.env.reset()
  9 |         self.log = []
 10 | 
 11 |     def transitions_at(self, state, action):
 12 |         transition_probs = self.env.transit_func(state, action)
 13 |         for next_state in transition_probs:
 14 |             prob = transition_probs[next_state]
 15 |             reward, _ = self.env.reward_func(next_state)
 16 |             yield prob, next_state, reward
 17 | 
 18 |     def plan(self, gamma=0.9, threshold=0.0001):
 19 |         raise Exception("Planner have to implements plan method.")
 20 | 
 21 |     def dict_to_grid(self, state_reward_dict):
 22 |         grid = []
 23 |         for i in range(self.env.row_length):
 24 |             row = [0] * self.env.column_length
 25 |             grid.append(row)
 26 |         for s in state_reward_dict:
 27 |             grid[s.row][s.column] = state_reward_dict[s]
 28 | 
 29 |         return grid
 30 | 
 31 | 
 32 | class ValueIterationPlanner(Planner):
 33 | 
 34 |     def __init__(self, env):
 35 |         super().__init__(env)
 36 | 
 37 |     def plan(self, gamma=0.9, threshold=0.0001):
 38 |         self.initialize()
 39 |         actions = self.env.action_space
 40 |         V = {}
 41 |         for s in self.env.states:
 42 |             # Initialize each state's expected reward
 43 |             V[s] = 0
 44 | 
 45 |         while True:
 46 |             delta = 0
 47 |             self.log.append(self.dict_to_grid(V))
 48 |             for s in V:
 49 |                 if not self.env.can_action_at(s):
 50 |                     continue
 51 |                 expected_rewards = []
 52 |                 for a in actions:
 53 |                     r = 0
 54 |                     for prob, next_state, reward in self.transitions_at(s, a):
 55 |                         r += prob * (reward + gamma * V[next_state])
 56 |                     expected_rewards.append(r)
 57 |                 max_reward = max(expected_rewards)
 58 |                 delta = max(delta, abs(max_reward - V[s]))
 59 |                 V[s] = max_reward
 60 | 
 61 |             if delta < threshold:
 62 |                 break
 63 | 
 64 |         # Turn dictionary to grid
 65 |         V_grid = self.dict_to_grid(V)
 66 |         return V_grid
 67 | 
 68 | 
 69 | class PolicyIterationPlanner(Planner):
 70 | 
 71 |     def __init__(self, env):
 72 |         super().__init__(env)
 73 |         self.policy = {}
 74 | 
 75 |     def initialize(self):
 76 |         super().initialize()
 77 |         self.policy = {}
 78 |         actions = self.env.action_space
 79 |         states = self.env.states
 80 |         for s in states:
 81 |             self.policy[s] = {}
 82 |             for a in actions:
 83 |                 # Initialize policy. First each action is taken uniformly.
 84 |                 self.policy[s][a] = 1 / len(actions)
 85 | 
 86 |     def estimate_by_policy(self, gamma, threshold):
 87 |         V = {}
 88 |         for s in self.env.states:
 89 |             # Initialize each state's expected reward
 90 |             V[s] = 0
 91 | 
 92 |         while True:
 93 |             delta = 0
 94 |             for s in V:
 95 |                 expected_rewards = []
 96 |                 for a in self.policy[s]:
 97 |                     action_prob = self.policy[s][a]
 98 |                     r = 0
 99 |                     for prob, next_state, reward in self.transitions_at(s, a):
100 |                         r += action_prob * prob * \
101 |                              (reward + gamma * V[next_state])
102 |                     expected_rewards.append(r)
103 |                 max_reward = max(expected_rewards)
104 |                 delta = max(delta, abs(max_reward - V[s]))
105 |                 V[s] = max_reward
106 |             if delta < threshold:
107 |                 break
108 | 
109 |         return V
110 | 
111 |     def plan(self, gamma=0.9, threshold=0.0001):
112 |         self.initialize()
113 |         states = self.env.states
114 |         actions = self.env.action_space
115 | 
116 |         def take_max_action(action_value_dict):
117 |             return max(action_value_dict, key=action_value_dict.get)
118 | 
119 |         while True:
120 |             update_stable = True
121 |             # Estimate expected rewards under current policy
122 |             V = self.estimate_by_policy(gamma, threshold)
123 |             self.log.append(self.dict_to_grid(V))
124 | 
125 |             for s in states:
126 |                 # Get action following to the policy (choose max prob's action)
127 |                 policy_action = take_max_action(self.policy[s])
128 | 
129 |                 # Compare with other actions
130 |                 action_rewards = {}
131 |                 for a in actions:
132 |                     r = 0
133 |                     for prob, next_state, reward in self.transitions_at(s, a):
134 |                         r += prob * (reward + gamma * V[next_state])
135 |                     action_rewards[a] = r
136 |                 best_action = take_max_action(action_rewards)
137 |                 if policy_action != best_action:
138 |                     update_stable = False
139 | 
140 |                 # Update policy (set best_action prob=1, otherwise=0 (greedy))
141 |                 for a in self.policy[s]:
142 |                     prob = 1 if a == best_action else 0
143 |                     self.policy[s][a] = prob
144 | 
145 |             if update_stable:
146 |                 # If policy isn't updated, stop iteration
147 |                 break
148 | 
149 |         # Turn dictionary to grid
150 |         V_grid = self.dict_to_grid(V)
151 |         return V_grid
152 | 


--------------------------------------------------------------------------------
/IRL/backups/visualizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib.cm as cm
 4 | 
 5 | 
 6 | def describe(episode, name, values, interval=10, round_count=-1):
 7 |     mean = np.mean(values[-interval:])
 8 |     std = np.std(values[-interval:])
 9 |     if round_count > 0:
10 |         mean = np.round(mean, round_count)
11 |         std = np.round(std, round_count)
12 |     desc = "{} is {} (+/-{})".format(name, mean, std)
13 |     print("At episode {}, {}".format(episode, desc))
14 | 
15 | 
16 | def plot_values(name, values, interval=10):
17 |     indices = list(range(0, len(values), interval))
18 |     means = []
19 |     stds = []
20 |     for i in indices:
21 |         _values = values[i:(i + interval)]
22 |         means.append(np.mean(_values))
23 |         stds.append(np.std(_values))
24 |     means = np.array(means)
25 |     stds = np.array(stds)
26 |     plt.figure()
27 |     plt.title("{} History".format(name))
28 |     plt.grid()
29 |     plt.fill_between(indices, means - stds, means + stds,
30 |                      alpha=0.1, color="g")
31 |     plt.plot(indices, means, "o-", color="g",
32 |              label="{} per {} episode".format(name.lower(), interval))
33 |     plt.legend(loc="best")
34 |     plt.show()
35 | 
36 | 
37 | def plot_grid_rewards(env, Q):
38 |     """
39 |     Show Q-values for FrozenLake-v0.
40 |     To show each action's evaluation,
41 |     a state is shown as 3 x 3 matrix like following.
42 |     XoX Up,
43 |     oco Left, Center(set mean value), Right
44 |     XoX Down
45 |     actions are located on 3 x 3 grid.
46 |     """
47 |     nrow = env.unwrapped.nrow
48 |     ncol = env.unwrapped.ncol
49 |     state_size = 3
50 |     q_nrow = nrow * state_size
51 |     q_ncol = ncol * state_size
52 |     reward_map = np.zeros((q_nrow, q_ncol))
53 | 
54 |     for r in range(nrow):
55 |         for c in range(ncol):
56 |             s = r * nrow + c
57 |             state_exist = False
58 |             if isinstance(Q, dict) and s in Q:
59 |                 state_exist = True
60 |             elif isinstance(Q, (np.ndarray, np.generic)) and s < Q.shape[0]:
61 |                 state_exist = True
62 | 
63 |             if state_exist:
64 |                 # In the display map, vertical index reverse.
65 |                 _r = 1 + (nrow - 1 - r) * state_size
66 |                 _c = 1 + c * state_size
67 |                 reward_map[_r][_c - 1] = Q[s][0]  # LEFT = 0
68 |                 reward_map[_r - 1][_c] = Q[s][1]  # DOWN = 1
69 |                 reward_map[_r][_c + 1] = Q[s][2]  # RIGHT = 2
70 |                 reward_map[_r + 1][_c] = Q[s][3]  # UP = 3
71 |                 # Center
72 |                 reward_map[_r][_c] = np.mean(Q[s])
73 | 
74 |     fig = plt.figure()
75 |     ax = fig.add_subplot(1, 1, 1)
76 |     plt.imshow(reward_map, cmap=cm.RdYlGn, interpolation="bilinear",
77 |                vmax=abs(reward_map).max(), vmin=-abs(reward_map).max())
78 |     ax.set_xlim(-0.5, q_ncol - 0.5)
79 |     ax.set_ylim(-0.5, q_nrow - 0.5)
80 |     ax.set_xticks(np.arange(-0.5, q_ncol, state_size))
81 |     ax.set_yticks(np.arange(-0.5, q_nrow, state_size))
82 |     ax.set_xticklabels(range(ncol + 1))
83 |     ax.set_yticklabels(range(nrow + 1))
84 |     ax.grid(which="both")
85 |     plt.show()
86 | 


--------------------------------------------------------------------------------
/IRL/bayesian.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats
  3 | from scipy.special import logsumexp
  4 | from planner import PolicyIterationPlanner
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | class BayesianIRL():
  9 | 
 10 |     def __init__(self, env, eta=0.8, prior_mean=0.0, prior_scale=0.5):
 11 |         self.env = env
 12 |         self.planner = PolicyIterationPlanner(env)
 13 |         self.eta = eta
 14 |         self._mean = prior_mean
 15 |         self._scale = prior_scale
 16 |         self.prior_dist = scipy.stats.norm(loc=prior_mean,
 17 |                                            scale=prior_scale)
 18 | 
 19 |     def estimate(self, trajectories, epoch=50, gamma=0.3,
 20 |                  learning_rate=0.1, sigma=0.05, sample_size=20):
 21 |         num_states = len(self.env.states)
 22 |         reward = np.random.normal(size=num_states,
 23 |                                   loc=self._mean, scale=self._scale)
 24 | 
 25 |         def get_q(r, g):
 26 |             self.planner.reward_func = lambda s: r[s]
 27 |             V = self.planner.plan(g)
 28 |             Q = self.planner.policy_to_q(V, gamma)
 29 |             return Q
 30 | 
 31 |         for i in range(epoch):
 32 |             noises = np.random.randn(sample_size, num_states)
 33 |             scores = []
 34 |             for n in tqdm(noises):
 35 |                 _reward = reward + sigma * n
 36 |                 Q = get_q(_reward, gamma)
 37 | 
 38 |                 # Calculate prior (sum of log prob).
 39 |                 reward_prior = np.sum(self.prior_dist.logpdf(_r)
 40 |                                       for _r in _reward)
 41 | 
 42 |                 # Calculate likelihood.
 43 |                 likelihood = self.calculate_likelihood(trajectories, Q)
 44 |                 # Calculate posterior.
 45 |                 posterior = likelihood + reward_prior
 46 |                 scores.append(posterior)
 47 | 
 48 |             rate = learning_rate / (sample_size * sigma)
 49 |             scores = np.array(scores)
 50 |             normalized_scores = (scores - scores.mean()) / scores.std()
 51 |             noise = np.mean(noises * normalized_scores.reshape((-1, 1)),
 52 |                             axis=0)
 53 |             reward = reward + rate * noise
 54 |             print("At iteration {} posterior={}.".format(i, scores.mean()))
 55 | 
 56 |         reward = reward.reshape(self.env.shape)
 57 |         return reward
 58 | 
 59 |     def calculate_likelihood(self, trajectories, Q):
 60 |         mean_log_prob = 0.0
 61 |         for t in trajectories:
 62 |             t_log_prob = 0.0
 63 |             for s, a in t:
 64 |                 expert_value = self.eta * Q[s][a]
 65 |                 total = [self.eta * Q[s][_a] for _a in self.env.actions]
 66 |                 t_log_prob += (expert_value - logsumexp(total))
 67 |             mean_log_prob += t_log_prob
 68 |         mean_log_prob /= len(trajectories)
 69 |         return mean_log_prob
 70 | 
 71 | 
 72 | if __name__ == "__main__":
 73 |     def test_estimate():
 74 |         from environment import GridWorldEnv
 75 |         env = GridWorldEnv(grid=[
 76 |             [0, 0, 0, 1],
 77 |             [0, 0, 0, 0],
 78 |             [0, -1, 0, 0],
 79 |             [0, 0, 0, 0],
 80 |         ])
 81 |         # Train Teacher
 82 |         teacher = PolicyIterationPlanner(env)
 83 |         teacher.plan()
 84 |         trajectories = []
 85 |         print("Gather demonstrations of teacher.")
 86 |         for i in range(20):
 87 |             s = env.reset()
 88 |             done = False
 89 |             steps = []
 90 |             while not done:
 91 |                 a = teacher.act(s)
 92 |                 steps.append((s, a))
 93 |                 n_s, r, done, _ = env.step(a)
 94 |                 s = n_s
 95 |             trajectories.append(steps)
 96 | 
 97 |         print("Estimate reward.")
 98 |         irl = BayesianIRL(env)
 99 |         rewards = irl.estimate(trajectories)
100 |         print(rewards)
101 |         env.plot_on_grid(rewards)
102 | 
103 |     test_estimate()
104 | 


--------------------------------------------------------------------------------
/IRL/environment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym.envs.toy_text import discrete
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.cm as cm
  5 | 
  6 | 
  7 | class GridWorldEnv(discrete.DiscreteEnv):
  8 | 
  9 |     metadata = {"render.modes": ["human", "ansi"]}
 10 | 
 11 |     def __init__(self, grid, move_prob=0.8, default_reward=0.0):
 12 |         # grid is 2d-array, and each value treated as attribute.
 13 |         # attribute is
 14 |         #  0: ordinary cell
 15 |         #  -1: damage cell (game end)
 16 |         #  1: reward cell (game end)
 17 |         self.grid = grid
 18 |         if isinstance(grid, (list, tuple)):
 19 |             self.grid = np.array(grid)
 20 |         self._actions = {
 21 |             "LEFT": 0,
 22 |             "DOWN": 1,
 23 |             "RIGHT": 2,
 24 |             "UP": 3,
 25 |         }
 26 |         self.default_reward = default_reward
 27 |         self.move_prob = move_prob
 28 | 
 29 |         num_states = self.nrow * self.ncol
 30 |         num_actions = len(self._actions)
 31 | 
 32 |         # start from left down
 33 |         initial_state_prob = np.zeros(num_states)
 34 |         initial_state_prob[self.coordinate_to_state(self.nrow - 1, 0)] = 1.0
 35 | 
 36 |         # Make transitions
 37 |         P = {}
 38 | 
 39 |         for s in range(num_states):
 40 |             if s not in P:
 41 |                 P[s] = {}
 42 | 
 43 |             reward = self.reward_func(s)
 44 |             done = self.has_done(s)
 45 |             if done:
 46 |                 # Terminal state
 47 |                 for a in range(num_actions):
 48 |                     P[s][a] = []
 49 |                     P[s][a].append([1.0, None, reward, done])
 50 |             else:
 51 |                 for a in range(num_actions):
 52 |                     P[s][a] = []
 53 |                     transition_probs = self.transit_func(s, a)
 54 |                     for n_s in transition_probs:
 55 |                         reward = self.reward_func(n_s)
 56 |                         done = self.has_done(s)
 57 |                         P[s][a].append([transition_probs[n_s], n_s,
 58 |                                         reward, done])
 59 |         self.P = P
 60 |         super().__init__(num_states, num_actions, P, initial_state_prob)
 61 | 
 62 |     @property
 63 |     def nrow(self):
 64 |         return self.grid.shape[0]
 65 | 
 66 |     @property
 67 |     def ncol(self):
 68 |         return self.grid.shape[1]
 69 | 
 70 |     @property
 71 |     def shape(self):
 72 |         return self.grid.shape
 73 | 
 74 |     @property
 75 |     def actions(self):
 76 |         return list(range(self.action_space.n))
 77 | 
 78 |     @property
 79 |     def states(self):
 80 |         return list(range(self.observation_space.n))
 81 | 
 82 |     def state_to_coordinate(self, s):
 83 |         row, col = divmod(s, self.nrow)
 84 |         return row, col
 85 | 
 86 |     def coordinate_to_state(self, row, col):
 87 |         index = row * self.nrow + col
 88 |         return index
 89 | 
 90 |     def state_to_feature(self, s):
 91 |         feature = np.zeros(self.observation_space.n)
 92 |         feature[s] = 1.0
 93 |         return feature
 94 | 
 95 |     def transit_func(self, state, action):
 96 |         transition_probs = {}
 97 |         opposite_direction = (action + 2) % 4
 98 |         candidates = [a for a in range(len(self._actions))
 99 |                       if a != opposite_direction]
100 | 
101 |         for a in candidates:
102 |             prob = 0
103 |             if a == action:
104 |                 prob = self.move_prob
105 |             else:
106 |                 prob = (1 - self.move_prob) / 2
107 | 
108 |             next_state = self._move(state, a)
109 |             if next_state not in transition_probs:
110 |                 transition_probs[next_state] = prob
111 |             else:
112 |                 transition_probs[next_state] += prob
113 | 
114 |         return transition_probs
115 | 
116 |     def reward_func(self, state):
117 |         row, col = self.state_to_coordinate(state)
118 |         reward = self.grid[row][col]
119 |         return reward
120 | 
121 |     def has_done(self, state):
122 |         row, col = self.state_to_coordinate(state)
123 |         reward = self.grid[row][col]
124 |         if np.abs(reward) == 1:
125 |             return True
126 |         else:
127 |             return False
128 | 
129 |     def _move(self, state, action):
130 |         next_state = state
131 |         row, col = self.state_to_coordinate(state)
132 |         next_row, next_col = row, col
133 | 
134 |         # Move state by action
135 |         if action == self._actions["LEFT"]:
136 |             next_col -= 1
137 |         elif action == self._actions["DOWN"]:
138 |             next_row += 1
139 |         elif action == self._actions["RIGHT"]:
140 |             next_col += 1
141 |         elif action == self._actions["UP"]:
142 |             next_row -= 1
143 | 
144 |         # Check the out of grid
145 |         if not (0 <= next_row < self.nrow):
146 |             next_row, next_col = row, col
147 |         if not (0 <= next_col < self.ncol):
148 |             next_row, next_col = row, col
149 | 
150 |         next_state = self.coordinate_to_state(next_row, next_col)
151 | 
152 |         return next_state
153 | 
154 |     def plot_on_grid(self, values):
155 |         if len(values.shape) < 2:
156 |             values = values.reshape(self.shape)
157 |         fig, ax = plt.subplots()
158 |         ax.imshow(values, cmap=cm.RdYlGn)
159 |         ax.set_xticks(np.arange(self.ncol))
160 |         ax.set_yticks(np.arange(self.nrow))
161 |         fig.tight_layout()
162 |         plt.show()
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     def test_grid():
167 |         env = GridWorldEnv(grid=[
168 |             [1, 0, 0, 0],
169 |             [0, 0, 0, 0],
170 |             [0, 0, 0, 0],
171 |             [0, 0, 0, 0],
172 |         ], move_prob=1.0)
173 |         s = env.reset()
174 |         assert s == 12, "Start position is not left down"
175 |         s, r, d, _ = env.step(0)  # Move to left wall
176 |         assert s == 12, "Agent should be bumped to left wall"
177 |         s, r, d, _ = env.step(1)  # Move to bottom wall
178 |         assert s == 12, "Agent should be bumped to bottom wall"
179 |         s, r, d, _ = env.step(2)  # Move to right
180 |         assert s == 13, "Agent should go to right"
181 |         s, r, d, _ = env.step(3)  # Move to up
182 |         assert s == 9, "Agent should go to up"
183 |         env.step(3)  # UP
184 |         env.step(3)  # UP
185 |         s, r, d, _ = env.step(0)  # LEFT
186 |         assert s == 0, "Agent locate last state"
187 |         s, r, d, _ = env.step(0)  # MOVE
188 |         assert s is None, "Next state does not exist"
189 |         assert d, "Agent should reache the goal"
190 |         assert r == 1, "Agent should get reward"
191 | 
192 |     test_grid()
193 | 


--------------------------------------------------------------------------------
/IRL/maxent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from planner import PolicyIterationPlanner
  3 | from tqdm import tqdm
  4 | 
  5 | 
  6 | class MaxEntIRL():
  7 | 
  8 |     def __init__(self, env):
  9 |         self.env = env
 10 |         self.planner = PolicyIterationPlanner(env)
 11 | 
 12 |     def estimate(self, trajectories, epoch=20, learning_rate=0.01, gamma=0.9):
 13 |         state_features = np.vstack([self.env.state_to_feature(s)
 14 |                                    for s in self.env.states])
 15 |         theta = np.random.uniform(size=state_features.shape[1])
 16 |         teacher_features = self.calculate_expected_feature(trajectories)
 17 | 
 18 |         for e in tqdm(range(epoch)):
 19 |             # Estimate reward.
 20 |             rewards = state_features.dot(theta.T)
 21 | 
 22 |             # Optimize policy under estimated reward.
 23 |             self.planner.reward_func = lambda s: rewards[s]
 24 |             self.planner.plan(gamma=gamma)
 25 | 
 26 |             # Estimate feature under policy.
 27 |             features = self.expected_features_under_policy(
 28 |                                 self.planner.policy, trajectories)
 29 | 
 30 |             # Update to close to teacher.
 31 |             update = teacher_features - features.dot(state_features)
 32 |             theta += learning_rate * update
 33 | 
 34 |         estimated = state_features.dot(theta.T)
 35 |         estimated = estimated.reshape(self.env.shape)
 36 |         return estimated
 37 | 
 38 |     def calculate_expected_feature(self, trajectories):
 39 |         features = np.zeros(self.env.observation_space.n)
 40 |         for t in trajectories:
 41 |             for s in t:
 42 |                 features[s] += 1
 43 | 
 44 |         features /= len(trajectories)
 45 |         return features
 46 | 
 47 |     def expected_features_under_policy(self, policy, trajectories):
 48 |         t_size = len(trajectories)
 49 |         states = self.env.states
 50 |         transition_probs = np.zeros((t_size, len(states)))
 51 | 
 52 |         initial_state_probs = np.zeros(len(states))
 53 |         for t in trajectories:
 54 |             initial_state_probs[t[0]] += 1
 55 |         initial_state_probs /= t_size
 56 |         transition_probs[0] = initial_state_probs
 57 | 
 58 |         for t in range(1, t_size):
 59 |             for prev_s in states:
 60 |                 prev_prob = transition_probs[t - 1][prev_s]
 61 |                 a = self.planner.act(prev_s)
 62 |                 probs = self.env.transit_func(prev_s, a)
 63 |                 for s in probs:
 64 |                     transition_probs[t][s] += prev_prob * probs[s]
 65 | 
 66 |         total = np.mean(transition_probs, axis=0)
 67 |         return total
 68 | 
 69 | 
 70 | if __name__ == "__main__":
 71 |     def test_estimate():
 72 |         from environment import GridWorldEnv
 73 |         env = GridWorldEnv(grid=[
 74 |             [0, 0, 0, 1],
 75 |             [0, 0, 0, 0],
 76 |             [0, -1, 0, 0],
 77 |             [0, 0, 0, 0],
 78 |         ])
 79 |         # Train Teacher
 80 |         teacher = PolicyIterationPlanner(env)
 81 |         teacher.plan()
 82 |         trajectories = []
 83 |         print("Gather demonstrations of teacher.")
 84 |         for i in range(20):
 85 |             s = env.reset()
 86 |             done = False
 87 |             steps = [s]
 88 |             while not done:
 89 |                 a = teacher.act(s)
 90 |                 n_s, r, done, _ = env.step(a)
 91 |                 steps.append(n_s)
 92 |                 s = n_s
 93 |             trajectories.append(steps)
 94 | 
 95 |         print("Estimate reward.")
 96 |         irl = MaxEntIRL(env)
 97 |         rewards = irl.estimate(trajectories, epoch=100)
 98 |         print(rewards)
 99 |         env.plot_on_grid(rewards)
100 | 
101 |     test_estimate()
102 | 


--------------------------------------------------------------------------------
/IRL/planner.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class Planner():
  5 | 
  6 |     def __init__(self, env, reward_func=None):
  7 |         self.env = env
  8 |         self.reward_func = reward_func
  9 |         if self.reward_func is None:
 10 |             self.reward_func = self.env.reward_func
 11 | 
 12 |     def initialize(self):
 13 |         self.env.reset()
 14 | 
 15 |     def transitions_at(self, state, action):
 16 |         reward = self.reward_func(state)
 17 |         done = self.env.has_done(state)
 18 |         transition = []
 19 |         if not done:
 20 |             transition_probs = self.env.transit_func(state, action)
 21 |             for next_state in transition_probs:
 22 |                 prob = transition_probs[next_state]
 23 |                 reward = self.reward_func(next_state)
 24 |                 done = self.env.has_done(state)
 25 |                 transition.append((prob, next_state, reward, done))
 26 |         else:
 27 |             transition.append((1.0, None, reward, done))
 28 |         for p, n_s, r, d in transition:
 29 |             yield p, n_s, r, d
 30 | 
 31 |     def plan(self, gamma=0.9, threshold=0.0001):
 32 |         raise Exception("Planner have to implements plan method.")
 33 | 
 34 | 
 35 | class ValueIterationPlanner(Planner):
 36 | 
 37 |     def __init__(self, env):
 38 |         super().__init__(env)
 39 | 
 40 |     def plan(self, gamma=0.9, threshold=0.0001):
 41 |         self.initialize()
 42 |         V = np.zeros(len(self.env.states))
 43 |         while True:
 44 |             delta = 0
 45 |             for s in self.env.states:
 46 |                 expected_rewards = []
 47 |                 for a in self.env.actions:
 48 |                     reward = 0
 49 |                     for p, n_s, r, done in self.transitions_at(s, a):
 50 |                         if n_s is None:
 51 |                             reward = r
 52 |                             continue
 53 |                         reward += p * (r + gamma * V[n_s] * (not done))
 54 |                     expected_rewards.append(reward)
 55 |                 max_reward = max(expected_rewards)
 56 |                 delta = max(delta, abs(max_reward - V[s]))
 57 |                 V[s] = max_reward
 58 | 
 59 |             if delta < threshold:
 60 |                 break
 61 | 
 62 |         return V
 63 | 
 64 | 
 65 | class PolicyIterationPlanner(Planner):
 66 | 
 67 |     def __init__(self, env):
 68 |         super().__init__(env)
 69 |         self.policy = None
 70 |         self._limit_count = 1000
 71 | 
 72 |     def initialize(self):
 73 |         super().initialize()
 74 |         self.policy = np.ones((self.env.observation_space.n,
 75 |                                self.env.action_space.n))
 76 |         # First, take each action uniformly.
 77 |         self.policy = self.policy / self.env.action_space.n
 78 | 
 79 |     def policy_to_q(self, V, gamma):
 80 |         Q = np.zeros((self.env.observation_space.n,
 81 |                       self.env.action_space.n))
 82 | 
 83 |         for s in self.env.states:
 84 |             for a in self.env.actions:
 85 |                 a_p = self.policy[s][a]
 86 |                 for p, n_s, r, done in self.transitions_at(s, a):
 87 |                     if done:
 88 |                         Q[s][a] += p * a_p * r
 89 |                     else:
 90 |                         Q[s][a] += p * a_p * (r + gamma * V[n_s])
 91 |         return Q
 92 | 
 93 |     def estimate_by_policy(self, gamma, threshold):
 94 |         V = np.zeros(self.env.observation_space.n)
 95 | 
 96 |         count = 0
 97 |         while True:
 98 |             delta = 0
 99 |             for s in self.env.states:
100 |                 expected_rewards = []
101 |                 for a in self.env.actions:
102 |                     action_prob = self.policy[s][a]
103 |                     reward = 0
104 |                     for p, n_s, r, done in self.transitions_at(s, a):
105 |                         if n_s is None:
106 |                             reward = r
107 |                             continue
108 |                         reward += action_prob * p * \
109 |                                   (r + gamma * V[n_s] * (not done))
110 |                     expected_rewards.append(reward)
111 |                 value = sum(expected_rewards)
112 |                 delta = max(delta, abs(value - V[s]))
113 |                 V[s] = value
114 | 
115 |             if delta < threshold or count > self._limit_count:
116 |                 break
117 |             count += 1
118 | 
119 |         return V
120 | 
121 |     def act(self, s):
122 |         return np.argmax(self.policy[s])
123 | 
124 |     def plan(self, gamma=0.9, threshold=0.0001, keep_policy=False):
125 |         if not keep_policy:
126 |             self.initialize()
127 | 
128 |         count = 0
129 |         while True:
130 |             update_stable = True
131 |             # Estimate expected reward under current policy.
132 |             V = self.estimate_by_policy(gamma, threshold)
133 | 
134 |             for s in self.env.states:
135 |                 # Get action following to the policy (choose max prob's action).
136 |                 policy_action = self.act(s)
137 | 
138 |                 # Compare with other actions.
139 |                 action_rewards = np.zeros(len(self.env.actions))
140 |                 for a in self.env.actions:
141 |                     reward = 0
142 |                     for p, n_s, r, done in self.transitions_at(s, a):
143 |                         if n_s is None:
144 |                             reward = r
145 |                             continue
146 |                         reward += p * (r + gamma * V[n_s] * (not done))
147 |                     action_rewards[a] = reward
148 |                 best_action = np.argmax(action_rewards)
149 |                 if policy_action != best_action:
150 |                     update_stable = False
151 | 
152 |                 # Update policy (set best_action prob=1, otherwise=0 (greedy)).
153 |                 self.policy[s] = np.zeros(len(self.env.actions))
154 |                 self.policy[s][best_action] = 1.0
155 | 
156 |             if update_stable or count > self._limit_count:
157 |                 # If policy isn't updated, stop iteration.
158 |                 break
159 |             count += 1
160 | 
161 |         return V
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     def test_plan():
166 |         from environment import GridWorldEnv
167 |         env = GridWorldEnv(grid=[
168 |             [0, 0, 0, 1],
169 |             [0, 0, 0, 0],
170 |             [0, -1, 0, 0],
171 |             [0, 0, 0, 0],
172 |         ])
173 |         print("Value Iteration.")
174 |         vp = ValueIterationPlanner(env)
175 |         v = vp.plan()
176 |         print(v.reshape(env.shape))
177 | 
178 |         print("Policy Iteration.")
179 |         pp = PolicyIterationPlanner(env)
180 |         v = pp.plan()
181 |         print(v.reshape(env.shape))
182 |         q = pp.policy_to_q(v, 0.9)
183 |         print(np.sum(q, axis=1).reshape(env.shape))
184 | 
185 |     test_plan()
186 | 


--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ### 指摘事項
 2 | 
 3 | 
 4 | ### 指摘箇所
 5 | 
 6 | * [ ] Day1: 強化学習の位置づけを知る
 7 | * [ ] Day2: 強化学習の解法(1): 環境から計画を立てる
 8 | * [ ] Day3: 強化学習の解法(2): 経験から計画を立てる
 9 | * [ ] Day4: 強化学習に対するニューラルネットワークの適用
10 | * [ ] Day5: 強化学習の弱点
11 | * [ ] Day6: 強化学習の弱点を克服するための手法
12 | * [ ] Day7: 強化学習の活用領域
13 | 
14 | ページ番号: p
15 | 
16 | ### 実行環境
17 | 
18 | * OS: 
19 | * Python version:
20 | * `pip freeze`の実行結果 (下に添付)
21 | 
22 | ### エラー内容
23 | 
24 | (例外のメッセージ、ログ、画面ショットなどを添付)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MM/dyna.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | from collections import defaultdict, Counter
  4 | import gym
  5 | from gym.envs.registration import register
  6 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv",
  7 |          kwargs={"is_slippery": False})
  8 | 
  9 | 
 10 | class DynaAgent():
 11 | 
 12 |     def __init__(self, epsilon=0.1):
 13 |         self.epsilon = epsilon
 14 |         self.actions = []
 15 |         self.value = None
 16 | 
 17 |     def policy(self, state):
 18 |         if np.random.random() < self.epsilon:
 19 |             return np.random.randint(len(self.actions))
 20 |         else:
 21 |             if sum(self.value[state]) == 0:
 22 |                 return np.random.randint(len(self.actions))
 23 |             else:            
 24 |                 return np.argmax(self.value[state])
 25 | 
 26 |     def learn(self, env, episode_count=3000, gamma=0.9, learning_rate=0.1,
 27 |               steps_in_model=-1, report_interval=100):
 28 |         self.actions = list(range(env.action_space.n))
 29 |         self.value = defaultdict(lambda: [0] * len(self.actions))
 30 |         model = Model(self.actions)
 31 | 
 32 |         rewards = []
 33 |         for e in range(episode_count):
 34 |             s = env.reset()
 35 |             done = False
 36 |             goal_reward = 0
 37 |             while not done:
 38 |                 a = self.policy(s)
 39 |                 n_state, reward, done, info = env.step(a)
 40 | 
 41 |                 # Update from experiments in the real environment.
 42 |                 gain = reward + gamma * max(self.value[n_state])
 43 |                 estimated = self.value[s][a]
 44 |                 self.value[s][a] += learning_rate * (gain - estimated)
 45 | 
 46 |                 if steps_in_model > 0:
 47 |                     model.update(s, a, reward, n_state)
 48 |                     for s, a, r, n_s in model.simulate(steps_in_model):
 49 |                         gain = r + gamma * max(self.value[n_s])
 50 |                         estimated = self.value[s][a]
 51 |                         self.value[s][a] += learning_rate * (gain - estimated)
 52 | 
 53 |                 s = n_state
 54 |             else:
 55 |                 goal_reward = reward
 56 | 
 57 |             rewards.append(goal_reward)
 58 |             if e != 0 and e % report_interval == 0:
 59 |                 recent = np.array(rewards[-report_interval:])
 60 |                 print("At episode {}, reward is {}".format(
 61 |                         e, recent.mean()))
 62 | 
 63 | 
 64 | class Model():
 65 | 
 66 |     def __init__(self, actions):
 67 |         self.num_actions = len(actions)
 68 |         self.transit_count = defaultdict(lambda: [Counter() for a in actions])
 69 |         self.total_reward = defaultdict(lambda: [0] *
 70 |                                         self.num_actions)
 71 |         self.history = defaultdict(Counter)
 72 | 
 73 |     def update(self, state, action, reward, next_state):
 74 |         self.transit_count[state][action][next_state] += 1
 75 |         self.total_reward[state][action] += reward
 76 |         self.history[state][action] += 1
 77 | 
 78 |     def transit(self, state, action):
 79 |         counter = self.transit_count[state][action]
 80 |         states = []
 81 |         counts = []
 82 |         for s, c in counter.most_common():
 83 |             states.append(s)
 84 |             counts.append(c)
 85 |         probs = np.array(counts) / sum(counts)
 86 |         return np.random.choice(states, p=probs)
 87 | 
 88 |     def reward(self, state, action):
 89 |         total_reward = self.total_reward[state][action]
 90 |         total_count = self.history[state][action]
 91 |         return total_reward / total_count
 92 | 
 93 |     def simulate(self, count):
 94 |         states = list(self.transit_count.keys())
 95 |         actions = lambda s: [a for a, c in self.history[s].most_common()
 96 |                              if c > 0]
 97 | 
 98 |         for i in range(count):
 99 |             state = np.random.choice(states)
100 |             action = np.random.choice(actions(state))
101 | 
102 |             next_state = self.transit(state, action)
103 |             reward = self.reward(state, action)
104 | 
105 |             yield state, action, reward, next_state
106 | 
107 | 
108 | def main(steps_in_model):
109 |     env = gym.make("FrozenLakeEasy-v0")
110 |     agent = DynaAgent()
111 |     agent.learn(env, steps_in_model=steps_in_model)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     parser = argparse.ArgumentParser(description="Dyna Agent")
116 |     parser.add_argument("--modelstep", type=int, default=-1,
117 |                         help="step count in the model")
118 | 
119 |     args = parser.parse_args()
120 |     main(args.modelstep)
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pythonで学ぶ強化学習 -入門から実践まで-
  2 | 
  3 | [Pythonで学ぶ強化学習 -入門から実践まで-](https://www.amazon.co.jp/dp/4065142989/)の実装コードリポジトリです。
  4 | 
  5 | 誤記、またサンプルコードの実行エラーについてはIssueで管理しています。
  6 | 
  7 | **[Issue List](https://github.com/icoxfog417/baby-steps-of-rl-ja/issues)**
  8 | 
  9 | * [3刷(2/4発行)での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/1?closed=1)
 10 |   * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/17/files)
 11 | * [改訂第2版での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/2?closed=1)
 12 |   * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/35/files)
 13 | * [改訂第2版4刷での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/3)
 14 |   * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/59/files)
 15 | 
 16 | 
 17 | 誤記/表記についての指摘、またサンプルコードの実行エラーについては[Issueにて](https://github.com/icoxfog417/baby-steps-of-rl-ja/issues/new)ご連絡をお願いします。
 18 | 
 19 | * 既に同じIssueが上がっていないか、事前に確認をお願いします。
 20 | * 実行エラーについては、テンプレートに沿い実行環境、発生エラーについて記載をお願いします。
 21 | 
 22 | ## Index
 23 | 
 24 | * [Setup](https://github.com/icoxfog417/baby-steps-of-rl-ja#setup)
 25 |   * [Setup with GPU](https://github.com/icoxfog417/baby-steps-of-rl-ja#setup-with-gpu)
 26 | * [Day1: 強化学習の位置づけを知る](https://github.com/icoxfog417/baby-steps-of-rl-ja#day1-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E4%BD%8D%E7%BD%AE%E3%81%A5%E3%81%91%E3%82%92%E7%9F%A5%E3%82%8B)
 27 | * [Day2: 強化学習の解法(1): 環境から計画を立てる](https://github.com/icoxfog417/baby-steps-of-rl-ja#day2-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E8%A7%A3%E6%B3%951-%E7%92%B0%E5%A2%83%E3%81%8B%E3%82%89%E8%A8%88%E7%94%BB%E3%82%92%E7%AB%8B%E3%81%A6%E3%82%8B)
 28 | * [Day3: 強化学習の解法(2): 経験から計画を立てる](https://github.com/icoxfog417/baby-steps-of-rl-ja#day3-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E8%A7%A3%E6%B3%952-%E7%B5%8C%E9%A8%93%E3%81%8B%E3%82%89%E8%A8%88%E7%94%BB%E3%82%92%E7%AB%8B%E3%81%A6%E3%82%8B)
 29 | * [Day4: 強化学習に対するニューラルネットワークの適用](https://github.com/icoxfog417/baby-steps-of-rl-ja#day4-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AB%E5%AF%BE%E3%81%99%E3%82%8B%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E3%83%8D%E3%83%83%E3%83%88%E3%83%AF%E3%83%BC%E3%82%AF%E3%81%AE%E9%81%A9%E7%94%A8)
 30 | * [Day5: 強化学習の弱点](https://github.com/icoxfog417/baby-steps-of-rl-ja#day5-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9)
 31 | * [Day6: 強化学習の弱点を克服するための手法](https://github.com/icoxfog417/baby-steps-of-rl-ja#day6-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9%E3%82%92%E5%85%8B%E6%9C%8D%E3%81%99%E3%82%8B%E3%81%9F%E3%82%81%E3%81%AE%E6%89%8B%E6%B3%95)
 32 | * [Day7: 強化学習の活用領域](https://github.com/icoxfog417/baby-steps-of-rl-ja#day7-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E6%B4%BB%E7%94%A8%E9%A0%98%E5%9F%9F)
 33 | 
 34 | [Support Content](https://github.com/icoxfog417/baby-steps-of-rl-ja#support-content)  
 35 | 
 36 | ## Setup
 37 | 
 38 | サンプルコードをダウンロードするのにGit、実行をするのにPythonの環境が必要です。そのため、以下2つのソフトウェアをダウンロードし、インストールしてください。なお、本書ではPythonの環境を作成するのにMinicondaを使用します。
 39 | 
 40 | 1. [Git](https://git-scm.com/)
 41 | 2. [Python (Miniconda)](https://conda.io/miniconda.html)
 42 |    * ダウンロードするのは、Python3の方です
 43 | 
 44 | インストールが終了したら、まずソースコードのダウンロードを行います。ターミナル/コマンドプロンプトを開き、作業するディレクトリで以下のコマンドを実行してください。
 45 | 
 46 | ```
 47 | git clone https://github.com/icoxfog417/baby-steps-of-rl-ja.git
 48 | ```
 49 | 
 50 | コマンドを実行すると、`baby-steps-of-rl-ja`というディレクトリが作成されていると思います。これで、ダウンロードは完了しました。ダウンロードしたフォルダに移動しましょう。
 51 | 
 52 | ```
 53 | cd baby-steps-of-rl-ja
 54 | ```
 55 | 
 56 | 続いて、ソースコードの実行環境を作成します。実行環境を作成するのに、Minicondaをインストールすることで使えるようになる`conda`コマンドを使用します。これから、本書の実行環境である`rl-book`という環境を作成します。
 57 | 
 58 | ```
 59 | conda create -n rl-book python=3.6
 60 | conda activate rl-book
 61 | ```
 62 | 
 63 | `conda activate`を実行することで、ターミナルの先頭に`(rl-book)`がついたでしょうか。これが、実行環境が有効化されているサインです。本書のソースコードを実行する際は、まず実行環境が有効化されているか=`(rl-book)`が先頭についているか、を確認してください。なお、無効化する際は`conda deactivate`のコマンドを実行します。
 64 | 
 65 | 実行環境に、実行に必要なライブラリをインストールします(`(rl-book)`が先頭についているか確認して実行してください)。
 66 | 
 67 | ```
 68 | pip install -r requirements.txt
 69 | ```
 70 | 
 71 | 以下のように、`welcome.py`を実行してみてください。ゲーム画面が立ち上がればセットアップは完了です。
 72 | 
 73 | ```
 74 | python welcome.py
 75 | ```
 76 | 
 77 | ## Setup with GPU
 78 | 
 79 | Day4で深層学習を利用した強化学習を実装していますが(DQN/A2C)、この学習にはGPUが不可欠です。GPUがない場合、学習に数日はかかります。
 80 | 
 81 | GPUを利用した学習には、当然GPUが必要です(より具体的には、NVIDIAのGPUです)。GPUを調達する方法は、以下2つです。
 82 | 
 83 | 1. GPUを搭載したマシンを用意する
 84 | 2. クラウド上でGPUを利用する
 85 |     * クラウドプラットフォームのGPUインスタンスを利用する
 86 |     * Google ColaboratoryでGPUを利用する
 87 | 
 88 | ### Local GPU Machine Setup
 89 | 
 90 | GPUを搭載したマシンがある場合、以下3つのステップでセットアップを行います。
 91 | 
 92 | 1. NVIDIA Driverのダウンロードとインストール
 93 |     * [NVIDIAドライバダウンロード](https://www.nvidia.co.jp/Download/index.aspx?lang=jp)
 94 |     * 基本的には、自動でダウンロードすべきドライバを選んでくれます。選んでくれない場合、手動で選択してダウンロードします。
 95 | 2. CUDA Toolkitのインストール
 96 |     * [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive)から、使用するTensorFlowが対応しているバージョンをダウンロードしインストールします([TensorFlow 1.13以上はCUDA 10です](https://www.tensorflow.org/install/gpu))。
 97 | 3. cuDNNのインストール
 98 |     * [cuDNN](https://developer.nvidia.com/cudnn)をダウンロードし、Toolkitのフォルダに展開します。なお、cuDNNのダウンロードにはユーザー登録が必要です。
 99 | 4. `tensorflow`の代わりに、`tensorflow-gpu`をインストールします(`tensorflow`がインストールされている場合、アンインストールしてください)。
100 | 
101 | `import tensorflow as tf`を実行し何もエラーがでなければセットアップは完了です。
102 | 
103 | ```
104 | > python
105 | >>>  import tensorflow as tf
106 | ```
107 | 
108 | ### Cloud GPU Machine Setup
109 | 
110 | AWSやAzure、GCPではGPUインスタンスを提供しています。それらを使用すれば、GPU搭載マシンを用意する必要はありません。GPUインスタンスでのセットアップ手順は[Local GPU Machine Setup](https://github.com/icoxfog417/baby-steps-of-rl-ja#local-gpu-machine-setup)と同じです。セットアップ済みのインスタンス(SageMakerなど)の場合、セットアップの必要もありません。
111 | 
112 | GPUインスタンスの使用には当然料金がかかります。そこで、無料でGPUを使用した計算ができるGoogle Colaboratoryを紹介します。
113 | 
114 | * [Day4: 価値評価に深層学習を適用する: Deep Q-Network](https://colab.research.google.com/drive/1QZs38jqCaSIpoKmoIl8XxVJUwdG78Hb8)
115 | * [Day4: 戦略に深層学習を適用する: Advantage Actor Critic(A2C)](https://colab.research.google.com/drive/1IzXGuNj4ZbsuWC7ei98ZzKrVk7mPS1t-)
116 | 
117 | Google Colaboratoryは、オンライン上でJupyter Notebookが使えるサービスです。GPUを使った計算も行うことができます。ただ、実行時間が限られています。長期の実行は行えませんが、可能な範囲で学習してモデルをダウンロードするには十分使えます。
118 | 
119 | 
120 | ## Day1: 強化学習の位置づけを知る
121 | 
122 | **Day1's Goals**
123 | 
124 | * 強化学習と、機械学習、人工知能といったキーワードの関係を理解する
125 | * 強化学習以外の学習法に対する、強化学習のメリット・デメリットを理解する
126 | * 機械学習の基本的な仕組みを理解する
127 | 
128 | **Summary**
129 | 
130 | * 強化学習とは?
131 |   * 強化学習 ⊂ 機械学習 ⊂ 人工知能。
132 |   * 機械学習 = 「機械」(=モデル)を「学習」させる手法。
133 |   * 強化学習 = 「学習」方法の一種。
134 |   * 強化学習は、連続した行動を通じて獲得できる「報酬の総和」を最大化することを目的とする。
135 |   * 行動の評価方法と、(評価に基づく)行動の選び方(=戦略)を学習する。
136 | * 強化学習のメリット・デメリット
137 |   * メリット: 評価が難しいタスクでも扱うことができる(行動の評価方法を学習するため)。
138 |   * デメリット: どんな行動を学習するかは制御できない(モデルが自ら獲得するため)。
139 | * 強化学習の基本的な仕組み
140 |   * 強化学習では、与えられる「環境」が一定のルールに従っていることを仮定する。
141 |   * このルールを、 **マルコフ決定過程(Markov Decision Process: MDP)** という。
142 |   * MDPの構成要素とその関係は、以下のように図式化できる。
143 |   * MDPにおける報酬は、「直前の状態と遷移先」に依存する。
144 |   * この報酬を **即時報酬(Immediate reward)** という。
145 |   * 報酬の総和(=即時報酬の合計)は、当然事前には知ることができない。
146 |   * そのため見積りを行うが、見積もった値を **期待報酬(Expected reward)** 、また **価値(Value)** と呼ぶ。
147 |   * 見積もる際に、将来の即時報酬については割り引いて考える。
148 |   * 割り引くための係数を **割引率(discount factor)** と呼ぶ。
149 | 
150 | <p align="center">
151 |   <img src="./doc/mdp.PNG" width=800 alt="mdp.PNG"/>
152 |   <p align="center">MDPの構成要素とその関係</p>
153 | </p>
154 | 
155 | **Exercises**
156 | 
157 | * [MDPの実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/environment.py)
158 | 
159 | ## Day2: 強化学習の解法(1): 環境から計画を立てる
160 | 
161 | **Day2's Goals**
162 | 
163 | * 行動評価の指標となる「価値」の定義を理解する
164 | * 状態の「価値」を動的計画法で学習する手法と実装方法を理解する
165 | * 「戦略」を動的計画法で学習する手法と実装方法を理解する
166 | * モデルベースの手法とモデルフリーの手法の違いを理解する
167 | 
168 | **Summary**
169 | 
170 | * 「価値」の定義
171 |   * Day1で定義した「価値」の計算には、将来の時点の即時報酬が必要になる。
172 |   * 将来の即時報酬は、計算する段階では当然わからない。
173 |   * わからない値に関する計算を持ち越しできるように、式を再帰的に定義する。
174 |   * 発生しうる即時報酬の候補はいくつかあり、どれになるかは確率的になる。
175 |   * そのため、報酬の値は期待値(確率x値)で表すようにする(行動確率 x 即時報酬)。
176 |   * 「価値」を再帰的かつ期待値で計算した式を、 **Bellman Equation** と呼ぶ。
177 | * 状態の「価値」の学習と、「戦略」の学習
178 |   * **Bellman Equation** では期待値の計算に戦略(行動確率)を使用する。
179 |   * 期待値(価値)が計算されれば、それに基づき戦略の修正も行われる(得られる価値が高くなるよう修正する)。
180 |   * 価値の計算、戦略の更新、価値の再計算・・・という処理が繰り返されることになる。
181 |   * 動的計画法において、戦略と価値を相互に更新するプロセスを **Policy Iteration** と呼ぶ。
182 |   * 一方、価値が計算できるなら価値が一番高いところを選べばいい、という素朴な考えもある。
183 |   * この場合、価値=戦略となる。
184 |   * 動的計画法において、価値=戦略とし、価値のみ更新するプロセスを **Value Iteration** と呼ぶ。
185 |   * 戦略を持つか(Policyベース)、価値=戦略とするか(Valueベース)は、強化学習において重要な観点となる。
186 | * モデルベースとモデルフリー。
187 |   * 動的計画法では、エージェントを一切動かさずに戦略/価値を学習した。
188 |   * このような芸当が可能なのは、遷移関数と報酬関数が明らかであり、シミュレーションが可能であるため。
189 |   * こうした、環境の情報を元に学習する手法を **モデルベース** の手法と呼ぶ。
190 |   * なお、遷移関数と報酬関数がわかっていることは少ないため、実際は推定を行うことになる。
191 |   * 一方、実際にエージェントを動かすことで得られた経験を元に学習する方法を **モデルフリー** の手法と呼ぶ。
192 |   * モデルの情報(遷移関数/報酬関数)が必要ないため、モデル「フリー」と呼ばれる。
193 |   * 環境が高度になるほどモデルの推定が困難になるため、一般的にはモデルフリーが用いられることが多い。
194 |   * しかし、表現力の高いDNNの登場によりこの限りではなくなっている。
195 |   * また、モデルフリーとモデルベースを併用する試みも多く行われている。
196 | 
197 | **Exercises**
198 | 
199 | * [価値の定義: Bellman Equationの実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/bellman_equation.py)
200 | * [価値反復法(Value Iteration)、戦略反復法(Policy Iteration)の実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/planner.py)
201 | 
202 | Value Iteration/Policy Iterationについて実行結果を試せるシミュレーターを用意しています。  
203 | 以下のスクリプトを実行し、立ち上がったサーバーにアクセスしてみてください([こちらから試せます](https://baby-step-of-rl-ja-dp.herokuapp.com/))。
204 | 
205 | ```
206 | python DP/run_server.py
207 | ```
208 | 
209 | http://localhost:8888/
210 | 
211 | <img src="./doc/application.PNG" width=600 alt="application.PNG"/>
212 | 
213 | * Areaで行・列を指定し、Drawのボタンを押すことで指定したサイズの迷路を作成できる。
214 | * 迷路内のセルを選択した後、Cell Settingのボタンを押すことで迷路のマスの設定を行うことができる。
215 | * Treasureはプラスの、Dangerはマイナスの報酬のゴール。Blockは、移動できないセルとなる。
216 | * 迷路の設定ができたら、Simulationにあるボタンを押し計画を立てる。
217 | * Value Iteration/Policy Iterationどちらかのボタンを押すと、ボタンに応じたアルゴリズムで解いた結果が参照できる。
218 | 
219 | ## Day3: 強化学習の解法(2): 経験から計画を立てる
220 | 
221 | **Day3's Goals**
222 | 
223 | * 経験を活用する際の3つの観点を理解する
224 |    1. 経験の蓄積と活用のバランス
225 |    2. 計画の修正を実績から行うか、予測で行うか
226 |    3. 経験を価値、戦略どちらの更新に利用するか
227 | * 各観点における対の関係を理解する
228 | * 各観点を代表する手法の実装方法を身につける
229 | 
230 | **Summary**
231 | 
232 | * 「経験」とは
233 |   * 「行動する前」に見積もった価値と、「行動した後」判明した実際の価値との差異となる。
234 |   * 行動すればするほど実際の即時報酬が明らかになり、見積もりに依存する分は少なくなる。
235 |   * これは「行動する前」の時点と「行動した後」の時点の差、という時刻間の差とも言える。
236 |   * そのため、これを **TD誤差(Temporal Difference error)** と呼ぶ。
237 | 
238 | <p align="center">
239 |   <img src="./doc/td.PNG" width=600 alt="td.PNG"/>
240 |   <p align="center">経験=TD誤差</p>
241 | </p>
242 | 
243 | * 経験の蓄積と活用のバランス
244 |   * モデルフリーでは遷移関数/報酬関数が不明である。
245 |   * そのため、「経験」の信頼度を上げるには複数回の試行が必要になる。
246 |   * (宝くじを1回買って当選したから、宝くじの当選確率は100%!とはならない)。
247 |   * 行動回数は。通常限られている。
248 |   * そのため、行動回数を「経験の信頼度向上」(見積り精度向上)と「経験を信じた行動」に割り振る必要がある。
249 |   * これを **探索と活用のトレードオフ(Exploration-Exploitation Trade-off)** と呼ぶ。
250 |   * (探索=信頼度向上、活用=信じた行動)。
251 |   * Epsilonの確率で探索/活用を切り替える手法を、**Epsilon-Greedy法** と呼ぶ。
252 | * 計画の修正を実績から行うか、予測で行うか
253 |   * 「行動した後」は、最短では1回行動した後、最長ではエピソードが終了した後となる。
254 |   * 前者を **TD法(TD(0))** 、後者を **Monte Carlo法** と呼ぶ。
255 |   * 「行動した後」を長く取るほど実績に基づいた修正が可能になるが、その分修正のタイミングは遅くなる。
256 |   * 実績/タイミングどちらを取るかはトレードオフとなる。
257 |   * TD(0)とMonte Carlo法の間を取ることももちろん可能である。
258 |   * 「行動した後」を複数ステップ後にする手法を **Multi-step learning** と呼ぶ。
259 |   * ステップ数の異なる経験を組み合わせる手法を **TD(λ)法** と呼ぶ。
260 | * 経験を価値、戦略どちらの更新に利用するか
261 |   * 経験は、価値/戦略(Valueベース/Policyベース)どちらの更新にも利用可能である。
262 |   * TD法に基づき行動の価値の更新を行う手法を **Q-learning** と呼ぶ。
263 |   * ("Q"は、行動価値を表す記号としてよく用いられる。これに対し状態の価値は"V"とされることが多い)。
264 |   * TD法に基づき戦略の更新を行う手法を **SARSA(State–action–reward–state–action)** と呼ぶ。
265 |   * SARSAでは価値を見積る際、先の行動は戦略により決定されることを前提とする。この前提を **On-policy** と呼ぶ。
266 |   * Valueベースのように、先の行動は「最大の価値が得られる行動」である、とする場合を **Off-policy** と呼ぶ。
267 |   * (戦略がない=Offのためこう呼ばれる)。
268 |   * Q-learningはOff-policyであり、SARSAはOn-policyである。
269 |   * SARSAでは戦略評価と戦略に同じ"Q"を使用している。
270 |   * これに対し、Policy Iterationのように評価と戦略を切り離すこともできる。
271 |   * 戦略側をActor、評価側をCriticとして切り離した手法を **Actor-Critic** と呼ぶ。
272 |   * Actor-Criticは、Policyベース(Actor)とValueベース(Critic)の併用とも言える。
273 | 
274 | 修正方法(実績/予測)、修正対象(価値/戦略)、見積り前提(On-policy/Off-policy)の3つの観点で手法をまとめると、以下のようになる。
275 | 
276 | <table>
277 |   <tr>
278 |     <th rowspan="2"></th>
279 |     <th colspan="2">修正方法</th>
280 |     <th colspan="2">修正対象</th>
281 |     <th colspan="2">見積り前提</th>
282 |   </tr>
283 |   <tr>
284 |     <td>予測</td>
285 |     <td>実績</td>
286 |     <td>価値</td>
287 |     <td>戦略<br></td>
288 |     <td>Off-policy</td>
289 |     <td>On-policy</td>
290 |   </tr>
291 |   <tr>
292 |     <td>Q-learning</td>
293 |     <td>○</td>
294 |     <td></td>
295 |     <td>○</td>
296 |     <td></td>
297 |     <td>○</td>
298 |     <td></td>
299 |   </tr>
300 |   <tr>
301 |     <td>Monte Carlo</td>
302 |     <td></td>
303 |     <td>○</td>
304 |     <td>○</td>
305 |     <td></td>
306 |     <td>○</td>
307 |     <td></td>
308 |   </tr>
309 |   <tr>
310 |     <td>SARSA</td>
311 |     <td>○</td>
312 |     <td></td>
313 |     <td colspan="2" align="center">○</td>
314 |     <td></td>
315 |     <td>○</td>
316 |   </tr>
317 |   <tr>
318 |     <td>Actor Critic</td>
319 |     <td>○</td>
320 |     <td></td>
321 |     <td>○</td>
322 |     <td>○</td>
323 |     <td></td>
324 |     <td>○</td>
325 |   </tr>
326 |   <tr>
327 |     <td>Off-policy Actor Critic</td>
328 |     <td>○</td>
329 |     <td></td>
330 |     <td>○</td>
331 |     <td>○</td>
332 |     <td>○</td>
333 |     <td></td>
334 |   </tr>
335 |   <tr>
336 |     <td>On-policy Monte Carlo</td>
337 |     <td></td>
338 |     <td>○</td>
339 |     <td>○</td>
340 |     <td>○</td>
341 |     <td></td>
342 |     <td>○</td>
343 |   </tr>
344 |   <tr>
345 |     <td>Off-policy Monte Carlo</td>
346 |     <td></td>
347 |     <td>○</td>
348 |     <td>○</td>
349 |     <td>○</td>
350 |     <td>○</td>
351 |     <td></td>
352 |   </tr>
353 | </table>
354 | 
355 | **Exercises**
356 | 
357 | * 経験の蓄積と活用のバランス
358 |   * [Epsilon-Greedy法](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Epsilon%26Greedy.ipynb)
359 | * 実績から計画を修正するか、予測で行うか
360 |   * [Monte Carlo](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Monte%20Carlo.ipynb)
361 |   * [Temporal Difference](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Q-learning.ipynb)
362 | * 経験を価値、戦略どちらの更新に利用するか
363 |   * [Valueベース & Off policy: Q-learning](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Q-learning.ipynb)
364 |   * [Policyベース & On policy: SARSA](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/SARSA.ipynb)
365 |   * [Valueベース & Policyベース: Actor Critic](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Actor%26Critic.ipynb)
366 | 
367 | ## Day4: 強化学習に対するニューラルネットワークの適用
368 | 
369 | **Day4's Goals**
370 | 
371 | * 関数として、ニューラルネットワークを適用するメリット
372 | * 価値評価を、パラメーターを持った関数で実装する方法
373 | * 戦略を、パラメーターを持った関数で実装する方法
374 | 
375 | **Summary**
376 | 
377 | * 価値評価/戦略の関数化
378 |   * Day3までは、状態における行動の価値をQ[s][a]というテーブルで管理してきた。
379 |   * しかし、このままでは状態数/行動数が多くなった場合に破綻することは目に見えている。
380 |   * テーブルを関数化することが、この組み合わせ爆発に対応するための一つの解法となる。
381 |   * 関数として(ディープ)ニューラルネットワークを使用する強化学習を特に「深層強化学習」と呼ぶ。
382 | * 関数として、ニューラルネットワークを使用するメリット・デメリット
383 |   * 人間が実際に観測している「状態」に近いデータをエージェントの学習に使用できる。
384 |   * これは、DNNが特徴抽出に優れているためである(画像ならばCNNなど)。
385 |   * ただ、ニューラルネットワークを使うことで学習時間が長くなるなどのデメリットも発生する(詳細は[Day5](https://github.com/icoxfog417/baby-steps-of-rl-ja#day5-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9))。
386 | * 価値評価を、パラメーターを持った関数で実装する
387 |   * 状態を受け取り、行動価値(=Q)を出力する関数(Q-function)を、ニューラルネットワークで実装する。
388 |   * ニューラルネットワークとしてCNNを利用する実装を、**Deep Q-Network (DQN)** と呼ぶ。
389 |   * DQN以前にもニューラルネットワークを使用した研究はあった。しかし、学習が安定しないという課題があった。
390 |   * DQNは、学習を安定させる3つの工夫を行うことでこの課題を克服している。
391 |   * 3つとは、 **Experience Reply** 、**Fixed Target Q-Network** 、**報酬のClipping** である。
392 |   * [Rainbow](https://arxiv.org/abs/1710.02298)は、DQNに対しさらに6つの工夫を追加した手法となっている。
393 | * 戦略を、パラメーターを持った関数で実装する
394 |   * 戦略の出力は行動確率であり、これは価値のように事前/事後の差分で評価ができない。
395 |   * (AとBの選択を行う際、選んだAが思っていたのとどれくらい違うかは評価できる(差分評価))。
396 |   * (しかし、Bをとっていたらどうだったのか?というのは時間を巻き戻さないとわからない)。
397 |   * そのため、価値の時のように差分を小さくするのではなく、純粋に戦略によって得られる期待価値を最大化する。
398 |   * 期待値は確率X値で計算できた。
399 |   * 戦略の期待価値は、「状態への遷移確率」X「行動確率」X「行動で得られる価値」で計算できる(J(θ))。
400 |   * この期待価値を、勾配法で最大化する。この手法を **方策勾配法(Policy Gradient)** と呼ぶ。
401 |   * 「行動で得られる価値」は、Day3で学んだように予測で見積る/実績から計算など様々なバリエーションがある。
402 |   * 行動の価値から状態の価値をマイナスした、純粋な行動の価値分を **Advantage** と呼ぶ。
403 |   * Advantageは、行動の価値は実績(Monte Carlo)、状態の価値は予測(TD)からと計算することができる。
404 |   * 状態の価値はCritic、戦略はActorとし、Advantageを使い学習する手法を **Advantage Actor Critic (A2C)** と呼ぶ。
405 |   * 方策勾配法は「現在の戦略での経験」で更新を行うため、過去の経験を使うExperience Replyは使用できない。
406 |   * 方策勾配法は、勾配の更新方法がとてもデリケートである。
407 |   * そのため、あまり大幅な更新が起きないよう(徐々に)学習させる手法としてTRPO、PPOがある。
408 | * 価値評価か、戦略か
409 |   * 価値評価には2つデメリットがある。
410 |   * 1. 価値の値が拮抗している2つの行動があっても、「最大」(少しでも大きい方)しかとらない。
411 |   * 2. 行動数が増えた場合対応が難しい。
412 |   * 戦略の場合、価値の大きさに応じた確率を割り振る、行動数が増えた(連続の場合)でも対応が可能である。
413 |   * (A2Cでは行動それぞれの確率を出力しており、実質的には価値評価と同じだった。これを克服する手法が2つある)。
414 |   * 1. 価値評価のようにベストな行動一つを出力する手法(Deterministic=決定的 なPolicy Gradient= **DPG**)
415 |   * 2. 行動分布のパラメーター(平均・分散など)を出力する手法。
416 |   * ただ、戦略の学習は価値評価の学習に比べ安定しない傾向がある。
417 |   * 2018年時点では、Policy Gradientの学習が意図した通りに行われているのかについてもまだよくわかっていない。
418 |   * 既存の手法は、以下のように分類を行うことができる。
419 | 
420 | <p align="center">
421 |   <img src="./doc/rl_ways.PNG" width=600 alt="rl_ways.PNG"/>
422 |   <p align="center">強化学習の手法の分類</p>
423 | </p>
424 | 
425 | **Exercises**
426 | 
427 | * [ニューラルネットワークの仕組み](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/FN/nn_tutorial)
428 | * [価値関数をニューラルネットで実装する](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/value_function_agent.py)
429 |   * [価値関数をDNNで実装する: DQN](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/dqn_agent.py)
430 | * [戦略をニューラルネットで実装する: Policy Gradient](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/policy_gradient_agent.py)
431 |   * [戦略をDNNで実装する: A2C](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/a2c_agent.py)
432 | 
433 | ## Day5: 強化学習の弱点
434 | 
435 | **Day5's goals**
436 | 
437 | 強化学習、特にニューラルネットワークを利用した深層強化学習の弱点について解説する。弱点とは、以下3点である。
438 | 
439 | * サンプル効率が悪い
440 | * 局所最適な行動に陥る、過学習することが多い
441 | * 再現性が低い
442 | 
443 | **Summary**
444 | 
445 | * サンプル効率が悪い
446 |   * Rainbowの論文には、Atariのゲームで人間同等のスコアを記録するのにどれぐらいの学習が必要か書かれている。
447 |   * これによれば、Rainbowであっても、約166時間のプレイ時間が必要になる(30fpsの場合)。
448 | * 局所最適な行動に陥る、過学習することが多い
449 |   * 局所最適: 対戦ゲームの場合、特定の対戦相手にだけ勝てる方法を学習する。
450 |   * 過学習: ゲームで言うところのチートプレイなどを学習する。
451 |   * ・・・といったことが起こる場合がある。
452 | * 再現性が低い
453 |   * ハイパーパラメーターの設定はもちろん、実行のたびに結果が変わるようなケースがある。
454 | * 対策
455 |   * 根本的な対策は[Day6](https://github.com/icoxfog417/baby-steps-of-rl-ja#day6-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9%E3%82%92%E5%85%8B%E6%9C%8D%E3%81%99%E3%82%8B%E3%81%9F%E3%82%81%E3%81%AE%E6%89%8B%E6%B3%95)にて紹介し、Day5ではこの弱点を前提とした対策を紹介する。
456 |   * 対策の基本は、「１回の学習結果を無駄にしない」となる。
457 |   * 「再現性が低い」ため複数回の実験が必要になる。
458 |   * しかし、「サンプル効率が悪い」ため学習には多くの時間が必要となる。
459 |   * そのため、一回の実験は長時間X複数回の実行からなる。これには当然時間がかかる。
460 |   * 時間のかかる実験がつまらないミスでやり直しになる事態を、可能な限り避ける必要がある。
461 |   * また、一回の実験からは可能な限り情報を取りたい。
462 |   * これを実現するため、Day4以降の実装では「モジュール分割」と「ログ取得」の2つを行っている。
463 | 
464 | <p align="center">
465 |   <img src="./doc/train_architecture.PNG" width=600 alt="train_architecture.PNG"/>
466 |   <p align="center">強化学習の実装フレームワーク</p>
467 | </p>
468 | 
469 | ## Day6: 強化学習の弱点を克服するための手法
470 | 
471 | **Day6's goals**
472 | 
473 | Day6では、Day5で紹介した弱点に対する根本的な対処方法(アルゴリズム的な改良)を解説する。
474 | 
475 | * 「サンプル効率が悪い」ことへの対処法
476 | * 「再現性が低い」ことへの対処法
477 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法
478 | 
479 | **Summary**
480 | 
481 | * 「サンプル効率が悪い」ことへの対処法
482 |   * 強化学習におけるサンプル効率の改善については、様々な手法が提案されている(下表参照)。
483 |   * 本書では、そのうちの一つである「環境認識の改善」について扱う。
484 |   * 深層強化学習は、画面など(人間が受け取るような)生に近いデータを扱う。
485 |   * このため、モデルは「入力(画面)からの特徴抽出」と「行動の仕方」の2つを同時に学習する必要がある。
486 |   * これが、学習効率を下げている原因と考えることができる。
487 |   * 「環境認識の改善」では、環境からの情報取得のサポートを行う。これには2つの方法がある。
488 |   * モデルベースとの併用: 環境のシミュレーター(モデル)を作り、抽象化された環境での学習を可能にする。
489 |   * 表現学習: 環境から得られる状態を、より認識しやすい表現に加工する。
490 |   * 本書では、モデルベースの併用として **Dyna** 、表現学習として **World Models** の紹介を行う。
491 | 
492 | <p align="center">
493 |   <img src="./doc/sample_improve.PNG" width=600 alt="sample_improve.PNG"/>
494 |   <p align="center">サンプル効率を改善する手法の一覧</p>
495 | </p>
496 | 
497 | * 「再現性が低い」ことへの対処法
498 |   * 再現性の低さを招いている要因の一つとして、「学習が安定しない」という問題がある。
499 |   * この点について、近年勾配法とは異なる最適化アルゴリズムが注目されている。
500 |   * それが **進化戦略** である。
501 |   * 勾配法は「初期状態から徐々に改善していく」というアプローチをとる。
502 |   * 一方、進化戦略は「多くの候補から絞り込む」というアプローチをとる。
503 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法
504 |   * 解決策として、「人がある程度誘導してやる」という単純な方法がある。これには、2つの方法がある。
505 |   * **模倣学習** : 人がお手本を示し、それに沿うよう行動を学習させる。
506 |   * **逆強化学習** : 示されたお手本から報酬を逆算し、それを基に行動を学習させる。
507 |   * 模倣学習は教師あり学習と似ているが、すべてのケースにおいてお手本を用意することは難しい。
508 |   * (事故を回避するデモ、など)。
509 |   * そのため、お手本をベースにしつつもお手本以外のケースにも対応できるようになる必要がある。
510 |   * 逆強化学習は、以下のような学習プロセスを経る(下図参照)。
511 |   * 通常の強化学習では、3のプロセスだけで済む(Train Agent under Reward Function)。
512 |   * しかし、逆強化学習はそこから報酬関数を更新、更新された報酬で再度学習・・・と繰り返す。
513 |   * そのため推定には時間がかかる。ただ、これを軽減する手法はいくつか提案されている。
514 | 
515 | <p align="center">
516 |   <img src="./doc/irl.png" width=600 alt="irl.png"/>
517 |   <p align="center">逆強化学習のプロセス</p>
518 | </p>
519 | 
520 | **Exercises**
521 | 
522 | * 「サンプル効率が悪い」ことへの対処法
523 |   * [モデルベースとの併用: Dyna](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/MM)
524 | * 「再現性が低い」ことへの対処法
525 |   * [新しい学習方法: 進化戦略](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/EV)
526 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法
527 |   * [模倣学習: DAgger](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/IM)
528 |   * [逆強化学習: MaxEntropy/Bayesian](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/IRL)
529 | 
530 | 
531 | ## Day7: 強化学習の活用領域
532 | 
533 | **Day7's goals**
534 | 
535 | * 強化学習を活用する2つのパターンを理解する
536 | * 強化学習を活用する2つのパターンにおける研究と事例を知る
537 | * 強化学習を活用する2つのパターンを実現するツール/サービスを知る
538 | 
539 | **Summary**
540 | 
541 | * 強化学習を活用する2つのパターン
542 |   * 強化学習の活用は、「行動の最適化」と「学習の最適化」に大別できる(下図参照)。
543 |   * 行動の最適化は、強化学習により獲得された行動をそのまま活用する。
544 |   * 学習の最適化は、強化学習の「報酬の最大化」という学習プロセスを活用する。
545 |   * この2つの分類に添い、研究/事例/ツール/サービスを紹介していく。
546 | 
547 | <p align="center">
548 |   <img src="./doc/rl_application.PNG" width=600 alt="rl_application.PNG"/>
549 |   <p align="center">強化学習の活用パターン</p>
550 | </p>
551 | 
552 | ## Support Content
553 | 
554 | プログラミングが初めて、という方のために参考になるコンテンツを用意しています。最近はプログラムを学ぶ書籍などは充実しているため、もちろんそれらで補完して頂いて構いません。
555 | 
556 | [python_exercises](https://github.com/icoxfog417/python_exercises)
557 | 


--------------------------------------------------------------------------------
/doc/application.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/application.PNG


--------------------------------------------------------------------------------
/doc/be.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/be.PNG


--------------------------------------------------------------------------------
/doc/colab_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/colab_a2c.png


--------------------------------------------------------------------------------
/doc/colab_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/colab_dqn.png


--------------------------------------------------------------------------------
/doc/frozen_lake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/frozen_lake.png


--------------------------------------------------------------------------------
/doc/irl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/irl.png


--------------------------------------------------------------------------------
/doc/mdp.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/mdp.PNG


--------------------------------------------------------------------------------
/doc/rl_application.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/rl_application.PNG


--------------------------------------------------------------------------------
/doc/rl_ways.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/rl_ways.PNG


--------------------------------------------------------------------------------
/doc/sample_improve.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/sample_improve.PNG


--------------------------------------------------------------------------------
/doc/td.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/td.PNG


--------------------------------------------------------------------------------
/doc/tradeoffs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/tradeoffs.png


--------------------------------------------------------------------------------
/doc/train_architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/train_architecture.PNG


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/environment.yml


--------------------------------------------------------------------------------
/requirements-colab.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/requirements-colab.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/requirements.txt


--------------------------------------------------------------------------------
/welcome.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tensorflow.python import keras as K
 3 | import gym
 4 | import gym_ple
 5 | 
 6 | 
 7 | def welcome():
 8 |     """
 9 |     Code to check installation of basic libraries
10 |     """
11 | 
12 |     env = gym.make("Catcher-v0")
13 |     num_action = env.action_space.n
14 |     episode_count = 10
15 | 
16 |     s = env.reset()
17 |     brain = K.Sequential()
18 |     brain.add(K.layers.Dense(num_action, input_shape=[np.prod(s.shape)],
19 |                              activation="softmax"))
20 | 
21 |     def policy(s):
22 |         evaluation = brain.predict(np.array([s.flatten()]))
23 |         return np.argmax(evaluation)
24 | 
25 |     for e in range(episode_count):
26 |         s = env.reset()
27 |         done = False
28 |         while not done:
29 |             env.render(mode="human")
30 |             a = policy(s)
31 |             n_state, reward, done, info = env.step(a)
32 |             s = n_state
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     welcome()
37 | 


--------------------------------------------------------------------------------