├── .gitignore ├── DP ├── Procfile ├── README.md ├── __init__.py ├── application.py ├── bellman_equation.py ├── environment.py ├── environment_demo.py ├── planner.py ├── requirements.txt ├── run_server.py ├── static │ ├── css │ │ └── index.css │ ├── images │ │ └── agent.png │ └── js │ │ └── index.js ├── templates │ ├── base.html │ └── index.html └── tests │ ├── __init__.py │ ├── test_environment.py │ └── test_planner.py ├── EL ├── __init__.py ├── actor_critic.py ├── compare_q_s.py ├── el_agent.py ├── epsilon_greedy.py ├── frozen_lake_util.py ├── monte_carlo.py ├── notebooks │ ├── Actor&Critic.ipynb │ ├── Epsilon&Greedy.ipynb │ ├── Monte Carlo.ipynb │ ├── Q-learning.ipynb │ └── SARSA.ipynb ├── q_learning.py └── sarsa.py ├── EV └── evolution.py ├── FN ├── __init__.py ├── a2c_agent.py ├── dqn_agent.py ├── fn_framework.py ├── nn_tutorial │ ├── explanation_keras.py │ ├── explanation_keras_batch.py │ ├── explanation_keras_boston.py │ ├── explanation_keras_mnist.py │ ├── explanation_tf.py │ ├── explanation_tf_batch.py │ └── gradient.py ├── policy_gradient_agent.py ├── policy_gradient_continuous_agent.py └── value_function_agent.py ├── IM └── dagger.py ├── IRL ├── backups │ ├── environment.py │ ├── irl_from_traj.py │ ├── linear.py │ ├── planner.py │ └── visualizer.py ├── bayesian.py ├── environment.py ├── maxent.py └── planner.py ├── ISSUE_TEMPLATE.md ├── LICENSE ├── MM └── dyna.py ├── README.md ├── doc ├── application.PNG ├── be.PNG ├── colab_a2c.png ├── colab_dqn.png ├── frozen_lake.png ├── irl.png ├── mdp.PNG ├── rl_application.PNG ├── rl_ways.PNG ├── sample_improve.PNG ├── td.PNG ├── tradeoffs.png └── train_architecture.PNG ├── environment.yml ├── requirements-colab.txt ├── requirements.txt └── welcome.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | .vscode 103 | .DS_Store 104 | FN/logs 105 | *.h5 106 | *.pkl 107 | /src 108 | -------------------------------------------------------------------------------- /DP/Procfile: -------------------------------------------------------------------------------- 1 | web: python run_server.py 2 | -------------------------------------------------------------------------------- /DP/README.md: -------------------------------------------------------------------------------- 1 | # Plan before Action: Dynamic Programming 2 | 3 | There are 3 programs are available. 4 | 5 | * To understand MDP: `environment.py` 6 | * `python environment_demo.py` 7 | * To understand Bellman Equation: `bellman_equation.py` 8 | * `python bellman_equation.py` 9 | * To understand Dynamic Programming: `planner.py` 10 | * `python run_server.py` 11 | * You can simulate Dynamic Programming online! 12 | -------------------------------------------------------------------------------- /DP/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/__init__.py -------------------------------------------------------------------------------- /DP/application.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tornado.web 3 | import tornado.escape 4 | from environment import Environment 5 | from planner import ValueIterationPlanner, PolicyIterationPlanner 6 | 7 | 8 | class IndexHandler(tornado.web.RequestHandler): 9 | 10 | def get(self): 11 | self.render("index.html") 12 | 13 | 14 | class PlanningHandler(tornado.web.RequestHandler): 15 | 16 | def post(self): 17 | data = tornado.escape.json_decode(self.request.body) 18 | grid = data["grid"] 19 | plan_type = data["plan"] 20 | move_prob = 0.8 # default value 21 | 22 | try: 23 | move_prob = float(data["prob"]) 24 | except ValueError: 25 | pass 26 | 27 | env = Environment(grid, move_prob=move_prob) 28 | if plan_type == "value": 29 | planner = ValueIterationPlanner(env) 30 | elif plan_type == "policy": 31 | planner = PolicyIterationPlanner(env) 32 | 33 | result = planner.plan() 34 | planner.log.append(result) 35 | self.write({"log": planner.log}) 36 | 37 | 38 | class Application(tornado.web.Application): 39 | 40 | def __init__(self): 41 | handlers = [ 42 | (r"/", IndexHandler), 43 | (r"/plan", PlanningHandler), 44 | ] 45 | 46 | settings = dict( 47 | template_path=os.path.join(os.path.dirname(__file__), "templates"), 48 | static_path=os.path.join(os.path.dirname(__file__), "static"), 49 | cookie_secret=os.environ.get("SECRET_TOKEN", "__TODO:_GENERATE_YOUR_OWN_RANDOM_VALUE_HERE__"), 50 | debug=True, 51 | ) 52 | 53 | super(Application, self).__init__(handlers, **settings) 54 | -------------------------------------------------------------------------------- /DP/bellman_equation.py: -------------------------------------------------------------------------------- 1 | def V(s, gamma=0.99): 2 | V = R(s) + gamma * max_V_on_next_state(s) 3 | return V 4 | 5 | 6 | def R(s): 7 | if s == "happy_end": 8 | return 1 9 | elif s == "bad_end": 10 | return -1 11 | else: 12 | return 0 13 | 14 | 15 | def max_V_on_next_state(s): 16 | # If game end, expected value is 0. 17 | if s in ["happy_end", "bad_end"]: 18 | return 0 19 | 20 | actions = ["up", "down"] 21 | values = [] 22 | for a in actions: 23 | transition_probs = transit_func(s, a) 24 | v = 0 25 | for next_state in transition_probs: 26 | prob = transition_probs[next_state] 27 | v += prob * V(next_state) 28 | values.append(v) 29 | return max(values) 30 | 31 | 32 | def transit_func(s, a): 33 | """ 34 | Make next state by adding action str to state. 35 | ex: (s = 'state', a = 'up') => 'state_up' 36 | (s = 'state_up', a = 'down') => 'state_up_down' 37 | """ 38 | 39 | actions = s.split("_")[1:] 40 | LIMIT_GAME_COUNT = 5 41 | HAPPY_END_BORDER = 4 42 | MOVE_PROB = 0.9 43 | 44 | def next_state(state, action): 45 | return "_".join([state, action]) 46 | 47 | if len(actions) == LIMIT_GAME_COUNT: 48 | up_count = sum([1 if a == "up" else 0 for a in actions]) 49 | state = "happy_end" if up_count >= HAPPY_END_BORDER else "bad_end" 50 | prob = 1.0 51 | return {state: prob} 52 | else: 53 | opposite = "up" if a == "down" else "down" 54 | return { 55 | next_state(s, a): MOVE_PROB, 56 | next_state(s, opposite): 1 - MOVE_PROB 57 | } 58 | 59 | 60 | if __name__ == "__main__": 61 | print(V("state")) 62 | print(V("state_up_up")) 63 | print(V("state_down_down")) 64 | -------------------------------------------------------------------------------- /DP/environment.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import numpy as np 3 | 4 | 5 | class State(): 6 | 7 | def __init__(self, row=-1, column=-1): 8 | self.row = row 9 | self.column = column 10 | 11 | def __repr__(self): 12 | return "".format(self.row, self.column) 13 | 14 | def clone(self): 15 | return State(self.row, self.column) 16 | 17 | def __hash__(self): 18 | return hash((self.row, self.column)) 19 | 20 | def __eq__(self, other): 21 | return self.row == other.row and self.column == other.column 22 | 23 | 24 | class Action(Enum): 25 | UP = 1 26 | DOWN = -1 27 | LEFT = 2 28 | RIGHT = -2 29 | 30 | 31 | class Environment(): 32 | 33 | def __init__(self, grid, move_prob=0.8): 34 | # grid is 2d-array. Its values are treated as an attribute. 35 | # Kinds of attribute is following. 36 | # 0: ordinary cell 37 | # -1: damage cell (game end) 38 | # 1: reward cell (game end) 39 | # 9: block cell (can't locate agent) 40 | self.grid = grid 41 | self.agent_state = State() 42 | 43 | # Default reward is minus. Just like a poison swamp. 44 | # It means the agent has to reach the goal fast! 45 | self.default_reward = -0.04 46 | 47 | # Agent can move to a selected direction in move_prob. 48 | # It means the agent will move different direction 49 | # in (1 - move_prob). 50 | self.move_prob = move_prob 51 | self.reset() 52 | 53 | @property 54 | def row_length(self): 55 | return len(self.grid) 56 | 57 | @property 58 | def column_length(self): 59 | return len(self.grid[0]) 60 | 61 | @property 62 | def actions(self): 63 | return [Action.UP, Action.DOWN, 64 | Action.LEFT, Action.RIGHT] 65 | 66 | @property 67 | def states(self): 68 | states = [] 69 | for row in range(self.row_length): 70 | for column in range(self.column_length): 71 | # Block cells are not included to the state. 72 | if self.grid[row][column] != 9: 73 | states.append(State(row, column)) 74 | return states 75 | 76 | def transit_func(self, state, action): 77 | transition_probs = {} 78 | if not self.can_action_at(state): 79 | # Already on the terminal cell. 80 | return transition_probs 81 | 82 | opposite_direction = Action(action.value * -1) 83 | 84 | for a in self.actions: 85 | prob = 0 86 | if a == action: 87 | prob = self.move_prob 88 | elif a != opposite_direction: 89 | prob = (1 - self.move_prob) / 2 90 | 91 | next_state = self._move(state, a) 92 | if next_state not in transition_probs: 93 | transition_probs[next_state] = prob 94 | else: 95 | transition_probs[next_state] += prob 96 | 97 | return transition_probs 98 | 99 | def can_action_at(self, state): 100 | if self.grid[state.row][state.column] == 0: 101 | return True 102 | else: 103 | return False 104 | 105 | def _move(self, state, action): 106 | if not self.can_action_at(state): 107 | raise Exception("Can't move from here!") 108 | 109 | next_state = state.clone() 110 | 111 | # Execute an action (move). 112 | if action == Action.UP: 113 | next_state.row -= 1 114 | elif action == Action.DOWN: 115 | next_state.row += 1 116 | elif action == Action.LEFT: 117 | next_state.column -= 1 118 | elif action == Action.RIGHT: 119 | next_state.column += 1 120 | 121 | # Check whether a state is out of the grid. 122 | if not (0 <= next_state.row < self.row_length): 123 | next_state = state 124 | if not (0 <= next_state.column < self.column_length): 125 | next_state = state 126 | 127 | # Check whether the agent bumped a block cell. 128 | if self.grid[next_state.row][next_state.column] == 9: 129 | next_state = state 130 | 131 | return next_state 132 | 133 | def reward_func(self, state): 134 | reward = self.default_reward 135 | done = False 136 | 137 | # Check an attribute of next state. 138 | attribute = self.grid[state.row][state.column] 139 | if attribute == 1: 140 | # Get reward! and the game ends. 141 | reward = 1 142 | done = True 143 | elif attribute == -1: 144 | # Get damage! and the game ends. 145 | reward = -1 146 | done = True 147 | 148 | return reward, done 149 | 150 | def reset(self): 151 | # Locate the agent at lower left corner. 152 | self.agent_state = State(self.row_length - 1, 0) 153 | return self.agent_state 154 | 155 | def step(self, action): 156 | next_state, reward, done = self.transit(self.agent_state, action) 157 | if next_state is not None: 158 | self.agent_state = next_state 159 | 160 | return next_state, reward, done 161 | 162 | def transit(self, state, action): 163 | transition_probs = self.transit_func(state, action) 164 | if len(transition_probs) == 0: 165 | return None, None, True 166 | 167 | next_states = [] 168 | probs = [] 169 | for s in transition_probs: 170 | next_states.append(s) 171 | probs.append(transition_probs[s]) 172 | 173 | next_state = np.random.choice(next_states, p=probs) 174 | reward, done = self.reward_func(next_state) 175 | return next_state, reward, done 176 | -------------------------------------------------------------------------------- /DP/environment_demo.py: -------------------------------------------------------------------------------- 1 | import random 2 | from environment import Environment 3 | 4 | 5 | class Agent(): 6 | 7 | def __init__(self, env): 8 | self.actions = env.actions 9 | 10 | def policy(self, state): 11 | return random.choice(self.actions) 12 | 13 | 14 | def main(): 15 | # Make grid environment. 16 | grid = [ 17 | [0, 0, 0, 1], 18 | [0, 9, 0, -1], 19 | [0, 0, 0, 0] 20 | ] 21 | env = Environment(grid) 22 | agent = Agent(env) 23 | 24 | # Try 10 game. 25 | for i in range(10): 26 | # Initialize position of agent. 27 | state = env.reset() 28 | total_reward = 0 29 | done = False 30 | 31 | while not done: 32 | action = agent.policy(state) 33 | next_state, reward, done = env.step(action) 34 | total_reward += reward 35 | state = next_state 36 | 37 | print("Episode {}: Agent gets {} reward.".format(i, total_reward)) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /DP/planner.py: -------------------------------------------------------------------------------- 1 | class Planner(): 2 | 3 | def __init__(self, env): 4 | self.env = env 5 | self.log = [] 6 | 7 | def initialize(self): 8 | self.env.reset() 9 | self.log = [] 10 | 11 | def plan(self, gamma=0.9, threshold=0.0001): 12 | raise Exception("Planner have to implements plan method.") 13 | 14 | def transitions_at(self, state, action): 15 | transition_probs = self.env.transit_func(state, action) 16 | for next_state in transition_probs: 17 | prob = transition_probs[next_state] 18 | reward, _ = self.env.reward_func(next_state) 19 | yield prob, next_state, reward 20 | 21 | def dict_to_grid(self, state_reward_dict): 22 | grid = [] 23 | for i in range(self.env.row_length): 24 | row = [0] * self.env.column_length 25 | grid.append(row) 26 | for s in state_reward_dict: 27 | grid[s.row][s.column] = state_reward_dict[s] 28 | 29 | return grid 30 | 31 | 32 | class ValueIterationPlanner(Planner): 33 | 34 | def __init__(self, env): 35 | super().__init__(env) 36 | 37 | def plan(self, gamma=0.9, threshold=0.0001): 38 | self.initialize() 39 | actions = self.env.actions 40 | V = {} 41 | for s in self.env.states: 42 | # Initialize each state's expected reward. 43 | V[s] = 0 44 | 45 | while True: 46 | delta = 0 47 | self.log.append(self.dict_to_grid(V)) 48 | for s in V: 49 | if not self.env.can_action_at(s): 50 | continue 51 | expected_rewards = [] 52 | for a in actions: 53 | r = 0 54 | for prob, next_state, reward in self.transitions_at(s, a): 55 | r += prob * (reward + gamma * V[next_state]) 56 | expected_rewards.append(r) 57 | max_reward = max(expected_rewards) 58 | delta = max(delta, abs(max_reward - V[s])) 59 | V[s] = max_reward 60 | 61 | if delta < threshold: 62 | break 63 | 64 | V_grid = self.dict_to_grid(V) 65 | return V_grid 66 | 67 | 68 | class PolicyIterationPlanner(Planner): 69 | 70 | def __init__(self, env): 71 | super().__init__(env) 72 | self.policy = {} 73 | 74 | def initialize(self): 75 | super().initialize() 76 | self.policy = {} 77 | actions = self.env.actions 78 | states = self.env.states 79 | for s in states: 80 | self.policy[s] = {} 81 | for a in actions: 82 | # Initialize policy. 83 | # At first, each action is taken uniformly. 84 | self.policy[s][a] = 1 / len(actions) 85 | 86 | def estimate_by_policy(self, gamma, threshold): 87 | V = {} 88 | for s in self.env.states: 89 | # Initialize each state's expected reward. 90 | V[s] = 0 91 | 92 | while True: 93 | delta = 0 94 | for s in V: 95 | expected_rewards = [] 96 | for a in self.policy[s]: 97 | action_prob = self.policy[s][a] 98 | r = 0 99 | for prob, next_state, reward in self.transitions_at(s, a): 100 | r += action_prob * prob * \ 101 | (reward + gamma * V[next_state]) 102 | expected_rewards.append(r) 103 | value = sum(expected_rewards) 104 | delta = max(delta, abs(value - V[s])) 105 | V[s] = value 106 | if delta < threshold: 107 | break 108 | 109 | return V 110 | 111 | def plan(self, gamma=0.9, threshold=0.0001): 112 | self.initialize() 113 | states = self.env.states 114 | actions = self.env.actions 115 | 116 | def take_max_action(action_value_dict): 117 | return max(action_value_dict, key=action_value_dict.get) 118 | 119 | while True: 120 | update_stable = True 121 | # Estimate expected rewards under current policy. 122 | V = self.estimate_by_policy(gamma, threshold) 123 | self.log.append(self.dict_to_grid(V)) 124 | 125 | for s in states: 126 | # Get an action following to the current policy. 127 | policy_action = take_max_action(self.policy[s]) 128 | 129 | # Compare with other actions. 130 | action_rewards = {} 131 | for a in actions: 132 | r = 0 133 | for prob, next_state, reward in self.transitions_at(s, a): 134 | r += prob * (reward + gamma * V[next_state]) 135 | action_rewards[a] = r 136 | best_action = take_max_action(action_rewards) 137 | if policy_action != best_action: 138 | update_stable = False 139 | 140 | # Update policy (set best_action prob=1, otherwise=0 (greedy)) 141 | for a in self.policy[s]: 142 | prob = 1 if a == best_action else 0 143 | self.policy[s][a] = prob 144 | 145 | if update_stable: 146 | # If policy isn't updated, stop iteration 147 | break 148 | 149 | # Turn dictionary to grid 150 | V_grid = self.dict_to_grid(V) 151 | return V_grid 152 | -------------------------------------------------------------------------------- /DP/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/requirements.txt -------------------------------------------------------------------------------- /DP/run_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tornado.ioloop 3 | from tornado.options import define, options, parse_command_line 4 | from application import Application 5 | 6 | 7 | define("port", default=8888, help="run on the given port", type=int) 8 | 9 | 10 | def main(): 11 | parse_command_line() 12 | app = Application() 13 | port = int(os.environ.get("PORT", 8888)) 14 | app.listen(port) 15 | print("Run server on port: {}".format(port)) 16 | tornado.ioloop.IOLoop.current().start() 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /DP/static/css/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family:-apple-system, BlinkMacSystemFont, "Helvetica Neue", "Segoe UI","Noto Sans Japanese","ヒラギノ角ゴ ProN W3", Meiryo, sans-serif; 3 | } 4 | .title-top{ 5 | margin-top: 20px; 6 | } 7 | .cell{ 8 | width: 80px; 9 | height: 80px; 10 | border: 1px solid silver; 11 | float: left; 12 | } 13 | .cell-content{ 14 | width: 100%; 15 | height: 100%; 16 | text-align: center; 17 | line-height: 80px; 18 | } 19 | .cell-content.active{ 20 | background-color: whitesmoke; 21 | } 22 | .cell-content.treasure{ 23 | background-color: #00d1b2; 24 | } 25 | .cell-content.danger{ 26 | background-color: #ff3860; 27 | } 28 | .cell-content.block{ 29 | background-color: #363636; 30 | } 31 | .cell-content.agent{ 32 | width: 60px; 33 | height: 60px; 34 | margin: auto; 35 | margin-top: 10px; 36 | background: url(/static/images/agent.png); 37 | background-size: 60px; 38 | } 39 | .cell-content.v5{ 40 | background-color: rgba(0, 209, 178, 0.8); 41 | } 42 | .cell-content.v4{ 43 | background-color: rgba(0, 209, 178, 0.6); 44 | } 45 | .cell-content.v3{ 46 | background-color: rgba(0, 209, 178, 0.3); 47 | } 48 | .cell-content.v2{ 49 | background-color: rgba(0, 209, 178, 0.1); 50 | } 51 | .cell-content.v1{ 52 | background-color: rgba(0, 209, 178, 0); 53 | } 54 | -------------------------------------------------------------------------------- /DP/static/images/agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/static/images/agent.png -------------------------------------------------------------------------------- /DP/static/js/index.js: -------------------------------------------------------------------------------- 1 | Vue.config.debug = true; 2 | var app = new Vue({ 3 | el: "#app", 4 | delimiters: ["[[", "]]"], 5 | data: { 6 | row: 3, 7 | column: 4, 8 | moveProb: 0.8, 9 | grid: [], 10 | selectedIndex: null, 11 | simulation: false, 12 | log: [], 13 | logIndex: 0 14 | }, 15 | created: function(){ 16 | this.draw(); 17 | this.selectedIndex = [0, 3]; 18 | this.setTreasure(); 19 | this.selectedIndex = [1, 3]; 20 | this.setDanger(); 21 | this.selectedIndex = [1, 1]; 22 | this.setBlock(); 23 | }, 24 | computed: { 25 | targetGrid: function () { 26 | if(!this.simulation){ 27 | return this.grid; 28 | }else{ 29 | return this.log[this.logIndex]; 30 | } 31 | }, 32 | hasLog: function(){ 33 | if(this.log.length > 0){ 34 | return true; 35 | }else{ 36 | return false; 37 | } 38 | } 39 | }, 40 | methods: { 41 | init: function(){ 42 | this.selectedIndex = null; 43 | this.simulation = false; 44 | this.logIndex = 0; 45 | this.log = []; 46 | }, 47 | draw: function(){ 48 | this.init(); 49 | this.makeGrid(); 50 | }, 51 | makeGrid: function(){ 52 | this.grid = []; 53 | var size = this.row * this.column; 54 | for(var i = 0; i < size; i++){ 55 | var rowIndex = Math.floor(i / this.column); 56 | var columnIndex = i % this.column; 57 | if(columnIndex == 0){ 58 | this.grid.push([]); 59 | } 60 | var cellAttribute = 0; 61 | this.grid[rowIndex].push(cellAttribute); 62 | } 63 | }, 64 | getCellAttribute: function(row, column){ 65 | var attribute = this.grid[row][column]; 66 | switch(attribute){ 67 | case 1: 68 | return "treasure" 69 | case -1: 70 | return "danger" 71 | case 9: 72 | return "block" 73 | } 74 | if(this.selectedIndex != null && (this.selectedIndex[0] == row && this.selectedIndex[1] == column)){ 75 | return "active" 76 | } 77 | if(row == (this.grid.length - 1) && column == 0){ 78 | return "agent" 79 | } 80 | if(this.simulation){ 81 | var value = this.log[this.logIndex][row][column]; 82 | if(value >= 0.8){ 83 | return "v5" 84 | }else if(value >= 0.6){ 85 | return "v4" 86 | }else if(value >= 0.3){ 87 | return "v3" 88 | }else if(value >= 0.1){ 89 | return "v2" 90 | }else{ 91 | return "v1" 92 | } 93 | } 94 | }, 95 | plan: function(planType){ 96 | var data = { 97 | "plan": planType, 98 | "prob": this.moveProb, 99 | "grid": this.grid 100 | } 101 | var self = this; 102 | fetch("/plan", { 103 | method: "POST", 104 | credentials: "same-origin", 105 | headers: { 106 | "Content-Type": "application/json" 107 | }, 108 | body: JSON.stringify(data) 109 | }).then(function(resp){ 110 | return resp.json() 111 | }).then(function(resp){ 112 | self.log = resp["log"]; 113 | self.play(); 114 | }) 115 | }, 116 | play: function(){ 117 | this.logIndex = 0; 118 | this.simulation = true; 119 | var self = this; 120 | var timer = setInterval(function(){ 121 | if(self.logIndex < self.log.length - 1){ 122 | self.logIndex += 1; 123 | }else{ 124 | clearInterval(timer); 125 | } 126 | }, 1000); 127 | }, 128 | stop: function(){ 129 | this.init(); 130 | }, 131 | value: function(row, column){ 132 | var attribute = this.grid[row][column]; 133 | if(attribute != 0 || (row == (this.grid.length -1) && column == 0)){ 134 | return ""; 135 | } 136 | var value = this.log[this.logIndex][row][column]; 137 | var value = Math.floor(value * 1000) / 1000; 138 | return value; 139 | }, 140 | selectCell: function(row, column){ 141 | // [row, 0] is Agent point 142 | if(!(row == (this.grid.length - 1) && column == 0)){ 143 | this.selectedIndex = [row, column]; 144 | } 145 | }, 146 | setTreasure: function(){ 147 | this.setAttribute(1); 148 | }, 149 | setDanger: function(){ 150 | this.setAttribute(-1); 151 | }, 152 | setBlock: function(){ 153 | this.setAttribute(9); 154 | }, 155 | clearAttribute: function(row, column){ 156 | if(this.simulation){ 157 | this.init(); 158 | } 159 | this.selectedIndex = [row, column]; 160 | this.setAttribute(0); 161 | }, 162 | setAttribute: function(attribute){ 163 | var index = this.selectedIndex; 164 | if(this.selectedIndex != null){ 165 | this.grid[index[0]][index[1]] = attribute; 166 | this.selectedIndex = null; 167 | } 168 | } 169 | } 170 | }) 171 | -------------------------------------------------------------------------------- /DP/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | MDP by Dynamic Programming Demo 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | {% block head %}{% end %} 15 | 16 | 17 | {% block body %}{% end %} 18 | {% block bottom %}{% end %} 19 | 20 | 21 | -------------------------------------------------------------------------------- /DP/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | 5 | 6 | {% end %} 7 | 8 | {% block bottom %} 9 | 10 | {% end %} 11 | 12 | {% block body %} 13 |
14 |

15 |

Dynamic Programming Simulator

16 |

17 |
18 |
19 |
20 |
21 |
22 |
25 | [[value(rowIndex, columnIndex)]] 26 |
27 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |

Area (Row x Column)

38 |
39 |

40 | 41 | 48 | 49 |

50 |

51 | 52 | 59 | 60 |

61 | 62 |
63 |

Cell Setting

64 | 65 | 66 | 67 |
68 |

Move Prob

69 |
70 | 71 |
72 |
73 |

Simulation

74 | 75 | 76 |
77 |
78 |

Result

79 |
[[log.length]] iterations have done to converge.
80 | 98 |
99 |
100 |
101 |
102 |
103 | {% end %} 104 | -------------------------------------------------------------------------------- /DP/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/DP/tests/__init__.py -------------------------------------------------------------------------------- /DP/tests/test_environment.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | from DP.environment import Environment 4 | 5 | 6 | class TestEnvironment(unittest.TestCase): 7 | 8 | def test_run_environment(self): 9 | grid = self.get_sample_grid() 10 | env = Environment(grid) 11 | for i in range(100): 12 | state = env.reset() # initialize agent position 13 | self.assertEqual(state.row, len(env.grid) - 1) 14 | self.assertEqual(state.column, 0) 15 | goal = False 16 | for t in range(10): 17 | action = random.choice(env.actions) 18 | state, reward, done = env.step(action) 19 | self.assertTrue(0 <= state.row < len(env.grid)) 20 | self.assertTrue(0 <= state.column < len(env.grid[0])) 21 | if done: 22 | print("Episode {}: get reward {}, {} timesteps".format( 23 | i, reward, t + 1)) 24 | goal = True 25 | break 26 | if not goal: 27 | print("Episode {}: no reward".format(i)) 28 | 29 | def get_sample_grid(self): 30 | # 3 x 4 grid 31 | grid = [ 32 | [0, 0, 0, 1], 33 | [0, 9, 0, -1], 34 | [0, 0, 0, 0] 35 | ] 36 | return grid 37 | -------------------------------------------------------------------------------- /DP/tests/test_planner.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | from DP.environment import Environment 4 | from DP.planner import ValueIterationPlanner, PolicyIterationPlanner 5 | 6 | 7 | class TestPlanner(unittest.TestCase): 8 | 9 | def test_value_iteration(self): 10 | grid = self.get_sample_grid() 11 | env = Environment(grid) 12 | planner = ValueIterationPlanner(env) 13 | result = planner.plan() 14 | print("Value Iteration") 15 | for r in result: 16 | print(r) 17 | 18 | def test_policy_iteration(self): 19 | grid = self.get_sample_grid() 20 | env = Environment(grid) 21 | planner = PolicyIterationPlanner(env) 22 | result = planner.plan() 23 | print("Policy Iteration") 24 | for r in result: 25 | print(r) 26 | 27 | def get_sample_grid(self): 28 | # 3 x 4 grid 29 | grid = [ 30 | [0, 0, 0, 1], 31 | [0, 9, 0, -1], 32 | [0, 0, 0, 0] 33 | ] 34 | return grid 35 | -------------------------------------------------------------------------------- /EL/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/EL/__init__.py -------------------------------------------------------------------------------- /EL/actor_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from el_agent import ELAgent 4 | from frozen_lake_util import show_q_value 5 | 6 | 7 | class Actor(ELAgent): 8 | 9 | def __init__(self, env): 10 | super().__init__(epsilon=-1) 11 | nrow = env.observation_space.n 12 | ncol = env.action_space.n 13 | self.actions = list(range(env.action_space.n)) 14 | self.Q = np.random.uniform(0, 1, nrow * ncol).reshape((nrow, ncol)) 15 | 16 | def softmax(self, x): 17 | return np.exp(x) / np.sum(np.exp(x), axis=0) 18 | 19 | def policy(self, s): 20 | a = np.random.choice(self.actions, 1, 21 | p=self.softmax(self.Q[s])) 22 | return a[0] 23 | 24 | 25 | class Critic(): 26 | 27 | def __init__(self, env): 28 | states = env.observation_space.n 29 | self.V = np.zeros(states) 30 | 31 | 32 | class ActorCritic(): 33 | 34 | def __init__(self, actor_class, critic_class): 35 | self.actor_class = actor_class 36 | self.critic_class = critic_class 37 | 38 | def train(self, env, episode_count=1000, gamma=0.9, 39 | learning_rate=0.1, render=False, report_interval=50): 40 | actor = self.actor_class(env) 41 | critic = self.critic_class(env) 42 | 43 | actor.init_log() 44 | for e in range(episode_count): 45 | s = env.reset() 46 | done = False 47 | while not done: 48 | if render: 49 | env.render() 50 | a = actor.policy(s) 51 | n_state, reward, done, info = env.step(a) 52 | 53 | gain = reward + gamma * critic.V[n_state] 54 | estimated = critic.V[s] 55 | td = gain - estimated 56 | actor.Q[s][a] += learning_rate * td 57 | critic.V[s] += learning_rate * td 58 | s = n_state 59 | 60 | else: 61 | actor.log(reward) 62 | 63 | if e != 0 and e % report_interval == 0: 64 | actor.show_reward_log(episode=e) 65 | 66 | return actor, critic 67 | 68 | 69 | def train(): 70 | trainer = ActorCritic(Actor, Critic) 71 | env = gym.make("FrozenLakeEasy-v0") 72 | actor, critic = trainer.train(env, episode_count=3000) 73 | show_q_value(actor.Q) 74 | actor.show_reward_log() 75 | 76 | 77 | if __name__ == "__main__": 78 | train() 79 | -------------------------------------------------------------------------------- /EL/compare_q_s.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | from collections import defaultdict 3 | import gym 4 | from el_agent import ELAgent 5 | from frozen_lake_util import show_q_value 6 | 7 | 8 | class CompareAgent(ELAgent): 9 | 10 | def __init__(self, q_learning=True, epsilon=0.33): 11 | self.q_learning = q_learning 12 | super().__init__(epsilon) 13 | 14 | def learn(self, env, episode_count=1000, gamma=0.9, 15 | learning_rate=0.1, render=False, report_interval=50): 16 | self.init_log() 17 | self.Q = defaultdict(lambda: [0] * len(actions)) 18 | actions = list(range(env.action_space.n)) 19 | for e in range(episode_count): 20 | s = env.reset() 21 | done = False 22 | a = self.policy(s, actions) 23 | while not done: 24 | if render: 25 | env.render() 26 | 27 | n_state, reward, done, info = env.step(a) 28 | 29 | if done and reward == 0: 30 | reward = -0.5 # Reward as penalty 31 | 32 | n_action = self.policy(n_state, actions) 33 | 34 | if self.q_learning: 35 | gain = reward + gamma * max(self.Q[n_state]) 36 | else: 37 | gain = reward + gamma * self.Q[n_state][n_action] 38 | 39 | estimated = self.Q[s][a] 40 | self.Q[s][a] += learning_rate * (gain - estimated) 41 | s = n_state 42 | 43 | if self.q_learning: 44 | a = self.policy(s, actions) 45 | else: 46 | a = n_action 47 | else: 48 | self.log(reward) 49 | 50 | if e != 0 and e % report_interval == 0: 51 | self.show_reward_log(episode=e) 52 | 53 | 54 | def train(q_learning): 55 | env = gym.make("FrozenLakeEasy-v0") 56 | agent = CompareAgent(q_learning=q_learning) 57 | agent.learn(env, episode_count=3000) 58 | return dict(agent.Q) 59 | 60 | 61 | if __name__ == "__main__": 62 | with Pool() as pool: 63 | results = pool.map(train, ([True, False])) 64 | for r in results: 65 | show_q_value(r) 66 | -------------------------------------------------------------------------------- /EL/el_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | class ELAgent(): 6 | 7 | def __init__(self, epsilon): 8 | self.Q = {} 9 | self.epsilon = epsilon 10 | self.reward_log = [] 11 | 12 | def policy(self, s, actions): 13 | if np.random.random() < self.epsilon: 14 | return np.random.randint(len(actions)) 15 | else: 16 | if s in self.Q and sum(self.Q[s]) != 0: 17 | return np.argmax(self.Q[s]) 18 | else: 19 | return np.random.randint(len(actions)) 20 | 21 | def init_log(self): 22 | self.reward_log = [] 23 | 24 | def log(self, reward): 25 | self.reward_log.append(reward) 26 | 27 | def show_reward_log(self, interval=50, episode=-1): 28 | if episode > 0: 29 | rewards = self.reward_log[-interval:] 30 | mean = np.round(np.mean(rewards), 3) 31 | std = np.round(np.std(rewards), 3) 32 | print("At Episode {} average reward is {} (+/-{}).".format( 33 | episode, mean, std)) 34 | else: 35 | indices = list(range(0, len(self.reward_log), interval)) 36 | means = [] 37 | stds = [] 38 | for i in indices: 39 | rewards = self.reward_log[i:(i + interval)] 40 | means.append(np.mean(rewards)) 41 | stds.append(np.std(rewards)) 42 | means = np.array(means) 43 | stds = np.array(stds) 44 | plt.figure() 45 | plt.title("Reward History") 46 | plt.grid() 47 | plt.fill_between(indices, means - stds, means + stds, 48 | alpha=0.1, color="g") 49 | plt.plot(indices, means, "o-", color="g", 50 | label="Rewards for each {} episode".format(interval)) 51 | plt.legend(loc="best") 52 | plt.show() 53 | -------------------------------------------------------------------------------- /EL/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | class CoinToss(): 6 | 7 | def __init__(self, head_probs, max_episode_steps=30): 8 | self.head_probs = head_probs 9 | self.max_episode_steps = max_episode_steps 10 | self.toss_count = 0 11 | 12 | def __len__(self): 13 | return len(self.head_probs) 14 | 15 | def reset(self): 16 | self.toss_count = 0 17 | 18 | def step(self, action): 19 | final = self.max_episode_steps - 1 20 | if self.toss_count > final: 21 | raise Exception("The step count exceeded maximum. \ 22 | Please reset env.") 23 | else: 24 | done = True if self.toss_count == final else False 25 | 26 | if action >= len(self.head_probs): 27 | raise Exception("The No.{} coin doesn't exist.".format(action)) 28 | else: 29 | head_prob = self.head_probs[action] 30 | if random.random() < head_prob: 31 | reward = 1.0 32 | else: 33 | reward = 0.0 34 | self.toss_count += 1 35 | return reward, done 36 | 37 | 38 | class EpsilonGreedyAgent(): 39 | 40 | def __init__(self, epsilon): 41 | self.epsilon = epsilon 42 | self.V = [] 43 | 44 | def policy(self): 45 | coins = range(len(self.V)) 46 | if random.random() < self.epsilon: 47 | return random.choice(coins) 48 | else: 49 | return np.argmax(self.V) 50 | 51 | def play(self, env): 52 | # Initialize estimation. 53 | N = [0] * len(env) 54 | self.V = [0] * len(env) 55 | 56 | env.reset() 57 | done = False 58 | rewards = [] 59 | while not done: 60 | selected_coin = self.policy() 61 | reward, done = env.step(selected_coin) 62 | rewards.append(reward) 63 | 64 | n = N[selected_coin] 65 | coin_average = self.V[selected_coin] 66 | new_average = (coin_average * n + reward) / (n + 1) 67 | N[selected_coin] += 1 68 | self.V[selected_coin] = new_average 69 | 70 | return rewards 71 | 72 | 73 | if __name__ == "__main__": 74 | import pandas as pd 75 | import matplotlib.pyplot as plt 76 | 77 | def main(): 78 | env = CoinToss([0.1, 0.5, 0.1, 0.9, 0.1]) 79 | epsilons = [0.0, 0.1, 0.2, 0.5, 0.8] 80 | game_steps = list(range(10, 310, 10)) 81 | result = {} 82 | for e in epsilons: 83 | agent = EpsilonGreedyAgent(epsilon=e) 84 | means = [] 85 | for s in game_steps: 86 | env.max_episode_steps = s 87 | rewards = agent.play(env) 88 | means.append(np.mean(rewards)) 89 | result["epsilon={}".format(e)] = means 90 | result["coin toss count"] = game_steps 91 | result = pd.DataFrame(result) 92 | result.set_index("coin toss count", drop=True, inplace=True) 93 | result.plot.line(figsize=(10, 5)) 94 | plt.show() 95 | 96 | main() 97 | -------------------------------------------------------------------------------- /EL/frozen_lake_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.cm as cm 4 | import gym 5 | from gym.envs.registration import register 6 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv", 7 | kwargs={"is_slippery": False}) 8 | 9 | 10 | def show_q_value(Q): 11 | """ 12 | Show Q-values for FrozenLake-v0. 13 | To show each action's evaluation, 14 | a state is shown as 3 x 3 matrix like following. 15 | 16 | +---+---+---+ 17 | | | u | | u: up value 18 | | l | m | r | l: left value, r: right value, m: mean value 19 | | | d | | d: down value 20 | +---+---+---+ 21 | """ 22 | env = gym.make("FrozenLake-v0") 23 | nrow = env.unwrapped.nrow 24 | ncol = env.unwrapped.ncol 25 | state_size = 3 26 | q_nrow = nrow * state_size 27 | q_ncol = ncol * state_size 28 | reward_map = np.zeros((q_nrow, q_ncol)) 29 | 30 | for r in range(nrow): 31 | for c in range(ncol): 32 | s = r * ncol + c 33 | state_exist = False 34 | if isinstance(Q, dict) and s in Q: 35 | state_exist = True 36 | elif isinstance(Q, (np.ndarray, np.generic)) and s < Q.shape[0]: 37 | state_exist = True 38 | 39 | if state_exist: 40 | # At the display map, the vertical index is reversed. 41 | _r = 1 + (nrow - 1 - r) * state_size 42 | _c = 1 + c * state_size 43 | reward_map[_r][_c - 1] = Q[s][0] # LEFT = 0 44 | reward_map[_r - 1][_c] = Q[s][1] # DOWN = 1 45 | reward_map[_r][_c + 1] = Q[s][2] # RIGHT = 2 46 | reward_map[_r + 1][_c] = Q[s][3] # UP = 3 47 | reward_map[_r][_c] = np.mean(Q[s]) # Center 48 | 49 | fig = plt.figure() 50 | ax = fig.add_subplot(1, 1, 1) 51 | plt.imshow(reward_map, cmap=cm.RdYlGn, interpolation="bilinear", 52 | vmax=abs(reward_map).max(), vmin=-abs(reward_map).max()) 53 | ax.set_xlim(-0.5, q_ncol - 0.5) 54 | ax.set_ylim(-0.5, q_nrow - 0.5) 55 | ax.set_xticks(np.arange(-0.5, q_ncol, state_size)) 56 | ax.set_yticks(np.arange(-0.5, q_nrow, state_size)) 57 | ax.set_xticklabels(range(ncol + 1)) 58 | ax.set_yticklabels(range(nrow + 1)) 59 | ax.grid(which="both") 60 | plt.show() 61 | -------------------------------------------------------------------------------- /EL/monte_carlo.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import defaultdict 3 | import gym 4 | from el_agent import ELAgent 5 | from frozen_lake_util import show_q_value 6 | 7 | 8 | class MonteCarloAgent(ELAgent): 9 | 10 | def __init__(self, epsilon=0.1): 11 | super().__init__(epsilon) 12 | 13 | def learn(self, env, episode_count=1000, gamma=0.9, 14 | render=False, report_interval=50): 15 | self.init_log() 16 | actions = list(range(env.action_space.n)) 17 | self.Q = defaultdict(lambda: [0] * len(actions)) 18 | N = defaultdict(lambda: [0] * len(actions)) 19 | 20 | for e in range(episode_count): 21 | s = env.reset() 22 | done = False 23 | # Play until the end of episode. 24 | experience = [] 25 | while not done: 26 | if render: 27 | env.render() 28 | a = self.policy(s, actions) 29 | n_state, reward, done, info = env.step(a) 30 | experience.append({"state": s, "action": a, "reward": reward}) 31 | s = n_state 32 | else: 33 | self.log(reward) 34 | 35 | # Evaluate each state, action. 36 | for i, x in enumerate(experience): 37 | s, a = x["state"], x["action"] 38 | 39 | # Calculate discounted future reward of s. 40 | G, t = 0, 0 41 | for j in range(i, len(experience)): 42 | G += math.pow(gamma, t) * experience[j]["reward"] 43 | t += 1 44 | 45 | N[s][a] += 1 # count of s, a pair 46 | alpha = 1 / N[s][a] 47 | self.Q[s][a] += alpha * (G - self.Q[s][a]) 48 | 49 | if e != 0 and e % report_interval == 0: 50 | self.show_reward_log(episode=e) 51 | 52 | 53 | def train(): 54 | agent = MonteCarloAgent(epsilon=0.1) 55 | env = gym.make("FrozenLakeEasy-v0") 56 | agent.learn(env, episode_count=500) 57 | show_q_value(agent.Q) 58 | agent.show_reward_log() 59 | 60 | 61 | if __name__ == "__main__": 62 | train() 63 | -------------------------------------------------------------------------------- /EL/q_learning.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import gym 3 | from el_agent import ELAgent 4 | from frozen_lake_util import show_q_value 5 | 6 | 7 | class QLearningAgent(ELAgent): 8 | 9 | def __init__(self, epsilon=0.1): 10 | super().__init__(epsilon) 11 | 12 | def learn(self, env, episode_count=1000, gamma=0.9, 13 | learning_rate=0.1, render=False, report_interval=50): 14 | self.init_log() 15 | actions = list(range(env.action_space.n)) 16 | self.Q = defaultdict(lambda: [0] * len(actions)) 17 | for e in range(episode_count): 18 | s = env.reset() 19 | done = False 20 | while not done: 21 | if render: 22 | env.render() 23 | a = self.policy(s, actions) 24 | n_state, reward, done, info = env.step(a) 25 | 26 | gain = reward + gamma * max(self.Q[n_state]) 27 | estimated = self.Q[s][a] 28 | self.Q[s][a] += learning_rate * (gain - estimated) 29 | s = n_state 30 | 31 | else: 32 | self.log(reward) 33 | 34 | if e != 0 and e % report_interval == 0: 35 | self.show_reward_log(episode=e) 36 | 37 | 38 | def train(): 39 | agent = QLearningAgent() 40 | env = gym.make("FrozenLakeEasy-v0") 41 | agent.learn(env, episode_count=500) 42 | show_q_value(agent.Q) 43 | agent.show_reward_log() 44 | 45 | 46 | if __name__ == "__main__": 47 | train() 48 | -------------------------------------------------------------------------------- /EL/sarsa.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import gym 3 | from el_agent import ELAgent 4 | from frozen_lake_util import show_q_value 5 | 6 | 7 | class SARSAAgent(ELAgent): 8 | 9 | def __init__(self, epsilon=0.1): 10 | super().__init__(epsilon) 11 | 12 | def learn(self, env, episode_count=1000, gamma=0.9, 13 | learning_rate=0.1, render=False, report_interval=50): 14 | self.init_log() 15 | actions = list(range(env.action_space.n)) 16 | self.Q = defaultdict(lambda: [0] * len(actions)) 17 | for e in range(episode_count): 18 | s = env.reset() 19 | done = False 20 | a = self.policy(s, actions) 21 | while not done: 22 | if render: 23 | env.render() 24 | n_state, reward, done, info = env.step(a) 25 | 26 | n_action = self.policy(n_state, actions) # On-policy 27 | gain = reward + gamma * self.Q[n_state][n_action] 28 | estimated = self.Q[s][a] 29 | self.Q[s][a] += learning_rate * (gain - estimated) 30 | s = n_state 31 | a = n_action 32 | else: 33 | self.log(reward) 34 | 35 | if e != 0 and e % report_interval == 0: 36 | self.show_reward_log(episode=e) 37 | 38 | 39 | def train(): 40 | agent = SARSAAgent() 41 | env = gym.make("FrozenLakeEasy-v0") 42 | agent.learn(env, episode_count=500) 43 | show_q_value(agent.Q) 44 | agent.show_reward_log() 45 | 46 | 47 | if __name__ == "__main__": 48 | train() 49 | -------------------------------------------------------------------------------- /EV/evolution.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from sklearn.externals.joblib import Parallel, delayed 5 | from PIL import Image 6 | import matplotlib.pyplot as plt 7 | import gym 8 | 9 | # Disable TensorFlow GPU for parallel execution 10 | if os.name == "nt": 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 12 | else: 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 14 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 15 | 16 | from tensorflow.python import keras as K 17 | 18 | 19 | class EvolutionalAgent(): 20 | 21 | def __init__(self, actions): 22 | self.actions = actions 23 | self.model = None 24 | 25 | def save(self, model_path): 26 | self.model.save(model_path, overwrite=True, include_optimizer=False) 27 | 28 | @classmethod 29 | def load(cls, env, model_path): 30 | actions = list(range(env.action_space.n)) 31 | agent = cls(actions) 32 | agent.model = K.models.load_model(model_path) 33 | return agent 34 | 35 | def initialize(self, state, weights=()): 36 | normal = K.initializers.glorot_normal() 37 | model = K.Sequential() 38 | model.add(K.layers.Conv2D( 39 | 3, kernel_size=5, strides=3, 40 | input_shape=state.shape, kernel_initializer=normal, 41 | activation="relu")) 42 | model.add(K.layers.Flatten()) 43 | model.add(K.layers.Dense(len(self.actions), activation="softmax")) 44 | self.model = model 45 | if len(weights) > 0: 46 | self.model.set_weights(weights) 47 | 48 | def policy(self, state): 49 | action_probs = self.model.predict(np.array([state]))[0] 50 | action = np.random.choice(self.actions, 51 | size=1, p=action_probs)[0] 52 | return action 53 | 54 | def play(self, env, episode_count=5, render=True): 55 | for e in range(episode_count): 56 | s = env.reset() 57 | done = False 58 | episode_reward = 0 59 | while not done: 60 | if render: 61 | env.render() 62 | a = self.policy(s) 63 | n_state, reward, done, info = env.step(a) 64 | episode_reward += reward 65 | s = n_state 66 | else: 67 | print("Get reward {}".format(episode_reward)) 68 | 69 | 70 | class CatcherObserver(): 71 | 72 | def __init__(self, width, height, frame_count): 73 | import gym_ple 74 | self._env = gym.make("Catcher-v0") 75 | self.width = width 76 | self.height = height 77 | 78 | @property 79 | def action_space(self): 80 | return self._env.action_space 81 | 82 | @property 83 | def observation_space(self): 84 | return self._env.observation_space 85 | 86 | def reset(self): 87 | return self.transform(self._env.reset()) 88 | 89 | def render(self): 90 | self._env.render(mode="human") 91 | 92 | def step(self, action): 93 | n_state, reward, done, info = self._env.step(action) 94 | return self.transform(n_state), reward, done, info 95 | 96 | def transform(self, state): 97 | grayed = Image.fromarray(state).convert("L") 98 | resized = grayed.resize((self.width, self.height)) 99 | resized = np.array(resized).astype("float") 100 | normalized = resized / 255.0 # scale to 0~1 101 | normalized = np.expand_dims(normalized, axis=2) # H x W => W x W x C 102 | return normalized 103 | 104 | 105 | class EvolutionalTrainer(): 106 | 107 | def __init__(self, population_size=20, sigma=0.5, learning_rate=0.1, 108 | report_interval=10): 109 | self.population_size = population_size 110 | self.sigma = sigma 111 | self.learning_rate = learning_rate 112 | self.weights = () 113 | self.reward_log = [] 114 | 115 | def train(self, epoch=100, episode_per_agent=1, render=False): 116 | env = self.make_env() 117 | actions = list(range(env.action_space.n)) 118 | s = env.reset() 119 | agent = EvolutionalAgent(actions) 120 | agent.initialize(s) 121 | self.weights = agent.model.get_weights() 122 | 123 | with Parallel(n_jobs=-1) as parallel: 124 | for e in range(epoch): 125 | experiment = delayed(EvolutionalTrainer.run_agent) 126 | results = parallel(experiment( 127 | episode_per_agent, self.weights, self.sigma) 128 | for p in range(self.population_size)) 129 | self.update(results) 130 | self.log() 131 | 132 | agent.model.set_weights(self.weights) 133 | return agent 134 | 135 | @classmethod 136 | def make_env(cls): 137 | return CatcherObserver(width=50, height=50, frame_count=5) 138 | 139 | @classmethod 140 | def run_agent(cls, episode_per_agent, base_weights, sigma, 141 | max_step=1000): 142 | env = cls.make_env() 143 | actions = list(range(env.action_space.n)) 144 | agent = EvolutionalAgent(actions) 145 | 146 | noises = [] 147 | new_weights = [] 148 | 149 | # Make weight 150 | for w in base_weights: 151 | noise = np.random.randn(*w.shape) 152 | new_weights.append(w + sigma * noise) 153 | noises.append(noise) 154 | 155 | # Test Play 156 | total_reward = 0 157 | for e in range(episode_per_agent): 158 | s = env.reset() 159 | if agent.model is None: 160 | agent.initialize(s, new_weights) 161 | done = False 162 | step = 0 163 | while not done and step < max_step: 164 | a = agent.policy(s) 165 | n_state, reward, done, info = env.step(a) 166 | total_reward += reward 167 | s = n_state 168 | step += 1 169 | 170 | reward = total_reward / episode_per_agent 171 | return reward, noises 172 | 173 | def update(self, agent_results): 174 | rewards = np.array([r[0] for r in agent_results]) 175 | noises = np.array([r[1] for r in agent_results]) 176 | normalized_rs = (rewards - rewards.mean()) / rewards.std() 177 | 178 | # Update base weights 179 | new_weights = [] 180 | for i, w in enumerate(self.weights): 181 | noise_at_i = np.array([n[i] for n in noises]) 182 | rate = self.learning_rate / (self.population_size * self.sigma) 183 | w = w + rate * np.dot(noise_at_i.T, normalized_rs).T 184 | new_weights.append(w) 185 | 186 | self.weights = new_weights 187 | self.reward_log.append(rewards) 188 | 189 | def log(self): 190 | rewards = self.reward_log[-1] 191 | print("Epoch {}: reward {:.3}(max:{}, min:{})".format( 192 | len(self.reward_log), rewards.mean(), 193 | rewards.max(), rewards.min())) 194 | 195 | def plot_rewards(self): 196 | indices = range(len(self.reward_log)) 197 | means = np.array([rs.mean() for rs in self.reward_log]) 198 | stds = np.array([rs.std() for rs in self.reward_log]) 199 | plt.figure() 200 | plt.title("Reward History") 201 | plt.grid() 202 | plt.fill_between(indices, means - stds, means + stds, 203 | alpha=0.1, color="g") 204 | plt.plot(indices, means, "o-", color="g", 205 | label="reward") 206 | plt.legend(loc="best") 207 | plt.show() 208 | 209 | 210 | def main(play): 211 | model_path = os.path.join(os.path.dirname(__file__), "ev_agent.h5") 212 | 213 | if play: 214 | env = EvolutionalTrainer.make_env() 215 | agent = EvolutionalAgent.load(env, model_path) 216 | agent.play(env, episode_count=5, render=True) 217 | else: 218 | trainer = EvolutionalTrainer() 219 | trained = trainer.train() 220 | trained.save(model_path) 221 | trainer.plot_rewards() 222 | 223 | 224 | if __name__ == "__main__": 225 | parser = argparse.ArgumentParser(description="Evolutional Agent") 226 | parser.add_argument("--play", action="store_true", 227 | help="play with trained model") 228 | 229 | args = parser.parse_args() 230 | main(args.play) 231 | -------------------------------------------------------------------------------- /FN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/FN/__init__.py -------------------------------------------------------------------------------- /FN/a2c_agent.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import deque 3 | import numpy as np 4 | from sklearn.preprocessing import StandardScaler 5 | import tensorflow as tf 6 | from tensorflow.python import keras as K 7 | from PIL import Image 8 | import gym 9 | import gym_ple 10 | from fn_framework import FNAgent, Trainer, Observer 11 | tf.compat.v1.disable_eager_execution() 12 | 13 | 14 | class ActorCriticAgent(FNAgent): 15 | 16 | def __init__(self, actions): 17 | # ActorCriticAgent uses self policy (doesn't use epsilon). 18 | super().__init__(epsilon=0.0, actions=actions) 19 | self._updater = None 20 | 21 | @classmethod 22 | def load(cls, env, model_path): 23 | actions = list(range(env.action_space.n)) 24 | agent = cls(actions) 25 | agent.model = K.models.load_model(model_path, custom_objects={ 26 | "SampleLayer": SampleLayer}) 27 | agent.initialized = True 28 | return agent 29 | 30 | def initialize(self, experiences, optimizer): 31 | feature_shape = experiences[0].s.shape 32 | self.make_model(feature_shape) 33 | self.set_updater(optimizer) 34 | self.initialized = True 35 | print("Done initialization. From now, begin training!") 36 | 37 | def make_model(self, feature_shape): 38 | normal = K.initializers.glorot_normal() 39 | model = K.Sequential() 40 | model.add(K.layers.Conv2D( 41 | 32, kernel_size=8, strides=4, padding="same", 42 | input_shape=feature_shape, 43 | kernel_initializer=normal, activation="relu")) 44 | model.add(K.layers.Conv2D( 45 | 64, kernel_size=4, strides=2, padding="same", 46 | kernel_initializer=normal, activation="relu")) 47 | model.add(K.layers.Conv2D( 48 | 64, kernel_size=3, strides=1, padding="same", 49 | kernel_initializer=normal, activation="relu")) 50 | model.add(K.layers.Flatten()) 51 | model.add(K.layers.Dense(256, kernel_initializer=normal, 52 | activation="relu")) 53 | 54 | actor_layer = K.layers.Dense(len(self.actions), 55 | kernel_initializer=normal) 56 | action_evals = actor_layer(model.output) 57 | actions = SampleLayer()(action_evals) 58 | 59 | critic_layer = K.layers.Dense(1, kernel_initializer=normal) 60 | values = critic_layer(model.output) 61 | 62 | self.model = K.Model(inputs=model.input, 63 | outputs=[actions, action_evals, values]) 64 | 65 | def set_updater(self, optimizer, 66 | value_loss_weight=1.0, entropy_weight=0.1): 67 | actions = tf.compat.v1.placeholder(shape=(None), dtype="int32") 68 | values = tf.compat.v1.placeholder(shape=(None), dtype="float32") 69 | 70 | _, action_evals, estimateds = self.model.output 71 | 72 | neg_logs = tf.nn.sparse_softmax_cross_entropy_with_logits( 73 | logits=action_evals, labels=actions) 74 | # tf.stop_gradient: Prevent policy_loss influences critic_layer. 75 | advantages = values - tf.stop_gradient(estimateds) 76 | 77 | policy_loss = tf.reduce_mean(neg_logs * advantages) 78 | value_loss = tf.keras.losses.MeanSquaredError()(values, estimateds) 79 | action_entropy = tf.reduce_mean(self.categorical_entropy(action_evals)) 80 | 81 | loss = policy_loss + value_loss_weight * value_loss 82 | loss -= entropy_weight * action_entropy 83 | 84 | updates = optimizer.get_updates(loss=loss, 85 | params=self.model.trainable_weights) 86 | 87 | self._updater = K.backend.function( 88 | inputs=[self.model.input, 89 | actions, values], 90 | outputs=[loss, 91 | policy_loss, 92 | value_loss, 93 | tf.reduce_mean(neg_logs), 94 | tf.reduce_mean(advantages), 95 | action_entropy], 96 | updates=updates) 97 | 98 | def categorical_entropy(self, logits): 99 | """ 100 | From OpenAI baseline implementation. 101 | https://github.com/openai/baselines/blob/master/baselines/common/distributions.py#L192 102 | """ 103 | a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True) 104 | ea0 = tf.exp(a0) 105 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 106 | p0 = ea0 / z0 107 | return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1) 108 | 109 | def policy(self, s): 110 | if not self.initialized: 111 | return np.random.randint(len(self.actions)) 112 | else: 113 | action, action_evals, values = self.model.predict(np.array([s])) 114 | return action[0] 115 | 116 | def estimate(self, s): 117 | action, action_evals, values = self.model.predict(np.array([s])) 118 | return values[0][0] 119 | 120 | def update(self, states, actions, rewards): 121 | return self._updater([states, actions, rewards]) 122 | 123 | 124 | class SampleLayer(K.layers.Layer): 125 | 126 | def __init__(self, **kwargs): 127 | self.output_dim = 1 # sample one action from evaluations 128 | super(SampleLayer, self).__init__(**kwargs) 129 | 130 | def build(self, input_shape): 131 | super(SampleLayer, self).build(input_shape) 132 | 133 | def call(self, x): 134 | noise = tf.random.uniform(tf.shape(x)) 135 | return tf.argmax(x - tf.math.log(-tf.math.log(noise)), axis=1) 136 | 137 | def compute_output_shape(self, input_shape): 138 | return (input_shape[0], self.output_dim) 139 | 140 | 141 | class ActorCriticAgentTest(ActorCriticAgent): 142 | 143 | def make_model(self, feature_shape): 144 | normal = K.initializers.glorot_normal() 145 | model = K.Sequential() 146 | model.add(K.layers.Dense(10, input_shape=feature_shape, 147 | kernel_initializer=normal, activation="relu")) 148 | model.add(K.layers.Dense(10, kernel_initializer=normal, 149 | activation="relu")) 150 | 151 | actor_layer = K.layers.Dense(len(self.actions), 152 | kernel_initializer=normal) 153 | 154 | action_evals = actor_layer(model.output) 155 | actions = SampleLayer()(action_evals) 156 | 157 | critic_layer = K.layers.Dense(1, kernel_initializer=normal) 158 | values = critic_layer(model.output) 159 | 160 | self.model = K.Model(inputs=model.input, 161 | outputs=[actions, action_evals, values]) 162 | 163 | 164 | class CatcherObserver(Observer): 165 | 166 | def __init__(self, env, width, height, frame_count): 167 | super().__init__(env) 168 | self.width = width 169 | self.height = height 170 | self.frame_count = frame_count 171 | self._frames = deque(maxlen=frame_count) 172 | 173 | def transform(self, state): 174 | grayed = Image.fromarray(state).convert("L") 175 | resized = grayed.resize((self.width, self.height)) 176 | resized = np.array(resized).astype("float") 177 | normalized = resized / 255.0 # scale to 0~1 178 | if len(self._frames) == 0: 179 | for i in range(self.frame_count): 180 | self._frames.append(normalized) 181 | else: 182 | self._frames.append(normalized) 183 | feature = np.array(self._frames) 184 | # Convert the feature shape (f, w, h) => (h, w, f). 185 | feature = np.transpose(feature, (1, 2, 0)) 186 | return feature 187 | 188 | 189 | class ActorCriticTrainer(Trainer): 190 | 191 | def __init__(self, buffer_size=256, batch_size=32, 192 | gamma=0.99, learning_rate=1e-3, 193 | report_interval=10, log_dir="", file_name=""): 194 | super().__init__(buffer_size, batch_size, gamma, 195 | report_interval, log_dir) 196 | self.file_name = file_name if file_name else "a2c_agent.h5" 197 | self.learning_rate = learning_rate 198 | self.losses = {} 199 | self.rewards = [] 200 | self._max_reward = -10 201 | 202 | def train(self, env, episode_count=900, initial_count=10, 203 | test_mode=False, render=False, observe_interval=100): 204 | actions = list(range(env.action_space.n)) 205 | if not test_mode: 206 | agent = ActorCriticAgent(actions) 207 | else: 208 | agent = ActorCriticAgentTest(actions) 209 | observe_interval = 0 210 | self.training_episode = episode_count 211 | 212 | self.train_loop(env, agent, episode_count, initial_count, render, 213 | observe_interval) 214 | return agent 215 | 216 | def episode_begin(self, episode, agent): 217 | self.rewards = [] 218 | 219 | def step(self, episode, step_count, agent, experience): 220 | self.rewards.append(experience.r) 221 | if not agent.initialized: 222 | if len(self.experiences) < self.buffer_size: 223 | # Store experience until buffer_size (enough to initialize). 224 | return False 225 | 226 | optimizer = K.optimizers.Adam(lr=self.learning_rate, 227 | clipnorm=5.0) 228 | agent.initialize(self.experiences, optimizer) 229 | self.logger.set_model(agent.model) 230 | self.training = True 231 | self.experiences.clear() 232 | else: 233 | if len(self.experiences) < self.batch_size: 234 | # Store experience until batch_size (enough to update). 235 | return False 236 | 237 | batch = self.make_batch(agent) 238 | loss, lp, lv, p_ng, p_ad, p_en = agent.update(*batch) 239 | # Record latest metrics. 240 | self.losses["loss/total"] = loss 241 | self.losses["loss/policy"] = lp 242 | self.losses["loss/value"] = lv 243 | self.losses["policy/neg_logs"] = p_ng 244 | self.losses["policy/advantage"] = p_ad 245 | self.losses["policy/entropy"] = p_en 246 | self.experiences.clear() 247 | 248 | def make_batch(self, agent): 249 | states = [] 250 | actions = [] 251 | values = [] 252 | experiences = list(self.experiences) 253 | states = np.array([e.s for e in experiences]) 254 | actions = np.array([e.a for e in experiences]) 255 | 256 | # Calculate values. 257 | # If the last experience isn't terminal (done) then estimates value. 258 | last = experiences[-1] 259 | future = last.r if last.d else agent.estimate(last.n_s) 260 | for e in reversed(experiences): 261 | value = e.r 262 | if not e.d: 263 | value += self.gamma * future 264 | values.append(value) 265 | future = value 266 | values = np.array(list(reversed(values))) 267 | 268 | scaler = StandardScaler() 269 | values = scaler.fit_transform(values.reshape((-1, 1))).flatten() 270 | 271 | return states, actions, values 272 | 273 | def episode_end(self, episode, step_count, agent): 274 | reward = sum(self.rewards) 275 | self.reward_log.append(reward) 276 | 277 | if agent.initialized: 278 | self.logger.write(self.training_count, "reward", reward) 279 | self.logger.write(self.training_count, "reward_max", 280 | max(self.rewards)) 281 | 282 | for k in self.losses: 283 | self.logger.write(self.training_count, k, self.losses[k]) 284 | 285 | if reward > self._max_reward: 286 | agent.save(self.logger.path_of(self.file_name)) 287 | self._max_reward = reward 288 | 289 | if self.is_event(episode, self.report_interval): 290 | recent_rewards = self.reward_log[-self.report_interval:] 291 | self.logger.describe("reward", recent_rewards, episode=episode) 292 | 293 | 294 | def main(play, is_test): 295 | file_name = "a2c_agent.h5" if not is_test else "a2c_agent_test.h5" 296 | trainer = ActorCriticTrainer(file_name=file_name) 297 | path = trainer.logger.path_of(trainer.file_name) 298 | agent_class = ActorCriticAgent 299 | 300 | if is_test: 301 | print("Train on test mode") 302 | obs = gym.make("CartPole-v0") 303 | agent_class = ActorCriticAgentTest 304 | else: 305 | env = gym.make("Catcher-v0") 306 | obs = CatcherObserver(env, 80, 80, 4) 307 | trainer.learning_rate = 7e-5 308 | 309 | if play: 310 | agent = agent_class.load(obs, path) 311 | agent.play(obs, episode_count=10, render=True) 312 | else: 313 | trainer.train(obs, test_mode=is_test) 314 | 315 | 316 | if __name__ == "__main__": 317 | parser = argparse.ArgumentParser(description="A2C Agent") 318 | parser.add_argument("--play", action="store_true", 319 | help="play with trained model") 320 | parser.add_argument("--test", action="store_true", 321 | help="train by test mode") 322 | 323 | args = parser.parse_args() 324 | main(args.play, args.test) 325 | -------------------------------------------------------------------------------- /FN/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import argparse 3 | from collections import deque 4 | import numpy as np 5 | from tensorflow.python import keras as K 6 | from PIL import Image 7 | import gym 8 | import gym_ple 9 | from fn_framework import FNAgent, Trainer, Observer 10 | 11 | 12 | class DeepQNetworkAgent(FNAgent): 13 | 14 | def __init__(self, epsilon, actions): 15 | super().__init__(epsilon, actions) 16 | self._scaler = None 17 | self._teacher_model = None 18 | 19 | def initialize(self, experiences, optimizer): 20 | feature_shape = experiences[0].s.shape 21 | self.make_model(feature_shape) 22 | self.model.compile(optimizer, loss="mse") 23 | self.initialized = True 24 | print("Done initialization. From now, begin training!") 25 | 26 | def make_model(self, feature_shape): 27 | normal = K.initializers.glorot_normal() 28 | model = K.Sequential() 29 | model.add(K.layers.Conv2D( 30 | 32, kernel_size=8, strides=4, padding="same", 31 | input_shape=feature_shape, kernel_initializer=normal, 32 | activation="relu")) 33 | model.add(K.layers.Conv2D( 34 | 64, kernel_size=4, strides=2, padding="same", 35 | kernel_initializer=normal, 36 | activation="relu")) 37 | model.add(K.layers.Conv2D( 38 | 64, kernel_size=3, strides=1, padding="same", 39 | kernel_initializer=normal, 40 | activation="relu")) 41 | model.add(K.layers.Flatten()) 42 | model.add(K.layers.Dense(256, kernel_initializer=normal, 43 | activation="relu")) 44 | model.add(K.layers.Dense(len(self.actions), 45 | kernel_initializer=normal)) 46 | self.model = model 47 | self._teacher_model = K.models.clone_model(self.model) 48 | 49 | def estimate(self, state): 50 | return self.model.predict(np.array([state]))[0] 51 | 52 | def update(self, experiences, gamma): 53 | states = np.array([e.s for e in experiences]) 54 | n_states = np.array([e.n_s for e in experiences]) 55 | 56 | estimateds = self.model.predict(states) 57 | future = self._teacher_model.predict(n_states) 58 | 59 | for i, e in enumerate(experiences): 60 | reward = e.r 61 | if not e.d: 62 | reward += gamma * np.max(future[i]) 63 | estimateds[i][e.a] = reward 64 | 65 | loss = self.model.train_on_batch(states, estimateds) 66 | return loss 67 | 68 | def update_teacher(self): 69 | self._teacher_model.set_weights(self.model.get_weights()) 70 | 71 | 72 | class DeepQNetworkAgentTest(DeepQNetworkAgent): 73 | 74 | def __init__(self, epsilon, actions): 75 | super().__init__(epsilon, actions) 76 | 77 | def make_model(self, feature_shape): 78 | normal = K.initializers.glorot_normal() 79 | model = K.Sequential() 80 | model.add(K.layers.Dense(64, input_shape=feature_shape, 81 | kernel_initializer=normal, activation="relu")) 82 | model.add(K.layers.Dense(len(self.actions), kernel_initializer=normal, 83 | activation="relu")) 84 | self.model = model 85 | self._teacher_model = K.models.clone_model(self.model) 86 | 87 | 88 | class CatcherObserver(Observer): 89 | 90 | def __init__(self, env, width, height, frame_count): 91 | super().__init__(env) 92 | self.width = width 93 | self.height = height 94 | self.frame_count = frame_count 95 | self._frames = deque(maxlen=frame_count) 96 | 97 | def transform(self, state): 98 | grayed = Image.fromarray(state).convert("L") 99 | resized = grayed.resize((self.width, self.height)) 100 | resized = np.array(resized).astype("float") 101 | normalized = resized / 255.0 # scale to 0~1 102 | if len(self._frames) == 0: 103 | for i in range(self.frame_count): 104 | self._frames.append(normalized) 105 | else: 106 | self._frames.append(normalized) 107 | feature = np.array(self._frames) 108 | # Convert the feature shape (f, w, h) => (h, w, f). 109 | feature = np.transpose(feature, (1, 2, 0)) 110 | 111 | return feature 112 | 113 | 114 | class DeepQNetworkTrainer(Trainer): 115 | 116 | def __init__(self, buffer_size=50000, batch_size=32, 117 | gamma=0.99, initial_epsilon=0.5, final_epsilon=1e-3, 118 | learning_rate=1e-3, teacher_update_freq=3, report_interval=10, 119 | log_dir="", file_name=""): 120 | super().__init__(buffer_size, batch_size, gamma, 121 | report_interval, log_dir) 122 | self.file_name = file_name if file_name else "dqn_agent.h5" 123 | self.initial_epsilon = initial_epsilon 124 | self.final_epsilon = final_epsilon 125 | self.learning_rate = learning_rate 126 | self.teacher_update_freq = teacher_update_freq 127 | self.loss = 0 128 | self.training_episode = 0 129 | self._max_reward = -10 130 | 131 | def train(self, env, episode_count=1200, initial_count=200, 132 | test_mode=False, render=False, observe_interval=100): 133 | actions = list(range(env.action_space.n)) 134 | if not test_mode: 135 | agent = DeepQNetworkAgent(1.0, actions) 136 | else: 137 | agent = DeepQNetworkAgentTest(1.0, actions) 138 | observe_interval = 0 139 | self.training_episode = episode_count 140 | 141 | self.train_loop(env, agent, episode_count, initial_count, render, 142 | observe_interval) 143 | return agent 144 | 145 | def episode_begin(self, episode, agent): 146 | self.loss = 0 147 | 148 | def begin_train(self, episode, agent): 149 | optimizer = K.optimizers.Adam(lr=self.learning_rate, clipvalue=1.0) 150 | agent.initialize(self.experiences, optimizer) 151 | self.logger.set_model(agent.model) 152 | agent.epsilon = self.initial_epsilon 153 | self.training_episode -= episode 154 | 155 | def step(self, episode, step_count, agent, experience): 156 | if self.training: 157 | batch = random.sample(self.experiences, self.batch_size) 158 | self.loss += agent.update(batch, self.gamma) 159 | 160 | def episode_end(self, episode, step_count, agent): 161 | reward = sum([e.r for e in self.get_recent(step_count)]) 162 | self.loss = self.loss / step_count 163 | self.reward_log.append(reward) 164 | if self.training: 165 | self.logger.write(self.training_count, "loss", self.loss) 166 | self.logger.write(self.training_count, "reward", reward) 167 | self.logger.write(self.training_count, "epsilon", agent.epsilon) 168 | if reward > self._max_reward: 169 | agent.save(self.logger.path_of(self.file_name)) 170 | self._max_reward = reward 171 | if self.is_event(self.training_count, self.teacher_update_freq): 172 | agent.update_teacher() 173 | 174 | diff = (self.initial_epsilon - self.final_epsilon) 175 | decay = diff / self.training_episode 176 | agent.epsilon = max(agent.epsilon - decay, self.final_epsilon) 177 | 178 | if self.is_event(episode, self.report_interval): 179 | recent_rewards = self.reward_log[-self.report_interval:] 180 | self.logger.describe("reward", recent_rewards, episode=episode) 181 | 182 | 183 | def main(play, is_test): 184 | file_name = "dqn_agent.h5" if not is_test else "dqn_agent_test.h5" 185 | trainer = DeepQNetworkTrainer(file_name=file_name) 186 | path = trainer.logger.path_of(trainer.file_name) 187 | agent_class = DeepQNetworkAgent 188 | 189 | if is_test: 190 | print("Train on test mode") 191 | obs = gym.make("CartPole-v0") 192 | agent_class = DeepQNetworkAgentTest 193 | else: 194 | env = gym.make("Catcher-v0") 195 | obs = CatcherObserver(env, 80, 80, 4) 196 | trainer.learning_rate = 1e-4 197 | 198 | if play: 199 | agent = agent_class.load(obs, path) 200 | agent.play(obs, render=True) 201 | else: 202 | trainer.train(obs, test_mode=is_test) 203 | 204 | 205 | if __name__ == "__main__": 206 | parser = argparse.ArgumentParser(description="DQN Agent") 207 | parser.add_argument("--play", action="store_true", 208 | help="play with trained model") 209 | parser.add_argument("--test", action="store_true", 210 | help="train by test mode") 211 | 212 | args = parser.parse_args() 213 | main(args.play, args.test) 214 | -------------------------------------------------------------------------------- /FN/fn_framework.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import re 4 | from collections import namedtuple 5 | from collections import deque 6 | import numpy as np 7 | import tensorflow as tf 8 | from tensorflow.python import keras as K 9 | from PIL import Image 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | Experience = namedtuple("Experience", 14 | ["s", "a", "r", "n_s", "d"]) 15 | 16 | 17 | class FNAgent(): 18 | 19 | def __init__(self, epsilon, actions): 20 | self.epsilon = epsilon 21 | self.actions = actions 22 | self.model = None 23 | self.estimate_probs = False 24 | self.initialized = False 25 | 26 | def save(self, model_path): 27 | self.model.save(model_path, overwrite=True, include_optimizer=False) 28 | 29 | @classmethod 30 | def load(cls, env, model_path, epsilon=0.0001): 31 | actions = list(range(env.action_space.n)) 32 | agent = cls(epsilon, actions) 33 | agent.model = K.models.load_model(model_path) 34 | agent.initialized = True 35 | return agent 36 | 37 | def initialize(self, experiences): 38 | raise NotImplementedError("You have to implement initialize method.") 39 | 40 | def estimate(self, s): 41 | raise NotImplementedError("You have to implement estimate method.") 42 | 43 | def update(self, experiences, gamma): 44 | raise NotImplementedError("You have to implement update method.") 45 | 46 | def policy(self, s): 47 | if np.random.random() < self.epsilon or not self.initialized: 48 | return np.random.randint(len(self.actions)) 49 | else: 50 | estimates = self.estimate(s) 51 | if self.estimate_probs: 52 | action = np.random.choice(self.actions, 53 | size=1, p=estimates)[0] 54 | return action 55 | else: 56 | return np.argmax(estimates) 57 | 58 | def play(self, env, episode_count=5, render=True): 59 | for e in range(episode_count): 60 | s = env.reset() 61 | done = False 62 | episode_reward = 0 63 | while not done: 64 | if render: 65 | env.render() 66 | a = self.policy(s) 67 | n_state, reward, done, info = env.step(a) 68 | episode_reward += reward 69 | s = n_state 70 | else: 71 | print("Get reward {}.".format(episode_reward)) 72 | 73 | 74 | class Trainer(): 75 | 76 | def __init__(self, buffer_size=1024, batch_size=32, 77 | gamma=0.9, report_interval=10, log_dir=""): 78 | self.buffer_size = buffer_size 79 | self.batch_size = batch_size 80 | self.gamma = gamma 81 | self.report_interval = report_interval 82 | self.logger = Logger(log_dir, self.trainer_name) 83 | self.experiences = deque(maxlen=buffer_size) 84 | self.training = False 85 | self.training_count = 0 86 | self.reward_log = [] 87 | 88 | @property 89 | def trainer_name(self): 90 | class_name = self.__class__.__name__ 91 | snaked = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", class_name) 92 | snaked = re.sub("([a-z0-9])([A-Z])", r"\1_\2", snaked).lower() 93 | snaked = snaked.replace("_trainer", "") 94 | return snaked 95 | 96 | def train_loop(self, env, agent, episode=200, initial_count=-1, 97 | render=False, observe_interval=0): 98 | self.experiences = deque(maxlen=self.buffer_size) 99 | self.training = False 100 | self.training_count = 0 101 | self.reward_log = [] 102 | frames = [] 103 | 104 | for i in range(episode): 105 | s = env.reset() 106 | done = False 107 | step_count = 0 108 | self.episode_begin(i, agent) 109 | while not done: 110 | if render: 111 | env.render() 112 | if self.training and observe_interval > 0 and\ 113 | (self.training_count == 1 or 114 | self.training_count % observe_interval == 0): 115 | frames.append(s) 116 | 117 | a = agent.policy(s) 118 | n_state, reward, done, info = env.step(a) 119 | e = Experience(s, a, reward, n_state, done) 120 | self.experiences.append(e) 121 | if not self.training and \ 122 | len(self.experiences) == self.buffer_size: 123 | self.begin_train(i, agent) 124 | self.training = True 125 | 126 | self.step(i, step_count, agent, e) 127 | 128 | s = n_state 129 | step_count += 1 130 | else: 131 | self.episode_end(i, step_count, agent) 132 | 133 | if not self.training and \ 134 | initial_count > 0 and i >= initial_count: 135 | self.begin_train(i, agent) 136 | self.training = True 137 | 138 | if self.training: 139 | if len(frames) > 0: 140 | self.logger.write_image(self.training_count, 141 | frames) 142 | frames = [] 143 | self.training_count += 1 144 | 145 | def episode_begin(self, episode, agent): 146 | pass 147 | 148 | def begin_train(self, episode, agent): 149 | pass 150 | 151 | def step(self, episode, step_count, agent, experience): 152 | pass 153 | 154 | def episode_end(self, episode, step_count, agent): 155 | pass 156 | 157 | def is_event(self, count, interval): 158 | return True if count != 0 and count % interval == 0 else False 159 | 160 | def get_recent(self, count): 161 | recent = range(len(self.experiences) - count, len(self.experiences)) 162 | return [self.experiences[i] for i in recent] 163 | 164 | 165 | class Observer(): 166 | 167 | def __init__(self, env): 168 | self._env = env 169 | 170 | @property 171 | def action_space(self): 172 | return self._env.action_space 173 | 174 | @property 175 | def observation_space(self): 176 | return self._env.observation_space 177 | 178 | def reset(self): 179 | return self.transform(self._env.reset()) 180 | 181 | def render(self): 182 | self._env.render(mode="human") 183 | 184 | def step(self, action): 185 | n_state, reward, done, info = self._env.step(action) 186 | return self.transform(n_state), reward, done, info 187 | 188 | def transform(self, state): 189 | raise NotImplementedError("You have to implement transform method.") 190 | 191 | 192 | class Logger(): 193 | 194 | def __init__(self, log_dir="", dir_name=""): 195 | self.log_dir = log_dir 196 | if not log_dir: 197 | self.log_dir = os.path.join(os.path.dirname(__file__), "logs") 198 | if not os.path.exists(self.log_dir): 199 | os.mkdir(self.log_dir) 200 | 201 | if dir_name: 202 | self.log_dir = os.path.join(self.log_dir, dir_name) 203 | if not os.path.exists(self.log_dir): 204 | os.mkdir(self.log_dir) 205 | 206 | self._callback = tf.compat.v1.keras.callbacks.TensorBoard( 207 | self.log_dir) 208 | 209 | @property 210 | def writer(self): 211 | return self._callback.writer 212 | 213 | def set_model(self, model): 214 | self._callback.set_model(model) 215 | 216 | def path_of(self, file_name): 217 | return os.path.join(self.log_dir, file_name) 218 | 219 | def describe(self, name, values, episode=-1, step=-1): 220 | mean = np.round(np.mean(values), 3) 221 | std = np.round(np.std(values), 3) 222 | desc = "{} is {} (+/-{})".format(name, mean, std) 223 | if episode > 0: 224 | print("At episode {}, {}".format(episode, desc)) 225 | elif step > 0: 226 | print("At step {}, {}".format(step, desc)) 227 | 228 | def plot(self, name, values, interval=10): 229 | indices = list(range(0, len(values), interval)) 230 | means = [] 231 | stds = [] 232 | for i in indices: 233 | _values = values[i:(i + interval)] 234 | means.append(np.mean(_values)) 235 | stds.append(np.std(_values)) 236 | means = np.array(means) 237 | stds = np.array(stds) 238 | plt.figure() 239 | plt.title("{} History".format(name)) 240 | plt.grid() 241 | plt.fill_between(indices, means - stds, means + stds, 242 | alpha=0.1, color="g") 243 | plt.plot(indices, means, "o-", color="g", 244 | label="{} per {} episode".format(name.lower(), interval)) 245 | plt.legend(loc="best") 246 | plt.show() 247 | 248 | def write(self, index, name, value): 249 | summary = tf.compat.v1.Summary() 250 | summary_value = summary.value.add() 251 | summary_value.tag = name 252 | summary_value.simple_value = value 253 | self.writer.add_summary(summary, index) 254 | self.writer.flush() 255 | 256 | def write_image(self, index, frames): 257 | # Deal with a 'frames' as a list of sequential gray scaled image. 258 | last_frames = [f[:, :, -1] for f in frames] 259 | if np.min(last_frames[-1]) < 0: 260 | scale = 127 / np.abs(last_frames[-1]).max() 261 | offset = 128 262 | else: 263 | scale = 255 / np.max(last_frames[-1]) 264 | offset = 0 265 | channel = 1 # gray scale 266 | tag = "frames_at_training_{}".format(index) 267 | values = [] 268 | 269 | for f in last_frames: 270 | height, width = f.shape 271 | array = np.asarray(f * scale + offset, dtype=np.uint8) 272 | image = Image.fromarray(array) 273 | output = io.BytesIO() 274 | image.save(output, format="PNG") 275 | image_string = output.getvalue() 276 | output.close() 277 | image = tf.compat.v1.Summary.Image( 278 | height=height, width=width, colorspace=channel, 279 | encoded_image_string=image_string) 280 | value = tf.compat.v1.Summary.Value(tag=tag, image=image) 281 | values.append(value) 282 | 283 | summary = tf.compat.v1.Summary(value=values) 284 | self.writer.add_summary(summary, index) 285 | self.writer.flush() 286 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.python import keras as K 3 | 4 | model = K.Sequential([ 5 | K.layers.Dense(units=4, input_shape=((2, ))), 6 | ]) 7 | 8 | weight, bias = model.layers[0].get_weights() 9 | print("Weight shape is {}.".format(weight.shape)) 10 | print("Bias shape is {}.".format(bias.shape)) 11 | 12 | x = np.random.rand(1, 2) 13 | y = model.predict(x) 14 | print("x is ({}) and y is ({}).".format(x.shape, y.shape)) 15 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_keras_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.python import keras as K 3 | 4 | # 2-layer neural network. 5 | model = K.Sequential([ 6 | K.layers.Dense(units=4, input_shape=((2, )), 7 | activation="sigmoid"), 8 | K.layers.Dense(units=4), 9 | ]) 10 | 11 | # Make batch size = 3 data (dimension of x is 2). 12 | batch = np.random.rand(3, 2) 13 | 14 | y = model.predict(batch) 15 | print(y.shape) # Will be (3, 4) 16 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_keras_boston.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.datasets import load_boston 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from tensorflow.python import keras as K 7 | 8 | 9 | dataset = load_boston() 10 | 11 | y = dataset.target 12 | X = dataset.data 13 | 14 | X_train, X_test, y_train, y_test = train_test_split( 15 | X, y, test_size=0.33) 16 | 17 | model = K.Sequential([ 18 | K.layers.BatchNormalization(input_shape=(13,)), 19 | K.layers.Dense(units=13, activation="softplus", kernel_regularizer="l1"), 20 | K.layers.Dense(units=1) 21 | ]) 22 | model.compile(loss="mean_squared_error", optimizer="sgd") 23 | model.fit(X_train, y_train, epochs=8) 24 | 25 | predicts = model.predict(X_test) 26 | 27 | result = pd.DataFrame({ 28 | "predict": np.reshape(predicts, (-1,)), 29 | "actual": y_test 30 | }) 31 | limit = np.max(y_test) 32 | 33 | result.plot.scatter(x="actual", y="predict", xlim=(0, limit), ylim=(0, limit)) 34 | plt.show() 35 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_keras_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.datasets import load_digits 4 | from sklearn.metrics import classification_report 5 | from tensorflow.python import keras as K 6 | 7 | 8 | dataset = load_digits() 9 | image_shape = (8, 8, 1) 10 | num_class = 10 11 | 12 | y = dataset.target 13 | y = K.utils.to_categorical(y, num_class) 14 | X = dataset.data 15 | X = np.array([data.reshape(image_shape) for data in X]) 16 | 17 | X_train, X_test, y_train, y_test = train_test_split( 18 | X, y, test_size=0.33) 19 | 20 | model = K.Sequential([ 21 | K.layers.Conv2D( 22 | 5, kernel_size=3, strides=1, padding="same", 23 | input_shape=image_shape, activation="relu"), 24 | K.layers.Conv2D( 25 | 3, kernel_size=2, strides=1, padding="same", 26 | activation="relu"), 27 | K.layers.Flatten(), 28 | K.layers.Dense(units=num_class, activation="softmax") 29 | ]) 30 | model.compile(loss="categorical_crossentropy", optimizer="sgd") 31 | model.fit(X_train, y_train, epochs=8) 32 | 33 | predicts = model.predict(X_test) 34 | predicts = np.argmax(predicts, axis=1) 35 | actual = np.argmax(y_test, axis=1) 36 | print(classification_report(actual, predicts)) 37 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_tf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | # Weight (row=4 x col=2). 5 | a = tf.Variable(np.random.rand(4, 2)) 6 | 7 | # Bias (row=4 x col=1). 8 | b = tf.Variable(np.random.rand(4, 1)) 9 | 10 | # Input(x) (row=2 x col=1). 11 | x = tf.compat.v1.placeholder(tf.float64, shape=(2, 1)) 12 | 13 | # Output(y) (row=4 x col=1). 14 | y = tf.matmul(a, x) + b 15 | 16 | 17 | with tf.Session() as sess: 18 | # Initialize variable. 19 | init = tf.global_variables_initializer() 20 | sess.run(init) 21 | 22 | # Make input to x. 23 | x_value = np.random.rand(2, 1) 24 | 25 | # Execute culculation. 26 | y_output = sess.run(y, feed_dict={x: x_value}) 27 | print(y_output.shape) # Will be (4, 1) 28 | -------------------------------------------------------------------------------- /FN/nn_tutorial/explanation_tf_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | # Weight (row=4 x col=2). 5 | a = tf.Variable(np.random.rand(4, 2)) 6 | 7 | # Bias (row=4 x col=1). 8 | b = tf.Variable(np.random.rand(4, 1)) 9 | 10 | # Input(x) (row=2 x col=1). 11 | x = tf.compat.v1.placeholder(tf.float64, shape=(2, 1)) 12 | 13 | # Output(y) (row=4 x col=1). 14 | y = tf.matmul(a, x) + b 15 | 16 | 17 | with tf.Session() as sess: 18 | # Initialize variable. 19 | init = tf.global_variables_initializer() 20 | sess.run(init) 21 | 22 | # Make batch. 23 | batch = [] 24 | for i in range(3): 25 | x_value = np.random.rand(2, 1) 26 | batch.append(x_value) 27 | 28 | # Execute culculation. 29 | y_outputs = [] 30 | for x_value in batch: 31 | y_output = sess.run(y, feed_dict={x: x_value}) 32 | y_outputs.append(y_output) 33 | 34 | y_output = np.array(y_outputs) 35 | print(y_output.shape) # Will be (3, 4, 1) 36 | -------------------------------------------------------------------------------- /FN/nn_tutorial/gradient.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | tf.enable_eager_execution() 5 | tfe = tf.contrib.eager 6 | 7 | 8 | def f(x, a, b): 9 | return tf.add(tf.multiply(x, a), b) 10 | 11 | 12 | def grad(f): 13 | return lambda x, a, b: tfe.gradients_function(f)(x, a, b) 14 | 15 | 16 | x = 2.0 17 | a = 3.0 18 | b = -1.0 19 | 20 | print(f(x, a, b)) 21 | print(grad(f)(x, a, b)) 22 | -------------------------------------------------------------------------------- /FN/policy_gradient_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import argparse 4 | import numpy as np 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.externals import joblib 7 | import tensorflow as tf 8 | from tensorflow.python import keras as K 9 | import gym 10 | from fn_framework import FNAgent, Trainer, Observer, Experience 11 | tf.compat.v1.disable_eager_execution() 12 | 13 | 14 | class PolicyGradientAgent(FNAgent): 15 | 16 | def __init__(self, actions): 17 | # PolicyGradientAgent uses self policy (doesn't use epsilon). 18 | super().__init__(epsilon=0.0, actions=actions) 19 | self.estimate_probs = True 20 | self.scaler = StandardScaler() 21 | self._updater = None 22 | 23 | def save(self, model_path): 24 | super().save(model_path) 25 | joblib.dump(self.scaler, self.scaler_path(model_path)) 26 | 27 | @classmethod 28 | def load(cls, env, model_path): 29 | actions = list(range(env.action_space.n)) 30 | agent = cls(actions) 31 | agent.model = K.models.load_model(model_path) 32 | agent.initialized = True 33 | agent.scaler = joblib.load(agent.scaler_path(model_path)) 34 | return agent 35 | 36 | def scaler_path(self, model_path): 37 | fname, _ = os.path.splitext(model_path) 38 | fname += "_scaler.pkl" 39 | return fname 40 | 41 | def initialize(self, experiences, optimizer): 42 | states = np.vstack([e.s for e in experiences]) 43 | feature_size = states.shape[1] 44 | self.model = K.models.Sequential([ 45 | K.layers.Dense(10, activation="relu", input_shape=(feature_size,)), 46 | K.layers.Dense(10, activation="relu"), 47 | K.layers.Dense(len(self.actions), activation="softmax") 48 | ]) 49 | self.set_updater(optimizer) 50 | self.scaler.fit(states) 51 | self.initialized = True 52 | print("Done initialization. From now, begin training!") 53 | 54 | def set_updater(self, optimizer): 55 | actions = tf.compat.v1.placeholder(shape=(None), dtype="int32") 56 | rewards = tf.compat.v1.placeholder(shape=(None), dtype="float32") 57 | one_hot_actions = tf.one_hot(actions, len(self.actions), axis=1) 58 | action_probs = self.model.output 59 | selected_action_probs = tf.reduce_sum(one_hot_actions * action_probs, 60 | axis=1) 61 | clipped = tf.clip_by_value(selected_action_probs, 1e-10, 1.0) 62 | loss = - tf.math.log(clipped) * rewards 63 | loss = tf.reduce_mean(loss) 64 | 65 | updates = optimizer.get_updates(loss=loss, 66 | params=self.model.trainable_weights) 67 | self._updater = K.backend.function( 68 | inputs=[self.model.input, 69 | actions, rewards], 70 | outputs=[loss], 71 | updates=updates) 72 | 73 | def estimate(self, s): 74 | normalized = self.scaler.transform(s) 75 | action_probs = self.model.predict(normalized)[0] 76 | return action_probs 77 | 78 | def update(self, states, actions, rewards): 79 | normalizeds = self.scaler.transform(states) 80 | actions = np.array(actions) 81 | rewards = np.array(rewards) 82 | self._updater([normalizeds, actions, rewards]) 83 | 84 | 85 | class CartPoleObserver(Observer): 86 | 87 | def transform(self, state): 88 | return np.array(state).reshape((1, -1)) 89 | 90 | 91 | class PolicyGradientTrainer(Trainer): 92 | 93 | def __init__(self, buffer_size=256, batch_size=32, gamma=0.9, 94 | report_interval=10, log_dir=""): 95 | super().__init__(buffer_size, batch_size, gamma, 96 | report_interval, log_dir) 97 | 98 | def train(self, env, episode_count=220, initial_count=-1, render=False): 99 | actions = list(range(env.action_space.n)) 100 | agent = PolicyGradientAgent(actions) 101 | self.train_loop(env, agent, episode_count, initial_count, render) 102 | return agent 103 | 104 | def episode_begin(self, episode, agent): 105 | if agent.initialized: 106 | self.experiences = [] 107 | 108 | def make_batch(self, policy_experiences): 109 | length = min(self.batch_size, len(policy_experiences)) 110 | batch = random.sample(policy_experiences, length) 111 | states = np.vstack([e.s for e in batch]) 112 | actions = [e.a for e in batch] 113 | rewards = [e.r for e in batch] 114 | scaler = StandardScaler() 115 | rewards = np.array(rewards).reshape((-1, 1)) 116 | rewards = scaler.fit_transform(rewards).flatten() 117 | return states, actions, rewards 118 | 119 | def episode_end(self, episode, step_count, agent): 120 | rewards = [e.r for e in self.get_recent(step_count)] 121 | self.reward_log.append(sum(rewards)) 122 | 123 | if not agent.initialized: 124 | if len(self.experiences) == self.buffer_size: 125 | optimizer = K.optimizers.Adam(lr=0.01) 126 | agent.initialize(self.experiences, optimizer) 127 | self.training = True 128 | else: 129 | policy_experiences = [] 130 | for t, e in enumerate(self.experiences): 131 | s, a, r, n_s, d = e 132 | d_r = [_r * (self.gamma ** i) for i, _r in 133 | enumerate(rewards[t:])] 134 | d_r = sum(d_r) 135 | d_e = Experience(s, a, d_r, n_s, d) 136 | policy_experiences.append(d_e) 137 | 138 | agent.update(*self.make_batch(policy_experiences)) 139 | 140 | if self.is_event(episode, self.report_interval): 141 | recent_rewards = self.reward_log[-self.report_interval:] 142 | self.logger.describe("reward", recent_rewards, episode=episode) 143 | 144 | 145 | def main(play): 146 | env = CartPoleObserver(gym.make("CartPole-v0")) 147 | trainer = PolicyGradientTrainer() 148 | path = trainer.logger.path_of("policy_gradient_agent.h5") 149 | 150 | if play: 151 | agent = PolicyGradientAgent.load(env, path) 152 | agent.play(env) 153 | else: 154 | trained = trainer.train(env) 155 | trainer.logger.plot("Rewards", trainer.reward_log, 156 | trainer.report_interval) 157 | trained.save(path) 158 | 159 | 160 | if __name__ == "__main__": 161 | parser = argparse.ArgumentParser(description="PG Agent") 162 | parser.add_argument("--play", action="store_true", 163 | help="play with trained model") 164 | 165 | args = parser.parse_args() 166 | main(args.play) 167 | -------------------------------------------------------------------------------- /FN/policy_gradient_continuous_agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import numpy as np 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.externals import joblib 7 | import tensorflow as tf 8 | from tensorflow.python import keras as K 9 | import gym 10 | from fn_framework import FNAgent, Trainer, Observer 11 | 12 | 13 | class PolicyGradientContinuousAgent(FNAgent): 14 | 15 | def __init__(self, epsilon, low, high): 16 | super().__init__(epsilon, [low, high]) 17 | self.scaler = None 18 | self._updater = None 19 | 20 | def save(self, model_path): 21 | super().save(model_path) 22 | joblib.dump(self.scaler, self.scaler_path(model_path)) 23 | 24 | @classmethod 25 | def load(cls, env, model_path, epsilon=0.0001): 26 | low, high = [env.action_space.low[0], env.action_space.high[0]] 27 | agent = cls(epsilon, low, high) 28 | agent.model = K.models.load_model(model_path, custom_objects={ 29 | "SampleLayer": SampleLayer}) 30 | agent.scaler = joblib.load(agent.scaler_path(model_path)) 31 | return agent 32 | 33 | def scaler_path(self, model_path): 34 | fname, _ = os.path.splitext(model_path) 35 | fname += "_scaler.pkl" 36 | return fname 37 | 38 | def initialize(self, experiences, actor_optimizer, critic_optimizer): 39 | self.scaler = StandardScaler() 40 | states = np.vstack([e.s for e in experiences]) 41 | self.scaler.fit(states) 42 | feature_size = states.shape[1] 43 | 44 | base = K.models.Sequential() 45 | base.add(K.layers.Dense(16, activation="relu", 46 | input_shape=(feature_size,))) 47 | base.add(K.layers.Dense(16, activation="relu")) 48 | base.add(K.layers.Dense(16, activation="relu")) 49 | 50 | # Actor 51 | # define action distribution 52 | mu = K.layers.Dense(1, activation="tanh")(base.output) 53 | mu = K.layers.Lambda(lambda m: m * 2)(mu) 54 | #sigma = K.layers.Dense(1, activation="softplus")(base.output) 55 | #self.dist_model = K.Model(inputs=base.input, outputs=[mu, sigma]) 56 | self.dist_model = K.Model(inputs=base.input, outputs=[mu]) 57 | 58 | # sample action from distribution 59 | low, high = self.actions 60 | action = SampleLayer(low, high)((mu)) 61 | self.model = K.Model(inputs=base.input, outputs=[action]) 62 | 63 | # Critic 64 | self.critic = K.models.Sequential([ 65 | K.layers.Dense(32, activation="relu", input_shape=(feature_size + 1,)), 66 | K.layers.Dense(32, activation="relu"), 67 | K.layers.Dense(32, activation="relu"), 68 | K.layers.Dense(1, activation="linear") 69 | ]) 70 | self.set_updater(actor_optimizer) 71 | self.critic.compile(loss="mse", optimizer=critic_optimizer) 72 | self.initialized = True 73 | print("Done initialize. From now, begin training!") 74 | 75 | def set_updater(self, optimizer): 76 | actions = tf.compat.v1.placeholder(shape=(None), dtype="float32") 77 | td_error = tf.compat.v1.placeholder(shape=(None), dtype="float32") 78 | 79 | # Actor loss 80 | mu = self.dist_model.output 81 | action_dist = tf.distributions.Normal(loc=tf.squeeze(mu), 82 | scale=0.1) 83 | action_probs = action_dist.prob(tf.squeeze(actions)) 84 | clipped = tf.clip_by_value(action_probs, 1e-10, 1.0) 85 | loss = - tf.math.log(clipped) * td_error 86 | loss = tf.reduce_mean(loss) 87 | 88 | updates = optimizer.get_updates(loss=loss, 89 | params=self.model.trainable_weights) 90 | self._updater = K.backend.function( 91 | inputs=[self.model.input, 92 | actions, td_error], 93 | outputs=[loss, action_probs, mu], 94 | updates=updates) 95 | 96 | def policy(self, s): 97 | if np.random.random() < self.epsilon or not self.initialized: 98 | low, high = self.actions 99 | return np.random.uniform(low, high) 100 | else: 101 | normalized_s = self.scaler.transform(s) 102 | action = self.model.predict(normalized_s)[0] 103 | return action[0] 104 | 105 | def update(self, batch, gamma): 106 | states = np.vstack([e.s for e in batch]) 107 | normalized_s = self.scaler.transform(states) 108 | actions = np.vstack([e.a for e in batch]) 109 | 110 | # Calculate value 111 | next_states = np.vstack([e.n_s for e in batch]) 112 | normalized_n_s = self.scaler.transform(next_states) 113 | n_s_actions = self.model.predict(normalized_n_s) 114 | feature_n = np.concatenate([normalized_n_s, n_s_actions], axis=1) 115 | n_s_values = self.critic.predict(feature_n) 116 | values = [b.r + gamma * (0 if b.d else 1) * n_s_values 117 | for b, n_s_values in zip(batch, n_s_values)] 118 | values = np.array(values) 119 | 120 | feature = np.concatenate([normalized_s, actions], axis=1) 121 | td_error = values - self.critic.predict(feature) 122 | a_loss, probs, mu = self._updater([normalized_s, actions, td_error]) 123 | c_loss = self.critic.train_on_batch(feature, values) 124 | 125 | """ 126 | print([a_loss, c_loss]) 127 | for x in zip(actions, mu, probs): 128 | print("Took action {}. (mu={}, its prob={})".format(*x)) 129 | """ 130 | 131 | 132 | class SampleLayer(K.layers.Layer): 133 | 134 | def __init__(self, low, high, **kwargs): 135 | self.low = low 136 | self.high = high 137 | super(SampleLayer, self).__init__(**kwargs) 138 | 139 | def build(self, input_shape): 140 | super(SampleLayer, self).build(input_shape) 141 | 142 | def call(self, x): 143 | mu = x 144 | actions = tf.distributions.Normal(loc=tf.squeeze(mu), 145 | scale=0.1).sample([1]) 146 | actions = tf.clip_by_value(actions, self.low, self.high) 147 | return tf.reshape(actions, (-1, 1)) 148 | 149 | def compute_output_shape(self, input_shape): 150 | return (input_shape[0], 1) 151 | 152 | def get_config(self): 153 | config = super().get_config() 154 | config["low"] = self.low 155 | config["high"] = self.high 156 | return config 157 | 158 | 159 | class PendulumObserver(Observer): 160 | 161 | def step(self, action): 162 | n_state, reward, done, info = self._env.step([action]) 163 | return self.transform(n_state), reward, done, info 164 | 165 | def transform(self, state): 166 | return np.reshape(state, (1, -1)) 167 | 168 | 169 | class PolicyGradientContinuousTrainer(Trainer): 170 | 171 | def __init__(self, buffer_size=100000, batch_size=32, 172 | gamma=0.99, report_interval=10, log_dir=""): 173 | super().__init__(buffer_size, batch_size, gamma, 174 | report_interval, log_dir) 175 | 176 | def train(self, env, episode_count=220, epsilon=1.0, initial_count=-1, 177 | render=False): 178 | low, high = [env.action_space.low[0], env.action_space.high[0]] 179 | agent = PolicyGradientContinuousAgent(epsilon, low, high) 180 | 181 | self.train_loop(env, agent, episode_count, initial_count, render) 182 | return agent 183 | 184 | def begin_train(self, episode, agent): 185 | actor_optimizer = K.optimizers.Adam(lr=0.001, clipnorm=1.0) 186 | critic_optimizer = K.optimizers.Adam(lr=0.001, clipnorm=1.0) 187 | agent.initialize(self.experiences, actor_optimizer, critic_optimizer) 188 | agent.epsilon = 0.01 189 | 190 | def step(self, episode, step_count, agent, experience): 191 | if self.training: 192 | batch = random.sample(self.experiences, self.batch_size) 193 | agent.update(batch, self.gamma) 194 | 195 | def episode_end(self, episode, step_count, agent): 196 | reward = sum([e.r for e in self.get_recent(step_count)]) 197 | self.reward_log.append(reward) 198 | 199 | if self.is_event(episode, self.report_interval): 200 | recent_rewards = self.reward_log[-self.report_interval:] 201 | self.logger.describe("reward", recent_rewards, episode=episode) 202 | 203 | 204 | def main(play): 205 | env = PendulumObserver(gym.make("Pendulum-v0")) 206 | trainer = PolicyGradientContinuousTrainer() 207 | path = trainer.logger.path_of("policy_gradient_continuous_agent.h5") 208 | 209 | if play: 210 | agent = PolicyGradientContinuousAgent.load(env, path) 211 | agent.play(env) 212 | else: 213 | trained = trainer.train(env, episode_count=1500, render=True) 214 | trainer.logger.plot("Rewards", trainer.reward_log, 215 | trainer.report_interval) 216 | trained.save(path) 217 | 218 | 219 | if __name__ == "__main__": 220 | parser = argparse.ArgumentParser(description="PG Agent Pendulum-v0") 221 | parser.add_argument("--play", action="store_true", 222 | help="play with trained model") 223 | 224 | args = parser.parse_args() 225 | main(args.play) 226 | -------------------------------------------------------------------------------- /FN/value_function_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import argparse 3 | import numpy as np 4 | from sklearn.neural_network import MLPRegressor 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.externals import joblib 8 | import gym 9 | from fn_framework import FNAgent, Trainer, Observer 10 | 11 | 12 | class ValueFunctionAgent(FNAgent): 13 | 14 | def save(self, model_path): 15 | joblib.dump(self.model, model_path) 16 | 17 | @classmethod 18 | def load(cls, env, model_path, epsilon=0.0001): 19 | actions = list(range(env.action_space.n)) 20 | agent = cls(epsilon, actions) 21 | agent.model = joblib.load(model_path) 22 | agent.initialized = True 23 | return agent 24 | 25 | def initialize(self, experiences): 26 | scaler = StandardScaler() 27 | estimator = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=1) 28 | self.model = Pipeline([("scaler", scaler), ("estimator", estimator)]) 29 | 30 | states = np.vstack([e.s for e in experiences]) 31 | self.model.named_steps["scaler"].fit(states) 32 | 33 | # Avoid the predict before fit. 34 | self.update([experiences[0]], gamma=0) 35 | self.initialized = True 36 | print("Done initialization. From now, begin training!") 37 | 38 | def estimate(self, s): 39 | estimated = self.model.predict(s)[0] 40 | return estimated 41 | 42 | def _predict(self, states): 43 | if self.initialized: 44 | predicteds = self.model.predict(states) 45 | else: 46 | size = len(self.actions) * len(states) 47 | predicteds = np.random.uniform(size=size) 48 | predicteds = predicteds.reshape((-1, len(self.actions))) 49 | return predicteds 50 | 51 | def update(self, experiences, gamma): 52 | states = np.vstack([e.s for e in experiences]) 53 | n_states = np.vstack([e.n_s for e in experiences]) 54 | 55 | estimateds = self._predict(states) 56 | future = self._predict(n_states) 57 | 58 | for i, e in enumerate(experiences): 59 | reward = e.r 60 | if not e.d: 61 | reward += gamma * np.max(future[i]) 62 | estimateds[i][e.a] = reward 63 | 64 | estimateds = np.array(estimateds) 65 | states = self.model.named_steps["scaler"].transform(states) 66 | self.model.named_steps["estimator"].partial_fit(states, estimateds) 67 | 68 | 69 | class CartPoleObserver(Observer): 70 | 71 | def transform(self, state): 72 | return np.array(state).reshape((1, -1)) 73 | 74 | 75 | class ValueFunctionTrainer(Trainer): 76 | 77 | def train(self, env, episode_count=220, epsilon=0.1, initial_count=-1, 78 | render=False): 79 | actions = list(range(env.action_space.n)) 80 | agent = ValueFunctionAgent(epsilon, actions) 81 | self.train_loop(env, agent, episode_count, initial_count, render) 82 | return agent 83 | 84 | def begin_train(self, episode, agent): 85 | agent.initialize(self.experiences) 86 | 87 | def step(self, episode, step_count, agent, experience): 88 | if self.training: 89 | batch = random.sample(self.experiences, self.batch_size) 90 | agent.update(batch, self.gamma) 91 | 92 | def episode_end(self, episode, step_count, agent): 93 | rewards = [e.r for e in self.get_recent(step_count)] 94 | self.reward_log.append(sum(rewards)) 95 | 96 | if self.is_event(episode, self.report_interval): 97 | recent_rewards = self.reward_log[-self.report_interval:] 98 | self.logger.describe("reward", recent_rewards, episode=episode) 99 | 100 | 101 | def main(play): 102 | env = CartPoleObserver(gym.make("CartPole-v0")) 103 | trainer = ValueFunctionTrainer() 104 | path = trainer.logger.path_of("value_function_agent.pkl") 105 | 106 | if play: 107 | agent = ValueFunctionAgent.load(env, path) 108 | agent.play(env) 109 | else: 110 | trained = trainer.train(env) 111 | trainer.logger.plot("Rewards", trainer.reward_log, 112 | trainer.report_interval) 113 | trained.save(path) 114 | 115 | 116 | if __name__ == "__main__": 117 | parser = argparse.ArgumentParser(description="VF Agent") 118 | parser.add_argument("--play", action="store_true", 119 | help="play with trained model") 120 | 121 | args = parser.parse_args() 122 | main(args.play) 123 | -------------------------------------------------------------------------------- /IM/dagger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import warnings 4 | import numpy as np 5 | from sklearn.externals import joblib 6 | from sklearn.neural_network import MLPRegressor, MLPClassifier 7 | import gym 8 | from gym.envs.registration import register 9 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv", 10 | kwargs={"is_slippery": False}) 11 | 12 | 13 | class TeacherAgent(): 14 | 15 | def __init__(self, env, epsilon=0.1): 16 | self.actions = list(range(env.action_space.n)) 17 | self.epsilon = epsilon 18 | self.model = None 19 | 20 | def save(self, model_path): 21 | joblib.dump(self.model, model_path) 22 | 23 | @classmethod 24 | def load(cls, env, model_path, epsilon=0.1): 25 | agent = cls(env, epsilon) 26 | agent.model = joblib.load(model_path) 27 | return agent 28 | 29 | def initialize(self, state): 30 | # Only state => action projection is needed. 31 | self.model = MLPRegressor(hidden_layer_sizes=(), max_iter=1) 32 | # Warmup to use predict method. 33 | dummy_label = [np.random.uniform(size=len(self.actions))] 34 | self.model.partial_fit([state], dummy_label) 35 | return self 36 | 37 | def estimate(self, state): 38 | q = self.model.predict([state])[0] 39 | return q 40 | 41 | def policy(self, state): 42 | if np.random.random() < self.epsilon: 43 | return np.random.randint(len(self.actions)) 44 | else: 45 | return np.argmax(self.estimate(state)) 46 | 47 | @classmethod 48 | def train(cls, env, episode_count=3000, gamma=0.9, 49 | initial_epsilon=1.0, final_epsilon=0.1, report_interval=100): 50 | agent = cls(env, initial_epsilon).initialize(env.reset()) 51 | rewards = [] 52 | decay = (initial_epsilon - final_epsilon) / episode_count 53 | for e in range(episode_count): 54 | s = env.reset() 55 | done = False 56 | goal_reward = 0 57 | while not done: 58 | a = agent.policy(s) 59 | estimated = agent.estimate(s) 60 | 61 | n_state, reward, done, info = env.step(a) 62 | gain = reward + gamma * max(agent.estimate(n_state)) 63 | 64 | estimated[a] = gain 65 | agent.model.partial_fit([s], [estimated]) 66 | s = n_state 67 | else: 68 | goal_reward = reward 69 | 70 | rewards.append(goal_reward) 71 | if e != 0 and e % report_interval == 0: 72 | recent = np.array(rewards[-report_interval:]) 73 | print("At episode {}, reward is {}".format( 74 | e, recent.mean())) 75 | agent.epsilon -= decay 76 | 77 | return agent 78 | 79 | 80 | class FrozenLakeObserver(): 81 | 82 | def __init__(self): 83 | self._env = gym.make("FrozenLakeEasy-v0") 84 | 85 | @property 86 | def action_space(self): 87 | return self._env.action_space 88 | 89 | @property 90 | def observation_space(self): 91 | return self._env.observation_space 92 | 93 | def reset(self): 94 | return self.transform(self._env.reset()) 95 | 96 | def render(self): 97 | self._env.render() 98 | 99 | def step(self, action): 100 | n_state, reward, done, info = self._env.step(action) 101 | return self.transform(n_state), reward, done, info 102 | 103 | def transform(self, state): 104 | feature = np.zeros(self.observation_space.n) 105 | feature[state] = 1.0 106 | return feature 107 | 108 | 109 | class Student(): 110 | 111 | def __init__(self, env): 112 | self.actions = list(range(env.action_space.n)) 113 | self.model = None 114 | 115 | def initialize(self, state): 116 | self.model = MLPClassifier(hidden_layer_sizes=(), max_iter=1) 117 | dummy_action = 0 118 | self.model.partial_fit([state], [dummy_action], 119 | classes=self.actions) 120 | return self 121 | 122 | def policy(self, state): 123 | return self.model.predict([state])[0] 124 | 125 | def imitate(self, env, teacher, initial_step=100, train_step=200, 126 | report_interval=10): 127 | states = [] 128 | actions = [] 129 | 130 | # Collect teacher's demonstrations. 131 | for e in range(initial_step): 132 | s = env.reset() 133 | done = False 134 | while not done: 135 | a = teacher.policy(s) 136 | n_state, reward, done, info = env.step(a) 137 | states.append(s) 138 | actions.append(a) 139 | s = n_state 140 | 141 | self.initialize(states[0]) 142 | self.model.partial_fit(states, actions) 143 | 144 | print("Start imitation.") 145 | # Student tries to learn teacher's actions. 146 | step_limit = 20 147 | for e in range(train_step): 148 | s = env.reset() 149 | done = False 150 | rewards = [] 151 | step = 0 152 | while not done and step < step_limit: 153 | a = self.policy(s) 154 | n_state, reward, done, info = env.step(a) 155 | states.append(s) 156 | actions.append(teacher.policy(s)) 157 | s = n_state 158 | step += 1 159 | else: 160 | goal_reward = reward 161 | 162 | rewards.append(goal_reward) 163 | if e != 0 and e % report_interval == 0: 164 | recent = np.array(rewards[-report_interval:]) 165 | print("At episode {}, reward is {}".format( 166 | e, recent.mean())) 167 | 168 | with warnings.catch_warnings(): 169 | # It will be fixed in latest scikit-learn. 170 | # https://github.com/scikit-learn/scikit-learn/issues/10449 171 | warnings.filterwarnings("ignore", category=DeprecationWarning) 172 | self.model.partial_fit(states, actions) 173 | 174 | 175 | def main(teacher): 176 | env = FrozenLakeObserver() 177 | path = os.path.join(os.path.dirname(__file__), "imitation_teacher.pkl") 178 | 179 | if teacher: 180 | agent = TeacherAgent.train(env) 181 | agent.save(path) 182 | else: 183 | teacher_agent = TeacherAgent.load(env, path) 184 | student = Student(env) 185 | student.imitate(env, teacher_agent) 186 | 187 | 188 | if __name__ == "__main__": 189 | parser = argparse.ArgumentParser(description="Imitation Learning") 190 | parser.add_argument("--teacher", action="store_true", 191 | help="train teacher model") 192 | 193 | args = parser.parse_args() 194 | main(args.teacher) 195 | -------------------------------------------------------------------------------- /IRL/backups/environment.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import numpy as np 3 | 4 | 5 | class Direction(Enum): 6 | UP = 1 7 | DOWN = -1 8 | LEFT = 2 9 | RIGHT = -2 10 | 11 | 12 | class State(): 13 | 14 | def __init__(self, row=-1, column=-1): 15 | self.row = row 16 | self.column = column 17 | 18 | def index(self, n_row): 19 | return self.row * n_row + self.column 20 | 21 | def __repr__(self): 22 | return "".format(self.row, self.column) 23 | 24 | def clone(self): 25 | return State(self.row, self.column) 26 | 27 | def __hash__(self): 28 | return hash((self.row, self.column)) 29 | 30 | def __eq__(self, other): 31 | return self.row == other.row and self.column == other.column 32 | 33 | 34 | class Environment(): 35 | 36 | def __init__(self, grid, move_prob=0.8): 37 | # Grid is 2d-array, and each value treated as attribute. 38 | # attribute is 39 | # 0: ordinary cell 40 | # -1: damage cell (game end) 41 | # 1: reward cell (game end) 42 | # 9: block cell (can't locate agent) 43 | self.grid = grid 44 | self.agent_state = State() 45 | 46 | # Default reward is minus like poison swamp. 47 | # It means agent have to reach the goal fast! 48 | self.default_reward = -0.04 49 | 50 | # Agent can move to decided direction in move_prob. 51 | # It means agent will move different direction in (1 - move_prob). 52 | self.move_prob = move_prob 53 | self.reset() 54 | 55 | @property 56 | def row_length(self): 57 | return len(self.grid) 58 | 59 | @property 60 | def column_length(self): 61 | return len(self.grid[0]) 62 | 63 | @property 64 | def action_space(self): 65 | return [Direction.UP, Direction.DOWN, 66 | Direction.LEFT, Direction.RIGHT] 67 | 68 | @property 69 | def states(self): 70 | states = [] 71 | for row in range(self.row_length): 72 | for column in range(self.column_length): 73 | # Avoid the Block Cell 74 | if self.grid[row][column] != 9: 75 | states.append(State(row, column)) 76 | return states 77 | 78 | def reset(self): 79 | # Locate agent at lower left corner 80 | self.agent_state = State(self.row_length - 1, 0) 81 | return self.agent_state 82 | 83 | def step(self, action): 84 | next_state, reward, done = self.transit(self.agent_state, action) 85 | if next_state is not None: 86 | self.agent_state = next_state 87 | 88 | return next_state, reward, done 89 | 90 | def transit(self, state, action): 91 | transition_probs = self.transit_func(state, action) 92 | if len(transition_probs) == 0: 93 | return None, None, True 94 | 95 | next_states = [] 96 | probs = [] 97 | for s in transition_probs: 98 | next_states.append(s) 99 | probs.append(transition_probs[s]) 100 | 101 | next_state = np.random.choice(next_states, p=probs) 102 | reward, done = self.reward_func(next_state) 103 | return next_state, reward, done 104 | 105 | def transit_func(self, state, action): 106 | transition_probs = {} 107 | if not self.can_action_at(state): 108 | # Already on the terminal cell 109 | return transition_probs 110 | 111 | actions = self.action_space 112 | opposite_direction = Direction(action.value * -1) 113 | 114 | for a in actions: 115 | prob = 0 116 | if a == action: 117 | prob = self.move_prob 118 | elif a != opposite_direction: 119 | prob = (1 - self.move_prob) / 2 120 | 121 | next_state = self._move(state, a) 122 | if next_state not in transition_probs: 123 | transition_probs[next_state] = prob 124 | else: 125 | transition_probs[next_state] += prob 126 | 127 | return transition_probs 128 | 129 | def can_action_at(self, state): 130 | if self.grid[state.row][state.column] == 0: 131 | return True 132 | else: 133 | return False 134 | 135 | def _move(self, state, action): 136 | if not self.can_action_at(state): 137 | raise Exception("Can't move from here!") 138 | 139 | next_state = state.clone() 140 | 141 | # Move state by action 142 | if action == Direction.UP: 143 | next_state.row -= 1 144 | elif action == Direction.DOWN: 145 | next_state.row += 1 146 | elif action == Direction.LEFT: 147 | next_state.column -= 1 148 | elif action == Direction.RIGHT: 149 | next_state.column += 1 150 | 151 | # Check the out of grid 152 | if not (0 <= next_state.row < self.row_length): 153 | next_state = state 154 | if not (0 <= next_state.column < self.column_length): 155 | next_state = state 156 | 157 | # Check the Agent bumped the block 158 | if self.grid[next_state.row][next_state.column] == 9: 159 | next_state = state 160 | 161 | return next_state 162 | 163 | def reward_func(self, state): 164 | reward = self.default_reward 165 | done = False 166 | 167 | # Check the attribute of next state 168 | attribute = self.grid[state.row][state.column] 169 | if attribute == 1: 170 | # Get treasure! and game ends. 171 | reward = 1 172 | done = True 173 | elif attribute == -1: 174 | # Go to hell! and the game ends. 175 | reward = -1 176 | done = True 177 | 178 | return reward, done 179 | -------------------------------------------------------------------------------- /IRL/backups/irl_from_traj.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from collections import defaultdict 5 | from sklearn.externals import joblib 6 | from sklearn.neural_network import MLPRegressor 7 | import tensorflow as tf 8 | import tensorflow.contrib.eager as tfe 9 | from tensorflow.python import keras as K 10 | import gym 11 | from gym.envs.registration import register 12 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv", 13 | kwargs={"is_slippery": False}) 14 | 15 | 16 | tfe.enable_eager_execution() 17 | 18 | 19 | class TeacherAgent(): 20 | 21 | def __init__(self, env, epsilon=0.1): 22 | self.actions = list(range(env.action_space.n)) 23 | self.num_states = env.observation_space.n 24 | self.epsilon = epsilon 25 | self.model = None 26 | 27 | def save(self, model_path): 28 | joblib.dump(self.model, model_path) 29 | 30 | @classmethod 31 | def load(cls, env, model_path, epsilon=0.1): 32 | agent = cls(env, epsilon) 33 | agent.model = joblib.load(model_path) 34 | return agent 35 | 36 | def initialize(self, state): 37 | # Only state => action projection is needed 38 | self.model = MLPRegressor(hidden_layer_sizes=(), max_iter=1) 39 | # Warmup to use predict method 40 | dummy_label = [np.random.uniform(size=len(self.actions))] 41 | self.model.partial_fit(np.array([self.transform(state)]), 42 | np.array(dummy_label)) 43 | return self 44 | 45 | def estimate(self, state): 46 | feature = self.transform(state) 47 | q = self.model.predict([feature])[0] 48 | return q 49 | 50 | def policy(self, state): 51 | if np.random.random() < self.epsilon: 52 | return np.random.randint(len(self.actions)) 53 | else: 54 | return np.argmax(self.estimate(state)) 55 | 56 | def transform(self, state): 57 | feature = np.zeros(self.num_states) 58 | feature[state] = 1.0 59 | return feature 60 | 61 | @classmethod 62 | def train(cls, env, episode_count=3000, gamma=0.9, 63 | initial_epsilon=1.0, final_epsilon=0.1, report_interval=100): 64 | agent = cls(env, initial_epsilon).initialize(env.reset()) 65 | rewards = [] 66 | decay = (initial_epsilon - final_epsilon) / episode_count 67 | for e in range(episode_count): 68 | s = env.reset() 69 | done = False 70 | goal_reward = 0 71 | while not done: 72 | a = agent.policy(s) 73 | estimated = agent.estimate(s) 74 | 75 | n_state, reward, done, info = env.step(a) 76 | 77 | gain = reward + gamma * max(agent.estimate(n_state)) 78 | estimated[a] = gain 79 | agent.model.partial_fit([agent.transform(s)], [estimated]) 80 | s = n_state 81 | else: 82 | goal_reward = reward 83 | 84 | rewards.append(goal_reward) 85 | if e != 0 and e % report_interval == 0: 86 | recent = np.array(rewards[-report_interval:]) 87 | print("At episode {}, reward is {}".format( 88 | e, recent.mean())) 89 | agent.epsilon -= decay 90 | 91 | return agent 92 | 93 | 94 | class IRL(): 95 | 96 | def __init__(self, env): 97 | self.actions = list(range(env.action_space.n)) 98 | self.num_states = env.observation_space.n 99 | self.rewards = tfe.Variable(tf.random_uniform( 100 | [env.observation_space.n]), 101 | name="rewards") 102 | """ 103 | self.rewards = tfe.Variable(initial_value=[0.0, 0.0, 0.0, 0.0, 104 | 0.0, 0.0, 0.0, 0.0, 105 | 0.0, 0.0, 0.0, 0.0, 106 | 0.0, 0.0, 0.0, 1.0,], 107 | name="rewards") 108 | """ 109 | self._updater = tfe.implicit_gradients(self.loss) 110 | 111 | """ 112 | def value_estimate(self, steps, gamma): 113 | values = {} 114 | counts = {} 115 | for i, t in enumerate(steps): 116 | rewards = [self.rewards[s] for s in t] 117 | for j, s in enumerate(t): 118 | discounteds = [r * (gamma ** k) 119 | for k, r in enumerate(rewards[j:])] 120 | discounted = tf.reduce_sum(discounteds) 121 | if s not in values: 122 | values[s] = discounted 123 | counts[s] = 0.0 124 | 125 | counts[s] += 1 126 | values[s] = tf.add(values[s], tf.divide( 127 | tf.subtract(discounted, values[s]), 128 | counts[s])) 129 | 130 | value_tensors = [] 131 | total_count = sum([counts[s] for s in counts]) 132 | for i in range(self.rewards.shape[0].value): 133 | if i in values: 134 | visit = counts[i] / total_count 135 | value = tf.multiply(values[i], visit) 136 | else: 137 | value = tf.constant(0.0) 138 | value_tensors.append(value) 139 | values = tf.stack(value_tensors) 140 | return values 141 | """ 142 | 143 | def value_estimate(self, trajectory, gamma): 144 | values = {} 145 | one_host_trajectory = tf.one_hot(trajectory, self.num_states) 146 | rewards = tf.reduce_sum(one_host_trajectory * self.rewards, axis=1) 147 | for i, r in enumerate(rewards): 148 | future = [_r * (gamma ** (k + 1)) 149 | for k, _r in enumerate(rewards[(i + 1):])] 150 | reward = r + tf.reduce_sum(future) 151 | s = trajectory[i] 152 | values[s] = reward 153 | 154 | value_tensors = [] 155 | for i in range(self.num_states): 156 | if i in values: 157 | value = values[i] 158 | else: 159 | value = tf.constant(0.0) 160 | value_tensors.append(value) 161 | values = tf.stack(value_tensors) 162 | return values 163 | 164 | def get_rewards(self): 165 | return self.rewards.numpy() 166 | 167 | def loss(self, teacher_steps, steps, gamma): 168 | teacher_values = tf.stack([self.value_estimate(t, gamma) for t in teacher_steps]) 169 | values = tf.stack([self.value_estimate(t, gamma) for t in steps]) 170 | best = tf.reduce_mean(teacher_values, axis=0) 171 | diff = tf.reduce_min(best - values, axis=0) 172 | #print(">>>>>>>>") 173 | #print(tf.reshape(best, (4, 4))) 174 | #print(tf.reshape(tf.reduce_mean(values, axis=0), (4, 4))) 175 | 176 | loss = tf.reduce_sum(tf.boolean_mask(diff, diff > 0)) 177 | penalty = -2 * tf.reduce_sum(tf.boolean_mask(diff, diff < 0)) 178 | loss += penalty 179 | 180 | #_loss = _loss + 1.5 * tf.reduce_sum(tf.abs(self.rewards)) 181 | return loss 182 | 183 | def update(self, optimizer, teacher_steps, steps, gamma): 184 | loss = self.loss(teacher_steps, steps, gamma) 185 | optimizer.apply_gradients(self._updater(teacher_steps, steps, gamma)) 186 | return loss, self.get_rewards() 187 | 188 | def take_action(self, Q, state, actions, epsilon=0.1): 189 | rand_action = np.random.randint(len(actions)) 190 | if np.random.random() < epsilon: 191 | return rand_action 192 | elif state in Q and sum(Q[state]) != 0: 193 | return np.argmax(Q[state]) 194 | else: 195 | return rand_action 196 | 197 | def estimate(self, env, teacher, episode_count=3000, 198 | teacher_demo_size=256, batch_size=32, 199 | learning_rate=1e-3, max_step=10, 200 | gamma=0.9, report_interval=10): 201 | 202 | # Accumulate teacher's demonstration 203 | demos = [] 204 | for e in range(teacher_demo_size): 205 | s = env.reset() 206 | done = False 207 | trajectory = [s] 208 | while not done: 209 | a = teacher.policy(s) 210 | n_state, reward, done, info = env.step(a) 211 | s = n_state 212 | trajectory.append(s) 213 | demos.append(trajectory) 214 | 215 | print("Start reward estimation.") 216 | actions = list(range(env.action_space.n)) 217 | rewards = np.zeros((env.observation_space.n)) 218 | Q = defaultdict(lambda: [0] * len(actions)) 219 | optimizer = tf.train.AdamOptimizer(learning_rate) 220 | 221 | for e in range(episode_count): 222 | batch = [] 223 | total_reward = 0 224 | for b in range(batch_size): 225 | s = env.reset() 226 | done = False 227 | trajectory = [s] 228 | step = 0 229 | epsilon = 1.0 230 | while not done and step < max_step: 231 | a = self.take_action(Q, s, actions, epsilon) 232 | n_state, reward, done, info = env.step(a) 233 | 234 | estimated = Q[s][a] 235 | gain = rewards[n_state] + gamma * max(Q[n_state]) 236 | Q[s][a] += learning_rate * (gain - estimated) 237 | s = n_state 238 | trajectory.append(s) 239 | step += 1 240 | epsilon = epsilon * ((batch_size - b) / batch_size) 241 | else: 242 | total_reward += reward 243 | batch.append(trajectory) 244 | 245 | teacher_batch = np.random.choice(demos, size=batch_size) 246 | loss, new_rewards = self.update(optimizer, 247 | teacher_batch, batch, gamma) 248 | 249 | rewards = new_rewards 250 | 251 | if e % 10 == 0: 252 | print("At episode {}, reward={}, loss={}".format( 253 | e, total_reward, loss)) 254 | print("Reward") 255 | print(new_rewards.reshape(4, 4)) 256 | 257 | 258 | def main(train): 259 | env = gym.make("FrozenLakeEasy-v0") 260 | path = os.path.join(os.path.dirname(__file__), "irl_teacher.pkl") 261 | 262 | if train: 263 | agent = TeacherAgent.train(env) 264 | agent.save(path) 265 | else: 266 | teacher = TeacherAgent.load(env, path) 267 | irl = IRL(env) 268 | irl.estimate(env, teacher) 269 | 270 | 271 | if __name__ == "__main__": 272 | parser = argparse.ArgumentParser(description="Imitation Learning") 273 | parser.add_argument("--train", action="store_true", 274 | help="train teacher model") 275 | 276 | args = parser.parse_args() 277 | main(args.train) 278 | -------------------------------------------------------------------------------- /IRL/backups/linear.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from tensorflow.python import keras as K 4 | import tensorflow as tf 5 | from environment import Environment 6 | from planner import PolicyIterationPlanner 7 | import visualizer as viz 8 | 9 | 10 | class LinerIRL(): 11 | 12 | def __init__(self): 13 | self._updater = None 14 | self.rewards = None 15 | 16 | def initialize(self, num_states, num_actions, optimizer, C=1.0, r_max=2): 17 | # Variables 18 | best_trans_probs = tf.compat.v1.placeholder( 19 | tf.float32, 20 | shape=(num_states, num_states)) 21 | other_trans_probss = tf.compat.v1.placeholder( 22 | tf.float32, 23 | shape=(num_states, 24 | num_actions - 1, 25 | num_states)) 26 | gamma = tf.compat.v1.placeholder(tf.float32, shape=()) 27 | rewards = tf.Variable(tf.random_normal([num_states], mean=r_max/2), 28 | name="rewards") 29 | 30 | _indices = tf.constant([0] * num_states) 31 | _min_losses = tf.constant([1e+10] * num_states) 32 | eye = tf.eye(num_states) 33 | 34 | condition = lambda s, i, loss: tf.less(i, other_trans_probss.shape[1]) # noqa 35 | 36 | def process(s, i, loss): 37 | best_trans_prob = best_trans_probs[s] 38 | other_trans_prob = other_trans_probss[s][i] 39 | 40 | f_left = tf.reshape((best_trans_prob - other_trans_prob), (1, -1)) 41 | f_right = tf.matrix_inverse(eye - gamma * best_trans_prob) 42 | 43 | # Limit the rewards of other actions smaller than best's one. 44 | R = tf.reshape(tf.clip_by_value(rewards, -r_max, r_max), (-1, 1)) 45 | 46 | formula = K.backend.dot(K.backend.dot(f_left, f_right), R) 47 | 48 | # Formula should be positive 49 | _loss = tf.abs(tf.squeeze(tf.nn.leaky_relu(formula))) 50 | loss = tf.reduce_min([loss, _loss]) 51 | i = tf.add(i, 1) 52 | return s, i, loss 53 | 54 | total_loss = tf.constant(0.0) 55 | for s in range(num_states): 56 | _, _, min_loss = tf.while_loop(condition, process, 57 | [s, _indices[s], _min_losses[s]]) 58 | total_loss = tf.add(total_loss, min_loss) 59 | 60 | total_loss -= C * tf.reduce_sum(tf.abs(rewards)) # L1 regularization 61 | total_loss = -total_loss # Maximize to Minimize 62 | 63 | # Get gradients 64 | updates = optimizer.get_updates(loss=total_loss, params=[rewards]) 65 | self._updater = K.backend.function( 66 | inputs=[best_trans_probs, 67 | other_trans_probss, 68 | gamma], 69 | outputs=[total_loss, rewards], 70 | updates=updates) 71 | 72 | def to_trans_prob(self, env, probs): 73 | states = env.states 74 | mx = np.zeros(len(states)) 75 | for s in states: 76 | if s in probs: 77 | mx[s.index(env.row_length)] = probs[s] 78 | return mx 79 | 80 | def estimate(self, env, teacher, episode_count=6000, learning_rate=1e-3, 81 | gamma=0.9, report_interval=100): 82 | optimizer = K.optimizers.Adam(learning_rate) 83 | num_actions = len(env.action_space) 84 | num_states = len(env.states) 85 | self.initialize(num_states, num_actions, optimizer) 86 | loss_history = [] 87 | for e in range(episode_count): 88 | best_trans_probs = [] 89 | other_trans_probss = [] 90 | for s in env.states: 91 | actions = teacher.policy[s] 92 | best_action = max(actions, key=actions.get) 93 | best_trans_prob = np.zeros(num_states) 94 | other_trans_probs = [] 95 | for a in env.action_space: 96 | probs = env.transit_func(s, a) 97 | if len(probs) == 0: 98 | continue 99 | if a == best_action: 100 | best_trans_prob = self.to_trans_prob(env, probs) 101 | else: 102 | other_trans_probs.append( 103 | self.to_trans_prob(env, probs) 104 | ) 105 | if len(other_trans_probs) == 0: 106 | other_trans_probs = [np.zeros(num_states)] * (num_actions - 1) 107 | 108 | other_trans_probs = np.array(other_trans_probs) 109 | 110 | best_trans_probs.append(best_trans_prob) 111 | other_trans_probss.append(other_trans_probs) 112 | 113 | best_trans_probs = np.array(best_trans_probs) 114 | other_trans_probss = np.array(other_trans_probss) 115 | 116 | loss, self.rewards = self._updater([best_trans_probs, 117 | other_trans_probss, 118 | gamma]) 119 | loss_history.append(loss) 120 | if e != 0 and e % report_interval == 0: 121 | viz.describe(e, "loss", loss_history, report_interval) 122 | 123 | return loss_history 124 | 125 | 126 | def main(): 127 | grid = [ 128 | [0, 0, 0, 1], 129 | [0, 0, 0, 0], 130 | [0, 0, 0, 0], 131 | [0, 0, 0, 0] 132 | ] 133 | # Prepare Teacher 134 | env = Environment(grid) 135 | planner = PolicyIterationPlanner(env) 136 | planner.plan() 137 | 138 | # Execute IRL 139 | irl = LinerIRL() 140 | irl.estimate(env, planner) 141 | print(irl.rewards) 142 | 143 | # Plot Reward Map 144 | ncol = env.column_length 145 | nrow = env.row_length 146 | import matplotlib.pyplot as plt 147 | import matplotlib.cm as cm 148 | fig, ax = plt.subplots() 149 | reward_map = irl.rewards.reshape((nrow, ncol)) 150 | ax.imshow(reward_map, cmap=cm.RdYlGn) 151 | ax.set_xticks(np.arange(ncol)) 152 | ax.set_yticks(np.arange(nrow)) 153 | fig.tight_layout() 154 | plt.show() 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /IRL/backups/planner.py: -------------------------------------------------------------------------------- 1 | class Planner(): 2 | 3 | def __init__(self, env): 4 | self.env = env 5 | self.log = [] 6 | 7 | def initialize(self): 8 | self.env.reset() 9 | self.log = [] 10 | 11 | def transitions_at(self, state, action): 12 | transition_probs = self.env.transit_func(state, action) 13 | for next_state in transition_probs: 14 | prob = transition_probs[next_state] 15 | reward, _ = self.env.reward_func(next_state) 16 | yield prob, next_state, reward 17 | 18 | def plan(self, gamma=0.9, threshold=0.0001): 19 | raise Exception("Planner have to implements plan method.") 20 | 21 | def dict_to_grid(self, state_reward_dict): 22 | grid = [] 23 | for i in range(self.env.row_length): 24 | row = [0] * self.env.column_length 25 | grid.append(row) 26 | for s in state_reward_dict: 27 | grid[s.row][s.column] = state_reward_dict[s] 28 | 29 | return grid 30 | 31 | 32 | class ValueIterationPlanner(Planner): 33 | 34 | def __init__(self, env): 35 | super().__init__(env) 36 | 37 | def plan(self, gamma=0.9, threshold=0.0001): 38 | self.initialize() 39 | actions = self.env.action_space 40 | V = {} 41 | for s in self.env.states: 42 | # Initialize each state's expected reward 43 | V[s] = 0 44 | 45 | while True: 46 | delta = 0 47 | self.log.append(self.dict_to_grid(V)) 48 | for s in V: 49 | if not self.env.can_action_at(s): 50 | continue 51 | expected_rewards = [] 52 | for a in actions: 53 | r = 0 54 | for prob, next_state, reward in self.transitions_at(s, a): 55 | r += prob * (reward + gamma * V[next_state]) 56 | expected_rewards.append(r) 57 | max_reward = max(expected_rewards) 58 | delta = max(delta, abs(max_reward - V[s])) 59 | V[s] = max_reward 60 | 61 | if delta < threshold: 62 | break 63 | 64 | # Turn dictionary to grid 65 | V_grid = self.dict_to_grid(V) 66 | return V_grid 67 | 68 | 69 | class PolicyIterationPlanner(Planner): 70 | 71 | def __init__(self, env): 72 | super().__init__(env) 73 | self.policy = {} 74 | 75 | def initialize(self): 76 | super().initialize() 77 | self.policy = {} 78 | actions = self.env.action_space 79 | states = self.env.states 80 | for s in states: 81 | self.policy[s] = {} 82 | for a in actions: 83 | # Initialize policy. First each action is taken uniformly. 84 | self.policy[s][a] = 1 / len(actions) 85 | 86 | def estimate_by_policy(self, gamma, threshold): 87 | V = {} 88 | for s in self.env.states: 89 | # Initialize each state's expected reward 90 | V[s] = 0 91 | 92 | while True: 93 | delta = 0 94 | for s in V: 95 | expected_rewards = [] 96 | for a in self.policy[s]: 97 | action_prob = self.policy[s][a] 98 | r = 0 99 | for prob, next_state, reward in self.transitions_at(s, a): 100 | r += action_prob * prob * \ 101 | (reward + gamma * V[next_state]) 102 | expected_rewards.append(r) 103 | max_reward = max(expected_rewards) 104 | delta = max(delta, abs(max_reward - V[s])) 105 | V[s] = max_reward 106 | if delta < threshold: 107 | break 108 | 109 | return V 110 | 111 | def plan(self, gamma=0.9, threshold=0.0001): 112 | self.initialize() 113 | states = self.env.states 114 | actions = self.env.action_space 115 | 116 | def take_max_action(action_value_dict): 117 | return max(action_value_dict, key=action_value_dict.get) 118 | 119 | while True: 120 | update_stable = True 121 | # Estimate expected rewards under current policy 122 | V = self.estimate_by_policy(gamma, threshold) 123 | self.log.append(self.dict_to_grid(V)) 124 | 125 | for s in states: 126 | # Get action following to the policy (choose max prob's action) 127 | policy_action = take_max_action(self.policy[s]) 128 | 129 | # Compare with other actions 130 | action_rewards = {} 131 | for a in actions: 132 | r = 0 133 | for prob, next_state, reward in self.transitions_at(s, a): 134 | r += prob * (reward + gamma * V[next_state]) 135 | action_rewards[a] = r 136 | best_action = take_max_action(action_rewards) 137 | if policy_action != best_action: 138 | update_stable = False 139 | 140 | # Update policy (set best_action prob=1, otherwise=0 (greedy)) 141 | for a in self.policy[s]: 142 | prob = 1 if a == best_action else 0 143 | self.policy[s][a] = prob 144 | 145 | if update_stable: 146 | # If policy isn't updated, stop iteration 147 | break 148 | 149 | # Turn dictionary to grid 150 | V_grid = self.dict_to_grid(V) 151 | return V_grid 152 | -------------------------------------------------------------------------------- /IRL/backups/visualizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.cm as cm 4 | 5 | 6 | def describe(episode, name, values, interval=10, round_count=-1): 7 | mean = np.mean(values[-interval:]) 8 | std = np.std(values[-interval:]) 9 | if round_count > 0: 10 | mean = np.round(mean, round_count) 11 | std = np.round(std, round_count) 12 | desc = "{} is {} (+/-{})".format(name, mean, std) 13 | print("At episode {}, {}".format(episode, desc)) 14 | 15 | 16 | def plot_values(name, values, interval=10): 17 | indices = list(range(0, len(values), interval)) 18 | means = [] 19 | stds = [] 20 | for i in indices: 21 | _values = values[i:(i + interval)] 22 | means.append(np.mean(_values)) 23 | stds.append(np.std(_values)) 24 | means = np.array(means) 25 | stds = np.array(stds) 26 | plt.figure() 27 | plt.title("{} History".format(name)) 28 | plt.grid() 29 | plt.fill_between(indices, means - stds, means + stds, 30 | alpha=0.1, color="g") 31 | plt.plot(indices, means, "o-", color="g", 32 | label="{} per {} episode".format(name.lower(), interval)) 33 | plt.legend(loc="best") 34 | plt.show() 35 | 36 | 37 | def plot_grid_rewards(env, Q): 38 | """ 39 | Show Q-values for FrozenLake-v0. 40 | To show each action's evaluation, 41 | a state is shown as 3 x 3 matrix like following. 42 | XoX Up, 43 | oco Left, Center(set mean value), Right 44 | XoX Down 45 | actions are located on 3 x 3 grid. 46 | """ 47 | nrow = env.unwrapped.nrow 48 | ncol = env.unwrapped.ncol 49 | state_size = 3 50 | q_nrow = nrow * state_size 51 | q_ncol = ncol * state_size 52 | reward_map = np.zeros((q_nrow, q_ncol)) 53 | 54 | for r in range(nrow): 55 | for c in range(ncol): 56 | s = r * nrow + c 57 | state_exist = False 58 | if isinstance(Q, dict) and s in Q: 59 | state_exist = True 60 | elif isinstance(Q, (np.ndarray, np.generic)) and s < Q.shape[0]: 61 | state_exist = True 62 | 63 | if state_exist: 64 | # In the display map, vertical index reverse. 65 | _r = 1 + (nrow - 1 - r) * state_size 66 | _c = 1 + c * state_size 67 | reward_map[_r][_c - 1] = Q[s][0] # LEFT = 0 68 | reward_map[_r - 1][_c] = Q[s][1] # DOWN = 1 69 | reward_map[_r][_c + 1] = Q[s][2] # RIGHT = 2 70 | reward_map[_r + 1][_c] = Q[s][3] # UP = 3 71 | # Center 72 | reward_map[_r][_c] = np.mean(Q[s]) 73 | 74 | fig = plt.figure() 75 | ax = fig.add_subplot(1, 1, 1) 76 | plt.imshow(reward_map, cmap=cm.RdYlGn, interpolation="bilinear", 77 | vmax=abs(reward_map).max(), vmin=-abs(reward_map).max()) 78 | ax.set_xlim(-0.5, q_ncol - 0.5) 79 | ax.set_ylim(-0.5, q_nrow - 0.5) 80 | ax.set_xticks(np.arange(-0.5, q_ncol, state_size)) 81 | ax.set_yticks(np.arange(-0.5, q_nrow, state_size)) 82 | ax.set_xticklabels(range(ncol + 1)) 83 | ax.set_yticklabels(range(nrow + 1)) 84 | ax.grid(which="both") 85 | plt.show() 86 | -------------------------------------------------------------------------------- /IRL/bayesian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats 3 | from scipy.special import logsumexp 4 | from planner import PolicyIterationPlanner 5 | from tqdm import tqdm 6 | 7 | 8 | class BayesianIRL(): 9 | 10 | def __init__(self, env, eta=0.8, prior_mean=0.0, prior_scale=0.5): 11 | self.env = env 12 | self.planner = PolicyIterationPlanner(env) 13 | self.eta = eta 14 | self._mean = prior_mean 15 | self._scale = prior_scale 16 | self.prior_dist = scipy.stats.norm(loc=prior_mean, 17 | scale=prior_scale) 18 | 19 | def estimate(self, trajectories, epoch=50, gamma=0.3, 20 | learning_rate=0.1, sigma=0.05, sample_size=20): 21 | num_states = len(self.env.states) 22 | reward = np.random.normal(size=num_states, 23 | loc=self._mean, scale=self._scale) 24 | 25 | def get_q(r, g): 26 | self.planner.reward_func = lambda s: r[s] 27 | V = self.planner.plan(g) 28 | Q = self.planner.policy_to_q(V, gamma) 29 | return Q 30 | 31 | for i in range(epoch): 32 | noises = np.random.randn(sample_size, num_states) 33 | scores = [] 34 | for n in tqdm(noises): 35 | _reward = reward + sigma * n 36 | Q = get_q(_reward, gamma) 37 | 38 | # Calculate prior (sum of log prob). 39 | reward_prior = np.sum(self.prior_dist.logpdf(_r) 40 | for _r in _reward) 41 | 42 | # Calculate likelihood. 43 | likelihood = self.calculate_likelihood(trajectories, Q) 44 | # Calculate posterior. 45 | posterior = likelihood + reward_prior 46 | scores.append(posterior) 47 | 48 | rate = learning_rate / (sample_size * sigma) 49 | scores = np.array(scores) 50 | normalized_scores = (scores - scores.mean()) / scores.std() 51 | noise = np.mean(noises * normalized_scores.reshape((-1, 1)), 52 | axis=0) 53 | reward = reward + rate * noise 54 | print("At iteration {} posterior={}.".format(i, scores.mean())) 55 | 56 | reward = reward.reshape(self.env.shape) 57 | return reward 58 | 59 | def calculate_likelihood(self, trajectories, Q): 60 | mean_log_prob = 0.0 61 | for t in trajectories: 62 | t_log_prob = 0.0 63 | for s, a in t: 64 | expert_value = self.eta * Q[s][a] 65 | total = [self.eta * Q[s][_a] for _a in self.env.actions] 66 | t_log_prob += (expert_value - logsumexp(total)) 67 | mean_log_prob += t_log_prob 68 | mean_log_prob /= len(trajectories) 69 | return mean_log_prob 70 | 71 | 72 | if __name__ == "__main__": 73 | def test_estimate(): 74 | from environment import GridWorldEnv 75 | env = GridWorldEnv(grid=[ 76 | [0, 0, 0, 1], 77 | [0, 0, 0, 0], 78 | [0, -1, 0, 0], 79 | [0, 0, 0, 0], 80 | ]) 81 | # Train Teacher 82 | teacher = PolicyIterationPlanner(env) 83 | teacher.plan() 84 | trajectories = [] 85 | print("Gather demonstrations of teacher.") 86 | for i in range(20): 87 | s = env.reset() 88 | done = False 89 | steps = [] 90 | while not done: 91 | a = teacher.act(s) 92 | steps.append((s, a)) 93 | n_s, r, done, _ = env.step(a) 94 | s = n_s 95 | trajectories.append(steps) 96 | 97 | print("Estimate reward.") 98 | irl = BayesianIRL(env) 99 | rewards = irl.estimate(trajectories) 100 | print(rewards) 101 | env.plot_on_grid(rewards) 102 | 103 | test_estimate() 104 | -------------------------------------------------------------------------------- /IRL/environment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.toy_text import discrete 3 | import matplotlib.pyplot as plt 4 | import matplotlib.cm as cm 5 | 6 | 7 | class GridWorldEnv(discrete.DiscreteEnv): 8 | 9 | metadata = {"render.modes": ["human", "ansi"]} 10 | 11 | def __init__(self, grid, move_prob=0.8, default_reward=0.0): 12 | # grid is 2d-array, and each value treated as attribute. 13 | # attribute is 14 | # 0: ordinary cell 15 | # -1: damage cell (game end) 16 | # 1: reward cell (game end) 17 | self.grid = grid 18 | if isinstance(grid, (list, tuple)): 19 | self.grid = np.array(grid) 20 | self._actions = { 21 | "LEFT": 0, 22 | "DOWN": 1, 23 | "RIGHT": 2, 24 | "UP": 3, 25 | } 26 | self.default_reward = default_reward 27 | self.move_prob = move_prob 28 | 29 | num_states = self.nrow * self.ncol 30 | num_actions = len(self._actions) 31 | 32 | # start from left down 33 | initial_state_prob = np.zeros(num_states) 34 | initial_state_prob[self.coordinate_to_state(self.nrow - 1, 0)] = 1.0 35 | 36 | # Make transitions 37 | P = {} 38 | 39 | for s in range(num_states): 40 | if s not in P: 41 | P[s] = {} 42 | 43 | reward = self.reward_func(s) 44 | done = self.has_done(s) 45 | if done: 46 | # Terminal state 47 | for a in range(num_actions): 48 | P[s][a] = [] 49 | P[s][a].append([1.0, None, reward, done]) 50 | else: 51 | for a in range(num_actions): 52 | P[s][a] = [] 53 | transition_probs = self.transit_func(s, a) 54 | for n_s in transition_probs: 55 | reward = self.reward_func(n_s) 56 | done = self.has_done(s) 57 | P[s][a].append([transition_probs[n_s], n_s, 58 | reward, done]) 59 | self.P = P 60 | super().__init__(num_states, num_actions, P, initial_state_prob) 61 | 62 | @property 63 | def nrow(self): 64 | return self.grid.shape[0] 65 | 66 | @property 67 | def ncol(self): 68 | return self.grid.shape[1] 69 | 70 | @property 71 | def shape(self): 72 | return self.grid.shape 73 | 74 | @property 75 | def actions(self): 76 | return list(range(self.action_space.n)) 77 | 78 | @property 79 | def states(self): 80 | return list(range(self.observation_space.n)) 81 | 82 | def state_to_coordinate(self, s): 83 | row, col = divmod(s, self.nrow) 84 | return row, col 85 | 86 | def coordinate_to_state(self, row, col): 87 | index = row * self.nrow + col 88 | return index 89 | 90 | def state_to_feature(self, s): 91 | feature = np.zeros(self.observation_space.n) 92 | feature[s] = 1.0 93 | return feature 94 | 95 | def transit_func(self, state, action): 96 | transition_probs = {} 97 | opposite_direction = (action + 2) % 4 98 | candidates = [a for a in range(len(self._actions)) 99 | if a != opposite_direction] 100 | 101 | for a in candidates: 102 | prob = 0 103 | if a == action: 104 | prob = self.move_prob 105 | else: 106 | prob = (1 - self.move_prob) / 2 107 | 108 | next_state = self._move(state, a) 109 | if next_state not in transition_probs: 110 | transition_probs[next_state] = prob 111 | else: 112 | transition_probs[next_state] += prob 113 | 114 | return transition_probs 115 | 116 | def reward_func(self, state): 117 | row, col = self.state_to_coordinate(state) 118 | reward = self.grid[row][col] 119 | return reward 120 | 121 | def has_done(self, state): 122 | row, col = self.state_to_coordinate(state) 123 | reward = self.grid[row][col] 124 | if np.abs(reward) == 1: 125 | return True 126 | else: 127 | return False 128 | 129 | def _move(self, state, action): 130 | next_state = state 131 | row, col = self.state_to_coordinate(state) 132 | next_row, next_col = row, col 133 | 134 | # Move state by action 135 | if action == self._actions["LEFT"]: 136 | next_col -= 1 137 | elif action == self._actions["DOWN"]: 138 | next_row += 1 139 | elif action == self._actions["RIGHT"]: 140 | next_col += 1 141 | elif action == self._actions["UP"]: 142 | next_row -= 1 143 | 144 | # Check the out of grid 145 | if not (0 <= next_row < self.nrow): 146 | next_row, next_col = row, col 147 | if not (0 <= next_col < self.ncol): 148 | next_row, next_col = row, col 149 | 150 | next_state = self.coordinate_to_state(next_row, next_col) 151 | 152 | return next_state 153 | 154 | def plot_on_grid(self, values): 155 | if len(values.shape) < 2: 156 | values = values.reshape(self.shape) 157 | fig, ax = plt.subplots() 158 | ax.imshow(values, cmap=cm.RdYlGn) 159 | ax.set_xticks(np.arange(self.ncol)) 160 | ax.set_yticks(np.arange(self.nrow)) 161 | fig.tight_layout() 162 | plt.show() 163 | 164 | 165 | if __name__ == "__main__": 166 | def test_grid(): 167 | env = GridWorldEnv(grid=[ 168 | [1, 0, 0, 0], 169 | [0, 0, 0, 0], 170 | [0, 0, 0, 0], 171 | [0, 0, 0, 0], 172 | ], move_prob=1.0) 173 | s = env.reset() 174 | assert s == 12, "Start position is not left down" 175 | s, r, d, _ = env.step(0) # Move to left wall 176 | assert s == 12, "Agent should be bumped to left wall" 177 | s, r, d, _ = env.step(1) # Move to bottom wall 178 | assert s == 12, "Agent should be bumped to bottom wall" 179 | s, r, d, _ = env.step(2) # Move to right 180 | assert s == 13, "Agent should go to right" 181 | s, r, d, _ = env.step(3) # Move to up 182 | assert s == 9, "Agent should go to up" 183 | env.step(3) # UP 184 | env.step(3) # UP 185 | s, r, d, _ = env.step(0) # LEFT 186 | assert s == 0, "Agent locate last state" 187 | s, r, d, _ = env.step(0) # MOVE 188 | assert s is None, "Next state does not exist" 189 | assert d, "Agent should reache the goal" 190 | assert r == 1, "Agent should get reward" 191 | 192 | test_grid() 193 | -------------------------------------------------------------------------------- /IRL/maxent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from planner import PolicyIterationPlanner 3 | from tqdm import tqdm 4 | 5 | 6 | class MaxEntIRL(): 7 | 8 | def __init__(self, env): 9 | self.env = env 10 | self.planner = PolicyIterationPlanner(env) 11 | 12 | def estimate(self, trajectories, epoch=20, learning_rate=0.01, gamma=0.9): 13 | state_features = np.vstack([self.env.state_to_feature(s) 14 | for s in self.env.states]) 15 | theta = np.random.uniform(size=state_features.shape[1]) 16 | teacher_features = self.calculate_expected_feature(trajectories) 17 | 18 | for e in tqdm(range(epoch)): 19 | # Estimate reward. 20 | rewards = state_features.dot(theta.T) 21 | 22 | # Optimize policy under estimated reward. 23 | self.planner.reward_func = lambda s: rewards[s] 24 | self.planner.plan(gamma=gamma) 25 | 26 | # Estimate feature under policy. 27 | features = self.expected_features_under_policy( 28 | self.planner.policy, trajectories) 29 | 30 | # Update to close to teacher. 31 | update = teacher_features - features.dot(state_features) 32 | theta += learning_rate * update 33 | 34 | estimated = state_features.dot(theta.T) 35 | estimated = estimated.reshape(self.env.shape) 36 | return estimated 37 | 38 | def calculate_expected_feature(self, trajectories): 39 | features = np.zeros(self.env.observation_space.n) 40 | for t in trajectories: 41 | for s in t: 42 | features[s] += 1 43 | 44 | features /= len(trajectories) 45 | return features 46 | 47 | def expected_features_under_policy(self, policy, trajectories): 48 | t_size = len(trajectories) 49 | states = self.env.states 50 | transition_probs = np.zeros((t_size, len(states))) 51 | 52 | initial_state_probs = np.zeros(len(states)) 53 | for t in trajectories: 54 | initial_state_probs[t[0]] += 1 55 | initial_state_probs /= t_size 56 | transition_probs[0] = initial_state_probs 57 | 58 | for t in range(1, t_size): 59 | for prev_s in states: 60 | prev_prob = transition_probs[t - 1][prev_s] 61 | a = self.planner.act(prev_s) 62 | probs = self.env.transit_func(prev_s, a) 63 | for s in probs: 64 | transition_probs[t][s] += prev_prob * probs[s] 65 | 66 | total = np.mean(transition_probs, axis=0) 67 | return total 68 | 69 | 70 | if __name__ == "__main__": 71 | def test_estimate(): 72 | from environment import GridWorldEnv 73 | env = GridWorldEnv(grid=[ 74 | [0, 0, 0, 1], 75 | [0, 0, 0, 0], 76 | [0, -1, 0, 0], 77 | [0, 0, 0, 0], 78 | ]) 79 | # Train Teacher 80 | teacher = PolicyIterationPlanner(env) 81 | teacher.plan() 82 | trajectories = [] 83 | print("Gather demonstrations of teacher.") 84 | for i in range(20): 85 | s = env.reset() 86 | done = False 87 | steps = [s] 88 | while not done: 89 | a = teacher.act(s) 90 | n_s, r, done, _ = env.step(a) 91 | steps.append(n_s) 92 | s = n_s 93 | trajectories.append(steps) 94 | 95 | print("Estimate reward.") 96 | irl = MaxEntIRL(env) 97 | rewards = irl.estimate(trajectories, epoch=100) 98 | print(rewards) 99 | env.plot_on_grid(rewards) 100 | 101 | test_estimate() 102 | -------------------------------------------------------------------------------- /IRL/planner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Planner(): 5 | 6 | def __init__(self, env, reward_func=None): 7 | self.env = env 8 | self.reward_func = reward_func 9 | if self.reward_func is None: 10 | self.reward_func = self.env.reward_func 11 | 12 | def initialize(self): 13 | self.env.reset() 14 | 15 | def transitions_at(self, state, action): 16 | reward = self.reward_func(state) 17 | done = self.env.has_done(state) 18 | transition = [] 19 | if not done: 20 | transition_probs = self.env.transit_func(state, action) 21 | for next_state in transition_probs: 22 | prob = transition_probs[next_state] 23 | reward = self.reward_func(next_state) 24 | done = self.env.has_done(state) 25 | transition.append((prob, next_state, reward, done)) 26 | else: 27 | transition.append((1.0, None, reward, done)) 28 | for p, n_s, r, d in transition: 29 | yield p, n_s, r, d 30 | 31 | def plan(self, gamma=0.9, threshold=0.0001): 32 | raise Exception("Planner have to implements plan method.") 33 | 34 | 35 | class ValueIterationPlanner(Planner): 36 | 37 | def __init__(self, env): 38 | super().__init__(env) 39 | 40 | def plan(self, gamma=0.9, threshold=0.0001): 41 | self.initialize() 42 | V = np.zeros(len(self.env.states)) 43 | while True: 44 | delta = 0 45 | for s in self.env.states: 46 | expected_rewards = [] 47 | for a in self.env.actions: 48 | reward = 0 49 | for p, n_s, r, done in self.transitions_at(s, a): 50 | if n_s is None: 51 | reward = r 52 | continue 53 | reward += p * (r + gamma * V[n_s] * (not done)) 54 | expected_rewards.append(reward) 55 | max_reward = max(expected_rewards) 56 | delta = max(delta, abs(max_reward - V[s])) 57 | V[s] = max_reward 58 | 59 | if delta < threshold: 60 | break 61 | 62 | return V 63 | 64 | 65 | class PolicyIterationPlanner(Planner): 66 | 67 | def __init__(self, env): 68 | super().__init__(env) 69 | self.policy = None 70 | self._limit_count = 1000 71 | 72 | def initialize(self): 73 | super().initialize() 74 | self.policy = np.ones((self.env.observation_space.n, 75 | self.env.action_space.n)) 76 | # First, take each action uniformly. 77 | self.policy = self.policy / self.env.action_space.n 78 | 79 | def policy_to_q(self, V, gamma): 80 | Q = np.zeros((self.env.observation_space.n, 81 | self.env.action_space.n)) 82 | 83 | for s in self.env.states: 84 | for a in self.env.actions: 85 | a_p = self.policy[s][a] 86 | for p, n_s, r, done in self.transitions_at(s, a): 87 | if done: 88 | Q[s][a] += p * a_p * r 89 | else: 90 | Q[s][a] += p * a_p * (r + gamma * V[n_s]) 91 | return Q 92 | 93 | def estimate_by_policy(self, gamma, threshold): 94 | V = np.zeros(self.env.observation_space.n) 95 | 96 | count = 0 97 | while True: 98 | delta = 0 99 | for s in self.env.states: 100 | expected_rewards = [] 101 | for a in self.env.actions: 102 | action_prob = self.policy[s][a] 103 | reward = 0 104 | for p, n_s, r, done in self.transitions_at(s, a): 105 | if n_s is None: 106 | reward = r 107 | continue 108 | reward += action_prob * p * \ 109 | (r + gamma * V[n_s] * (not done)) 110 | expected_rewards.append(reward) 111 | value = sum(expected_rewards) 112 | delta = max(delta, abs(value - V[s])) 113 | V[s] = value 114 | 115 | if delta < threshold or count > self._limit_count: 116 | break 117 | count += 1 118 | 119 | return V 120 | 121 | def act(self, s): 122 | return np.argmax(self.policy[s]) 123 | 124 | def plan(self, gamma=0.9, threshold=0.0001, keep_policy=False): 125 | if not keep_policy: 126 | self.initialize() 127 | 128 | count = 0 129 | while True: 130 | update_stable = True 131 | # Estimate expected reward under current policy. 132 | V = self.estimate_by_policy(gamma, threshold) 133 | 134 | for s in self.env.states: 135 | # Get action following to the policy (choose max prob's action). 136 | policy_action = self.act(s) 137 | 138 | # Compare with other actions. 139 | action_rewards = np.zeros(len(self.env.actions)) 140 | for a in self.env.actions: 141 | reward = 0 142 | for p, n_s, r, done in self.transitions_at(s, a): 143 | if n_s is None: 144 | reward = r 145 | continue 146 | reward += p * (r + gamma * V[n_s] * (not done)) 147 | action_rewards[a] = reward 148 | best_action = np.argmax(action_rewards) 149 | if policy_action != best_action: 150 | update_stable = False 151 | 152 | # Update policy (set best_action prob=1, otherwise=0 (greedy)). 153 | self.policy[s] = np.zeros(len(self.env.actions)) 154 | self.policy[s][best_action] = 1.0 155 | 156 | if update_stable or count > self._limit_count: 157 | # If policy isn't updated, stop iteration. 158 | break 159 | count += 1 160 | 161 | return V 162 | 163 | 164 | if __name__ == "__main__": 165 | def test_plan(): 166 | from environment import GridWorldEnv 167 | env = GridWorldEnv(grid=[ 168 | [0, 0, 0, 1], 169 | [0, 0, 0, 0], 170 | [0, -1, 0, 0], 171 | [0, 0, 0, 0], 172 | ]) 173 | print("Value Iteration.") 174 | vp = ValueIterationPlanner(env) 175 | v = vp.plan() 176 | print(v.reshape(env.shape)) 177 | 178 | print("Policy Iteration.") 179 | pp = PolicyIterationPlanner(env) 180 | v = pp.plan() 181 | print(v.reshape(env.shape)) 182 | q = pp.policy_to_q(v, 0.9) 183 | print(np.sum(q, axis=1).reshape(env.shape)) 184 | 185 | test_plan() 186 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### 指摘事項 2 | 3 | 4 | ### 指摘箇所 5 | 6 | * [ ] Day1: 強化学習の位置づけを知る 7 | * [ ] Day2: 強化学習の解法(1): 環境から計画を立てる 8 | * [ ] Day3: 強化学習の解法(2): 経験から計画を立てる 9 | * [ ] Day4: 強化学習に対するニューラルネットワークの適用 10 | * [ ] Day5: 強化学習の弱点 11 | * [ ] Day6: 強化学習の弱点を克服するための手法 12 | * [ ] Day7: 強化学習の活用領域 13 | 14 | ページ番号: p 15 | 16 | ### 実行環境 17 | 18 | * OS: 19 | * Python version: 20 | * `pip freeze`の実行結果 (下に添付) 21 | 22 | ### エラー内容 23 | 24 | (例外のメッセージ、ログ、画面ショットなどを添付) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MM/dyna.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | from collections import defaultdict, Counter 4 | import gym 5 | from gym.envs.registration import register 6 | register(id="FrozenLakeEasy-v0", entry_point="gym.envs.toy_text:FrozenLakeEnv", 7 | kwargs={"is_slippery": False}) 8 | 9 | 10 | class DynaAgent(): 11 | 12 | def __init__(self, epsilon=0.1): 13 | self.epsilon = epsilon 14 | self.actions = [] 15 | self.value = None 16 | 17 | def policy(self, state): 18 | if np.random.random() < self.epsilon: 19 | return np.random.randint(len(self.actions)) 20 | else: 21 | if sum(self.value[state]) == 0: 22 | return np.random.randint(len(self.actions)) 23 | else: 24 | return np.argmax(self.value[state]) 25 | 26 | def learn(self, env, episode_count=3000, gamma=0.9, learning_rate=0.1, 27 | steps_in_model=-1, report_interval=100): 28 | self.actions = list(range(env.action_space.n)) 29 | self.value = defaultdict(lambda: [0] * len(self.actions)) 30 | model = Model(self.actions) 31 | 32 | rewards = [] 33 | for e in range(episode_count): 34 | s = env.reset() 35 | done = False 36 | goal_reward = 0 37 | while not done: 38 | a = self.policy(s) 39 | n_state, reward, done, info = env.step(a) 40 | 41 | # Update from experiments in the real environment. 42 | gain = reward + gamma * max(self.value[n_state]) 43 | estimated = self.value[s][a] 44 | self.value[s][a] += learning_rate * (gain - estimated) 45 | 46 | if steps_in_model > 0: 47 | model.update(s, a, reward, n_state) 48 | for s, a, r, n_s in model.simulate(steps_in_model): 49 | gain = r + gamma * max(self.value[n_s]) 50 | estimated = self.value[s][a] 51 | self.value[s][a] += learning_rate * (gain - estimated) 52 | 53 | s = n_state 54 | else: 55 | goal_reward = reward 56 | 57 | rewards.append(goal_reward) 58 | if e != 0 and e % report_interval == 0: 59 | recent = np.array(rewards[-report_interval:]) 60 | print("At episode {}, reward is {}".format( 61 | e, recent.mean())) 62 | 63 | 64 | class Model(): 65 | 66 | def __init__(self, actions): 67 | self.num_actions = len(actions) 68 | self.transit_count = defaultdict(lambda: [Counter() for a in actions]) 69 | self.total_reward = defaultdict(lambda: [0] * 70 | self.num_actions) 71 | self.history = defaultdict(Counter) 72 | 73 | def update(self, state, action, reward, next_state): 74 | self.transit_count[state][action][next_state] += 1 75 | self.total_reward[state][action] += reward 76 | self.history[state][action] += 1 77 | 78 | def transit(self, state, action): 79 | counter = self.transit_count[state][action] 80 | states = [] 81 | counts = [] 82 | for s, c in counter.most_common(): 83 | states.append(s) 84 | counts.append(c) 85 | probs = np.array(counts) / sum(counts) 86 | return np.random.choice(states, p=probs) 87 | 88 | def reward(self, state, action): 89 | total_reward = self.total_reward[state][action] 90 | total_count = self.history[state][action] 91 | return total_reward / total_count 92 | 93 | def simulate(self, count): 94 | states = list(self.transit_count.keys()) 95 | actions = lambda s: [a for a, c in self.history[s].most_common() 96 | if c > 0] 97 | 98 | for i in range(count): 99 | state = np.random.choice(states) 100 | action = np.random.choice(actions(state)) 101 | 102 | next_state = self.transit(state, action) 103 | reward = self.reward(state, action) 104 | 105 | yield state, action, reward, next_state 106 | 107 | 108 | def main(steps_in_model): 109 | env = gym.make("FrozenLakeEasy-v0") 110 | agent = DynaAgent() 111 | agent.learn(env, steps_in_model=steps_in_model) 112 | 113 | 114 | if __name__ == "__main__": 115 | parser = argparse.ArgumentParser(description="Dyna Agent") 116 | parser.add_argument("--modelstep", type=int, default=-1, 117 | help="step count in the model") 118 | 119 | args = parser.parse_args() 120 | main(args.modelstep) 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pythonで学ぶ強化学習 -入門から実践まで- 2 | 3 | [Pythonで学ぶ強化学習 -入門から実践まで-](https://www.amazon.co.jp/dp/4065142989/)の実装コードリポジトリです。 4 | 5 | 誤記、またサンプルコードの実行エラーについてはIssueで管理しています。 6 | 7 | **[Issue List](https://github.com/icoxfog417/baby-steps-of-rl-ja/issues)** 8 | 9 | * [3刷(2/4発行)での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/1?closed=1) 10 | * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/17/files) 11 | * [改訂第2版での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/2?closed=1) 12 | * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/35/files) 13 | * [改訂第2版4刷での修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/milestone/3) 14 | * [ソースコードの修正点](https://github.com/icoxfog417/baby-steps-of-rl-ja/pull/59/files) 15 | 16 | 17 | 誤記/表記についての指摘、またサンプルコードの実行エラーについては[Issueにて](https://github.com/icoxfog417/baby-steps-of-rl-ja/issues/new)ご連絡をお願いします。 18 | 19 | * 既に同じIssueが上がっていないか、事前に確認をお願いします。 20 | * 実行エラーについては、テンプレートに沿い実行環境、発生エラーについて記載をお願いします。 21 | 22 | ## Index 23 | 24 | * [Setup](https://github.com/icoxfog417/baby-steps-of-rl-ja#setup) 25 | * [Setup with GPU](https://github.com/icoxfog417/baby-steps-of-rl-ja#setup-with-gpu) 26 | * [Day1: 強化学習の位置づけを知る](https://github.com/icoxfog417/baby-steps-of-rl-ja#day1-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E4%BD%8D%E7%BD%AE%E3%81%A5%E3%81%91%E3%82%92%E7%9F%A5%E3%82%8B) 27 | * [Day2: 強化学習の解法(1): 環境から計画を立てる](https://github.com/icoxfog417/baby-steps-of-rl-ja#day2-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E8%A7%A3%E6%B3%951-%E7%92%B0%E5%A2%83%E3%81%8B%E3%82%89%E8%A8%88%E7%94%BB%E3%82%92%E7%AB%8B%E3%81%A6%E3%82%8B) 28 | * [Day3: 強化学習の解法(2): 経験から計画を立てる](https://github.com/icoxfog417/baby-steps-of-rl-ja#day3-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E8%A7%A3%E6%B3%952-%E7%B5%8C%E9%A8%93%E3%81%8B%E3%82%89%E8%A8%88%E7%94%BB%E3%82%92%E7%AB%8B%E3%81%A6%E3%82%8B) 29 | * [Day4: 強化学習に対するニューラルネットワークの適用](https://github.com/icoxfog417/baby-steps-of-rl-ja#day4-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AB%E5%AF%BE%E3%81%99%E3%82%8B%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E3%83%8D%E3%83%83%E3%83%88%E3%83%AF%E3%83%BC%E3%82%AF%E3%81%AE%E9%81%A9%E7%94%A8) 30 | * [Day5: 強化学習の弱点](https://github.com/icoxfog417/baby-steps-of-rl-ja#day5-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9) 31 | * [Day6: 強化学習の弱点を克服するための手法](https://github.com/icoxfog417/baby-steps-of-rl-ja#day6-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9%E3%82%92%E5%85%8B%E6%9C%8D%E3%81%99%E3%82%8B%E3%81%9F%E3%82%81%E3%81%AE%E6%89%8B%E6%B3%95) 32 | * [Day7: 強化学習の活用領域](https://github.com/icoxfog417/baby-steps-of-rl-ja#day7-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E6%B4%BB%E7%94%A8%E9%A0%98%E5%9F%9F) 33 | 34 | [Support Content](https://github.com/icoxfog417/baby-steps-of-rl-ja#support-content) 35 | 36 | ## Setup 37 | 38 | サンプルコードをダウンロードするのにGit、実行をするのにPythonの環境が必要です。そのため、以下2つのソフトウェアをダウンロードし、インストールしてください。なお、本書ではPythonの環境を作成するのにMinicondaを使用します。 39 | 40 | 1. [Git](https://git-scm.com/) 41 | 2. [Python (Miniconda)](https://conda.io/miniconda.html) 42 | * ダウンロードするのは、Python3の方です 43 | 44 | インストールが終了したら、まずソースコードのダウンロードを行います。ターミナル/コマンドプロンプトを開き、作業するディレクトリで以下のコマンドを実行してください。 45 | 46 | ``` 47 | git clone https://github.com/icoxfog417/baby-steps-of-rl-ja.git 48 | ``` 49 | 50 | コマンドを実行すると、`baby-steps-of-rl-ja`というディレクトリが作成されていると思います。これで、ダウンロードは完了しました。ダウンロードしたフォルダに移動しましょう。 51 | 52 | ``` 53 | cd baby-steps-of-rl-ja 54 | ``` 55 | 56 | 続いて、ソースコードの実行環境を作成します。実行環境を作成するのに、Minicondaをインストールすることで使えるようになる`conda`コマンドを使用します。これから、本書の実行環境である`rl-book`という環境を作成します。 57 | 58 | ``` 59 | conda create -n rl-book python=3.6 60 | conda activate rl-book 61 | ``` 62 | 63 | `conda activate`を実行することで、ターミナルの先頭に`(rl-book)`がついたでしょうか。これが、実行環境が有効化されているサインです。本書のソースコードを実行する際は、まず実行環境が有効化されているか=`(rl-book)`が先頭についているか、を確認してください。なお、無効化する際は`conda deactivate`のコマンドを実行します。 64 | 65 | 実行環境に、実行に必要なライブラリをインストールします(`(rl-book)`が先頭についているか確認して実行してください)。 66 | 67 | ``` 68 | pip install -r requirements.txt 69 | ``` 70 | 71 | 以下のように、`welcome.py`を実行してみてください。ゲーム画面が立ち上がればセットアップは完了です。 72 | 73 | ``` 74 | python welcome.py 75 | ``` 76 | 77 | ## Setup with GPU 78 | 79 | Day4で深層学習を利用した強化学習を実装していますが(DQN/A2C)、この学習にはGPUが不可欠です。GPUがない場合、学習に数日はかかります。 80 | 81 | GPUを利用した学習には、当然GPUが必要です(より具体的には、NVIDIAのGPUです)。GPUを調達する方法は、以下2つです。 82 | 83 | 1. GPUを搭載したマシンを用意する 84 | 2. クラウド上でGPUを利用する 85 | * クラウドプラットフォームのGPUインスタンスを利用する 86 | * Google ColaboratoryでGPUを利用する 87 | 88 | ### Local GPU Machine Setup 89 | 90 | GPUを搭載したマシンがある場合、以下3つのステップでセットアップを行います。 91 | 92 | 1. NVIDIA Driverのダウンロードとインストール 93 | * [NVIDIAドライバダウンロード](https://www.nvidia.co.jp/Download/index.aspx?lang=jp) 94 | * 基本的には、自動でダウンロードすべきドライバを選んでくれます。選んでくれない場合、手動で選択してダウンロードします。 95 | 2. CUDA Toolkitのインストール 96 | * [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive)から、使用するTensorFlowが対応しているバージョンをダウンロードしインストールします([TensorFlow 1.13以上はCUDA 10です](https://www.tensorflow.org/install/gpu))。 97 | 3. cuDNNのインストール 98 | * [cuDNN](https://developer.nvidia.com/cudnn)をダウンロードし、Toolkitのフォルダに展開します。なお、cuDNNのダウンロードにはユーザー登録が必要です。 99 | 4. `tensorflow`の代わりに、`tensorflow-gpu`をインストールします(`tensorflow`がインストールされている場合、アンインストールしてください)。 100 | 101 | `import tensorflow as tf`を実行し何もエラーがでなければセットアップは完了です。 102 | 103 | ``` 104 | > python 105 | >>> import tensorflow as tf 106 | ``` 107 | 108 | ### Cloud GPU Machine Setup 109 | 110 | AWSやAzure、GCPではGPUインスタンスを提供しています。それらを使用すれば、GPU搭載マシンを用意する必要はありません。GPUインスタンスでのセットアップ手順は[Local GPU Machine Setup](https://github.com/icoxfog417/baby-steps-of-rl-ja#local-gpu-machine-setup)と同じです。セットアップ済みのインスタンス(SageMakerなど)の場合、セットアップの必要もありません。 111 | 112 | GPUインスタンスの使用には当然料金がかかります。そこで、無料でGPUを使用した計算ができるGoogle Colaboratoryを紹介します。 113 | 114 | * [Day4: 価値評価に深層学習を適用する: Deep Q-Network](https://colab.research.google.com/drive/1QZs38jqCaSIpoKmoIl8XxVJUwdG78Hb8) 115 | * [Day4: 戦略に深層学習を適用する: Advantage Actor Critic(A2C)](https://colab.research.google.com/drive/1IzXGuNj4ZbsuWC7ei98ZzKrVk7mPS1t-) 116 | 117 | Google Colaboratoryは、オンライン上でJupyter Notebookが使えるサービスです。GPUを使った計算も行うことができます。ただ、実行時間が限られています。長期の実行は行えませんが、可能な範囲で学習してモデルをダウンロードするには十分使えます。 118 | 119 | 120 | ## Day1: 強化学習の位置づけを知る 121 | 122 | **Day1's Goals** 123 | 124 | * 強化学習と、機械学習、人工知能といったキーワードの関係を理解する 125 | * 強化学習以外の学習法に対する、強化学習のメリット・デメリットを理解する 126 | * 機械学習の基本的な仕組みを理解する 127 | 128 | **Summary** 129 | 130 | * 強化学習とは? 131 | * 強化学習 ⊂ 機械学習 ⊂ 人工知能。 132 | * 機械学習 = 「機械」(=モデル)を「学習」させる手法。 133 | * 強化学習 = 「学習」方法の一種。 134 | * 強化学習は、連続した行動を通じて獲得できる「報酬の総和」を最大化することを目的とする。 135 | * 行動の評価方法と、(評価に基づく)行動の選び方(=戦略)を学習する。 136 | * 強化学習のメリット・デメリット 137 | * メリット: 評価が難しいタスクでも扱うことができる(行動の評価方法を学習するため)。 138 | * デメリット: どんな行動を学習するかは制御できない(モデルが自ら獲得するため)。 139 | * 強化学習の基本的な仕組み 140 | * 強化学習では、与えられる「環境」が一定のルールに従っていることを仮定する。 141 | * このルールを、 **マルコフ決定過程(Markov Decision Process: MDP)** という。 142 | * MDPの構成要素とその関係は、以下のように図式化できる。 143 | * MDPにおける報酬は、「直前の状態と遷移先」に依存する。 144 | * この報酬を **即時報酬(Immediate reward)** という。 145 | * 報酬の総和(=即時報酬の合計)は、当然事前には知ることができない。 146 | * そのため見積りを行うが、見積もった値を **期待報酬(Expected reward)** 、また **価値(Value)** と呼ぶ。 147 | * 見積もる際に、将来の即時報酬については割り引いて考える。 148 | * 割り引くための係数を **割引率(discount factor)** と呼ぶ。 149 | 150 |

151 | mdp.PNG 152 |

MDPの構成要素とその関係

153 |

154 | 155 | **Exercises** 156 | 157 | * [MDPの実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/environment.py) 158 | 159 | ## Day2: 強化学習の解法(1): 環境から計画を立てる 160 | 161 | **Day2's Goals** 162 | 163 | * 行動評価の指標となる「価値」の定義を理解する 164 | * 状態の「価値」を動的計画法で学習する手法と実装方法を理解する 165 | * 「戦略」を動的計画法で学習する手法と実装方法を理解する 166 | * モデルベースの手法とモデルフリーの手法の違いを理解する 167 | 168 | **Summary** 169 | 170 | * 「価値」の定義 171 | * Day1で定義した「価値」の計算には、将来の時点の即時報酬が必要になる。 172 | * 将来の即時報酬は、計算する段階では当然わからない。 173 | * わからない値に関する計算を持ち越しできるように、式を再帰的に定義する。 174 | * 発生しうる即時報酬の候補はいくつかあり、どれになるかは確率的になる。 175 | * そのため、報酬の値は期待値(確率x値)で表すようにする(行動確率 x 即時報酬)。 176 | * 「価値」を再帰的かつ期待値で計算した式を、 **Bellman Equation** と呼ぶ。 177 | * 状態の「価値」の学習と、「戦略」の学習 178 | * **Bellman Equation** では期待値の計算に戦略(行動確率)を使用する。 179 | * 期待値(価値)が計算されれば、それに基づき戦略の修正も行われる(得られる価値が高くなるよう修正する)。 180 | * 価値の計算、戦略の更新、価値の再計算・・・という処理が繰り返されることになる。 181 | * 動的計画法において、戦略と価値を相互に更新するプロセスを **Policy Iteration** と呼ぶ。 182 | * 一方、価値が計算できるなら価値が一番高いところを選べばいい、という素朴な考えもある。 183 | * この場合、価値=戦略となる。 184 | * 動的計画法において、価値=戦略とし、価値のみ更新するプロセスを **Value Iteration** と呼ぶ。 185 | * 戦略を持つか(Policyベース)、価値=戦略とするか(Valueベース)は、強化学習において重要な観点となる。 186 | * モデルベースとモデルフリー。 187 | * 動的計画法では、エージェントを一切動かさずに戦略/価値を学習した。 188 | * このような芸当が可能なのは、遷移関数と報酬関数が明らかであり、シミュレーションが可能であるため。 189 | * こうした、環境の情報を元に学習する手法を **モデルベース** の手法と呼ぶ。 190 | * なお、遷移関数と報酬関数がわかっていることは少ないため、実際は推定を行うことになる。 191 | * 一方、実際にエージェントを動かすことで得られた経験を元に学習する方法を **モデルフリー** の手法と呼ぶ。 192 | * モデルの情報(遷移関数/報酬関数)が必要ないため、モデル「フリー」と呼ばれる。 193 | * 環境が高度になるほどモデルの推定が困難になるため、一般的にはモデルフリーが用いられることが多い。 194 | * しかし、表現力の高いDNNの登場によりこの限りではなくなっている。 195 | * また、モデルフリーとモデルベースを併用する試みも多く行われている。 196 | 197 | **Exercises** 198 | 199 | * [価値の定義: Bellman Equationの実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/bellman_equation.py) 200 | * [価値反復法(Value Iteration)、戦略反復法(Policy Iteration)の実装](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/DP/planner.py) 201 | 202 | Value Iteration/Policy Iterationについて実行結果を試せるシミュレーターを用意しています。 203 | 以下のスクリプトを実行し、立ち上がったサーバーにアクセスしてみてください([こちらから試せます](https://baby-step-of-rl-ja-dp.herokuapp.com/))。 204 | 205 | ``` 206 | python DP/run_server.py 207 | ``` 208 | 209 | http://localhost:8888/ 210 | 211 | application.PNG 212 | 213 | * Areaで行・列を指定し、Drawのボタンを押すことで指定したサイズの迷路を作成できる。 214 | * 迷路内のセルを選択した後、Cell Settingのボタンを押すことで迷路のマスの設定を行うことができる。 215 | * Treasureはプラスの、Dangerはマイナスの報酬のゴール。Blockは、移動できないセルとなる。 216 | * 迷路の設定ができたら、Simulationにあるボタンを押し計画を立てる。 217 | * Value Iteration/Policy Iterationどちらかのボタンを押すと、ボタンに応じたアルゴリズムで解いた結果が参照できる。 218 | 219 | ## Day3: 強化学習の解法(2): 経験から計画を立てる 220 | 221 | **Day3's Goals** 222 | 223 | * 経験を活用する際の3つの観点を理解する 224 | 1. 経験の蓄積と活用のバランス 225 | 2. 計画の修正を実績から行うか、予測で行うか 226 | 3. 経験を価値、戦略どちらの更新に利用するか 227 | * 各観点における対の関係を理解する 228 | * 各観点を代表する手法の実装方法を身につける 229 | 230 | **Summary** 231 | 232 | * 「経験」とは 233 | * 「行動する前」に見積もった価値と、「行動した後」判明した実際の価値との差異となる。 234 | * 行動すればするほど実際の即時報酬が明らかになり、見積もりに依存する分は少なくなる。 235 | * これは「行動する前」の時点と「行動した後」の時点の差、という時刻間の差とも言える。 236 | * そのため、これを **TD誤差(Temporal Difference error)** と呼ぶ。 237 | 238 |

239 | td.PNG 240 |

経験=TD誤差

241 |

242 | 243 | * 経験の蓄積と活用のバランス 244 | * モデルフリーでは遷移関数/報酬関数が不明である。 245 | * そのため、「経験」の信頼度を上げるには複数回の試行が必要になる。 246 | * (宝くじを1回買って当選したから、宝くじの当選確率は100%!とはならない)。 247 | * 行動回数は。通常限られている。 248 | * そのため、行動回数を「経験の信頼度向上」(見積り精度向上)と「経験を信じた行動」に割り振る必要がある。 249 | * これを **探索と活用のトレードオフ(Exploration-Exploitation Trade-off)** と呼ぶ。 250 | * (探索=信頼度向上、活用=信じた行動)。 251 | * Epsilonの確率で探索/活用を切り替える手法を、**Epsilon-Greedy法** と呼ぶ。 252 | * 計画の修正を実績から行うか、予測で行うか 253 | * 「行動した後」は、最短では1回行動した後、最長ではエピソードが終了した後となる。 254 | * 前者を **TD法(TD(0))** 、後者を **Monte Carlo法** と呼ぶ。 255 | * 「行動した後」を長く取るほど実績に基づいた修正が可能になるが、その分修正のタイミングは遅くなる。 256 | * 実績/タイミングどちらを取るかはトレードオフとなる。 257 | * TD(0)とMonte Carlo法の間を取ることももちろん可能である。 258 | * 「行動した後」を複数ステップ後にする手法を **Multi-step learning** と呼ぶ。 259 | * ステップ数の異なる経験を組み合わせる手法を **TD(λ)法** と呼ぶ。 260 | * 経験を価値、戦略どちらの更新に利用するか 261 | * 経験は、価値/戦略(Valueベース/Policyベース)どちらの更新にも利用可能である。 262 | * TD法に基づき行動の価値の更新を行う手法を **Q-learning** と呼ぶ。 263 | * ("Q"は、行動価値を表す記号としてよく用いられる。これに対し状態の価値は"V"とされることが多い)。 264 | * TD法に基づき戦略の更新を行う手法を **SARSA(State–action–reward–state–action)** と呼ぶ。 265 | * SARSAでは価値を見積る際、先の行動は戦略により決定されることを前提とする。この前提を **On-policy** と呼ぶ。 266 | * Valueベースのように、先の行動は「最大の価値が得られる行動」である、とする場合を **Off-policy** と呼ぶ。 267 | * (戦略がない=Offのためこう呼ばれる)。 268 | * Q-learningはOff-policyであり、SARSAはOn-policyである。 269 | * SARSAでは戦略評価と戦略に同じ"Q"を使用している。 270 | * これに対し、Policy Iterationのように評価と戦略を切り離すこともできる。 271 | * 戦略側をActor、評価側をCriticとして切り離した手法を **Actor-Critic** と呼ぶ。 272 | * Actor-Criticは、Policyベース(Actor)とValueベース(Critic)の併用とも言える。 273 | 274 | 修正方法(実績/予測)、修正対象(価値/戦略)、見積り前提(On-policy/Off-policy)の3つの観点で手法をまとめると、以下のようになる。 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 |
修正方法修正対象見積り前提
予測実績価値戦略
Off-policyOn-policy
Q-learning
Monte Carlo
SARSA
Actor Critic
Off-policy Actor Critic
On-policy Monte Carlo
Off-policy Monte Carlo
354 | 355 | **Exercises** 356 | 357 | * 経験の蓄積と活用のバランス 358 | * [Epsilon-Greedy法](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Epsilon%26Greedy.ipynb) 359 | * 実績から計画を修正するか、予測で行うか 360 | * [Monte Carlo](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Monte%20Carlo.ipynb) 361 | * [Temporal Difference](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Q-learning.ipynb) 362 | * 経験を価値、戦略どちらの更新に利用するか 363 | * [Valueベース & Off policy: Q-learning](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Q-learning.ipynb) 364 | * [Policyベース & On policy: SARSA](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/SARSA.ipynb) 365 | * [Valueベース & Policyベース: Actor Critic](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/EL/notebooks/Actor%26Critic.ipynb) 366 | 367 | ## Day4: 強化学習に対するニューラルネットワークの適用 368 | 369 | **Day4's Goals** 370 | 371 | * 関数として、ニューラルネットワークを適用するメリット 372 | * 価値評価を、パラメーターを持った関数で実装する方法 373 | * 戦略を、パラメーターを持った関数で実装する方法 374 | 375 | **Summary** 376 | 377 | * 価値評価/戦略の関数化 378 | * Day3までは、状態における行動の価値をQ[s][a]というテーブルで管理してきた。 379 | * しかし、このままでは状態数/行動数が多くなった場合に破綻することは目に見えている。 380 | * テーブルを関数化することが、この組み合わせ爆発に対応するための一つの解法となる。 381 | * 関数として(ディープ)ニューラルネットワークを使用する強化学習を特に「深層強化学習」と呼ぶ。 382 | * 関数として、ニューラルネットワークを使用するメリット・デメリット 383 | * 人間が実際に観測している「状態」に近いデータをエージェントの学習に使用できる。 384 | * これは、DNNが特徴抽出に優れているためである(画像ならばCNNなど)。 385 | * ただ、ニューラルネットワークを使うことで学習時間が長くなるなどのデメリットも発生する(詳細は[Day5](https://github.com/icoxfog417/baby-steps-of-rl-ja#day5-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9))。 386 | * 価値評価を、パラメーターを持った関数で実装する 387 | * 状態を受け取り、行動価値(=Q)を出力する関数(Q-function)を、ニューラルネットワークで実装する。 388 | * ニューラルネットワークとしてCNNを利用する実装を、**Deep Q-Network (DQN)** と呼ぶ。 389 | * DQN以前にもニューラルネットワークを使用した研究はあった。しかし、学習が安定しないという課題があった。 390 | * DQNは、学習を安定させる3つの工夫を行うことでこの課題を克服している。 391 | * 3つとは、 **Experience Reply** 、**Fixed Target Q-Network** 、**報酬のClipping** である。 392 | * [Rainbow](https://arxiv.org/abs/1710.02298)は、DQNに対しさらに6つの工夫を追加した手法となっている。 393 | * 戦略を、パラメーターを持った関数で実装する 394 | * 戦略の出力は行動確率であり、これは価値のように事前/事後の差分で評価ができない。 395 | * (AとBの選択を行う際、選んだAが思っていたのとどれくらい違うかは評価できる(差分評価))。 396 | * (しかし、Bをとっていたらどうだったのか?というのは時間を巻き戻さないとわからない)。 397 | * そのため、価値の時のように差分を小さくするのではなく、純粋に戦略によって得られる期待価値を最大化する。 398 | * 期待値は確率X値で計算できた。 399 | * 戦略の期待価値は、「状態への遷移確率」X「行動確率」X「行動で得られる価値」で計算できる(J(θ))。 400 | * この期待価値を、勾配法で最大化する。この手法を **方策勾配法(Policy Gradient)** と呼ぶ。 401 | * 「行動で得られる価値」は、Day3で学んだように予測で見積る/実績から計算など様々なバリエーションがある。 402 | * 行動の価値から状態の価値をマイナスした、純粋な行動の価値分を **Advantage** と呼ぶ。 403 | * Advantageは、行動の価値は実績(Monte Carlo)、状態の価値は予測(TD)からと計算することができる。 404 | * 状態の価値はCritic、戦略はActorとし、Advantageを使い学習する手法を **Advantage Actor Critic (A2C)** と呼ぶ。 405 | * 方策勾配法は「現在の戦略での経験」で更新を行うため、過去の経験を使うExperience Replyは使用できない。 406 | * 方策勾配法は、勾配の更新方法がとてもデリケートである。 407 | * そのため、あまり大幅な更新が起きないよう(徐々に)学習させる手法としてTRPO、PPOがある。 408 | * 価値評価か、戦略か 409 | * 価値評価には2つデメリットがある。 410 | * 1. 価値の値が拮抗している2つの行動があっても、「最大」(少しでも大きい方)しかとらない。 411 | * 2. 行動数が増えた場合対応が難しい。 412 | * 戦略の場合、価値の大きさに応じた確率を割り振る、行動数が増えた(連続の場合)でも対応が可能である。 413 | * (A2Cでは行動それぞれの確率を出力しており、実質的には価値評価と同じだった。これを克服する手法が2つある)。 414 | * 1. 価値評価のようにベストな行動一つを出力する手法(Deterministic=決定的 なPolicy Gradient= **DPG**) 415 | * 2. 行動分布のパラメーター(平均・分散など)を出力する手法。 416 | * ただ、戦略の学習は価値評価の学習に比べ安定しない傾向がある。 417 | * 2018年時点では、Policy Gradientの学習が意図した通りに行われているのかについてもまだよくわかっていない。 418 | * 既存の手法は、以下のように分類を行うことができる。 419 | 420 |

421 | rl_ways.PNG 422 |

強化学習の手法の分類

423 |

424 | 425 | **Exercises** 426 | 427 | * [ニューラルネットワークの仕組み](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/FN/nn_tutorial) 428 | * [価値関数をニューラルネットで実装する](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/value_function_agent.py) 429 | * [価値関数をDNNで実装する: DQN](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/dqn_agent.py) 430 | * [戦略をニューラルネットで実装する: Policy Gradient](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/policy_gradient_agent.py) 431 | * [戦略をDNNで実装する: A2C](https://github.com/icoxfog417/baby-steps-of-rl-ja/blob/master/FN/a2c_agent.py) 432 | 433 | ## Day5: 強化学習の弱点 434 | 435 | **Day5's goals** 436 | 437 | 強化学習、特にニューラルネットワークを利用した深層強化学習の弱点について解説する。弱点とは、以下3点である。 438 | 439 | * サンプル効率が悪い 440 | * 局所最適な行動に陥る、過学習することが多い 441 | * 再現性が低い 442 | 443 | **Summary** 444 | 445 | * サンプル効率が悪い 446 | * Rainbowの論文には、Atariのゲームで人間同等のスコアを記録するのにどれぐらいの学習が必要か書かれている。 447 | * これによれば、Rainbowであっても、約166時間のプレイ時間が必要になる(30fpsの場合)。 448 | * 局所最適な行動に陥る、過学習することが多い 449 | * 局所最適: 対戦ゲームの場合、特定の対戦相手にだけ勝てる方法を学習する。 450 | * 過学習: ゲームで言うところのチートプレイなどを学習する。 451 | * ・・・といったことが起こる場合がある。 452 | * 再現性が低い 453 | * ハイパーパラメーターの設定はもちろん、実行のたびに結果が変わるようなケースがある。 454 | * 対策 455 | * 根本的な対策は[Day6](https://github.com/icoxfog417/baby-steps-of-rl-ja#day6-%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92%E3%81%AE%E5%BC%B1%E7%82%B9%E3%82%92%E5%85%8B%E6%9C%8D%E3%81%99%E3%82%8B%E3%81%9F%E3%82%81%E3%81%AE%E6%89%8B%E6%B3%95)にて紹介し、Day5ではこの弱点を前提とした対策を紹介する。 456 | * 対策の基本は、「1回の学習結果を無駄にしない」となる。 457 | * 「再現性が低い」ため複数回の実験が必要になる。 458 | * しかし、「サンプル効率が悪い」ため学習には多くの時間が必要となる。 459 | * そのため、一回の実験は長時間X複数回の実行からなる。これには当然時間がかかる。 460 | * 時間のかかる実験がつまらないミスでやり直しになる事態を、可能な限り避ける必要がある。 461 | * また、一回の実験からは可能な限り情報を取りたい。 462 | * これを実現するため、Day4以降の実装では「モジュール分割」と「ログ取得」の2つを行っている。 463 | 464 |

465 | train_architecture.PNG 466 |

強化学習の実装フレームワーク

467 |

468 | 469 | ## Day6: 強化学習の弱点を克服するための手法 470 | 471 | **Day6's goals** 472 | 473 | Day6では、Day5で紹介した弱点に対する根本的な対処方法(アルゴリズム的な改良)を解説する。 474 | 475 | * 「サンプル効率が悪い」ことへの対処法 476 | * 「再現性が低い」ことへの対処法 477 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法 478 | 479 | **Summary** 480 | 481 | * 「サンプル効率が悪い」ことへの対処法 482 | * 強化学習におけるサンプル効率の改善については、様々な手法が提案されている(下表参照)。 483 | * 本書では、そのうちの一つである「環境認識の改善」について扱う。 484 | * 深層強化学習は、画面など(人間が受け取るような)生に近いデータを扱う。 485 | * このため、モデルは「入力(画面)からの特徴抽出」と「行動の仕方」の2つを同時に学習する必要がある。 486 | * これが、学習効率を下げている原因と考えることができる。 487 | * 「環境認識の改善」では、環境からの情報取得のサポートを行う。これには2つの方法がある。 488 | * モデルベースとの併用: 環境のシミュレーター(モデル)を作り、抽象化された環境での学習を可能にする。 489 | * 表現学習: 環境から得られる状態を、より認識しやすい表現に加工する。 490 | * 本書では、モデルベースの併用として **Dyna** 、表現学習として **World Models** の紹介を行う。 491 | 492 |

493 | sample_improve.PNG 494 |

サンプル効率を改善する手法の一覧

495 |

496 | 497 | * 「再現性が低い」ことへの対処法 498 | * 再現性の低さを招いている要因の一つとして、「学習が安定しない」という問題がある。 499 | * この点について、近年勾配法とは異なる最適化アルゴリズムが注目されている。 500 | * それが **進化戦略** である。 501 | * 勾配法は「初期状態から徐々に改善していく」というアプローチをとる。 502 | * 一方、進化戦略は「多くの候補から絞り込む」というアプローチをとる。 503 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法 504 | * 解決策として、「人がある程度誘導してやる」という単純な方法がある。これには、2つの方法がある。 505 | * **模倣学習** : 人がお手本を示し、それに沿うよう行動を学習させる。 506 | * **逆強化学習** : 示されたお手本から報酬を逆算し、それを基に行動を学習させる。 507 | * 模倣学習は教師あり学習と似ているが、すべてのケースにおいてお手本を用意することは難しい。 508 | * (事故を回避するデモ、など)。 509 | * そのため、お手本をベースにしつつもお手本以外のケースにも対応できるようになる必要がある。 510 | * 逆強化学習は、以下のような学習プロセスを経る(下図参照)。 511 | * 通常の強化学習では、3のプロセスだけで済む(Train Agent under Reward Function)。 512 | * しかし、逆強化学習はそこから報酬関数を更新、更新された報酬で再度学習・・・と繰り返す。 513 | * そのため推定には時間がかかる。ただ、これを軽減する手法はいくつか提案されている。 514 | 515 |

516 | irl.png 517 |

逆強化学習のプロセス

518 |

519 | 520 | **Exercises** 521 | 522 | * 「サンプル効率が悪い」ことへの対処法 523 | * [モデルベースとの併用: Dyna](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/MM) 524 | * 「再現性が低い」ことへの対処法 525 | * [新しい学習方法: 進化戦略](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/EV) 526 | * 「局所最適な行動に陥る、過学習することが多い」ことへの対処法 527 | * [模倣学習: DAgger](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/IM) 528 | * [逆強化学習: MaxEntropy/Bayesian](https://github.com/icoxfog417/baby-steps-of-rl-ja/tree/master/IRL) 529 | 530 | 531 | ## Day7: 強化学習の活用領域 532 | 533 | **Day7's goals** 534 | 535 | * 強化学習を活用する2つのパターンを理解する 536 | * 強化学習を活用する2つのパターンにおける研究と事例を知る 537 | * 強化学習を活用する2つのパターンを実現するツール/サービスを知る 538 | 539 | **Summary** 540 | 541 | * 強化学習を活用する2つのパターン 542 | * 強化学習の活用は、「行動の最適化」と「学習の最適化」に大別できる(下図参照)。 543 | * 行動の最適化は、強化学習により獲得された行動をそのまま活用する。 544 | * 学習の最適化は、強化学習の「報酬の最大化」という学習プロセスを活用する。 545 | * この2つの分類に添い、研究/事例/ツール/サービスを紹介していく。 546 | 547 |

548 | rl_application.PNG 549 |

強化学習の活用パターン

550 |

551 | 552 | ## Support Content 553 | 554 | プログラミングが初めて、という方のために参考になるコンテンツを用意しています。最近はプログラムを学ぶ書籍などは充実しているため、もちろんそれらで補完して頂いて構いません。 555 | 556 | [python_exercises](https://github.com/icoxfog417/python_exercises) 557 | -------------------------------------------------------------------------------- /doc/application.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/application.PNG -------------------------------------------------------------------------------- /doc/be.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/be.PNG -------------------------------------------------------------------------------- /doc/colab_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/colab_a2c.png -------------------------------------------------------------------------------- /doc/colab_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/colab_dqn.png -------------------------------------------------------------------------------- /doc/frozen_lake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/frozen_lake.png -------------------------------------------------------------------------------- /doc/irl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/irl.png -------------------------------------------------------------------------------- /doc/mdp.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/mdp.PNG -------------------------------------------------------------------------------- /doc/rl_application.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/rl_application.PNG -------------------------------------------------------------------------------- /doc/rl_ways.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/rl_ways.PNG -------------------------------------------------------------------------------- /doc/sample_improve.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/sample_improve.PNG -------------------------------------------------------------------------------- /doc/td.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/td.PNG -------------------------------------------------------------------------------- /doc/tradeoffs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/tradeoffs.png -------------------------------------------------------------------------------- /doc/train_architecture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/doc/train_architecture.PNG -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/environment.yml -------------------------------------------------------------------------------- /requirements-colab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/requirements-colab.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/icoxfog417/baby-steps-of-rl-ja/1dadc208d6e9f50e010e6b5d2dcc5d9fc11eb51d/requirements.txt -------------------------------------------------------------------------------- /welcome.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.python import keras as K 3 | import gym 4 | import gym_ple 5 | 6 | 7 | def welcome(): 8 | """ 9 | Code to check installation of basic libraries 10 | """ 11 | 12 | env = gym.make("Catcher-v0") 13 | num_action = env.action_space.n 14 | episode_count = 10 15 | 16 | s = env.reset() 17 | brain = K.Sequential() 18 | brain.add(K.layers.Dense(num_action, input_shape=[np.prod(s.shape)], 19 | activation="softmax")) 20 | 21 | def policy(s): 22 | evaluation = brain.predict(np.array([s.flatten()])) 23 | return np.argmax(evaluation) 24 | 25 | for e in range(episode_count): 26 | s = env.reset() 27 | done = False 28 | while not done: 29 | env.render(mode="human") 30 | a = policy(s) 31 | n_state, reward, done, info = env.step(a) 32 | s = n_state 33 | 34 | 35 | if __name__ == "__main__": 36 | welcome() 37 | --------------------------------------------------------------------------------