├── requirements.txt
├── .DS_Store
├── deeprl_hw1
├── __init__.py
├── rl1.pyc
├── .DS_Store
├── __init__.pyc
├── lake_envs.pyc
├── queue_envs.pyc
├── rlvaliterchngd.pyc
├── driver3.py
├── lake_envs.py
├── rl.py
├── queue_envs.py
├── rl1.py
└── rlvaliterchngd.py
├── results
├── .DS_Store
├── 2ca4x4.png
├── results.docx
├── part a4x4
│ ├── 2i.png
│ ├── 2c4x4.png
│ ├── 2e4x4.png
│ ├── 1gvalue.csv
│ └── 1gpolicy.csv
├── part a4x4 with max
│ ├── 1c.png
│ ├── 1e.png
│ ├── 1gpolicy.csv
│ └── 1gvalue.csv
├── part a8x8 with max
│ ├── .DS_Store
│ ├── 2c8x8.png
│ ├── 2e8x8.png
│ ├── 1gpolicy.csv
│ └── 1gvalue.csv
├── Deterministic-4x4-neg-reward-FrozenLake-v0.png
├── Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png
├── Deterministic-4x4-neg-reward-FrozenLake-v0_2cvaluegamma0.16.csv
├── Stochastic-4x4-FrozenLake-v0_2bvalue.csv
└── Deterministic-4x4-neg-reward-FrozenLake-v0_2cvalue.csv
├── Stochastic-4x4-FrozenLake-v0.png
├── .ipynb_checkpoints
└── 21-checkpoint.ipynb
├── Deterministic-4x4-neg-reward-FrozenLake-v0.png
├── setup.py
├── .idea
├── modules.xml
├── misc.xml
├── deeprl_hw1_src.iml
├── inspectionProfiles
│ └── Project_Default.xml
└── workspace.xml
├── README.md
├── example.py
├── DeterministicFrozenLake.py
├── StochasticFrozenLake.py
├── DeterministicFrozenNegReward.py
└── 21.ipynb
/requirements.txt:
--------------------------------------------------------------------------------
1 | future
2 | gym
3 | numpy
4 | six
5 | -e .
6 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/.DS_Store
--------------------------------------------------------------------------------
/deeprl_hw1/__init__.py:
--------------------------------------------------------------------------------
1 | import deeprl_hw1.lake_envs
2 | import deeprl_hw1.queue_envs
3 |
--------------------------------------------------------------------------------
/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/.DS_Store
--------------------------------------------------------------------------------
/deeprl_hw1/rl1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/rl1.pyc
--------------------------------------------------------------------------------
/results/2ca4x4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/2ca4x4.png
--------------------------------------------------------------------------------
/deeprl_hw1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/.DS_Store
--------------------------------------------------------------------------------
/results/results.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/results.docx
--------------------------------------------------------------------------------
/deeprl_hw1/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/__init__.pyc
--------------------------------------------------------------------------------
/deeprl_hw1/lake_envs.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/lake_envs.pyc
--------------------------------------------------------------------------------
/results/part a4x4/2i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2i.png
--------------------------------------------------------------------------------
/deeprl_hw1/queue_envs.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/queue_envs.pyc
--------------------------------------------------------------------------------
/results/part a4x4/2c4x4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2c4x4.png
--------------------------------------------------------------------------------
/results/part a4x4/2e4x4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2e4x4.png
--------------------------------------------------------------------------------
/deeprl_hw1/rlvaliterchngd.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/rlvaliterchngd.pyc
--------------------------------------------------------------------------------
/Stochastic-4x4-FrozenLake-v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/Stochastic-4x4-FrozenLake-v0.png
--------------------------------------------------------------------------------
/results/part a4x4 with max/1c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4 with max/1c.png
--------------------------------------------------------------------------------
/results/part a4x4 with max/1e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4 with max/1e.png
--------------------------------------------------------------------------------
/results/part a8x8 with max/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/.DS_Store
--------------------------------------------------------------------------------
/results/part a8x8 with max/2c8x8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/2c8x8.png
--------------------------------------------------------------------------------
/results/part a8x8 with max/2e8x8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/2e8x8.png
--------------------------------------------------------------------------------
/.ipynb_checkpoints/21-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 0
6 | }
7 |
--------------------------------------------------------------------------------
/results/part a4x4/1gvalue.csv:
--------------------------------------------------------------------------------
1 | 0.592,0.657,0.730,0.657
2 | 0.657,0.001,0.811,0.002
3 | 0.730,0.811,0.901,0.002
4 | 0.007,0.901,1.001,0.001
--------------------------------------------------------------------------------
/Deterministic-4x4-neg-reward-FrozenLake-v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/Deterministic-4x4-neg-reward-FrozenLake-v0.png
--------------------------------------------------------------------------------
/results/part a4x4/1gpolicy.csv:
--------------------------------------------------------------------------------
1 | 0.5916,0.6572,0.7301,0.6571
2 | 0.6572,0.0005,0.8111,0.0023
3 | 0.7301,0.8111,0.9011,0.0021
4 | 0.0074,0.9011,1.0011,0.0011
--------------------------------------------------------------------------------
/results/Deterministic-4x4-neg-reward-FrozenLake-v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/Deterministic-4x4-neg-reward-FrozenLake-v0.png
--------------------------------------------------------------------------------
/results/Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png
--------------------------------------------------------------------------------
/results/Deterministic-4x4-neg-reward-FrozenLake-v0_2cvaluegamma0.16.csv:
--------------------------------------------------------------------------------
1 | -0.999935,0.000065,-0.999961,0.000039
2 | 0.000065,0.000065,0.000039,0.000039
3 | 0.000046,0.000010,0.000101,0.000101
4 | 0.000046,0.000007,1.190390,1.190390
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from distutils.core import setup
4 |
5 | setup(
6 | name='DeepRL Homework 1',
7 | version='1.0',
8 | description='Library for 10-703 Homework 1',
9 | packages=['deeprl_hw1'])
10 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/results/part a4x4 with max/1gpolicy.csv:
--------------------------------------------------------------------------------
1 | 5.923973887191247290e-01,6.580073887191245641e-01,7.309073887191246399e-01,6.578166498472122203e-01
2 | 6.580073887191245641e-01,5.799280908488830578e-03,8.119073887191247119e-01,7.018522510759056705e-03
3 | 7.309073887191246399e-01,8.119073887191247119e-01,9.019073887191245698e-01,4.045348128936829789e-03
4 | 8.663607960043642059e-03,9.019073887191245698e-01,1.001907388719124548e+00,1.907388719124641875e-03
5 |
--------------------------------------------------------------------------------
/results/part a4x4 with max/1gvalue.csv:
--------------------------------------------------------------------------------
1 | 5.940389180249019407e-01,6.596489180249021089e-01,7.325489180249019627e-01,6.592940262224117332e-01
2 | 6.596489180249021089e-01,5.975301847172851538e-03,8.135489180249019237e-01,6.305401447811602444e-03
3 | 7.325489180249019627e-01,8.135489180249019237e-01,9.035489180249020036e-01,8.526760647121702126e-04
4 | 8.310233909333586064e-03,9.035489180249020036e-01,1.003548918024901981e+00,3.548918024901988755e-03
5 |
--------------------------------------------------------------------------------
/deeprl_hw1/driver3.py:
--------------------------------------------------------------------------------
1 | import deeprl_hw1.queue_envs as qenv
2 | import numpy
3 | P1 = 0.1
4 | P2 = 0.9
5 | P3 = 0.1
6 |
7 | env=qenv.QueueEnv(P1,P2,P3)
8 | #ps=env.query_model((1,0,0,0),1)
9 | #print ps
10 | ps=env.query_model((1,5,3,4),3)
11 | print ps
12 | numpy.random.seed(0)
13 | env.reset()
14 | env.render()
15 | env._step(1)
16 | env.render()
17 | env._step(3)
18 | env.render()
19 | #
20 | # ps=env.query_model((1,5,5,5),3)
21 | # print ps
22 |
--------------------------------------------------------------------------------
/results/Stochastic-4x4-FrozenLake-v0_2bvalue.csv:
--------------------------------------------------------------------------------
1 | 7.210080319262232584e-02,6.500778087490913237e-02,7.867964029828880546e-02,5.967327973610045411e-02
2 | 9.499989900152405742e-02,8.464270518010267794e-03,1.174987390033538359e-01,2.065368390832942116e-03
3 | 1.488080916636685680e-01,2.513906886597030987e-01,3.040243032811825175e-01,1.272454070772601007e-03
4 | 6.766787927394104021e-03,3.841673769823148454e-01,6.439724196755912677e-01,6.094194795894227086e-03
5 |
--------------------------------------------------------------------------------
/results/Deterministic-4x4-neg-reward-FrozenLake-v0_2cvalue.csv:
--------------------------------------------------------------------------------
1 | -9.999608187719284391e-01,3.918122807154976087e-05,-9.999774521696255247e-01,2.254783037451089448e-05
2 | 3.918122807154976087e-05,3.918122807154976087e-05,2.254783037451089448e-05,2.254783037451089448e-05
3 | 9.128853355511469254e-05,6.268996491447961536e-06,5.862065414633444213e-05,5.862065414633444213e-05
4 | 9.128853355511469254e-05,1.460616536881835047e-05,1.190425913320509954e+00,1.190425913320509954e+00
5 |
--------------------------------------------------------------------------------
/.idea/deeprl_hw1_src.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/results/part a8x8 with max/1gpolicy.csv:
--------------------------------------------------------------------------------
1 | 2.579595231504817621e-01,2.862024767985817952e-01,3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01
2 | 2.862024767985817952e-01,3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01,5.942629403175818670e-01
3 | 3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,3.767824654025735392e-03,4.820698403175819879e-01,5.352139403175819599e-01,5.942629403175818670e-01,6.598729403175817021e-01
4 | 3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01,3.942658874654073053e-03,6.598729403175817021e-01,7.327729403175817779e-01
5 | 3.172062423758236216e-01,3.520740863858236591e-01,3.908161352858235960e-01,9.125019350097088961e-04,5.942629403175818670e-01,6.598729403175817021e-01,7.327729403175817779e-01,8.137729403175818499e-01
6 | 2.854856181382412483e-01,8.370799156727023668e-03,6.671803637201679030e-03,5.942629403175818670e-01,6.598729403175817021e-01,7.327729403175817779e-01,7.436799765772327507e-03,9.037729403175817078e-01
7 | 3.168666777472413099e-01,4.493092146848743848e-03,4.816925462858237528e-01,5.348366462858237247e-01,7.133581024200938078e-03,8.137729403175818499e-01,2.763054665441602568e-04,1.003772940317581686e+00
8 | 3.517345217572412364e-01,3.904765706572413952e-01,4.335232916572414053e-01,5.762616389577770136e-03,8.137729403175818499e-01,9.037729403175817078e-01,1.003772940317581686e+00,3.772940317581741088e-03
9 |
--------------------------------------------------------------------------------
/results/part a8x8 with max/1gvalue.csv:
--------------------------------------------------------------------------------
1 | 2.542550444410008881e-01,2.824979980891009212e-01,3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01
2 | 2.824979980891009212e-01,3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01,5.905584616081007709e-01
3 | 3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,7.373080087904547095e-03,4.783653616081008364e-01,5.315094616081008638e-01,5.905584616081007709e-01,6.561684616081009391e-01
4 | 3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01,1.307747491744603533e-03,6.561684616081009391e-01,7.290684616081009040e-01
5 | 3.138722115372908905e-01,3.487400555472908170e-01,3.874821044472907539e-01,1.526370220697159533e-03,5.905584616081007709e-01,6.561684616081009391e-01,7.290684616081009040e-01,8.100684616081009759e-01
6 | 2.824849903835617848e-01,4.610209748014226609e-03,1.384175249950048581e-03,5.905584616081007709e-01,6.561684616081009391e-01,7.290684616081009040e-01,1.963379439910021348e-04,9.000684616081008338e-01
7 | 3.138660499925617353e-01,3.178091108843970371e-03,4.783585154472907996e-01,5.315026154472907161e-01,8.674123898821699957e-03,8.100684616081009759e-01,5.057248319540319850e-03,1.000068461608100812e+00
8 | 3.487338940025616618e-01,3.874759429025617097e-01,4.305226639025617197e-01,6.594137748716060420e-03,8.100684616081009759e-01,9.000684616081008338e-01,1.000068461608100812e+00,6.846160810080188482e-05
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning
2 |
3 | ## OpenAI Gym Environments
4 | ### Creating the environments
5 |
6 | To create the environment use the following code snippet:
7 |
8 | ```
9 | import gym
10 | import deeprl_hw1.envs
11 |
12 | env = gym.make('Deterministic-4x4-FrozenLake-v0')
13 | ```
14 |
15 | ### Actions
16 |
17 | There are four actions: LEFT, UP, DOWN, RIGHT represented as
18 | integers. The `deep_rl_hw1.envs` contains variables to reference
19 | these. For example:
20 |
21 | ```
22 | print(deeprl_hw1.envs.LEFT)
23 | ```
24 |
25 | will print out the number 0.
26 |
27 | ### Environment Attributes
28 |
29 | This class contains the following important attributes:
30 |
31 | - `nS` :: number of states
32 | - `nA` :: number of actions
33 | - `P` :: transitions, rewards, terminals
34 |
35 | The `P` attribute will be the most important for your implementation
36 | of value iteration and policy iteration. This attribute contains the
37 | model for the particular map instance. It is a dictionary of
38 | dictionary of lists with the following form:
39 |
40 | ```
41 | P[s][a] = [(prob, nextstate, reward, is_terminal), ...]
42 | ```
43 |
44 | For example, to get the probability of taking action LEFT in state 0
45 | you would use the following code:
46 |
47 | ```
48 | env.P[0][deeprl_hw1.envs.LEFT]
49 | ```
50 |
51 | This would return the list: `[(1.0, 0, 0.0, False)]` for the
52 | `Deterministic-4x4-FrozenLake-v0` domain. There is one tuple in the
53 | list, so there is only one possible next state. The next state will be
54 | state 0, according to the second number in the tuple. This will be the
55 | next state 100\% of the time according to the first number in the
56 | tuple. The reward function for this state action pair `R(0,LEFT) = 0`
57 | according to the third number. The final tuple value says that the
58 | next state is not terminal.
59 |
60 | ##
61 | ### Running a random policy
62 |
63 | example.py has an example of how to run a random policy on the domain.
64 |
65 | #Value Iteration
66 | The optimal policies for the different environments is in the .py files.
67 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | from __future__ import (absolute_import, division, print_function,
5 | unicode_literals)
6 | from builtins import input
7 |
8 | import deeprl_hw1.lake_envs as lake_env
9 | import gym
10 | import time
11 |
12 |
13 | def run_random_policy(env):
14 | """Run a random policy for the given environment.
15 |
16 | Logs the total reward and the number of steps until the terminal
17 | state was reached.
18 |
19 | Parameters
20 | ----------
21 | env: gym.envs.Environment
22 | Instance of an OpenAI gym.
23 |
24 | Returns
25 | -------
26 | (float, int)
27 | First number is the total undiscounted reward received. The
28 | second number is the total number of actions taken before the
29 | episode finished.
30 | """
31 | initial_state = env.reset()
32 | env.render()
33 | time.sleep(1) # just pauses so you can see the output
34 |
35 | total_reward = 0
36 | num_steps = 0
37 | while True:
38 | nextstate, reward, is_terminal, debug_info = env.step(
39 | env.action_space.sample())
40 | env.render()
41 |
42 | total_reward += reward
43 | num_steps += 1
44 |
45 | if is_terminal:
46 | break
47 |
48 | time.sleep(1)
49 |
50 | return total_reward, num_steps
51 |
52 |
53 | def print_env_info(env):
54 | print('Environment has %d states and %d actions.' % (env.nS, env.nA))
55 |
56 |
57 | def print_model_info(env, state, action):
58 | transition_table_row = env.P[state][action]
59 | print(
60 | ('According to transition function, '
61 | 'taking action %s(%d) in state %d leads to'
62 | ' %d possible outcomes') % (lake_env.action_names[action],
63 | action, state, len(transition_table_row)))
64 | for prob, nextstate, reward, is_terminal in transition_table_row:
65 | state_type = 'terminal' if is_terminal else 'non-terminal'
66 | print(
67 | '\tTransitioning to %s state %d with probability %f and reward %f'
68 | % (state_type, nextstate, prob, reward))
69 |
70 |
71 | def main():
72 | # create the environment
73 | env = gym.make('FrozenLake-v0')
74 | # uncomment next line to try the deterministic version
75 | # env = gym.make('Deterministic-4x4-FrozenLake-v0')
76 |
77 | print_env_info(env)
78 | print_model_info(env, 0, lake_env.DOWN)
79 | print_model_info(env, 1, lake_env.DOWN)
80 | print_model_info(env, 14, lake_env.RIGHT)
81 |
82 | input('Hit enter to run a random policy...')
83 |
84 | total_reward, num_steps = run_random_policy(env)
85 | print('Agent received total reward of: %f' % total_reward)
86 | print('Agent took %d steps' % num_steps)
87 |
88 |
89 | if __name__ == '__main__':
90 | main()
91 |
--------------------------------------------------------------------------------
/DeterministicFrozenLake.py:
--------------------------------------------------------------------------------
1 | import deeprl_hw1.lake_envs as lake_env
2 | import gym
3 | import time
4 | import seaborn
5 | from tabulate import tabulate
6 | import matplotlib.pyplot as plt
7 | from deeprl_hw1.rl1 import *
8 |
9 | def run_policy(env,gamma,policy):
10 | initial_state = env.reset()
11 | env.render()
12 | time.sleep(1) # just pauses so you can see the output
13 |
14 | total_reward = 0
15 | num_steps = 0
16 | current_state=initial_state
17 | while True:
18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state])
19 | env.render()
20 |
21 | total_reward += math.pow(gamma,num_steps)*reward
22 | num_steps += 1
23 |
24 | if is_terminal:
25 | break
26 |
27 | current_state=nextstate
28 | time.sleep(1)
29 |
30 | return total_reward, num_steps
31 |
32 | grid=8
33 | envname='Deterministic-'+str(grid)+'x'+str(grid)+'-FrozenLake-v0'
34 | env = gym.make(envname)
35 | env.render()
36 | gamma=0.9
37 | print "Executing Policy Iteration"
38 | start_time=time.time()
39 | policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma)
40 | print "Total time taken: "+str((time.time()-start_time))
41 | print "Total Policy Improvement Steps: "+str(policy_iters)
42 | print "Total Policy Evaluation Steps: "+str(val_iters)
43 | print "Policy:"
44 | policy_str=print_policy(policy,lake_env.action_names)
45 | ps=[]
46 | for elem in policy_str:
47 | ps.append(elem[0])
48 | reshaped_policy=np.reshape(ps,(grid,grid))
49 | print tabulate(reshaped_policy,tablefmt='latex')
50 | f, ax = plt.subplots(figsize=(11, 9))
51 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
52 | reshaped=np.reshape(value_func,(grid,grid))
53 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
54 | square=True, xticklabels=grid+1, yticklabels=grid+1,
55 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
56 | plt.savefig('1c.png',bbox_inches='tight')
57 | np.savetxt('1gpolicy.csv',reshaped,delimiter=',')
58 |
59 | print "Executing Value Iteration"
60 | start_time=time.time()
61 | value_function,value_iters=value_iteration(env,gamma)
62 | print "Total time taken: "+str((time.time()-start_time))
63 | print "Total Value Iteration Steps: "+str(value_iters)
64 | print "Policy:"
65 | policy=value_function_to_policy(env,gamma,value_function)
66 | policy_str=print_policy(policy,lake_env.action_names)
67 | ps=[]
68 | for elem in policy_str:
69 | ps.append(elem[0])
70 | reshaped_policy=np.reshape(ps,(grid,grid))
71 | print tabulate(reshaped_policy,tablefmt='latex')
72 | f, ax = plt.subplots(figsize=(11, 9))
73 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
74 | reshaped=np.reshape(value_function,(grid,grid))
75 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
76 | square=True, xticklabels=grid+1, yticklabels=grid+1,
77 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
78 | plt.savefig('1e.png',bbox_inches='tight')
79 | np.savetxt('1gvalue.csv',reshaped,delimiter=',')
80 |
81 | cum_reward,nsteps=run_policy(env,gamma,policy)
82 | print "Cumulative Reward: "+str(cum_reward)
83 | print "No. of steps: "+str(nsteps)
84 |
85 |
--------------------------------------------------------------------------------
/deeprl_hw1/lake_envs.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """Defines some frozen lake maps."""
3 |
4 | from __future__ import (absolute_import, division, print_function,
5 | unicode_literals)
6 |
7 | from gym.envs.toy_text.frozen_lake import LEFT, RIGHT, DOWN, UP
8 | from gym.envs.toy_text import frozen_lake, discrete
9 |
10 | from gym.envs.registration import register
11 |
12 | action_names = {LEFT: 'LEFT', RIGHT: 'RIGHT', DOWN: 'DOWN', UP: 'UP'}
13 |
14 | register(
15 | id='Deterministic-4x4-FrozenLake-v0',
16 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
17 | kwargs={'map_name': '4x4',
18 | 'is_slippery': False})
19 |
20 | register(
21 | id='Deterministic-8x8-FrozenLake-v0',
22 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
23 | kwargs={'map_name': '8x8',
24 | 'is_slippery': False})
25 |
26 | register(
27 | id='Stochastic-4x4-FrozenLake-v0',
28 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
29 | kwargs={'map_name': '4x4',
30 | 'is_slippery': True})
31 |
32 | register(
33 | id='Stochastic-8x8-FrozenLake-v0',
34 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
35 | kwargs={'map_name': '8x8',
36 | 'is_slippery': True})
37 |
38 |
39 | class NegRewardFrozenLake(frozen_lake.FrozenLakeEnv):
40 | def __init__(self, **kwargs):
41 | super(NegRewardFrozenLake, self).__init__(**kwargs)
42 |
43 | # modify the rewards
44 | for state in range(self.nS):
45 | for action in range(self.nA):
46 | new_transitions = []
47 | for (prob, nextstate, _, is_terminal) in self.P[state][action]:
48 | row = nextstate // self.ncol
49 | col = nextstate - row * self.ncol
50 | tile_type = self.desc[row, col]
51 | if tile_type == 'F' or tile_type == 'S':
52 | reward = -1
53 | elif tile_type == 'G':
54 | reward = 1
55 | else:
56 | reward = 0
57 |
58 | new_transitions.append(
59 | (prob, nextstate, reward, is_terminal))
60 | self.P[state][action] = new_transitions
61 |
62 |
63 | register(
64 | id='Deterministic-4x4-neg-reward-FrozenLake-v0',
65 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake',
66 | kwargs={'map_name': '4x4',
67 | 'is_slippery': False})
68 |
69 | register(
70 | id='Stochastic-4x4-neg-reward-FrozenLake-v0',
71 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake',
72 | kwargs={'map_name': '4x4',
73 | 'is_slippery': True})
74 |
75 | register(
76 | id='Deterministic-8x8-neg-reward-FrozenLake-v0',
77 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake',
78 | kwargs={'map_name': '8x8',
79 | 'is_slippery': False})
80 |
81 | register(
82 | id='Stochastic-8x8-neg-reward-FrozenLake-v0',
83 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake',
84 | kwargs={'map_name': '8x8',
85 | 'is_slippery': True})
86 |
--------------------------------------------------------------------------------
/StochasticFrozenLake.py:
--------------------------------------------------------------------------------
1 | import deeprl_hw1.lake_envs as lake_env
2 | import gym
3 | import time
4 | import seaborn
5 | from tabulate import tabulate
6 | import matplotlib.pyplot as plt
7 | from deeprl_hw1.rlvaliterchngd import *
8 |
9 | def run_policy(env,gamma,policy):
10 | initial_state = env.reset()
11 | #env.render()
12 | time.sleep(1) # just pauses so you can see the output
13 |
14 | total_reward = 0
15 | num_steps = 0
16 | current_state=initial_state
17 | while True:
18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state])
19 | #env.render()
20 |
21 | total_reward += math.pow(gamma,num_steps)*reward
22 | num_steps += 1
23 |
24 | if is_terminal:
25 | break
26 |
27 | current_state=nextstate
28 | time.sleep(1)
29 |
30 | return total_reward, num_steps
31 |
32 | grid=4
33 | envname='Stochastic-'+str(grid)+'x'+str(grid)+'-FrozenLake-v0'
34 | env = gym.make(envname)
35 | env.render()
36 | gamma=0.9
37 |
38 | # print "Executing Policy Iteration"
39 | # start_time=time.time()
40 | # policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma)
41 | # print "Total time taken: "+str((time.time()-start_time))
42 | # print "Total Policy Improvement Steps: "+str(policy_iters)
43 | # print "Total Policy Evaluation Steps: "+str(val_iters)
44 | # print "Policy:"
45 | # policy_str=print_policy(policy,lake_env.action_names)
46 | # ps=[]
47 | # for elem in policy_str:
48 | # ps.append(elem[0])
49 | # reshaped_policy=np.reshape(ps,(grid,grid))
50 | # print tabulate(reshaped_policy,tablefmt='latex')
51 | # f, ax = plt.subplots(figsize=(11, 9))
52 | # cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
53 | # reshaped=np.reshape(value_func,(grid,grid))
54 | # seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
55 | # square=True, xticklabels=grid+1, yticklabels=grid+1,
56 | # linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
57 | # plt.savefig('1c.png',bbox_inches='tight')
58 | # np.savetxt('1gpolicy.csv',reshaped,delimiter=',')
59 |
60 | print "Executing Value Iteration"
61 | start_time=time.time()
62 | value_function,value_iters=value_iteration(env,gamma)
63 | print "Total time taken: "+str((time.time()-start_time))
64 | print "Total Value Iteration Steps: "+str(value_iters)
65 | print "Policy:"
66 | policy=value_function_to_policy(env,gamma,value_function)
67 | policy_str=print_policy(policy,lake_env.action_names)
68 | ps=[]
69 | for elem in policy_str:
70 | ps.append(elem[0])
71 | reshaped_policy=np.reshape(ps,(grid,grid))
72 | print tabulate(reshaped_policy,tablefmt='latex')
73 | f, ax = plt.subplots(figsize=(11, 9))
74 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
75 | reshaped=np.reshape(value_function,(grid,grid))
76 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
77 | square=True, xticklabels=grid+1, yticklabels=grid+1,
78 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
79 | plt.savefig(envname+'.png',bbox_inches='tight')
80 | np.savetxt(envname+'_2bvalue.csv',reshaped,delimiter=',')
81 |
82 | total_cum_reward=0
83 | maxn=5
84 | start_time=time.time()
85 | for n in range(maxn):
86 | cum_reward,nsteps=run_policy(env,gamma,policy)
87 | total_cum_reward+=cum_reward
88 | if n%1==0: print "Done "+str(n)
89 | print ("Time: "+str((time.time()-start_time)/60))
90 |
91 | print "Average Cumulative Reward: "+str((total_cum_reward/maxn))
92 | print "No. of steps: "+str(nsteps)
93 |
94 |
--------------------------------------------------------------------------------
/DeterministicFrozenNegReward.py:
--------------------------------------------------------------------------------
1 | import deeprl_hw1.lake_envs as lake_env
2 | import gym
3 | import time
4 | import seaborn
5 | from tabulate import tabulate
6 | import matplotlib.pyplot as plt
7 | from deeprl_hw1.rlvaliterchngd import *
8 |
9 | def run_policy(env,gamma,policy):
10 | initial_state = env.reset()
11 | #env.render()
12 | time.sleep(1) # just pauses so you can see the output
13 |
14 | total_reward = 0
15 | num_steps = 0
16 | current_state=initial_state
17 | while True:
18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state])
19 | #env.render()
20 |
21 | total_reward += math.pow(gamma,num_steps)*reward
22 | num_steps += 1
23 |
24 | if is_terminal:
25 | break
26 |
27 | current_state=nextstate
28 | time.sleep(1)
29 |
30 | return total_reward, num_steps
31 |
32 | grid=4
33 | envname='Deterministic-4x4-neg-reward-FrozenLake-v0'
34 | env = gym.make(envname)
35 | env.render()
36 | gamma=0.16
37 |
38 | # print "Executing Policy Iteration"
39 | # start_time=time.time()
40 | # policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma)
41 | # print "Total time taken: "+str((time.time()-start_time))
42 | # print "Total Policy Improvement Steps: "+str(policy_iters)
43 | # print "Total Policy Evaluation Steps: "+str(val_iters)
44 | # print "Policy:"
45 | # policy_str=print_policy(policy,lake_env.action_names)
46 | # ps=[]
47 | # for elem in policy_str:
48 | # ps.append(elem[0])
49 | # reshaped_policy=np.reshape(ps,(grid,grid))
50 | # print tabulate(reshaped_policy,tablefmt='latex')
51 | # f, ax = plt.subplots(figsize=(11, 9))
52 | # cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
53 | # reshaped=np.reshape(value_func,(grid,grid))
54 | # seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
55 | # square=True, xticklabels=grid+1, yticklabels=grid+1,
56 | # linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
57 | # plt.savefig('1c.png',bbox_inches='tight')
58 | # np.savetxt('1gpolicy.csv',reshaped,delimiter=',')
59 |
60 | print "Executing Value Iteration"
61 | start_time=time.time()
62 | value_function,value_iters=value_iteration(env,gamma)
63 | print "Total time taken: "+str((time.time()-start_time))
64 | print "Total Value Iteration Steps: "+str(value_iters)
65 | print "Policy:"
66 | policy=value_function_to_policy(env,gamma,value_function)
67 | policy_str=print_policy(policy,lake_env.action_names)
68 | ps=[]
69 | for elem in policy_str:
70 | ps.append(elem[0])
71 | reshaped_policy=np.reshape(ps,(grid,grid))
72 | print tabulate(reshaped_policy,tablefmt='latex')
73 | f, ax = plt.subplots(figsize=(11, 9))
74 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
75 | reshaped=np.reshape(value_function,(grid,grid))
76 | seaborn.heatmap(reshaped, cmap=cmap, vmax=5,
77 | square=True, xticklabels=grid+1, yticklabels=grid+1,
78 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
79 | plt.savefig(envname+'.png',bbox_inches='tight')
80 | np.savetxt(envname+'_2cvalue.csv',reshaped,delimiter=',')
81 |
82 | # total_cum_reward=0
83 | # maxn=5
84 | # start_time=time.time()
85 | # for n in range(maxn):
86 | # cum_reward,nsteps=run_policy(env,gamma,policy)
87 | # total_cum_reward+=cum_reward
88 | # if n%1==0: print "Done "+str(n)
89 | # print ("Time: "+str((time.time()-start_time)/60))
90 | #
91 | # print "Average Cumulative Reward: "+str((total_cum_reward/maxn))
92 | # print "No. of steps: "+str(nsteps)
93 |
94 |
--------------------------------------------------------------------------------
/deeprl_hw1/rl.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import division, absolute_import
3 | from __future__ import print_function, unicode_literals
4 |
5 | import numpy as np
6 |
7 |
8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3):
9 | """Evaluate the value of a policy.
10 |
11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
12 | book.
13 |
14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
15 |
16 | Parameters
17 | ----------
18 | env: gym.core.Environment
19 | The environment to compute value iteration for. Must have nS,
20 | nA, and P as attributes.
21 | gamma: float
22 | Discount factor, must be in range [0, 1)
23 | policy: np.array
24 | The policy to evaluate. Maps states to actions.
25 | max_iterations: int
26 | The maximum number of iterations to run before stopping.
27 | tol: float
28 | Determines when value function has converged.
29 |
30 | Returns
31 | -------
32 | np.ndarray
33 | The value for the given policy
34 | """
35 | return np.zeros(env.nS)
36 |
37 |
38 | def value_function_to_policy(env, gamma, value_function):
39 | """Output action numbers for each state in value_function.
40 |
41 | Parameters
42 | ----------
43 | env: gym.core.Environment
44 | Environment to compute policy for. Must have nS, nA, and P as
45 | attributes.
46 | gamma: float
47 | Discount factor. Number in range [0, 1)
48 | value_function: np.ndarray
49 | Value of each state.
50 |
51 | Returns
52 | -------
53 | np.ndarray
54 | An array of integers. Each integer is the optimal action to take
55 | in that state according to the environment dynamics and the
56 | given value function.
57 | """
58 | return np.zeros(env.nS, dtype='int')
59 |
60 |
61 | def improve_policy(env, gamma, value_func, policy):
62 | """Given a policy and value function improve the policy.
63 |
64 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
65 | book.
66 |
67 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
68 |
69 | Parameters
70 | ----------
71 | env: gym.core.Environment
72 | The environment to compute value iteration for. Must have nS,
73 | nA, and P as attributes.
74 | gamma: float
75 | Discount factor, must be in range [0, 1)
76 | value_func: np.ndarray
77 | Value function for the given policy.
78 | policy: dict or np.array
79 | The policy to improve. Maps states to actions.
80 | max_iterations: int
81 | The maximum number of iterations to run before stopping.
82 | tol: float
83 | Determines when value function has converged.
84 |
85 | Returns
86 | -------
87 | bool, np.ndarray
88 | Returns true if policy changed. Also returns the new policy.
89 | """
90 | return False, policy
91 |
92 |
93 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
94 | """Runs policy iteration.
95 |
96 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
97 | book.
98 |
99 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
100 |
101 | You should use the improve_policy and evaluate_policy methods to
102 | implement this method.
103 |
104 | Parameters
105 | ----------
106 | env: gym.core.Environment
107 | The environment to compute value iteration for. Must have nS,
108 | nA, and P as attributes.
109 | gamma: float
110 | Discount factor, must be in range [0, 1)
111 | max_iterations: int
112 | The maximum number of iterations to run before stopping.
113 | tol: float
114 | Determines when value function has converged.
115 |
116 | Returns
117 | -------
118 | (np.ndarray, np.ndarray, int, int)
119 | Returns optimal policy, value function, number of policy
120 | improvement iterations, and number of value iterations.
121 | """
122 | policy = np.zeros(env.nS, dtype='int')
123 | value_func = np.zeros(env.nS)
124 |
125 | return policy, value_func, 0, 0
126 |
127 |
128 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
129 | """Runs value iteration for a given gamma and environment.
130 |
131 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition
132 | book.
133 |
134 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
135 |
136 | Parameters
137 | ----------
138 | env: gym.core.Environment
139 | The environment to compute value iteration for. Must have nS,
140 | nA, and P as attributes.
141 | gamma: float
142 | Discount factor, must be in range [0, 1)
143 | max_iterations: int
144 | The maximum number of iterations to run before stopping.
145 | tol: float
146 | Determines when value function has converged.
147 |
148 | Returns
149 | -------
150 | np.ndarray, iteration
151 | The value function and the number of iterations it took to converge.
152 | """
153 | return np.zeros(env.nS), 0
154 |
155 |
156 | def print_policy(policy, action_names):
157 | """Print the policy in human-readable format.
158 |
159 | Parameters
160 | ----------
161 | policy: np.ndarray
162 | Array of state to action number mappings
163 | action_names: dict
164 | Mapping of action numbers to characters representing the action.
165 | """
166 | str_policy = policy.astype('str')
167 | for action_num, action_name in action_names.items():
168 | np.place(str_policy, policy == action_num, action_name)
169 |
170 | print(str_policy)
171 |
--------------------------------------------------------------------------------
/21.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import gym\n",
12 | "import deeprl_hw1\n",
13 | "from example import *"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 37,
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "outputs": [
23 | {
24 | "name": "stderr",
25 | "output_type": "stream",
26 | "text": [
27 | "INFO:gym.envs.registration:Making new env: Deterministic-4x4-neg-reward-FrozenLake-v0\n",
28 | "[2017-02-15 00:08:57,815] Making new env: Deterministic-4x4-neg-reward-FrozenLake-v0\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "env=gym.make('Deterministic-4x4-neg-reward-FrozenLake-v0')"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 30,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "0\n",
48 | "2\n",
49 | "3\n",
50 | "1\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "print(deeprl_hw1.lake_envs.LEFT)\n",
56 | "print(deeprl_hw1.lake_envs.RIGHT)\n",
57 | "print(deeprl_hw1.lake_envs.UP)\n",
58 | "print(deeprl_hw1.lake_envs.DOWN)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 38,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "{0: {0: [(1.0, 0, -1, False)], 1: [(1.0, 4, -1, False)], 2: [(1.0, 1, -1, False)], 3: [(1.0, 0, -1, False)]}, 1: {0: [(1.0, 0, -1, False)], 1: [(1.0, 5, 0, True)], 2: [(1.0, 2, -1, False)], 3: [(1.0, 1, -1, False)]}, 2: {0: [(1.0, 1, -1, False)], 1: [(1.0, 6, -1, False)], 2: [(1.0, 3, -1, False)], 3: [(1.0, 2, -1, False)]}, 3: {0: [(1.0, 2, -1, False)], 1: [(1.0, 7, 0, True)], 2: [(1.0, 3, -1, False)], 3: [(1.0, 3, -1, False)]}, 4: {0: [(1.0, 4, -1, False)], 1: [(1.0, 8, -1, False)], 2: [(1.0, 5, 0, True)], 3: [(1.0, 0, -1, False)]}, 5: {0: [(1.0, 5, 0, True)], 1: [(1.0, 5, 0, True)], 2: [(1.0, 5, 0, True)], 3: [(1.0, 5, 0, True)]}, 6: {0: [(1.0, 5, 0, True)], 1: [(1.0, 10, -1, False)], 2: [(1.0, 7, 0, True)], 3: [(1.0, 2, -1, False)]}, 7: {0: [(1.0, 7, 0, True)], 1: [(1.0, 7, 0, True)], 2: [(1.0, 7, 0, True)], 3: [(1.0, 7, 0, True)]}, 8: {0: [(1.0, 8, -1, False)], 1: [(1.0, 12, 0, True)], 2: [(1.0, 9, -1, False)], 3: [(1.0, 4, -1, False)]}, 9: {0: [(1.0, 8, -1, False)], 1: [(1.0, 13, -1, False)], 2: [(1.0, 10, -1, False)], 3: [(1.0, 5, 0, True)]}, 10: {0: [(1.0, 9, -1, False)], 1: [(1.0, 14, -1, False)], 2: [(1.0, 11, 0, True)], 3: [(1.0, 6, -1, False)]}, 11: {0: [(1.0, 11, 0, True)], 1: [(1.0, 11, 0, True)], 2: [(1.0, 11, 0, True)], 3: [(1.0, 11, 0, True)]}, 12: {0: [(1.0, 12, 0, True)], 1: [(1.0, 12, 0, True)], 2: [(1.0, 12, 0, True)], 3: [(1.0, 12, 0, True)]}, 13: {0: [(1.0, 12, 0, True)], 1: [(1.0, 13, -1, False)], 2: [(1.0, 14, -1, False)], 3: [(1.0, 9, -1, False)]}, 14: {0: [(1.0, 13, -1, False)], 1: [(1.0, 14, -1, False)], 2: [(1.0, 15, 1, True)], 3: [(1.0, 10, -1, False)]}, 15: {0: [(1.0, 15, 1, True)], 1: [(1.0, 15, 1, True)], 2: [(1.0, 15, 1, True)], 3: [(1.0, 15, 1, True)]}}\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "print env.P"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 16,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "Environment has 16 states and 4 actions.\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "print_env_info(env)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 17,
102 | "metadata": {
103 | "collapsed": false
104 | },
105 | "outputs": [
106 | {
107 | "name": "stdout",
108 | "output_type": "stream",
109 | "text": [
110 | "According to transition function, taking action RIGHT(2) in state 0 leads to 1 possible outcomes\n",
111 | "\tTransitioning to non-terminal state 1 with probability 1.000000 and reward 0.000000\n"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "print_model_info(env,0,deeprl_hw1.lake_envs.RIGHT)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 18,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "\u001b[41mS\u001b[0mFFF\n",
131 | "FHFH\n",
132 | "FFFH\n",
133 | "HFFG\n",
134 | "\n"
135 | ]
136 | },
137 | {
138 | "data": {
139 | "text/plain": [
140 | ""
141 | ]
142 | },
143 | "execution_count": 18,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "env.render()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 24,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "4\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "print env.nA"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 23,
174 | "metadata": {
175 | "collapsed": false
176 | },
177 | "outputs": [
178 | {
179 | "name": "stdout",
180 | "output_type": "stream",
181 | "text": [
182 | "4\n"
183 | ]
184 | }
185 | ],
186 | "source": [
187 | "print env.action_space.n"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "collapsed": true
195 | },
196 | "outputs": [],
197 | "source": []
198 | }
199 | ],
200 | "metadata": {
201 | "kernelspec": {
202 | "display_name": "Python 2",
203 | "language": "python",
204 | "name": "python2"
205 | },
206 | "language_info": {
207 | "codemirror_mode": {
208 | "name": "ipython",
209 | "version": 2
210 | },
211 | "file_extension": ".py",
212 | "mimetype": "text/x-python",
213 | "name": "python",
214 | "nbconvert_exporter": "python",
215 | "pygments_lexer": "ipython2",
216 | "version": "2.7.12"
217 | }
218 | },
219 | "nbformat": 4,
220 | "nbformat_minor": 0
221 | }
222 |
--------------------------------------------------------------------------------
/deeprl_hw1/queue_envs.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """Define the Queue environment from problem 3 here."""
3 |
4 | from __future__ import (absolute_import, division, print_function,
5 | unicode_literals)
6 |
7 | from gym import Env, spaces
8 | from gym.envs.registration import register
9 | import numpy
10 | import itertools
11 |
12 | class QueueEnv(Env):
13 | """Implement the Queue environment from problem 3.
14 |
15 | Parameters
16 | ----------
17 | p1: float
18 | Value between [0, 1]. The probability of queue 1 receiving a new item.
19 | p2: float
20 | Value between [0, 1]. The probability of queue 2 receiving a new item.
21 | p3: float
22 | Value between [0, 1]. The probability of queue 3 receiving a new item.
23 |
24 | Attributes
25 | ----------
26 | nS: number of states
27 | nA: number of actions
28 | P: environment model
29 | """
30 | metadata = {'render.modes': ['human']}
31 |
32 | SWITCH_TO_1 = 0
33 | SWITCH_TO_2 = 1
34 | SWITCH_TO_3 = 2
35 | SERVICE_QUEUE = 3
36 |
37 |
38 |
39 |
40 | def __init__(self, p1, p2, p3):
41 | self.action_space = spaces.Discrete(4)
42 | self.observation_space = spaces.MultiDiscrete(
43 | [(1, 3), (0, 5), (0, 5), (0, 5)])
44 | self.nS = 0
45 | self.nA = 4
46 | self.P = dict()
47 | self.current_state=(1,0,0,0)
48 | self.p1=p1
49 | self.p2=p2
50 | self.p3=p3
51 |
52 |
53 | def _reset(self):
54 | """Reset the environment.
55 |
56 | The server should always start on Queue 1.
57 |
58 | Returns
59 | -------
60 | (int, int, int, int)
61 | A tuple representing the current state with meanings
62 | (current queue, num items in 1, num items in 2, num items in
63 | 3).
64 | """
65 | self.current_state=(1,0,0,0)
66 | return self.current_state
67 |
68 | def _step(self, action):
69 | """Execute the specified action.
70 |
71 | Parameters
72 | ----------
73 | action: int
74 | A number in range [0, 3]. Represents the action.
75 |
76 | Returns
77 | -------
78 | (state, reward, is_terminal, debug_info)
79 | State is the tuple in the same format as the reset
80 | method. Reward is a floating point number. is_terminal is a
81 | boolean representing if the new state is a terminal
82 | state. debug_info is a dictionary. You can fill debug_info
83 | with any additional information you deem useful.
84 | """
85 | possible_next_states=self.query_model(self.current_state,action)
86 | probarray=[]
87 | for ps in possible_next_states:
88 | probarray.append(ps[0])
89 | probs=numpy.asarray(probarray)
90 | randomarray=numpy.random.rand(len(possible_next_states),1)
91 | next_state_index=self.categorical_sample(probs,randomarray)
92 | pns=possible_next_states[next_state_index]
93 | next_state=(pns[1],pns[2],pns[3],dict())
94 | self.current_state=next_state[0]
95 | return next_state
96 |
97 |
98 |
99 | def _render(self, mode='human', close=False):
100 | print ("Current Q: "+str(self.current_state[0]))
101 | print ("Items in Q1: "+str(self.current_state[1]))
102 | print ("Items in Q2: "+str(self.current_state[2]))
103 | print ("Items in Q3: "+str(self.current_state[3]))
104 | print ("\n")
105 |
106 |
107 | def _seed(self, seed=None):
108 | """Set the random seed.
109 |
110 | Parameters
111 | ----------
112 | seed: int, None
113 | Random seed used by numpy.random and random.
114 | """
115 | pass
116 |
117 | def query_model(self, state, action):
118 | """Return the possible transition outcomes for a state-action pair.
119 |
120 | This should be in the same format at the provided environments
121 | in section 2.
122 |
123 | Parameters
124 | ----------
125 | state
126 | State used in query. Should be in the same format at
127 | the states returned by reset and step.
128 | action: int
129 | The action used in query.
130 |
131 | Returns
132 | -------
133 | [(prob, nextstate, reward, is_terminal), ...]
134 | List of possible outcomes
135 | """
136 | lst=list(itertools.product([0,1],repeat=3))
137 | reward=0
138 | newstate=list(state)
139 | if action==QueueEnv.SERVICE_QUEUE:
140 | currq=newstate[0]
141 | if newstate[currq]>0:
142 | newstate[currq]-=1
143 | reward=1
144 | elif action==QueueEnv.SWITCH_TO_1:
145 | newstate[0]=1
146 | elif action==QueueEnv.SWITCH_TO_2:
147 | newstate[0]=2
148 | elif action==QueueEnv.SWITCH_TO_3:
149 | newstate[0]=3
150 | blockq1=1
151 | blockq2=1
152 | blockq3=1
153 | if newstate[1]>=5: blockq1=0
154 | if newstate[2]>=5: blockq2=0
155 | if newstate[3]>=5: blockq3=0
156 | possible_states=[]
157 | for combination in lst:
158 | q1=combination[0]
159 | q2=combination[1]
160 | q3=combination[2]
161 | state_prob=0
162 | newpstate=newstate[:]
163 | if blockq1==0 or q1==0: state_prob+=(1-self.p1)
164 | else:
165 | state_prob+=self.p1
166 | newpstate[1]+=1
167 | if blockq2==0 or q2==0: state_prob=state_prob*(1-self.p2)
168 | else:
169 | state_prob=state_prob*self.p2
170 | newpstate[2]+=1
171 | if blockq3==0 or q3==0: state_prob=state_prob*(1-self.p3)
172 | else:
173 | state_prob=state_prob*self.p3
174 | newpstate[3]+=1
175 | found=False
176 | for psalready in possible_states:
177 | if tuple(newpstate) == psalready[1]:
178 | found=True
179 | break
180 | if not found: possible_states.append((state_prob,tuple(newpstate)))
181 | total_prob=0
182 | for ps in possible_states:
183 | total_prob+=ps[0]
184 | for i in range(len(possible_states)):
185 | unnormalized_state=possible_states[i]
186 | possible_states[i]=(float(unnormalized_state[0])/float(total_prob),unnormalized_state[1])
187 | final_list=[]
188 | for ps in possible_states:
189 | final_list.append((ps[0],ps[1],reward,False))
190 | return final_list
191 |
192 | def get_action_name(self, action):
193 | if action == QueueEnv.SERVICE_QUEUE:
194 | return 'SERVICE_QUEUE'
195 | elif action == QueueEnv.SWITCH_TO_1:
196 | return 'SWITCH_TO_1'
197 | elif action == QueueEnv.SWITCH_TO_2:
198 | return 'SWITCH_TO_2'
199 | elif action == QueueEnv.SWITCH_TO_3:
200 | return 'SWITCH_TO_3'
201 | return 'UNKNOWN'
202 |
203 | def categorical_sample(self, prob_n, np_random):
204 | """
205 | Sample from categorical distribution
206 | Each row specifies class probabilities
207 | """
208 | csprob_n = numpy.cumsum(prob_n)
209 | return (csprob_n > np_random).argmax()
210 |
211 | register(
212 | id='Queue-1-v0',
213 | entry_point='deeprl_hw1.queue_envs:QueueEnv',
214 | kwargs={'p1': .1,
215 | 'p2': .9,
216 | 'p3': .1})
217 |
218 | register(
219 | id='Queue-2-v0',
220 | entry_point='deeprl_hw1.queue_envs:QueueEnv',
221 | kwargs={'p1': .1,
222 | 'p2': .1,
223 | 'p3': .1})
224 |
--------------------------------------------------------------------------------
/deeprl_hw1/rl1.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import division, absolute_import
3 | from __future__ import print_function, unicode_literals
4 |
5 | import numpy as np
6 | import math
7 |
8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3):
9 | """Evaluate the value of a policy.
10 |
11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
12 | book.
13 |
14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
15 |
16 | Parameters
17 | ----------
18 | env: gym.core.Environment
19 | The environment to compute value iteration for. Must have nS,
20 | nA, and P as attributes.
21 | gamma: float
22 | Discount factor, must be in range [0, 1)
23 | policy: np.array
24 | The policy to evaluate. Maps states to actions.
25 | max_iterations: int
26 | The maximum number of iterations to run before stopping.
27 | tol: float
28 | Determines when value function has converged.
29 |
30 | Returns
31 | -------
32 | np.ndarray
33 | The value for the given policy
34 | """
35 | value_func_old = np.random.rand(env.nS)
36 | value_func_new = np.zeros(env.nS)
37 | for iteration in range(max_iterations):
38 | delta=0
39 | for s in range(env.nS):
40 | vs=0
41 | actions=[policy[s]]
42 | #if len(actions)==1: actions=[actions]
43 | for a in actions:
44 | for possible_next_state in env.P[s][a]:
45 | prob_action = possible_next_state[0]
46 | cur_reward=possible_next_state[2]
47 | future_reward=gamma*value_func_old[possible_next_state[1]]
48 | vs+=prob_action*(cur_reward+future_reward)
49 | #if env.P[s][a][3]:break
50 | diff=abs(value_func_old[s]-vs)
51 | delta=max(delta,diff)
52 | value_func_new[s]=vs
53 | #delta=math.sqrt(delta)
54 | if delta<=tol: break
55 | value_func_old = value_func_new
56 | return value_func_new, iteration
57 |
58 |
59 | def value_function_to_policy(env, gamma, value_function):
60 | """Output action numbers for each state in value_function.
61 |
62 | Parameters
63 | ----------
64 | env: gym.core.Environment
65 | Environment to compute policy for. Must have nS, nA, and P as
66 | attributes.
67 | gamma: float
68 | Discount factor. Number in range [0, 1)
69 | value_function: np.ndarray
70 | Value of each state.
71 |
72 | Returns
73 | -------
74 | np.ndarray
75 | An array of integers. Each integer is the optimal action to take
76 | in that state according to the environment dynamics and the
77 | given value function.
78 | """
79 | policy=np.zeros(env.nS,dtype='int')
80 | for s in range(env.nS):
81 | maxvsa=-1
82 | maxa=-1
83 | for a in range(env.nA):
84 | vsa=0
85 | for possible_next_state in env.P[s][a]:
86 | prob_action = possible_next_state[0]
87 | cur_reward = possible_next_state[2]
88 | future_reward = gamma * value_function[possible_next_state[1]]
89 | vsa+=prob_action * (cur_reward + future_reward)
90 | if vsa>maxvsa:
91 | maxvsa=vsa
92 | maxa=a
93 | policy[s]=maxa
94 |
95 | return policy
96 |
97 |
98 | def improve_policy(env, gamma, value_func, policy):
99 | """Given a policy and value function improve the policy.
100 |
101 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
102 | book.
103 |
104 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
105 |
106 | Parameters
107 | ----------
108 | env: gym.core.Environment
109 | The environment to compute value iteration for. Must have nS,
110 | nA, and P as attributes.
111 | gamma: float
112 | Discount factor, must be in range [0, 1)
113 | value_func: np.ndarray
114 | Value function for the given policy.
115 | policy: dict or np.array
116 | The policy to improve. Maps states to actions.
117 | max_iterations: int
118 | The maximum number of iterations to run before stopping.
119 | tol: float
120 | Determines when value function has converged.
121 |
122 | Returns
123 | -------
124 | bool, np.ndarray
125 | Returns true if policy changed. Also returns the new policy.
126 | """
127 | stable=True
128 | for s in range(env.nS):
129 | old_action=policy[s]
130 | maxvsa=-1
131 | maxa=-1
132 | for a in range(env.nA):
133 | vsa=0
134 | for possible_next_state in env.P[s][a]:
135 | prob_action = possible_next_state[0]
136 | cur_reward = possible_next_state[2]
137 | future_reward = gamma * value_func[possible_next_state[1]]
138 | vsa+=prob_action * (cur_reward + future_reward)
139 | if vsa>maxvsa:
140 | maxvsa=vsa
141 | maxa=a
142 | if maxa!=old_action: stable=False
143 | policy[s]=maxa
144 | return stable, policy
145 |
146 |
147 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
148 | """Runs policy iteration.
149 |
150 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
151 | book.
152 |
153 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
154 |
155 | You should use the improve_policy and evaluate_policy methods to
156 | implement this method.
157 |
158 | Parameters
159 | ----------
160 | env: gym.core.Environment
161 | The environment to compute value iteration for. Must have nS,
162 | nA, and P as attributes.
163 | gamma: float
164 | Discount factor, must be in range [0, 1)
165 | max_iterations: int
166 | The maximum number of iterations to run before stopping.
167 | tol: float
168 | Determines when value function has converged.
169 |
170 | Returns
171 | -------
172 | (np.ndarray, np.ndarray, int, int)
173 | Returns optimal policy, value function, number of policy
174 | improvement iterations, and number of value iterations.
175 | """
176 | policy = np.zeros(env.nS, dtype='int')
177 | value_func = np.zeros(env.nS)
178 | stable=False
179 | iters=0
180 | eval_iters=0
181 | while not stable:
182 | value_func,iter=evaluate_policy(env,gamma,policy)
183 | eval_iters+=iter
184 | stable,policy=improve_policy(env,gamma,value_func,policy)
185 | iters+=1
186 | return policy, value_func, iters, eval_iters
187 |
188 |
189 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
190 | """Runs value iteration for a given gamma and environment.
191 |
192 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition
193 | book.
194 |
195 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
196 |
197 | Parameters
198 | ----------
199 | env: gym.core.Environment
200 | The environment to compute value iteration for. Must have nS,
201 | nA, and P as attributes.
202 | gamma: float
203 | Discount factor, must be in range [0, 1)
204 | max_iterations: int
205 | The maximum number of iterations to run before stopping.
206 | tol: float
207 | Determines when value function has converged.
208 |
209 | Returns
210 | -------
211 | np.ndarray, iteration
212 | The value function and the number of iterations it took to converge.
213 | """
214 | value_func_old = np.random.rand(env.nS)
215 | value_func_new = np.zeros(env.nS)
216 | for iteration in range(max_iterations):
217 | delta=0
218 | for s in range(env.nS):
219 | maxvsa = -1
220 | for a in range(env.nA):
221 | vsa=0
222 | for possible_next_state in env.P[s][a]:
223 | prob_action = possible_next_state[0]
224 | cur_reward=possible_next_state[2]
225 | if possible_next_state[3]:
226 | future_reward=0
227 | else: future_reward=gamma*value_func_old[possible_next_state[1]]
228 | vsa+=prob_action*(cur_reward+future_reward)
229 | if vsa>maxvsa:
230 | maxvsa=vsa
231 | #diff=math.pow((value_func_old[s]-maxvsa),2)
232 | diff=abs(value_func_old[s]-maxvsa)
233 | delta=max(delta,diff)
234 | value_func_new[s]=maxvsa
235 | #delta=math.sqrt(delta)
236 | if delta<=tol: break
237 | value_func_old = value_func_new
238 |
239 | return value_func_new, iteration
240 |
241 |
242 | def print_policy(policy, action_names):
243 | """Print the policy in human-readable format.
244 |
245 | Parameters
246 | ----------
247 | policy: np.ndarray
248 | Array of state to action number mappings
249 | action_names: dict
250 | Mapping of action numbers to characters representing the action.
251 | """
252 | str_policy = policy.astype('str')
253 | for action_num, action_name in action_names.items():
254 | np.place(str_policy, policy == action_num, action_name)
255 |
256 | print(str_policy)
257 | return str_policy
258 |
--------------------------------------------------------------------------------
/deeprl_hw1/rlvaliterchngd.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import division, absolute_import
3 | from __future__ import print_function, unicode_literals
4 |
5 | import numpy as np
6 | import math
7 |
8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3):
9 | """Evaluate the value of a policy.
10 |
11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
12 | book.
13 |
14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
15 |
16 | Parameters
17 | ----------
18 | env: gym.core.Environment
19 | The environment to compute value iteration for. Must have nS,
20 | nA, and P as attributes.
21 | gamma: float
22 | Discount factor, must be in range [0, 1)
23 | policy: np.array
24 | The policy to evaluate. Maps states to actions.
25 | max_iterations: int
26 | The maximum number of iterations to run before stopping.
27 | tol: float
28 | Determines when value function has converged.
29 |
30 | Returns
31 | -------
32 | np.ndarray
33 | The value for the given policy
34 | """
35 | value_func_old = np.random.rand(env.nS)
36 | value_func_new = np.zeros(env.nS)
37 | for iteration in range(max_iterations):
38 | delta=0
39 | for s in range(env.nS):
40 | vs=0
41 | actions=[policy[s]]
42 | #if len(actions)==1: actions=[actions]
43 | for a in actions:
44 | for possible_next_state in env.P[s][a]:
45 | prob_action = possible_next_state[0]
46 | cur_reward=possible_next_state[2]
47 | future_reward=gamma*value_func_old[possible_next_state[1]]
48 | vs+=prob_action*(cur_reward+future_reward)
49 | #if env.P[s][a][3]:break
50 | diff=abs(value_func_old[s]-vs)
51 | delta=max(delta,diff)
52 | value_func_new[s]=vs
53 | #delta=math.sqrt(delta)
54 | if delta<=tol: break
55 | value_func_old = value_func_new
56 | return value_func_new, iteration
57 |
58 |
59 | def value_function_to_policy(env, gamma, value_function):
60 | """Output action numbers for each state in value_function.
61 |
62 | Parameters
63 | ----------
64 | env: gym.core.Environment
65 | Environment to compute policy for. Must have nS, nA, and P as
66 | attributes.
67 | gamma: float
68 | Discount factor. Number in range [0, 1)
69 | value_function: np.ndarray
70 | Value of each state.
71 |
72 | Returns
73 | -------
74 | np.ndarray
75 | An array of integers. Each integer is the optimal action to take
76 | in that state according to the environment dynamics and the
77 | given value function.
78 | """
79 | policy=np.zeros(env.nS,dtype='int')
80 | for s in range(env.nS):
81 | maxvsa=-1
82 | maxa=-1
83 | for a in range(env.nA):
84 | vsa=0
85 | for possible_next_state in env.P[s][a]:
86 | prob_action = possible_next_state[0]
87 | cur_reward = possible_next_state[2]
88 | future_reward = gamma * value_function[possible_next_state[1]]
89 | vsa+=prob_action * (cur_reward + future_reward)
90 | if vsa>maxvsa:
91 | maxvsa=vsa
92 | maxa=a
93 | policy[s]=maxa
94 |
95 | return policy
96 |
97 |
98 | def improve_policy(env, gamma, value_func, policy):
99 | """Given a policy and value function improve the policy.
100 |
101 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
102 | book.
103 |
104 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
105 |
106 | Parameters
107 | ----------
108 | env: gym.core.Environment
109 | The environment to compute value iteration for. Must have nS,
110 | nA, and P as attributes.
111 | gamma: float
112 | Discount factor, must be in range [0, 1)
113 | value_func: np.ndarray
114 | Value function for the given policy.
115 | policy: dict or np.array
116 | The policy to improve. Maps states to actions.
117 | max_iterations: int
118 | The maximum number of iterations to run before stopping.
119 | tol: float
120 | Determines when value function has converged.
121 |
122 | Returns
123 | -------
124 | bool, np.ndarray
125 | Returns true if policy changed. Also returns the new policy.
126 | """
127 | stable=True
128 | for s in range(env.nS):
129 | old_action=policy[s]
130 | maxvsa=-1
131 | maxa=-1
132 | for a in range(env.nA):
133 | vsa=0
134 | for possible_next_state in env.P[s][a]:
135 | prob_action = possible_next_state[0]
136 | cur_reward = possible_next_state[2]
137 | future_reward = gamma * value_func[possible_next_state[1]]
138 | vsa+=prob_action * (cur_reward + future_reward)
139 | if vsa>maxvsa:
140 | maxvsa=vsa
141 | maxa=a
142 | if maxa!=old_action: stable=False
143 | policy[s]=maxa
144 | return stable, policy
145 |
146 |
147 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
148 | """Runs policy iteration.
149 |
150 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition
151 | book.
152 |
153 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
154 |
155 | You should use the improve_policy and evaluate_policy methods to
156 | implement this method.
157 |
158 | Parameters
159 | ----------
160 | env: gym.core.Environment
161 | The environment to compute value iteration for. Must have nS,
162 | nA, and P as attributes.
163 | gamma: float
164 | Discount factor, must be in range [0, 1)
165 | max_iterations: int
166 | The maximum number of iterations to run before stopping.
167 | tol: float
168 | Determines when value function has converged.
169 |
170 | Returns
171 | -------
172 | (np.ndarray, np.ndarray, int, int)
173 | Returns optimal policy, value function, number of policy
174 | improvement iterations, and number of value iterations.
175 | """
176 | policy = np.zeros(env.nS, dtype='int')
177 | value_func = np.zeros(env.nS)
178 | stable=False
179 | iters=0
180 | eval_iters=0
181 | while not stable:
182 | value_func,iter=evaluate_policy(env,gamma,policy)
183 | eval_iters+=iter
184 | stable,policy=improve_policy(env,gamma,value_func,policy)
185 | iters+=1
186 | return policy, value_func, iters, eval_iters
187 |
188 |
189 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3):
190 | """Runs value iteration for a given gamma and environment.
191 |
192 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition
193 | book.
194 |
195 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf
196 |
197 | Parameters
198 | ----------
199 | env: gym.core.Environment
200 | The environment to compute value iteration for. Must have nS,
201 | nA, and P as attributes.
202 | gamma: float
203 | Discount factor, must be in range [0, 1)
204 | max_iterations: int
205 | The maximum number of iterations to run before stopping.
206 | tol: float
207 | Determines when value function has converged.
208 |
209 | Returns
210 | -------
211 | np.ndarray, iteration
212 | The value function and the number of iterations it took to converge.
213 | """
214 | value_func_old = np.random.rand(env.nS)
215 | value_func_new = np.zeros(env.nS)
216 | for iteration in range(max_iterations):
217 | delta=0
218 | for s in range(env.nS):
219 | maxvsa = -1
220 | for a in range(env.nA):
221 | vsa=0
222 | for possible_next_state in env.P[s][a]:
223 | prob_action = possible_next_state[0]
224 | cur_reward=possible_next_state[2]
225 | if value_func_new[possible_next_state[1]]==0:
226 | future_reward=gamma*value_func_old[possible_next_state[1]]
227 | else:
228 | future_reward = gamma * value_func_new[possible_next_state[1]]
229 | vsa+=prob_action*(cur_reward+future_reward)
230 | if vsa>maxvsa:
231 | maxvsa=vsa
232 | #diff=math.pow((value_func_old[s]-maxvsa),2)
233 | diff=abs(value_func_old[s]-maxvsa)
234 | delta=max(delta,diff)
235 | value_func_new[s]=maxvsa
236 | #delta=math.sqrt(delta)
237 | if delta<=tol: break
238 | value_func_old = value_func_new
239 |
240 | return value_func_new, iteration
241 |
242 |
243 | def print_policy(policy, action_names):
244 | """Print the policy in human-readable format.
245 |
246 | Parameters
247 | ----------
248 | policy: np.ndarray
249 | Array of state to action number mappings
250 | action_names: dict
251 | Mapping of action numbers to characters representing the action.
252 | """
253 | str_policy = policy.astype('str')
254 | for action_num, action_name in action_names.items():
255 | np.place(str_policy, policy == action_num, action_name)
256 |
257 | print(str_policy)
258 | return str_policy
259 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 | Python
137 |
138 |
139 |
140 |
141 | PyCompatibilityInspection
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 | 1487050740220
419 |
420 |
421 | 1487050740220
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
--------------------------------------------------------------------------------