├── requirements.txt ├── .DS_Store ├── deeprl_hw1 ├── __init__.py ├── rl1.pyc ├── .DS_Store ├── __init__.pyc ├── lake_envs.pyc ├── queue_envs.pyc ├── rlvaliterchngd.pyc ├── driver3.py ├── lake_envs.py ├── rl.py ├── queue_envs.py ├── rl1.py └── rlvaliterchngd.py ├── results ├── .DS_Store ├── 2ca4x4.png ├── results.docx ├── part a4x4 │ ├── 2i.png │ ├── 2c4x4.png │ ├── 2e4x4.png │ ├── 1gvalue.csv │ └── 1gpolicy.csv ├── part a4x4 with max │ ├── 1c.png │ ├── 1e.png │ ├── 1gpolicy.csv │ └── 1gvalue.csv ├── part a8x8 with max │ ├── .DS_Store │ ├── 2c8x8.png │ ├── 2e8x8.png │ ├── 1gpolicy.csv │ └── 1gvalue.csv ├── Deterministic-4x4-neg-reward-FrozenLake-v0.png ├── Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png ├── Deterministic-4x4-neg-reward-FrozenLake-v0_2cvaluegamma0.16.csv ├── Stochastic-4x4-FrozenLake-v0_2bvalue.csv └── Deterministic-4x4-neg-reward-FrozenLake-v0_2cvalue.csv ├── Stochastic-4x4-FrozenLake-v0.png ├── .ipynb_checkpoints └── 21-checkpoint.ipynb ├── Deterministic-4x4-neg-reward-FrozenLake-v0.png ├── setup.py ├── .idea ├── modules.xml ├── misc.xml ├── deeprl_hw1_src.iml ├── inspectionProfiles │ └── Project_Default.xml └── workspace.xml ├── README.md ├── example.py ├── DeterministicFrozenLake.py ├── StochasticFrozenLake.py ├── DeterministicFrozenNegReward.py └── 21.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | future 2 | gym 3 | numpy 4 | six 5 | -e . 6 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/.DS_Store -------------------------------------------------------------------------------- /deeprl_hw1/__init__.py: -------------------------------------------------------------------------------- 1 | import deeprl_hw1.lake_envs 2 | import deeprl_hw1.queue_envs 3 | -------------------------------------------------------------------------------- /results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/.DS_Store -------------------------------------------------------------------------------- /deeprl_hw1/rl1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/rl1.pyc -------------------------------------------------------------------------------- /results/2ca4x4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/2ca4x4.png -------------------------------------------------------------------------------- /deeprl_hw1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/.DS_Store -------------------------------------------------------------------------------- /results/results.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/results.docx -------------------------------------------------------------------------------- /deeprl_hw1/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/__init__.pyc -------------------------------------------------------------------------------- /deeprl_hw1/lake_envs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/lake_envs.pyc -------------------------------------------------------------------------------- /results/part a4x4/2i.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2i.png -------------------------------------------------------------------------------- /deeprl_hw1/queue_envs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/queue_envs.pyc -------------------------------------------------------------------------------- /results/part a4x4/2c4x4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2c4x4.png -------------------------------------------------------------------------------- /results/part a4x4/2e4x4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4/2e4x4.png -------------------------------------------------------------------------------- /deeprl_hw1/rlvaliterchngd.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/deeprl_hw1/rlvaliterchngd.pyc -------------------------------------------------------------------------------- /Stochastic-4x4-FrozenLake-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/Stochastic-4x4-FrozenLake-v0.png -------------------------------------------------------------------------------- /results/part a4x4 with max/1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4 with max/1c.png -------------------------------------------------------------------------------- /results/part a4x4 with max/1e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a4x4 with max/1e.png -------------------------------------------------------------------------------- /results/part a8x8 with max/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/.DS_Store -------------------------------------------------------------------------------- /results/part a8x8 with max/2c8x8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/2c8x8.png -------------------------------------------------------------------------------- /results/part a8x8 with max/2e8x8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/part a8x8 with max/2e8x8.png -------------------------------------------------------------------------------- /.ipynb_checkpoints/21-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /results/part a4x4/1gvalue.csv: -------------------------------------------------------------------------------- 1 | 0.592,0.657,0.730,0.657 2 | 0.657,0.001,0.811,0.002 3 | 0.730,0.811,0.901,0.002 4 | 0.007,0.901,1.001,0.001 -------------------------------------------------------------------------------- /Deterministic-4x4-neg-reward-FrozenLake-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/Deterministic-4x4-neg-reward-FrozenLake-v0.png -------------------------------------------------------------------------------- /results/part a4x4/1gpolicy.csv: -------------------------------------------------------------------------------- 1 | 0.5916,0.6572,0.7301,0.6571 2 | 0.6572,0.0005,0.8111,0.0023 3 | 0.7301,0.8111,0.9011,0.0021 4 | 0.0074,0.9011,1.0011,0.0011 -------------------------------------------------------------------------------- /results/Deterministic-4x4-neg-reward-FrozenLake-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/Deterministic-4x4-neg-reward-FrozenLake-v0.png -------------------------------------------------------------------------------- /results/Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaksham/frozenlake/HEAD/results/Deterministic-4x4-neg-reward-FrozenLake-v0gamma0.16.png -------------------------------------------------------------------------------- /results/Deterministic-4x4-neg-reward-FrozenLake-v0_2cvaluegamma0.16.csv: -------------------------------------------------------------------------------- 1 | -0.999935,0.000065,-0.999961,0.000039 2 | 0.000065,0.000065,0.000039,0.000039 3 | 0.000046,0.000010,0.000101,0.000101 4 | 0.000046,0.000007,1.190390,1.190390 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup( 6 | name='DeepRL Homework 1', 7 | version='1.0', 8 | description='Library for 10-703 Homework 1', 9 | packages=['deeprl_hw1']) 10 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /results/part a4x4 with max/1gpolicy.csv: -------------------------------------------------------------------------------- 1 | 5.923973887191247290e-01,6.580073887191245641e-01,7.309073887191246399e-01,6.578166498472122203e-01 2 | 6.580073887191245641e-01,5.799280908488830578e-03,8.119073887191247119e-01,7.018522510759056705e-03 3 | 7.309073887191246399e-01,8.119073887191247119e-01,9.019073887191245698e-01,4.045348128936829789e-03 4 | 8.663607960043642059e-03,9.019073887191245698e-01,1.001907388719124548e+00,1.907388719124641875e-03 5 | -------------------------------------------------------------------------------- /results/part a4x4 with max/1gvalue.csv: -------------------------------------------------------------------------------- 1 | 5.940389180249019407e-01,6.596489180249021089e-01,7.325489180249019627e-01,6.592940262224117332e-01 2 | 6.596489180249021089e-01,5.975301847172851538e-03,8.135489180249019237e-01,6.305401447811602444e-03 3 | 7.325489180249019627e-01,8.135489180249019237e-01,9.035489180249020036e-01,8.526760647121702126e-04 4 | 8.310233909333586064e-03,9.035489180249020036e-01,1.003548918024901981e+00,3.548918024901988755e-03 5 | -------------------------------------------------------------------------------- /deeprl_hw1/driver3.py: -------------------------------------------------------------------------------- 1 | import deeprl_hw1.queue_envs as qenv 2 | import numpy 3 | P1 = 0.1 4 | P2 = 0.9 5 | P3 = 0.1 6 | 7 | env=qenv.QueueEnv(P1,P2,P3) 8 | #ps=env.query_model((1,0,0,0),1) 9 | #print ps 10 | ps=env.query_model((1,5,3,4),3) 11 | print ps 12 | numpy.random.seed(0) 13 | env.reset() 14 | env.render() 15 | env._step(1) 16 | env.render() 17 | env._step(3) 18 | env.render() 19 | # 20 | # ps=env.query_model((1,5,5,5),3) 21 | # print ps 22 | -------------------------------------------------------------------------------- /results/Stochastic-4x4-FrozenLake-v0_2bvalue.csv: -------------------------------------------------------------------------------- 1 | 7.210080319262232584e-02,6.500778087490913237e-02,7.867964029828880546e-02,5.967327973610045411e-02 2 | 9.499989900152405742e-02,8.464270518010267794e-03,1.174987390033538359e-01,2.065368390832942116e-03 3 | 1.488080916636685680e-01,2.513906886597030987e-01,3.040243032811825175e-01,1.272454070772601007e-03 4 | 6.766787927394104021e-03,3.841673769823148454e-01,6.439724196755912677e-01,6.094194795894227086e-03 5 | -------------------------------------------------------------------------------- /results/Deterministic-4x4-neg-reward-FrozenLake-v0_2cvalue.csv: -------------------------------------------------------------------------------- 1 | -9.999608187719284391e-01,3.918122807154976087e-05,-9.999774521696255247e-01,2.254783037451089448e-05 2 | 3.918122807154976087e-05,3.918122807154976087e-05,2.254783037451089448e-05,2.254783037451089448e-05 3 | 9.128853355511469254e-05,6.268996491447961536e-06,5.862065414633444213e-05,5.862065414633444213e-05 4 | 9.128853355511469254e-05,1.460616536881835047e-05,1.190425913320509954e+00,1.190425913320509954e+00 5 | -------------------------------------------------------------------------------- /.idea/deeprl_hw1_src.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /results/part a8x8 with max/1gpolicy.csv: -------------------------------------------------------------------------------- 1 | 2.579595231504817621e-01,2.862024767985817952e-01,3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01 2 | 2.862024767985817952e-01,3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01,5.942629403175818670e-01 3 | 3.175835364075818013e-01,3.524513804175817833e-01,3.911934293175818311e-01,3.767824654025735392e-03,4.820698403175819879e-01,5.352139403175819599e-01,5.942629403175818670e-01,6.598729403175817021e-01 4 | 3.524513804175817833e-01,3.911934293175818311e-01,4.342401503175817856e-01,4.820698403175819879e-01,5.352139403175819599e-01,3.942658874654073053e-03,6.598729403175817021e-01,7.327729403175817779e-01 5 | 3.172062423758236216e-01,3.520740863858236591e-01,3.908161352858235960e-01,9.125019350097088961e-04,5.942629403175818670e-01,6.598729403175817021e-01,7.327729403175817779e-01,8.137729403175818499e-01 6 | 2.854856181382412483e-01,8.370799156727023668e-03,6.671803637201679030e-03,5.942629403175818670e-01,6.598729403175817021e-01,7.327729403175817779e-01,7.436799765772327507e-03,9.037729403175817078e-01 7 | 3.168666777472413099e-01,4.493092146848743848e-03,4.816925462858237528e-01,5.348366462858237247e-01,7.133581024200938078e-03,8.137729403175818499e-01,2.763054665441602568e-04,1.003772940317581686e+00 8 | 3.517345217572412364e-01,3.904765706572413952e-01,4.335232916572414053e-01,5.762616389577770136e-03,8.137729403175818499e-01,9.037729403175817078e-01,1.003772940317581686e+00,3.772940317581741088e-03 9 | -------------------------------------------------------------------------------- /results/part a8x8 with max/1gvalue.csv: -------------------------------------------------------------------------------- 1 | 2.542550444410008881e-01,2.824979980891009212e-01,3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01 2 | 2.824979980891009212e-01,3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01,5.905584616081007709e-01 3 | 3.138790576981008718e-01,3.487469017081009648e-01,3.874889506081009016e-01,7.373080087904547095e-03,4.783653616081008364e-01,5.315094616081008638e-01,5.905584616081007709e-01,6.561684616081009391e-01 4 | 3.487469017081009648e-01,3.874889506081009016e-01,4.305356716081008006e-01,4.783653616081008364e-01,5.315094616081008638e-01,1.307747491744603533e-03,6.561684616081009391e-01,7.290684616081009040e-01 5 | 3.138722115372908905e-01,3.487400555472908170e-01,3.874821044472907539e-01,1.526370220697159533e-03,5.905584616081007709e-01,6.561684616081009391e-01,7.290684616081009040e-01,8.100684616081009759e-01 6 | 2.824849903835617848e-01,4.610209748014226609e-03,1.384175249950048581e-03,5.905584616081007709e-01,6.561684616081009391e-01,7.290684616081009040e-01,1.963379439910021348e-04,9.000684616081008338e-01 7 | 3.138660499925617353e-01,3.178091108843970371e-03,4.783585154472907996e-01,5.315026154472907161e-01,8.674123898821699957e-03,8.100684616081009759e-01,5.057248319540319850e-03,1.000068461608100812e+00 8 | 3.487338940025616618e-01,3.874759429025617097e-01,4.305226639025617197e-01,6.594137748716060420e-03,8.100684616081009759e-01,9.000684616081008338e-01,1.000068461608100812e+00,6.846160810080188482e-05 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning 2 | 3 | ## OpenAI Gym Environments 4 | ### Creating the environments 5 | 6 | To create the environment use the following code snippet: 7 | 8 | ``` 9 | import gym 10 | import deeprl_hw1.envs 11 | 12 | env = gym.make('Deterministic-4x4-FrozenLake-v0') 13 | ``` 14 | 15 | ### Actions 16 | 17 | There are four actions: LEFT, UP, DOWN, RIGHT represented as 18 | integers. The `deep_rl_hw1.envs` contains variables to reference 19 | these. For example: 20 | 21 | ``` 22 | print(deeprl_hw1.envs.LEFT) 23 | ``` 24 | 25 | will print out the number 0. 26 | 27 | ### Environment Attributes 28 | 29 | This class contains the following important attributes: 30 | 31 | - `nS` :: number of states 32 | - `nA` :: number of actions 33 | - `P` :: transitions, rewards, terminals 34 | 35 | The `P` attribute will be the most important for your implementation 36 | of value iteration and policy iteration. This attribute contains the 37 | model for the particular map instance. It is a dictionary of 38 | dictionary of lists with the following form: 39 | 40 | ``` 41 | P[s][a] = [(prob, nextstate, reward, is_terminal), ...] 42 | ``` 43 | 44 | For example, to get the probability of taking action LEFT in state 0 45 | you would use the following code: 46 | 47 | ``` 48 | env.P[0][deeprl_hw1.envs.LEFT] 49 | ``` 50 | 51 | This would return the list: `[(1.0, 0, 0.0, False)]` for the 52 | `Deterministic-4x4-FrozenLake-v0` domain. There is one tuple in the 53 | list, so there is only one possible next state. The next state will be 54 | state 0, according to the second number in the tuple. This will be the 55 | next state 100\% of the time according to the first number in the 56 | tuple. The reward function for this state action pair `R(0,LEFT) = 0` 57 | according to the third number. The final tuple value says that the 58 | next state is not terminal. 59 | 60 | ## 61 | ### Running a random policy 62 | 63 | example.py has an example of how to run a random policy on the domain. 64 | 65 | #Value Iteration 66 | The optimal policies for the different environments is in the .py files. 67 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from __future__ import (absolute_import, division, print_function, 5 | unicode_literals) 6 | from builtins import input 7 | 8 | import deeprl_hw1.lake_envs as lake_env 9 | import gym 10 | import time 11 | 12 | 13 | def run_random_policy(env): 14 | """Run a random policy for the given environment. 15 | 16 | Logs the total reward and the number of steps until the terminal 17 | state was reached. 18 | 19 | Parameters 20 | ---------- 21 | env: gym.envs.Environment 22 | Instance of an OpenAI gym. 23 | 24 | Returns 25 | ------- 26 | (float, int) 27 | First number is the total undiscounted reward received. The 28 | second number is the total number of actions taken before the 29 | episode finished. 30 | """ 31 | initial_state = env.reset() 32 | env.render() 33 | time.sleep(1) # just pauses so you can see the output 34 | 35 | total_reward = 0 36 | num_steps = 0 37 | while True: 38 | nextstate, reward, is_terminal, debug_info = env.step( 39 | env.action_space.sample()) 40 | env.render() 41 | 42 | total_reward += reward 43 | num_steps += 1 44 | 45 | if is_terminal: 46 | break 47 | 48 | time.sleep(1) 49 | 50 | return total_reward, num_steps 51 | 52 | 53 | def print_env_info(env): 54 | print('Environment has %d states and %d actions.' % (env.nS, env.nA)) 55 | 56 | 57 | def print_model_info(env, state, action): 58 | transition_table_row = env.P[state][action] 59 | print( 60 | ('According to transition function, ' 61 | 'taking action %s(%d) in state %d leads to' 62 | ' %d possible outcomes') % (lake_env.action_names[action], 63 | action, state, len(transition_table_row))) 64 | for prob, nextstate, reward, is_terminal in transition_table_row: 65 | state_type = 'terminal' if is_terminal else 'non-terminal' 66 | print( 67 | '\tTransitioning to %s state %d with probability %f and reward %f' 68 | % (state_type, nextstate, prob, reward)) 69 | 70 | 71 | def main(): 72 | # create the environment 73 | env = gym.make('FrozenLake-v0') 74 | # uncomment next line to try the deterministic version 75 | # env = gym.make('Deterministic-4x4-FrozenLake-v0') 76 | 77 | print_env_info(env) 78 | print_model_info(env, 0, lake_env.DOWN) 79 | print_model_info(env, 1, lake_env.DOWN) 80 | print_model_info(env, 14, lake_env.RIGHT) 81 | 82 | input('Hit enter to run a random policy...') 83 | 84 | total_reward, num_steps = run_random_policy(env) 85 | print('Agent received total reward of: %f' % total_reward) 86 | print('Agent took %d steps' % num_steps) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /DeterministicFrozenLake.py: -------------------------------------------------------------------------------- 1 | import deeprl_hw1.lake_envs as lake_env 2 | import gym 3 | import time 4 | import seaborn 5 | from tabulate import tabulate 6 | import matplotlib.pyplot as plt 7 | from deeprl_hw1.rl1 import * 8 | 9 | def run_policy(env,gamma,policy): 10 | initial_state = env.reset() 11 | env.render() 12 | time.sleep(1) # just pauses so you can see the output 13 | 14 | total_reward = 0 15 | num_steps = 0 16 | current_state=initial_state 17 | while True: 18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state]) 19 | env.render() 20 | 21 | total_reward += math.pow(gamma,num_steps)*reward 22 | num_steps += 1 23 | 24 | if is_terminal: 25 | break 26 | 27 | current_state=nextstate 28 | time.sleep(1) 29 | 30 | return total_reward, num_steps 31 | 32 | grid=8 33 | envname='Deterministic-'+str(grid)+'x'+str(grid)+'-FrozenLake-v0' 34 | env = gym.make(envname) 35 | env.render() 36 | gamma=0.9 37 | print "Executing Policy Iteration" 38 | start_time=time.time() 39 | policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma) 40 | print "Total time taken: "+str((time.time()-start_time)) 41 | print "Total Policy Improvement Steps: "+str(policy_iters) 42 | print "Total Policy Evaluation Steps: "+str(val_iters) 43 | print "Policy:" 44 | policy_str=print_policy(policy,lake_env.action_names) 45 | ps=[] 46 | for elem in policy_str: 47 | ps.append(elem[0]) 48 | reshaped_policy=np.reshape(ps,(grid,grid)) 49 | print tabulate(reshaped_policy,tablefmt='latex') 50 | f, ax = plt.subplots(figsize=(11, 9)) 51 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 52 | reshaped=np.reshape(value_func,(grid,grid)) 53 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1, 54 | square=True, xticklabels=grid+1, yticklabels=grid+1, 55 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 56 | plt.savefig('1c.png',bbox_inches='tight') 57 | np.savetxt('1gpolicy.csv',reshaped,delimiter=',') 58 | 59 | print "Executing Value Iteration" 60 | start_time=time.time() 61 | value_function,value_iters=value_iteration(env,gamma) 62 | print "Total time taken: "+str((time.time()-start_time)) 63 | print "Total Value Iteration Steps: "+str(value_iters) 64 | print "Policy:" 65 | policy=value_function_to_policy(env,gamma,value_function) 66 | policy_str=print_policy(policy,lake_env.action_names) 67 | ps=[] 68 | for elem in policy_str: 69 | ps.append(elem[0]) 70 | reshaped_policy=np.reshape(ps,(grid,grid)) 71 | print tabulate(reshaped_policy,tablefmt='latex') 72 | f, ax = plt.subplots(figsize=(11, 9)) 73 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 74 | reshaped=np.reshape(value_function,(grid,grid)) 75 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1, 76 | square=True, xticklabels=grid+1, yticklabels=grid+1, 77 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 78 | plt.savefig('1e.png',bbox_inches='tight') 79 | np.savetxt('1gvalue.csv',reshaped,delimiter=',') 80 | 81 | cum_reward,nsteps=run_policy(env,gamma,policy) 82 | print "Cumulative Reward: "+str(cum_reward) 83 | print "No. of steps: "+str(nsteps) 84 | 85 | -------------------------------------------------------------------------------- /deeprl_hw1/lake_envs.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Defines some frozen lake maps.""" 3 | 4 | from __future__ import (absolute_import, division, print_function, 5 | unicode_literals) 6 | 7 | from gym.envs.toy_text.frozen_lake import LEFT, RIGHT, DOWN, UP 8 | from gym.envs.toy_text import frozen_lake, discrete 9 | 10 | from gym.envs.registration import register 11 | 12 | action_names = {LEFT: 'LEFT', RIGHT: 'RIGHT', DOWN: 'DOWN', UP: 'UP'} 13 | 14 | register( 15 | id='Deterministic-4x4-FrozenLake-v0', 16 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 17 | kwargs={'map_name': '4x4', 18 | 'is_slippery': False}) 19 | 20 | register( 21 | id='Deterministic-8x8-FrozenLake-v0', 22 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 23 | kwargs={'map_name': '8x8', 24 | 'is_slippery': False}) 25 | 26 | register( 27 | id='Stochastic-4x4-FrozenLake-v0', 28 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 29 | kwargs={'map_name': '4x4', 30 | 'is_slippery': True}) 31 | 32 | register( 33 | id='Stochastic-8x8-FrozenLake-v0', 34 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 35 | kwargs={'map_name': '8x8', 36 | 'is_slippery': True}) 37 | 38 | 39 | class NegRewardFrozenLake(frozen_lake.FrozenLakeEnv): 40 | def __init__(self, **kwargs): 41 | super(NegRewardFrozenLake, self).__init__(**kwargs) 42 | 43 | # modify the rewards 44 | for state in range(self.nS): 45 | for action in range(self.nA): 46 | new_transitions = [] 47 | for (prob, nextstate, _, is_terminal) in self.P[state][action]: 48 | row = nextstate // self.ncol 49 | col = nextstate - row * self.ncol 50 | tile_type = self.desc[row, col] 51 | if tile_type == 'F' or tile_type == 'S': 52 | reward = -1 53 | elif tile_type == 'G': 54 | reward = 1 55 | else: 56 | reward = 0 57 | 58 | new_transitions.append( 59 | (prob, nextstate, reward, is_terminal)) 60 | self.P[state][action] = new_transitions 61 | 62 | 63 | register( 64 | id='Deterministic-4x4-neg-reward-FrozenLake-v0', 65 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake', 66 | kwargs={'map_name': '4x4', 67 | 'is_slippery': False}) 68 | 69 | register( 70 | id='Stochastic-4x4-neg-reward-FrozenLake-v0', 71 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake', 72 | kwargs={'map_name': '4x4', 73 | 'is_slippery': True}) 74 | 75 | register( 76 | id='Deterministic-8x8-neg-reward-FrozenLake-v0', 77 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake', 78 | kwargs={'map_name': '8x8', 79 | 'is_slippery': False}) 80 | 81 | register( 82 | id='Stochastic-8x8-neg-reward-FrozenLake-v0', 83 | entry_point='deeprl_hw1.lake_envs:NegRewardFrozenLake', 84 | kwargs={'map_name': '8x8', 85 | 'is_slippery': True}) 86 | -------------------------------------------------------------------------------- /StochasticFrozenLake.py: -------------------------------------------------------------------------------- 1 | import deeprl_hw1.lake_envs as lake_env 2 | import gym 3 | import time 4 | import seaborn 5 | from tabulate import tabulate 6 | import matplotlib.pyplot as plt 7 | from deeprl_hw1.rlvaliterchngd import * 8 | 9 | def run_policy(env,gamma,policy): 10 | initial_state = env.reset() 11 | #env.render() 12 | time.sleep(1) # just pauses so you can see the output 13 | 14 | total_reward = 0 15 | num_steps = 0 16 | current_state=initial_state 17 | while True: 18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state]) 19 | #env.render() 20 | 21 | total_reward += math.pow(gamma,num_steps)*reward 22 | num_steps += 1 23 | 24 | if is_terminal: 25 | break 26 | 27 | current_state=nextstate 28 | time.sleep(1) 29 | 30 | return total_reward, num_steps 31 | 32 | grid=4 33 | envname='Stochastic-'+str(grid)+'x'+str(grid)+'-FrozenLake-v0' 34 | env = gym.make(envname) 35 | env.render() 36 | gamma=0.9 37 | 38 | # print "Executing Policy Iteration" 39 | # start_time=time.time() 40 | # policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma) 41 | # print "Total time taken: "+str((time.time()-start_time)) 42 | # print "Total Policy Improvement Steps: "+str(policy_iters) 43 | # print "Total Policy Evaluation Steps: "+str(val_iters) 44 | # print "Policy:" 45 | # policy_str=print_policy(policy,lake_env.action_names) 46 | # ps=[] 47 | # for elem in policy_str: 48 | # ps.append(elem[0]) 49 | # reshaped_policy=np.reshape(ps,(grid,grid)) 50 | # print tabulate(reshaped_policy,tablefmt='latex') 51 | # f, ax = plt.subplots(figsize=(11, 9)) 52 | # cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 53 | # reshaped=np.reshape(value_func,(grid,grid)) 54 | # seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1, 55 | # square=True, xticklabels=grid+1, yticklabels=grid+1, 56 | # linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 57 | # plt.savefig('1c.png',bbox_inches='tight') 58 | # np.savetxt('1gpolicy.csv',reshaped,delimiter=',') 59 | 60 | print "Executing Value Iteration" 61 | start_time=time.time() 62 | value_function,value_iters=value_iteration(env,gamma) 63 | print "Total time taken: "+str((time.time()-start_time)) 64 | print "Total Value Iteration Steps: "+str(value_iters) 65 | print "Policy:" 66 | policy=value_function_to_policy(env,gamma,value_function) 67 | policy_str=print_policy(policy,lake_env.action_names) 68 | ps=[] 69 | for elem in policy_str: 70 | ps.append(elem[0]) 71 | reshaped_policy=np.reshape(ps,(grid,grid)) 72 | print tabulate(reshaped_policy,tablefmt='latex') 73 | f, ax = plt.subplots(figsize=(11, 9)) 74 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 75 | reshaped=np.reshape(value_function,(grid,grid)) 76 | seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1, 77 | square=True, xticklabels=grid+1, yticklabels=grid+1, 78 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 79 | plt.savefig(envname+'.png',bbox_inches='tight') 80 | np.savetxt(envname+'_2bvalue.csv',reshaped,delimiter=',') 81 | 82 | total_cum_reward=0 83 | maxn=5 84 | start_time=time.time() 85 | for n in range(maxn): 86 | cum_reward,nsteps=run_policy(env,gamma,policy) 87 | total_cum_reward+=cum_reward 88 | if n%1==0: print "Done "+str(n) 89 | print ("Time: "+str((time.time()-start_time)/60)) 90 | 91 | print "Average Cumulative Reward: "+str((total_cum_reward/maxn)) 92 | print "No. of steps: "+str(nsteps) 93 | 94 | -------------------------------------------------------------------------------- /DeterministicFrozenNegReward.py: -------------------------------------------------------------------------------- 1 | import deeprl_hw1.lake_envs as lake_env 2 | import gym 3 | import time 4 | import seaborn 5 | from tabulate import tabulate 6 | import matplotlib.pyplot as plt 7 | from deeprl_hw1.rlvaliterchngd import * 8 | 9 | def run_policy(env,gamma,policy): 10 | initial_state = env.reset() 11 | #env.render() 12 | time.sleep(1) # just pauses so you can see the output 13 | 14 | total_reward = 0 15 | num_steps = 0 16 | current_state=initial_state 17 | while True: 18 | nextstate, reward, is_terminal, debug_info = env.step(policy[current_state]) 19 | #env.render() 20 | 21 | total_reward += math.pow(gamma,num_steps)*reward 22 | num_steps += 1 23 | 24 | if is_terminal: 25 | break 26 | 27 | current_state=nextstate 28 | time.sleep(1) 29 | 30 | return total_reward, num_steps 31 | 32 | grid=4 33 | envname='Deterministic-4x4-neg-reward-FrozenLake-v0' 34 | env = gym.make(envname) 35 | env.render() 36 | gamma=0.16 37 | 38 | # print "Executing Policy Iteration" 39 | # start_time=time.time() 40 | # policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma) 41 | # print "Total time taken: "+str((time.time()-start_time)) 42 | # print "Total Policy Improvement Steps: "+str(policy_iters) 43 | # print "Total Policy Evaluation Steps: "+str(val_iters) 44 | # print "Policy:" 45 | # policy_str=print_policy(policy,lake_env.action_names) 46 | # ps=[] 47 | # for elem in policy_str: 48 | # ps.append(elem[0]) 49 | # reshaped_policy=np.reshape(ps,(grid,grid)) 50 | # print tabulate(reshaped_policy,tablefmt='latex') 51 | # f, ax = plt.subplots(figsize=(11, 9)) 52 | # cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 53 | # reshaped=np.reshape(value_func,(grid,grid)) 54 | # seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1, 55 | # square=True, xticklabels=grid+1, yticklabels=grid+1, 56 | # linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 57 | # plt.savefig('1c.png',bbox_inches='tight') 58 | # np.savetxt('1gpolicy.csv',reshaped,delimiter=',') 59 | 60 | print "Executing Value Iteration" 61 | start_time=time.time() 62 | value_function,value_iters=value_iteration(env,gamma) 63 | print "Total time taken: "+str((time.time()-start_time)) 64 | print "Total Value Iteration Steps: "+str(value_iters) 65 | print "Policy:" 66 | policy=value_function_to_policy(env,gamma,value_function) 67 | policy_str=print_policy(policy,lake_env.action_names) 68 | ps=[] 69 | for elem in policy_str: 70 | ps.append(elem[0]) 71 | reshaped_policy=np.reshape(ps,(grid,grid)) 72 | print tabulate(reshaped_policy,tablefmt='latex') 73 | f, ax = plt.subplots(figsize=(11, 9)) 74 | cmap = seaborn.diverging_palette(220, 10, as_cmap=True) 75 | reshaped=np.reshape(value_function,(grid,grid)) 76 | seaborn.heatmap(reshaped, cmap=cmap, vmax=5, 77 | square=True, xticklabels=grid+1, yticklabels=grid+1, 78 | linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) 79 | plt.savefig(envname+'.png',bbox_inches='tight') 80 | np.savetxt(envname+'_2cvalue.csv',reshaped,delimiter=',') 81 | 82 | # total_cum_reward=0 83 | # maxn=5 84 | # start_time=time.time() 85 | # for n in range(maxn): 86 | # cum_reward,nsteps=run_policy(env,gamma,policy) 87 | # total_cum_reward+=cum_reward 88 | # if n%1==0: print "Done "+str(n) 89 | # print ("Time: "+str((time.time()-start_time)/60)) 90 | # 91 | # print "Average Cumulative Reward: "+str((total_cum_reward/maxn)) 92 | # print "No. of steps: "+str(nsteps) 93 | 94 | -------------------------------------------------------------------------------- /deeprl_hw1/rl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import division, absolute_import 3 | from __future__ import print_function, unicode_literals 4 | 5 | import numpy as np 6 | 7 | 8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3): 9 | """Evaluate the value of a policy. 10 | 11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 12 | book. 13 | 14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 15 | 16 | Parameters 17 | ---------- 18 | env: gym.core.Environment 19 | The environment to compute value iteration for. Must have nS, 20 | nA, and P as attributes. 21 | gamma: float 22 | Discount factor, must be in range [0, 1) 23 | policy: np.array 24 | The policy to evaluate. Maps states to actions. 25 | max_iterations: int 26 | The maximum number of iterations to run before stopping. 27 | tol: float 28 | Determines when value function has converged. 29 | 30 | Returns 31 | ------- 32 | np.ndarray 33 | The value for the given policy 34 | """ 35 | return np.zeros(env.nS) 36 | 37 | 38 | def value_function_to_policy(env, gamma, value_function): 39 | """Output action numbers for each state in value_function. 40 | 41 | Parameters 42 | ---------- 43 | env: gym.core.Environment 44 | Environment to compute policy for. Must have nS, nA, and P as 45 | attributes. 46 | gamma: float 47 | Discount factor. Number in range [0, 1) 48 | value_function: np.ndarray 49 | Value of each state. 50 | 51 | Returns 52 | ------- 53 | np.ndarray 54 | An array of integers. Each integer is the optimal action to take 55 | in that state according to the environment dynamics and the 56 | given value function. 57 | """ 58 | return np.zeros(env.nS, dtype='int') 59 | 60 | 61 | def improve_policy(env, gamma, value_func, policy): 62 | """Given a policy and value function improve the policy. 63 | 64 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 65 | book. 66 | 67 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 68 | 69 | Parameters 70 | ---------- 71 | env: gym.core.Environment 72 | The environment to compute value iteration for. Must have nS, 73 | nA, and P as attributes. 74 | gamma: float 75 | Discount factor, must be in range [0, 1) 76 | value_func: np.ndarray 77 | Value function for the given policy. 78 | policy: dict or np.array 79 | The policy to improve. Maps states to actions. 80 | max_iterations: int 81 | The maximum number of iterations to run before stopping. 82 | tol: float 83 | Determines when value function has converged. 84 | 85 | Returns 86 | ------- 87 | bool, np.ndarray 88 | Returns true if policy changed. Also returns the new policy. 89 | """ 90 | return False, policy 91 | 92 | 93 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 94 | """Runs policy iteration. 95 | 96 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 97 | book. 98 | 99 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 100 | 101 | You should use the improve_policy and evaluate_policy methods to 102 | implement this method. 103 | 104 | Parameters 105 | ---------- 106 | env: gym.core.Environment 107 | The environment to compute value iteration for. Must have nS, 108 | nA, and P as attributes. 109 | gamma: float 110 | Discount factor, must be in range [0, 1) 111 | max_iterations: int 112 | The maximum number of iterations to run before stopping. 113 | tol: float 114 | Determines when value function has converged. 115 | 116 | Returns 117 | ------- 118 | (np.ndarray, np.ndarray, int, int) 119 | Returns optimal policy, value function, number of policy 120 | improvement iterations, and number of value iterations. 121 | """ 122 | policy = np.zeros(env.nS, dtype='int') 123 | value_func = np.zeros(env.nS) 124 | 125 | return policy, value_func, 0, 0 126 | 127 | 128 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 129 | """Runs value iteration for a given gamma and environment. 130 | 131 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition 132 | book. 133 | 134 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 135 | 136 | Parameters 137 | ---------- 138 | env: gym.core.Environment 139 | The environment to compute value iteration for. Must have nS, 140 | nA, and P as attributes. 141 | gamma: float 142 | Discount factor, must be in range [0, 1) 143 | max_iterations: int 144 | The maximum number of iterations to run before stopping. 145 | tol: float 146 | Determines when value function has converged. 147 | 148 | Returns 149 | ------- 150 | np.ndarray, iteration 151 | The value function and the number of iterations it took to converge. 152 | """ 153 | return np.zeros(env.nS), 0 154 | 155 | 156 | def print_policy(policy, action_names): 157 | """Print the policy in human-readable format. 158 | 159 | Parameters 160 | ---------- 161 | policy: np.ndarray 162 | Array of state to action number mappings 163 | action_names: dict 164 | Mapping of action numbers to characters representing the action. 165 | """ 166 | str_policy = policy.astype('str') 167 | for action_num, action_name in action_names.items(): 168 | np.place(str_policy, policy == action_num, action_name) 169 | 170 | print(str_policy) 171 | -------------------------------------------------------------------------------- /21.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import gym\n", 12 | "import deeprl_hw1\n", 13 | "from example import *" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 37, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stderr", 25 | "output_type": "stream", 26 | "text": [ 27 | "INFO:gym.envs.registration:Making new env: Deterministic-4x4-neg-reward-FrozenLake-v0\n", 28 | "[2017-02-15 00:08:57,815] Making new env: Deterministic-4x4-neg-reward-FrozenLake-v0\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "env=gym.make('Deterministic-4x4-neg-reward-FrozenLake-v0')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 30, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "0\n", 48 | "2\n", 49 | "3\n", 50 | "1\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "print(deeprl_hw1.lake_envs.LEFT)\n", 56 | "print(deeprl_hw1.lake_envs.RIGHT)\n", 57 | "print(deeprl_hw1.lake_envs.UP)\n", 58 | "print(deeprl_hw1.lake_envs.DOWN)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 38, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "{0: {0: [(1.0, 0, -1, False)], 1: [(1.0, 4, -1, False)], 2: [(1.0, 1, -1, False)], 3: [(1.0, 0, -1, False)]}, 1: {0: [(1.0, 0, -1, False)], 1: [(1.0, 5, 0, True)], 2: [(1.0, 2, -1, False)], 3: [(1.0, 1, -1, False)]}, 2: {0: [(1.0, 1, -1, False)], 1: [(1.0, 6, -1, False)], 2: [(1.0, 3, -1, False)], 3: [(1.0, 2, -1, False)]}, 3: {0: [(1.0, 2, -1, False)], 1: [(1.0, 7, 0, True)], 2: [(1.0, 3, -1, False)], 3: [(1.0, 3, -1, False)]}, 4: {0: [(1.0, 4, -1, False)], 1: [(1.0, 8, -1, False)], 2: [(1.0, 5, 0, True)], 3: [(1.0, 0, -1, False)]}, 5: {0: [(1.0, 5, 0, True)], 1: [(1.0, 5, 0, True)], 2: [(1.0, 5, 0, True)], 3: [(1.0, 5, 0, True)]}, 6: {0: [(1.0, 5, 0, True)], 1: [(1.0, 10, -1, False)], 2: [(1.0, 7, 0, True)], 3: [(1.0, 2, -1, False)]}, 7: {0: [(1.0, 7, 0, True)], 1: [(1.0, 7, 0, True)], 2: [(1.0, 7, 0, True)], 3: [(1.0, 7, 0, True)]}, 8: {0: [(1.0, 8, -1, False)], 1: [(1.0, 12, 0, True)], 2: [(1.0, 9, -1, False)], 3: [(1.0, 4, -1, False)]}, 9: {0: [(1.0, 8, -1, False)], 1: [(1.0, 13, -1, False)], 2: [(1.0, 10, -1, False)], 3: [(1.0, 5, 0, True)]}, 10: {0: [(1.0, 9, -1, False)], 1: [(1.0, 14, -1, False)], 2: [(1.0, 11, 0, True)], 3: [(1.0, 6, -1, False)]}, 11: {0: [(1.0, 11, 0, True)], 1: [(1.0, 11, 0, True)], 2: [(1.0, 11, 0, True)], 3: [(1.0, 11, 0, True)]}, 12: {0: [(1.0, 12, 0, True)], 1: [(1.0, 12, 0, True)], 2: [(1.0, 12, 0, True)], 3: [(1.0, 12, 0, True)]}, 13: {0: [(1.0, 12, 0, True)], 1: [(1.0, 13, -1, False)], 2: [(1.0, 14, -1, False)], 3: [(1.0, 9, -1, False)]}, 14: {0: [(1.0, 13, -1, False)], 1: [(1.0, 14, -1, False)], 2: [(1.0, 15, 1, True)], 3: [(1.0, 10, -1, False)]}, 15: {0: [(1.0, 15, 1, True)], 1: [(1.0, 15, 1, True)], 2: [(1.0, 15, 1, True)], 3: [(1.0, 15, 1, True)]}}\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "print env.P" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 16, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Environment has 16 states and 4 actions.\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "print_env_info(env)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 17, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "According to transition function, taking action RIGHT(2) in state 0 leads to 1 possible outcomes\n", 111 | "\tTransitioning to non-terminal state 1 with probability 1.000000 and reward 0.000000\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "print_model_info(env,0,deeprl_hw1.lake_envs.RIGHT)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 18, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "\u001b[41mS\u001b[0mFFF\n", 131 | "FHFH\n", 132 | "FFFH\n", 133 | "HFFG\n", 134 | "\n" 135 | ] 136 | }, 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "" 141 | ] 142 | }, 143 | "execution_count": 18, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "env.render()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 24, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "4\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "print env.nA" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 23, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "4\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "print env.action_space.n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 2", 203 | "language": "python", 204 | "name": "python2" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 2 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython2", 216 | "version": "2.7.12" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 0 221 | } 222 | -------------------------------------------------------------------------------- /deeprl_hw1/queue_envs.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Define the Queue environment from problem 3 here.""" 3 | 4 | from __future__ import (absolute_import, division, print_function, 5 | unicode_literals) 6 | 7 | from gym import Env, spaces 8 | from gym.envs.registration import register 9 | import numpy 10 | import itertools 11 | 12 | class QueueEnv(Env): 13 | """Implement the Queue environment from problem 3. 14 | 15 | Parameters 16 | ---------- 17 | p1: float 18 | Value between [0, 1]. The probability of queue 1 receiving a new item. 19 | p2: float 20 | Value between [0, 1]. The probability of queue 2 receiving a new item. 21 | p3: float 22 | Value between [0, 1]. The probability of queue 3 receiving a new item. 23 | 24 | Attributes 25 | ---------- 26 | nS: number of states 27 | nA: number of actions 28 | P: environment model 29 | """ 30 | metadata = {'render.modes': ['human']} 31 | 32 | SWITCH_TO_1 = 0 33 | SWITCH_TO_2 = 1 34 | SWITCH_TO_3 = 2 35 | SERVICE_QUEUE = 3 36 | 37 | 38 | 39 | 40 | def __init__(self, p1, p2, p3): 41 | self.action_space = spaces.Discrete(4) 42 | self.observation_space = spaces.MultiDiscrete( 43 | [(1, 3), (0, 5), (0, 5), (0, 5)]) 44 | self.nS = 0 45 | self.nA = 4 46 | self.P = dict() 47 | self.current_state=(1,0,0,0) 48 | self.p1=p1 49 | self.p2=p2 50 | self.p3=p3 51 | 52 | 53 | def _reset(self): 54 | """Reset the environment. 55 | 56 | The server should always start on Queue 1. 57 | 58 | Returns 59 | ------- 60 | (int, int, int, int) 61 | A tuple representing the current state with meanings 62 | (current queue, num items in 1, num items in 2, num items in 63 | 3). 64 | """ 65 | self.current_state=(1,0,0,0) 66 | return self.current_state 67 | 68 | def _step(self, action): 69 | """Execute the specified action. 70 | 71 | Parameters 72 | ---------- 73 | action: int 74 | A number in range [0, 3]. Represents the action. 75 | 76 | Returns 77 | ------- 78 | (state, reward, is_terminal, debug_info) 79 | State is the tuple in the same format as the reset 80 | method. Reward is a floating point number. is_terminal is a 81 | boolean representing if the new state is a terminal 82 | state. debug_info is a dictionary. You can fill debug_info 83 | with any additional information you deem useful. 84 | """ 85 | possible_next_states=self.query_model(self.current_state,action) 86 | probarray=[] 87 | for ps in possible_next_states: 88 | probarray.append(ps[0]) 89 | probs=numpy.asarray(probarray) 90 | randomarray=numpy.random.rand(len(possible_next_states),1) 91 | next_state_index=self.categorical_sample(probs,randomarray) 92 | pns=possible_next_states[next_state_index] 93 | next_state=(pns[1],pns[2],pns[3],dict()) 94 | self.current_state=next_state[0] 95 | return next_state 96 | 97 | 98 | 99 | def _render(self, mode='human', close=False): 100 | print ("Current Q: "+str(self.current_state[0])) 101 | print ("Items in Q1: "+str(self.current_state[1])) 102 | print ("Items in Q2: "+str(self.current_state[2])) 103 | print ("Items in Q3: "+str(self.current_state[3])) 104 | print ("\n") 105 | 106 | 107 | def _seed(self, seed=None): 108 | """Set the random seed. 109 | 110 | Parameters 111 | ---------- 112 | seed: int, None 113 | Random seed used by numpy.random and random. 114 | """ 115 | pass 116 | 117 | def query_model(self, state, action): 118 | """Return the possible transition outcomes for a state-action pair. 119 | 120 | This should be in the same format at the provided environments 121 | in section 2. 122 | 123 | Parameters 124 | ---------- 125 | state 126 | State used in query. Should be in the same format at 127 | the states returned by reset and step. 128 | action: int 129 | The action used in query. 130 | 131 | Returns 132 | ------- 133 | [(prob, nextstate, reward, is_terminal), ...] 134 | List of possible outcomes 135 | """ 136 | lst=list(itertools.product([0,1],repeat=3)) 137 | reward=0 138 | newstate=list(state) 139 | if action==QueueEnv.SERVICE_QUEUE: 140 | currq=newstate[0] 141 | if newstate[currq]>0: 142 | newstate[currq]-=1 143 | reward=1 144 | elif action==QueueEnv.SWITCH_TO_1: 145 | newstate[0]=1 146 | elif action==QueueEnv.SWITCH_TO_2: 147 | newstate[0]=2 148 | elif action==QueueEnv.SWITCH_TO_3: 149 | newstate[0]=3 150 | blockq1=1 151 | blockq2=1 152 | blockq3=1 153 | if newstate[1]>=5: blockq1=0 154 | if newstate[2]>=5: blockq2=0 155 | if newstate[3]>=5: blockq3=0 156 | possible_states=[] 157 | for combination in lst: 158 | q1=combination[0] 159 | q2=combination[1] 160 | q3=combination[2] 161 | state_prob=0 162 | newpstate=newstate[:] 163 | if blockq1==0 or q1==0: state_prob+=(1-self.p1) 164 | else: 165 | state_prob+=self.p1 166 | newpstate[1]+=1 167 | if blockq2==0 or q2==0: state_prob=state_prob*(1-self.p2) 168 | else: 169 | state_prob=state_prob*self.p2 170 | newpstate[2]+=1 171 | if blockq3==0 or q3==0: state_prob=state_prob*(1-self.p3) 172 | else: 173 | state_prob=state_prob*self.p3 174 | newpstate[3]+=1 175 | found=False 176 | for psalready in possible_states: 177 | if tuple(newpstate) == psalready[1]: 178 | found=True 179 | break 180 | if not found: possible_states.append((state_prob,tuple(newpstate))) 181 | total_prob=0 182 | for ps in possible_states: 183 | total_prob+=ps[0] 184 | for i in range(len(possible_states)): 185 | unnormalized_state=possible_states[i] 186 | possible_states[i]=(float(unnormalized_state[0])/float(total_prob),unnormalized_state[1]) 187 | final_list=[] 188 | for ps in possible_states: 189 | final_list.append((ps[0],ps[1],reward,False)) 190 | return final_list 191 | 192 | def get_action_name(self, action): 193 | if action == QueueEnv.SERVICE_QUEUE: 194 | return 'SERVICE_QUEUE' 195 | elif action == QueueEnv.SWITCH_TO_1: 196 | return 'SWITCH_TO_1' 197 | elif action == QueueEnv.SWITCH_TO_2: 198 | return 'SWITCH_TO_2' 199 | elif action == QueueEnv.SWITCH_TO_3: 200 | return 'SWITCH_TO_3' 201 | return 'UNKNOWN' 202 | 203 | def categorical_sample(self, prob_n, np_random): 204 | """ 205 | Sample from categorical distribution 206 | Each row specifies class probabilities 207 | """ 208 | csprob_n = numpy.cumsum(prob_n) 209 | return (csprob_n > np_random).argmax() 210 | 211 | register( 212 | id='Queue-1-v0', 213 | entry_point='deeprl_hw1.queue_envs:QueueEnv', 214 | kwargs={'p1': .1, 215 | 'p2': .9, 216 | 'p3': .1}) 217 | 218 | register( 219 | id='Queue-2-v0', 220 | entry_point='deeprl_hw1.queue_envs:QueueEnv', 221 | kwargs={'p1': .1, 222 | 'p2': .1, 223 | 'p3': .1}) 224 | -------------------------------------------------------------------------------- /deeprl_hw1/rl1.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import division, absolute_import 3 | from __future__ import print_function, unicode_literals 4 | 5 | import numpy as np 6 | import math 7 | 8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3): 9 | """Evaluate the value of a policy. 10 | 11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 12 | book. 13 | 14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 15 | 16 | Parameters 17 | ---------- 18 | env: gym.core.Environment 19 | The environment to compute value iteration for. Must have nS, 20 | nA, and P as attributes. 21 | gamma: float 22 | Discount factor, must be in range [0, 1) 23 | policy: np.array 24 | The policy to evaluate. Maps states to actions. 25 | max_iterations: int 26 | The maximum number of iterations to run before stopping. 27 | tol: float 28 | Determines when value function has converged. 29 | 30 | Returns 31 | ------- 32 | np.ndarray 33 | The value for the given policy 34 | """ 35 | value_func_old = np.random.rand(env.nS) 36 | value_func_new = np.zeros(env.nS) 37 | for iteration in range(max_iterations): 38 | delta=0 39 | for s in range(env.nS): 40 | vs=0 41 | actions=[policy[s]] 42 | #if len(actions)==1: actions=[actions] 43 | for a in actions: 44 | for possible_next_state in env.P[s][a]: 45 | prob_action = possible_next_state[0] 46 | cur_reward=possible_next_state[2] 47 | future_reward=gamma*value_func_old[possible_next_state[1]] 48 | vs+=prob_action*(cur_reward+future_reward) 49 | #if env.P[s][a][3]:break 50 | diff=abs(value_func_old[s]-vs) 51 | delta=max(delta,diff) 52 | value_func_new[s]=vs 53 | #delta=math.sqrt(delta) 54 | if delta<=tol: break 55 | value_func_old = value_func_new 56 | return value_func_new, iteration 57 | 58 | 59 | def value_function_to_policy(env, gamma, value_function): 60 | """Output action numbers for each state in value_function. 61 | 62 | Parameters 63 | ---------- 64 | env: gym.core.Environment 65 | Environment to compute policy for. Must have nS, nA, and P as 66 | attributes. 67 | gamma: float 68 | Discount factor. Number in range [0, 1) 69 | value_function: np.ndarray 70 | Value of each state. 71 | 72 | Returns 73 | ------- 74 | np.ndarray 75 | An array of integers. Each integer is the optimal action to take 76 | in that state according to the environment dynamics and the 77 | given value function. 78 | """ 79 | policy=np.zeros(env.nS,dtype='int') 80 | for s in range(env.nS): 81 | maxvsa=-1 82 | maxa=-1 83 | for a in range(env.nA): 84 | vsa=0 85 | for possible_next_state in env.P[s][a]: 86 | prob_action = possible_next_state[0] 87 | cur_reward = possible_next_state[2] 88 | future_reward = gamma * value_function[possible_next_state[1]] 89 | vsa+=prob_action * (cur_reward + future_reward) 90 | if vsa>maxvsa: 91 | maxvsa=vsa 92 | maxa=a 93 | policy[s]=maxa 94 | 95 | return policy 96 | 97 | 98 | def improve_policy(env, gamma, value_func, policy): 99 | """Given a policy and value function improve the policy. 100 | 101 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 102 | book. 103 | 104 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 105 | 106 | Parameters 107 | ---------- 108 | env: gym.core.Environment 109 | The environment to compute value iteration for. Must have nS, 110 | nA, and P as attributes. 111 | gamma: float 112 | Discount factor, must be in range [0, 1) 113 | value_func: np.ndarray 114 | Value function for the given policy. 115 | policy: dict or np.array 116 | The policy to improve. Maps states to actions. 117 | max_iterations: int 118 | The maximum number of iterations to run before stopping. 119 | tol: float 120 | Determines when value function has converged. 121 | 122 | Returns 123 | ------- 124 | bool, np.ndarray 125 | Returns true if policy changed. Also returns the new policy. 126 | """ 127 | stable=True 128 | for s in range(env.nS): 129 | old_action=policy[s] 130 | maxvsa=-1 131 | maxa=-1 132 | for a in range(env.nA): 133 | vsa=0 134 | for possible_next_state in env.P[s][a]: 135 | prob_action = possible_next_state[0] 136 | cur_reward = possible_next_state[2] 137 | future_reward = gamma * value_func[possible_next_state[1]] 138 | vsa+=prob_action * (cur_reward + future_reward) 139 | if vsa>maxvsa: 140 | maxvsa=vsa 141 | maxa=a 142 | if maxa!=old_action: stable=False 143 | policy[s]=maxa 144 | return stable, policy 145 | 146 | 147 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 148 | """Runs policy iteration. 149 | 150 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 151 | book. 152 | 153 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 154 | 155 | You should use the improve_policy and evaluate_policy methods to 156 | implement this method. 157 | 158 | Parameters 159 | ---------- 160 | env: gym.core.Environment 161 | The environment to compute value iteration for. Must have nS, 162 | nA, and P as attributes. 163 | gamma: float 164 | Discount factor, must be in range [0, 1) 165 | max_iterations: int 166 | The maximum number of iterations to run before stopping. 167 | tol: float 168 | Determines when value function has converged. 169 | 170 | Returns 171 | ------- 172 | (np.ndarray, np.ndarray, int, int) 173 | Returns optimal policy, value function, number of policy 174 | improvement iterations, and number of value iterations. 175 | """ 176 | policy = np.zeros(env.nS, dtype='int') 177 | value_func = np.zeros(env.nS) 178 | stable=False 179 | iters=0 180 | eval_iters=0 181 | while not stable: 182 | value_func,iter=evaluate_policy(env,gamma,policy) 183 | eval_iters+=iter 184 | stable,policy=improve_policy(env,gamma,value_func,policy) 185 | iters+=1 186 | return policy, value_func, iters, eval_iters 187 | 188 | 189 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 190 | """Runs value iteration for a given gamma and environment. 191 | 192 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition 193 | book. 194 | 195 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 196 | 197 | Parameters 198 | ---------- 199 | env: gym.core.Environment 200 | The environment to compute value iteration for. Must have nS, 201 | nA, and P as attributes. 202 | gamma: float 203 | Discount factor, must be in range [0, 1) 204 | max_iterations: int 205 | The maximum number of iterations to run before stopping. 206 | tol: float 207 | Determines when value function has converged. 208 | 209 | Returns 210 | ------- 211 | np.ndarray, iteration 212 | The value function and the number of iterations it took to converge. 213 | """ 214 | value_func_old = np.random.rand(env.nS) 215 | value_func_new = np.zeros(env.nS) 216 | for iteration in range(max_iterations): 217 | delta=0 218 | for s in range(env.nS): 219 | maxvsa = -1 220 | for a in range(env.nA): 221 | vsa=0 222 | for possible_next_state in env.P[s][a]: 223 | prob_action = possible_next_state[0] 224 | cur_reward=possible_next_state[2] 225 | if possible_next_state[3]: 226 | future_reward=0 227 | else: future_reward=gamma*value_func_old[possible_next_state[1]] 228 | vsa+=prob_action*(cur_reward+future_reward) 229 | if vsa>maxvsa: 230 | maxvsa=vsa 231 | #diff=math.pow((value_func_old[s]-maxvsa),2) 232 | diff=abs(value_func_old[s]-maxvsa) 233 | delta=max(delta,diff) 234 | value_func_new[s]=maxvsa 235 | #delta=math.sqrt(delta) 236 | if delta<=tol: break 237 | value_func_old = value_func_new 238 | 239 | return value_func_new, iteration 240 | 241 | 242 | def print_policy(policy, action_names): 243 | """Print the policy in human-readable format. 244 | 245 | Parameters 246 | ---------- 247 | policy: np.ndarray 248 | Array of state to action number mappings 249 | action_names: dict 250 | Mapping of action numbers to characters representing the action. 251 | """ 252 | str_policy = policy.astype('str') 253 | for action_num, action_name in action_names.items(): 254 | np.place(str_policy, policy == action_num, action_name) 255 | 256 | print(str_policy) 257 | return str_policy 258 | -------------------------------------------------------------------------------- /deeprl_hw1/rlvaliterchngd.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import division, absolute_import 3 | from __future__ import print_function, unicode_literals 4 | 5 | import numpy as np 6 | import math 7 | 8 | def evaluate_policy(env, gamma, policy, max_iterations=int(1e3), tol=1e-3): 9 | """Evaluate the value of a policy. 10 | 11 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 12 | book. 13 | 14 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 15 | 16 | Parameters 17 | ---------- 18 | env: gym.core.Environment 19 | The environment to compute value iteration for. Must have nS, 20 | nA, and P as attributes. 21 | gamma: float 22 | Discount factor, must be in range [0, 1) 23 | policy: np.array 24 | The policy to evaluate. Maps states to actions. 25 | max_iterations: int 26 | The maximum number of iterations to run before stopping. 27 | tol: float 28 | Determines when value function has converged. 29 | 30 | Returns 31 | ------- 32 | np.ndarray 33 | The value for the given policy 34 | """ 35 | value_func_old = np.random.rand(env.nS) 36 | value_func_new = np.zeros(env.nS) 37 | for iteration in range(max_iterations): 38 | delta=0 39 | for s in range(env.nS): 40 | vs=0 41 | actions=[policy[s]] 42 | #if len(actions)==1: actions=[actions] 43 | for a in actions: 44 | for possible_next_state in env.P[s][a]: 45 | prob_action = possible_next_state[0] 46 | cur_reward=possible_next_state[2] 47 | future_reward=gamma*value_func_old[possible_next_state[1]] 48 | vs+=prob_action*(cur_reward+future_reward) 49 | #if env.P[s][a][3]:break 50 | diff=abs(value_func_old[s]-vs) 51 | delta=max(delta,diff) 52 | value_func_new[s]=vs 53 | #delta=math.sqrt(delta) 54 | if delta<=tol: break 55 | value_func_old = value_func_new 56 | return value_func_new, iteration 57 | 58 | 59 | def value_function_to_policy(env, gamma, value_function): 60 | """Output action numbers for each state in value_function. 61 | 62 | Parameters 63 | ---------- 64 | env: gym.core.Environment 65 | Environment to compute policy for. Must have nS, nA, and P as 66 | attributes. 67 | gamma: float 68 | Discount factor. Number in range [0, 1) 69 | value_function: np.ndarray 70 | Value of each state. 71 | 72 | Returns 73 | ------- 74 | np.ndarray 75 | An array of integers. Each integer is the optimal action to take 76 | in that state according to the environment dynamics and the 77 | given value function. 78 | """ 79 | policy=np.zeros(env.nS,dtype='int') 80 | for s in range(env.nS): 81 | maxvsa=-1 82 | maxa=-1 83 | for a in range(env.nA): 84 | vsa=0 85 | for possible_next_state in env.P[s][a]: 86 | prob_action = possible_next_state[0] 87 | cur_reward = possible_next_state[2] 88 | future_reward = gamma * value_function[possible_next_state[1]] 89 | vsa+=prob_action * (cur_reward + future_reward) 90 | if vsa>maxvsa: 91 | maxvsa=vsa 92 | maxa=a 93 | policy[s]=maxa 94 | 95 | return policy 96 | 97 | 98 | def improve_policy(env, gamma, value_func, policy): 99 | """Given a policy and value function improve the policy. 100 | 101 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 102 | book. 103 | 104 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 105 | 106 | Parameters 107 | ---------- 108 | env: gym.core.Environment 109 | The environment to compute value iteration for. Must have nS, 110 | nA, and P as attributes. 111 | gamma: float 112 | Discount factor, must be in range [0, 1) 113 | value_func: np.ndarray 114 | Value function for the given policy. 115 | policy: dict or np.array 116 | The policy to improve. Maps states to actions. 117 | max_iterations: int 118 | The maximum number of iterations to run before stopping. 119 | tol: float 120 | Determines when value function has converged. 121 | 122 | Returns 123 | ------- 124 | bool, np.ndarray 125 | Returns true if policy changed. Also returns the new policy. 126 | """ 127 | stable=True 128 | for s in range(env.nS): 129 | old_action=policy[s] 130 | maxvsa=-1 131 | maxa=-1 132 | for a in range(env.nA): 133 | vsa=0 134 | for possible_next_state in env.P[s][a]: 135 | prob_action = possible_next_state[0] 136 | cur_reward = possible_next_state[2] 137 | future_reward = gamma * value_func[possible_next_state[1]] 138 | vsa+=prob_action * (cur_reward + future_reward) 139 | if vsa>maxvsa: 140 | maxvsa=vsa 141 | maxa=a 142 | if maxa!=old_action: stable=False 143 | policy[s]=maxa 144 | return stable, policy 145 | 146 | 147 | def policy_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 148 | """Runs policy iteration. 149 | 150 | See page 87 (pg 105 pdf) of the Sutton and Barto Second Edition 151 | book. 152 | 153 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 154 | 155 | You should use the improve_policy and evaluate_policy methods to 156 | implement this method. 157 | 158 | Parameters 159 | ---------- 160 | env: gym.core.Environment 161 | The environment to compute value iteration for. Must have nS, 162 | nA, and P as attributes. 163 | gamma: float 164 | Discount factor, must be in range [0, 1) 165 | max_iterations: int 166 | The maximum number of iterations to run before stopping. 167 | tol: float 168 | Determines when value function has converged. 169 | 170 | Returns 171 | ------- 172 | (np.ndarray, np.ndarray, int, int) 173 | Returns optimal policy, value function, number of policy 174 | improvement iterations, and number of value iterations. 175 | """ 176 | policy = np.zeros(env.nS, dtype='int') 177 | value_func = np.zeros(env.nS) 178 | stable=False 179 | iters=0 180 | eval_iters=0 181 | while not stable: 182 | value_func,iter=evaluate_policy(env,gamma,policy) 183 | eval_iters+=iter 184 | stable,policy=improve_policy(env,gamma,value_func,policy) 185 | iters+=1 186 | return policy, value_func, iters, eval_iters 187 | 188 | 189 | def value_iteration(env, gamma, max_iterations=int(1e3), tol=1e-3): 190 | """Runs value iteration for a given gamma and environment. 191 | 192 | See page 90 (pg 108 pdf) of the Sutton and Barto Second Edition 193 | book. 194 | 195 | http://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf 196 | 197 | Parameters 198 | ---------- 199 | env: gym.core.Environment 200 | The environment to compute value iteration for. Must have nS, 201 | nA, and P as attributes. 202 | gamma: float 203 | Discount factor, must be in range [0, 1) 204 | max_iterations: int 205 | The maximum number of iterations to run before stopping. 206 | tol: float 207 | Determines when value function has converged. 208 | 209 | Returns 210 | ------- 211 | np.ndarray, iteration 212 | The value function and the number of iterations it took to converge. 213 | """ 214 | value_func_old = np.random.rand(env.nS) 215 | value_func_new = np.zeros(env.nS) 216 | for iteration in range(max_iterations): 217 | delta=0 218 | for s in range(env.nS): 219 | maxvsa = -1 220 | for a in range(env.nA): 221 | vsa=0 222 | for possible_next_state in env.P[s][a]: 223 | prob_action = possible_next_state[0] 224 | cur_reward=possible_next_state[2] 225 | if value_func_new[possible_next_state[1]]==0: 226 | future_reward=gamma*value_func_old[possible_next_state[1]] 227 | else: 228 | future_reward = gamma * value_func_new[possible_next_state[1]] 229 | vsa+=prob_action*(cur_reward+future_reward) 230 | if vsa>maxvsa: 231 | maxvsa=vsa 232 | #diff=math.pow((value_func_old[s]-maxvsa),2) 233 | diff=abs(value_func_old[s]-maxvsa) 234 | delta=max(delta,diff) 235 | value_func_new[s]=maxvsa 236 | #delta=math.sqrt(delta) 237 | if delta<=tol: break 238 | value_func_old = value_func_new 239 | 240 | return value_func_new, iteration 241 | 242 | 243 | def print_policy(policy, action_names): 244 | """Print the policy in human-readable format. 245 | 246 | Parameters 247 | ---------- 248 | policy: np.ndarray 249 | Array of state to action number mappings 250 | action_names: dict 251 | Mapping of action numbers to characters representing the action. 252 | """ 253 | str_policy = policy.astype('str') 254 | for action_num, action_name in action_names.items(): 255 | np.place(str_policy, policy == action_num, action_name) 256 | 257 | print(str_policy) 258 | return str_policy 259 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 106 | 107 | 108 | 121 | 122 | 123 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | Python 137 | 138 | 139 | 140 | 141 | PyCompatibilityInspection 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 170 | 171 | 174 | 175 | 176 | 177 | 180 | 181 | 184 | 185 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 220 | 221 | 232 | 233 | 251 | 252 | 270 | 271 | 291 | 292 | 313 | 314 | 337 | 338 | 354 | 355 | 371 | 372 | 388 | 389 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 415 | 416 | 417 | 418 | 1487050740220 419 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 451 | 452 | 453 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | --------------------------------------------------------------------------------