├── codecov.yml ├── LICENSE ├── .gitignore ├── man ├── figures │ └── logo.png ├── RandomPolicy.Rd ├── SoftmaxPolicy.Rd ├── getStateValues.Rd ├── getEligibilityTraces.Rd ├── getReplayMemory.Rd ├── getValueFunction.Rd ├── EpsilonGreedyPolicy.Rd ├── Eligibility.Rd ├── nHot.Rd ├── QLearning.Rd ├── makeReplayMemory.Rd ├── ValueNetwork.Rd ├── makeAlgorithm.Rd ├── ValueTable.Rd ├── interact.Rd ├── makePolicy.Rd ├── MountainCar.Rd ├── makeValueFunction.Rd ├── GymEnvironment.Rd ├── makeAgent.Rd ├── reinforcelearn.Rd ├── Environment.Rd ├── MdpEnvironment.Rd ├── CliffWalking.Rd ├── windyGridworld.Rd ├── tilecoding.Rd ├── makeEnvironment.Rd └── gridworld.Rd ├── docs ├── reinforcelearn.png ├── reference │ ├── nHot.html │ ├── interact.html │ ├── QLearning.html │ ├── ValueTable.html │ ├── gridworld.html │ ├── makeAgent.html │ ├── makePolicy.html │ ├── tilecoding.html │ ├── CliffWalking.html │ ├── Eligibility.html │ ├── Environment.html │ ├── RandomPolicy.html │ ├── SoftmaxPolicy.html │ ├── ValueNetwork.html │ ├── figures │ │ └── logo.png │ ├── makeAlgorithm.html │ ├── mountainCar.html │ ├── GymEnvironment.html │ ├── MdpEnvironment.html │ ├── getReplayMemory.html │ ├── getStateValues.html │ ├── makeEnvironment.html │ ├── reinforcelearn.html │ ├── windyGridworld.html │ ├── getValueFunction.html │ ├── makeReplayMemory.html │ ├── makeValueFunction.html │ ├── EpsilonGreedyPolicy.html │ ├── getEligibilityTraces.html │ └── index.html ├── articles │ ├── gridworld.JPG │ ├── mountaincar.JPG │ ├── environments.R │ ├── agents.R │ └── index.html ├── pkgdown.yml ├── link.svg ├── pkgdown.js ├── jquery.sticky-kit.min.js ├── session_info.txt ├── pkgdown.css ├── authors.html ├── news │ └── index.html └── LICENSE.html ├── tests ├── testthat.R └── testthat │ ├── test_policy.R │ ├── test_accessor_functions.R │ ├── test_environment.R │ └── test_agent.R ├── vignettes ├── gridworld.JPG ├── mountaincar.JPG ├── environments.R ├── agents.R ├── agents.Rmd └── environments.Rmd ├── benchmark ├── Images │ ├── qlearning_windygrid-1.png │ ├── qlearning_windygrid_elig-1.png │ ├── qlearning_windygrid_expreplay-1.png │ └── qlearning_windygrid_neuralnetwork-1.png ├── benchmark_windy_gridworld.md └── benchmark_windy_gridworld.Rmd ├── NEWS.md ├── .Rbuildignore ├── cran-comments.md ├── .travis.yml ├── NAMESPACE ├── DESCRIPTION ├── R ├── reinforcelearn.R ├── eligibility.R ├── algorithm.R ├── accessor_functions.R ├── environment_mountaincar.R ├── environment_mdp.R ├── environment_gym.R ├── experience_replay.R ├── policy.R ├── interact.R ├── tiles.R ├── valuefunction.R └── environment.R ├── _pkgdown.yml ├── session_info.txt ├── README.Rmd ├── README.md └── examples └── user_interface.R /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Markus Dumke -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | reinforcelearn.Rproj 6 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /docs/reinforcelearn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reinforcelearn.png -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(reinforcelearn) 3 | 4 | test_check("reinforcelearn") 5 | -------------------------------------------------------------------------------- /vignettes/gridworld.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/vignettes/gridworld.JPG -------------------------------------------------------------------------------- /docs/reference/nHot.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/nHot.html -------------------------------------------------------------------------------- /vignettes/mountaincar.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/vignettes/mountaincar.JPG -------------------------------------------------------------------------------- /docs/articles/gridworld.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/articles/gridworld.JPG -------------------------------------------------------------------------------- /docs/reference/interact.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/interact.html -------------------------------------------------------------------------------- /docs/articles/mountaincar.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/articles/mountaincar.JPG -------------------------------------------------------------------------------- /docs/reference/QLearning.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/QLearning.html -------------------------------------------------------------------------------- /docs/reference/ValueTable.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/ValueTable.html -------------------------------------------------------------------------------- /docs/reference/gridworld.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/gridworld.html -------------------------------------------------------------------------------- /docs/reference/makeAgent.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeAgent.html -------------------------------------------------------------------------------- /docs/reference/makePolicy.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makePolicy.html -------------------------------------------------------------------------------- /docs/reference/tilecoding.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/tilecoding.html -------------------------------------------------------------------------------- /docs/reference/CliffWalking.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/CliffWalking.html -------------------------------------------------------------------------------- /docs/reference/Eligibility.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/Eligibility.html -------------------------------------------------------------------------------- /docs/reference/Environment.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/Environment.html -------------------------------------------------------------------------------- /docs/reference/RandomPolicy.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/RandomPolicy.html -------------------------------------------------------------------------------- /docs/reference/SoftmaxPolicy.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/SoftmaxPolicy.html -------------------------------------------------------------------------------- /docs/reference/ValueNetwork.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/ValueNetwork.html -------------------------------------------------------------------------------- /docs/reference/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/figures/logo.png -------------------------------------------------------------------------------- /docs/reference/makeAlgorithm.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeAlgorithm.html -------------------------------------------------------------------------------- /docs/reference/mountainCar.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/mountainCar.html -------------------------------------------------------------------------------- /docs/reference/GymEnvironment.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/GymEnvironment.html -------------------------------------------------------------------------------- /docs/reference/MdpEnvironment.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/MdpEnvironment.html -------------------------------------------------------------------------------- /docs/reference/getReplayMemory.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getReplayMemory.html -------------------------------------------------------------------------------- /docs/reference/getStateValues.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getStateValues.html -------------------------------------------------------------------------------- /docs/reference/makeEnvironment.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeEnvironment.html -------------------------------------------------------------------------------- /docs/reference/reinforcelearn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/reinforcelearn.html -------------------------------------------------------------------------------- /docs/reference/windyGridworld.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/windyGridworld.html -------------------------------------------------------------------------------- /docs/reference/getValueFunction.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getValueFunction.html -------------------------------------------------------------------------------- /docs/reference/makeReplayMemory.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeReplayMemory.html -------------------------------------------------------------------------------- /docs/reference/makeValueFunction.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeValueFunction.html -------------------------------------------------------------------------------- /docs/reference/EpsilonGreedyPolicy.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/EpsilonGreedyPolicy.html -------------------------------------------------------------------------------- /docs/reference/getEligibilityTraces.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getEligibilityTraces.html -------------------------------------------------------------------------------- /benchmark/Images/qlearning_windygrid-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid-1.png -------------------------------------------------------------------------------- /benchmark/Images/qlearning_windygrid_elig-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_elig-1.png -------------------------------------------------------------------------------- /benchmark/Images/qlearning_windygrid_expreplay-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_expreplay-1.png -------------------------------------------------------------------------------- /benchmark/Images/qlearning_windygrid_neuralnetwork-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_neuralnetwork-1.png -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # reinforcelearn 2 | 3 | # 0.2.0 4 | 5 | * Fixed failing tests due to random number generation in R 3.6.0. 6 | 7 | # 0.1.0 8 | 9 | * Initial release. 10 | * Added a `NEWS.md` file to track changes to the package. 11 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | urls: 2 | reference: http://markusdumke.github.io/reinforcelearn/reference 3 | article: http://markusdumke.github.io/reinforcelearn/articles 4 | articles: 5 | agents: agents.html 6 | environments: environments.html 7 | 8 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^README\.Rmd$ 5 | ^README-.*\.png$ 6 | ^cran-comments\.md$ 7 | ^codecov\.yml$ 8 | ^docs$ 9 | ^_pkgdown\.yml$ 10 | ^reinforcelearn\.png$ 11 | ^examples$ 12 | ^benchmark$ 13 | ^session_info.txt$ 14 | -------------------------------------------------------------------------------- /man/RandomPolicy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/policy.R 3 | \name{RandomPolicy} 4 | \alias{RandomPolicy} 5 | \title{Random Policy} 6 | \description{ 7 | Random Policy 8 | } 9 | \section{Usage}{ 10 | 11 | \code{makePolicy("random")} 12 | } 13 | 14 | \examples{ 15 | pol = makePolicy("random") 16 | } 17 | -------------------------------------------------------------------------------- /man/SoftmaxPolicy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/policy.R 3 | \name{SoftmaxPolicy} 4 | \alias{SoftmaxPolicy} 5 | \title{Softmax Policy} 6 | \description{ 7 | Softmax Policy 8 | } 9 | \section{Usage}{ 10 | 11 | \code{makePolicy("softmax")} 12 | } 13 | 14 | \examples{ 15 | pol = makePolicy("softmax") 16 | } 17 | -------------------------------------------------------------------------------- /tests/testthat/test_policy.R: -------------------------------------------------------------------------------- 1 | context("policy") 2 | policy1 = makePolicy("random") 3 | policy2 = makePolicy("greedy") 4 | policy3 = makePolicy("epsilon.greedy", epsilon = 0.2) 5 | policy4 = makePolicy("softmax") 6 | test_that("policy creation returns list", { 7 | expect_equivalent(policy1, list(name = "random", args = list())) 8 | expect_equal(class(policy2), "Policy") 9 | expect_equal(policy3$args$epsilon, 0.2) 10 | }) 11 | -------------------------------------------------------------------------------- /man/getStateValues.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/accessor_functions.R 3 | \name{getStateValues} 4 | \alias{getStateValues} 5 | \title{Get state values.} 6 | \usage{ 7 | getStateValues(action.vals) 8 | } 9 | \arguments{ 10 | \item{action.vals}{[\code{matrix}] \cr Action value matrix.} 11 | } 12 | \description{ 13 | Get state value function from action value function. 14 | } 15 | -------------------------------------------------------------------------------- /man/getEligibilityTraces.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/accessor_functions.R 3 | \name{getEligibilityTraces} 4 | \alias{getEligibilityTraces} 5 | \title{Get eligibility traces} 6 | \usage{ 7 | getEligibilityTraces(agent) 8 | } 9 | \arguments{ 10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.} 11 | } 12 | \value{ 13 | A matrix with the eligibility traces. 14 | } 15 | \description{ 16 | Returns the eligibility traces of the agent. 17 | } 18 | -------------------------------------------------------------------------------- /man/getReplayMemory.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/accessor_functions.R 3 | \name{getReplayMemory} 4 | \alias{getReplayMemory} 5 | \title{Get replay memory.} 6 | \usage{ 7 | getReplayMemory(agent) 8 | } 9 | \arguments{ 10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.} 11 | } 12 | \value{ 13 | A list containing the experienced observations, actions and rewards. 14 | } 15 | \description{ 16 | Returns the replay memory of the agent. 17 | } 18 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Resubmission 2 | This is a resubmission. In this version I have: 3 | 4 | * Fixed test that failed due to change in R random number generation in R 3.6.0. 5 | 6 | ## Test environments 7 | * local x86_64-w64-mingw32/x64 (64-bit), R 3.6.0 8 | * linux_gnu x_86_64, R Under development (unstable) (2017-12-22 r73943) on travis-ci 9 | 10 | ## R CMD check results 11 | 12 | 0 errors | 0 warnings | 0 notes 13 | 14 | ## Reverse dependencies 15 | 16 | There are no reverse dependencies. 17 | 18 | --- 19 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | 7 | warnings_are_errors: true 8 | 9 | r: 10 | - devel 11 | 12 | before_install: 13 | - Rscript -e 'if (length(find.package("devtools", quiet = TRUE)) == 0) install.packages("devtools")' 14 | - Rscript -e 'devtools::install_deps()' 15 | 16 | after_success: 17 | - Rscript -e 'covr::codecov(type = "tests")' 18 | 19 | notifications: 20 | slack: 21 | on_success: change 22 | on_failure: change 23 | -------------------------------------------------------------------------------- /man/getValueFunction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/accessor_functions.R 3 | \name{getValueFunction} 4 | \alias{getValueFunction} 5 | \title{Get weights of value function.} 6 | \usage{ 7 | getValueFunction(agent) 8 | } 9 | \arguments{ 10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.} 11 | } 12 | \value{ 13 | For a value function table this will return a matrix, for a neural 14 | network a list with the weights of the layers. 15 | } 16 | \description{ 17 | Returns the weights of the value function representation of the agent. 18 | } 19 | -------------------------------------------------------------------------------- /man/EpsilonGreedyPolicy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/policy.R 3 | \name{EpsilonGreedyPolicy} 4 | \alias{EpsilonGreedyPolicy} 5 | \alias{GreedyPolicy} 6 | \title{Epsilon Greedy Policy} 7 | \arguments{ 8 | \item{epsilon}{[\code{numeric(1) in [0, 1]}] \cr 9 | Ratio of random exploration in epsilon-greedy action selection.} 10 | } 11 | \description{ 12 | Epsilon Greedy Policy 13 | } 14 | \section{Usage}{ 15 | 16 | \code{makePolicy("epsilon.greedy", epsilon = 0.1)} \cr 17 | \code{makePolicy("greedy")} 18 | } 19 | 20 | \examples{ 21 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 22 | } 23 | -------------------------------------------------------------------------------- /man/Eligibility.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/eligibility.R 3 | \name{Eligibility} 4 | \alias{Eligibility} 5 | \alias{eligibility} 6 | \title{Eligibility traces} 7 | \arguments{ 8 | \item{lambda}{[\code{numeric(1)} in (0, 1)] \cr Trace decay parameter.} 9 | 10 | \item{traces}{[\code{character(1)}] \cr Type of eligibility trace update. One of \code{c("replace", "accumulate")}.} 11 | } 12 | \description{ 13 | Eligibility traces. 14 | } 15 | \details{ 16 | Algorithms supporting eligibility traces: 17 | \itemize{ 18 | \item \link{QLearning} 19 | } 20 | } 21 | \examples{ 22 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 23 | } 24 | -------------------------------------------------------------------------------- /man/nHot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tiles.R 3 | \name{nHot} 4 | \alias{nHot} 5 | \title{Make n hot vector.} 6 | \usage{ 7 | nHot(x, len, out = "matrix") 8 | } 9 | \arguments{ 10 | \item{x}{[\code{integer}] \cr Which features are active?} 11 | 12 | \item{len}{[\code{integer(1)}] \cr Length of the feature vector.} 13 | 14 | \item{out}{[\code{character(1)}] \cr Format of the output. Can be a vector or a matrix.} 15 | } 16 | \value{ 17 | [\code{matrix(1, len)}] A one-row matrix with \code{len} columns with every 18 | entry 0 except the columns specified by \code{x} which are 1. 19 | } 20 | \description{ 21 | Make n hot vector. 22 | } 23 | \examples{ 24 | nHot(c(1, 3), 5) 25 | nHot(c(1, 3), 5, out = "vector") 26 | } 27 | -------------------------------------------------------------------------------- /man/QLearning.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/algorithm.R 3 | \name{QLearning} 4 | \alias{QLearning} 5 | \alias{qlearning} 6 | \title{Q-Learning} 7 | \arguments{ 8 | \item{lambda}{[\code{numeric(1)} in (0, 1)] \cr Trace decay parameter.} 9 | 10 | \item{traces}{[\code{character(1)}] \cr Type of eligibility trace update. One of \code{c("replace", "accumulate")}.} 11 | } 12 | \description{ 13 | Q-Learning algorithm. 14 | } 15 | \details{ 16 | To use eligibility traces specify \code{lambda} and \code{traces}. 17 | } 18 | \section{Usage}{ 19 | 20 | \code{makeAlgorithm("qlearning", lambda, traces)} 21 | } 22 | 23 | \examples{ 24 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 25 | } 26 | \seealso{ 27 | \link{Eligibility} 28 | } 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(go,down) 4 | S3method(go,left) 5 | S3method(go,leftdown) 6 | S3method(go,leftup) 7 | S3method(go,right) 8 | S3method(go,rightdown) 9 | S3method(go,rightup) 10 | S3method(go,up) 11 | export(CliffWalking) 12 | export(Environment) 13 | export(EpsilonGreedyPolicy) 14 | export(Gridworld) 15 | export(GymEnvironment) 16 | export(MdpEnvironment) 17 | export(RandomPolicy) 18 | export(SoftmaxPolicy) 19 | export(WindyGridworld) 20 | export(getEligibilityTraces) 21 | export(getReplayMemory) 22 | export(getStateValues) 23 | export(getValueFunction) 24 | export(iht) 25 | export(interact) 26 | export(makeAgent) 27 | export(makeAlgorithm) 28 | export(makeEnvironment) 29 | export(makePolicy) 30 | export(makeReplayMemory) 31 | export(makeValueFunction) 32 | export(nHot) 33 | export(tiles) 34 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /man/makeReplayMemory.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/experience_replay.R 3 | \name{makeReplayMemory} 4 | \alias{makeReplayMemory} 5 | \alias{experience.replay,} 6 | \alias{replay.memory} 7 | \title{Experience Replay} 8 | \usage{ 9 | makeReplayMemory(size = 100L, batch.size = 16L) 10 | } 11 | \arguments{ 12 | \item{size}{[\code{integer(1)}] \cr Size of replay memory.} 13 | 14 | \item{batch.size}{[\code{integer(1)}] \cr Batch size.} 15 | } 16 | \value{ 17 | [\code{list(size, batch.size)}] 18 | This list can then be passed onto \link{makeAgent}, which will construct the 19 | replay memory accordingly. 20 | } 21 | \description{ 22 | Create replay memory for experience replay. 23 | } 24 | \details{ 25 | Sampling from replay memory will be uniform. 26 | } 27 | \examples{ 28 | memory = makeReplayMemory(size = 100L, batch.size = 16L) 29 | } 30 | -------------------------------------------------------------------------------- /man/ValueNetwork.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/valuefunction.R 3 | \name{ValueNetwork} 4 | \alias{ValueNetwork} 5 | \alias{neural.network} 6 | \title{Value Network} 7 | \arguments{ 8 | \item{model}{[\code{keras model}] \cr A keras model. 9 | Make sure that the model has been compiled.} 10 | } 11 | \description{ 12 | Neural network representing the action value function Q. 13 | } 14 | \section{Usage}{ 15 | 16 | \code{makeValueFunction("neural.network", model)} 17 | } 18 | 19 | \examples{ 20 | \dontrun{ 21 | library(keras) 22 | model = keras_model_sequential() 23 | model \%>\% layer_dense(20, input_shape = 10, activation = "relu") 24 | model \%>\% layer_dense(4, activation = "softmax") 25 | keras::compile(model, loss = "mae", optimizer = keras::optimizer_sgd(lr = 0.4)) 26 | 27 | val = makeValueFunction("neural.network", model = model) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/makeAlgorithm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/algorithm.R 3 | \name{makeAlgorithm} 4 | \alias{makeAlgorithm} 5 | \title{Make reinforcement learning algorithm.} 6 | \usage{ 7 | makeAlgorithm(class, args = list(), ...) 8 | } 9 | \arguments{ 10 | \item{class}{[\code{character(1)}] \cr Algorithm. One of \code{c("qlearning")}.} 11 | 12 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the 13 | subclass. The arguments in ... take precedence over values in this list. 14 | We strongly encourage you to use one or the other to pass arguments 15 | to the function but not both.} 16 | 17 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively 18 | these can be given using the \code{args} argument.} 19 | } 20 | \description{ 21 | Make reinforcement learning algorithm. 22 | } 23 | \section{Representations}{ 24 | 25 | \itemize{ 26 | \item \link{QLearning} 27 | } 28 | } 29 | 30 | \examples{ 31 | alg = makeAlgorithm("qlearning") 32 | } 33 | -------------------------------------------------------------------------------- /tests/testthat/test_accessor_functions.R: -------------------------------------------------------------------------------- 1 | context("getValueFunction") 2 | val.fun = makeValueFunction("table", n.states = 8L, n.actions = 4L) 3 | agent = makeAgent("random", val.fun) 4 | 5 | test_that("getValueFunction returns action value function", { 6 | expect_equal(getValueFunction(agent), matrix(0, nrow = 8, ncol = 4)) 7 | }) 8 | 9 | test_that("getStateValues returns row max of action value function", { 10 | expect_equal(getStateValues(matrix(c(1, 2, 3, 4), ncol = 2)), c(3, 4)) 11 | }) 12 | 13 | set.seed(1) 14 | context("getReplayMemory") 15 | env = makeEnvironment("windy.gridworld") 16 | memory = makeReplayMemory(size = 2L, batch.size = 2L) 17 | agent = makeAgent("random", replay.memory = memory) 18 | interact(env, agent, n.steps = 2L) 19 | 20 | test_that("getReplayMemory returns list", { 21 | expect_equal(typeof(getReplayMemory(agent)), "list") 22 | expect_equal(getReplayMemory(agent), list(list(state = 30, action = 0, 23 | reward = -1, next.state = 30), list(state = 30, action = 2, reward = -1, 24 | next.state = 20))) 25 | }) 26 | 27 | context("getEligibilityTraces") 28 | -------------------------------------------------------------------------------- /man/ValueTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/valuefunction.R 3 | \name{ValueTable} 4 | \alias{ValueTable} 5 | \alias{table} 6 | \title{Value Table} 7 | \arguments{ 8 | \item{n.states}{[\code{integer(1)}] \cr Number of states (rows in the value function).} 9 | 10 | \item{n.actions}{[\code{integer(1)}] \cr Number of actions (columns in the value function).} 11 | 12 | \item{step.size}{[\code{numeric(1)}] \cr Step size (learning rate) for gradient descent update.} 13 | } 14 | \description{ 15 | Table representing the action value function Q. 16 | } 17 | \details{ 18 | You can specify the shape of the value table. If omitted the agent will try 19 | to configure these automatically from the environment during interaction 20 | (therefore the environment needs to have a \code{n.states} and \code{n.actions} attribute). 21 | } 22 | \section{Usage}{ 23 | 24 | \code{makeValueFunction("table", n.states = NULL, n.actions = 1L, step.size = 0.1, initial.value = NULL)} 25 | } 26 | 27 | \examples{ 28 | val = makeValueFunction("table", n.states = 20L, n.actions = 4L) 29 | } 30 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: reinforcelearn 2 | Type: Package 3 | Title: Reinforcement Learning 4 | Version: 0.2.0 5 | Authors@R: person("Markus", "Dumke", email = {"markusdumke@gmail.com"}, role = c("aut", "cre")) 6 | Description: Implements reinforcement learning environments and algorithms as described in Sutton & Barto (1998, ISBN:0262193981). 7 | The Q-Learning algorithm can be used with function approximation, 8 | eligibility traces (Singh & Sutton (1996) ) 9 | and experience replay (Mnih et al. (2013) ). 10 | License: MIT + file LICENSE 11 | Encoding: UTF-8 12 | LazyData: true 13 | Depends: R (>= 3.0.0) 14 | RoxygenNote: 6.1.1 15 | BugReports: https://github.com/markusdumke/reinforcelearn/issues 16 | URL: http://markusdumke.github.io/reinforcelearn 17 | SystemRequirements: (Python and gym only required if gym environments are used) 18 | Imports: 19 | checkmate (>= 1.8.4), 20 | R6 (>= 2.2.2), 21 | nnet (>= 7.3-12), 22 | purrr (>= 0.2.4) 23 | Suggests: 24 | reticulate, 25 | keras, 26 | knitr, 27 | rmarkdown, 28 | testthat, 29 | covr, 30 | lintr 31 | VignetteBuilder: knitr 32 | -------------------------------------------------------------------------------- /R/reinforcelearn.R: -------------------------------------------------------------------------------- 1 | #' Reinforcement Learning. 2 | #' 3 | #' Implementations of reinforcement learning algorithms and environments. 4 | #' 5 | #' @md 6 | #' 7 | #' @section Environments: 8 | #' * [makeEnvironment] 9 | #' * [Environment] 10 | #' * [GymEnvironment] 11 | #' * [MdpEnvironment] 12 | #' * [Gridworld] 13 | #' * [WindyGridworld] 14 | #' * [CliffWalking] 15 | #' * [MountainCar] 16 | #' * [MountainCarContinuous] 17 | #' 18 | #' @section Policies: 19 | #' * [makePolicy] 20 | #' * [EpsilonGreedyPolicy] 21 | #' * [GreedyPolicy] 22 | #' * [SoftmaxPolicy] 23 | #' * [RandomPolicy] 24 | #' 25 | #' @section Value Function Representations: 26 | #' * [makeValueFunction] 27 | #' * [ValueTable] 28 | #' * [ValueNetwork] 29 | #' 30 | #' @section Algorithms: 31 | #' * [makeAlgorithm] 32 | #' * [QLearning] 33 | #' 34 | #' @section Extensions: 35 | #' * [makeReplayMemory] 36 | #' * [Eligibility] 37 | #' 38 | #' @section Agent: 39 | #' * [makeAgent] 40 | #' * [getValueFunction] 41 | #' * [getReplayMemory] 42 | #' * [getEligibilityTraces] 43 | #' 44 | #' @section Interaction: 45 | #' * [interact] 46 | #' 47 | #' @name reinforcelearn 48 | #' @aliases reinforcementlearning 49 | #' @docType package 50 | NULL 51 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | $("#sidebar").stick_in_parent({offset_top: 40}); 3 | $('body').scrollspy({ 4 | target: '#sidebar', 5 | offset: 60 6 | }); 7 | 8 | var cur_path = paths(location.pathname); 9 | $("#navbar ul li a").each(function(index, value) { 10 | if (value.text == "Home") 11 | return; 12 | if (value.getAttribute("href") === "#") 13 | return; 14 | 15 | var path = paths(value.pathname); 16 | if (is_prefix(cur_path, path)) { 17 | // Add class to parent
  • , and enclosing
  • if in dropdown 18 | var menu_anchor = $(value); 19 | menu_anchor.parent().addClass("active"); 20 | menu_anchor.closest("li.dropdown").addClass("active"); 21 | } 22 | }); 23 | }); 24 | 25 | function paths(pathname) { 26 | var pieces = pathname.split("/"); 27 | pieces.shift(); // always starts with / 28 | 29 | var end = pieces[pieces.length - 1]; 30 | if (end === "index.html" || end === "") 31 | pieces.pop(); 32 | return(pieces); 33 | } 34 | 35 | function is_prefix(needle, haystack) { 36 | if (needle.length > haystack.lengh) 37 | return(false); 38 | 39 | for (var i = 0; i < haystack.length; i++) { 40 | if (needle[i] != haystack[i]) 41 | return(false); 42 | } 43 | 44 | return(true); 45 | } 46 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: http://markusdumke.github.io/reinforcelearn 2 | 3 | template: 4 | params: 5 | bootswatch: cerulean 6 | 7 | reference: 8 | - title: Package help page 9 | contents: 10 | - reinforcelearn 11 | - title: Environments 12 | desc: Creation of reinforcement learning environments. 13 | contents: 14 | - makeEnvironment 15 | - Environment 16 | - GymEnvironment 17 | - MdpEnvironment 18 | - Gridworld 19 | - CliffWalking 20 | - WindyGridworld 21 | - MountainCar 22 | - MountainCarContinuous 23 | - title: Policies 24 | contents: 25 | - makePolicy 26 | - RandomPolicy 27 | - GreedyPolicy 28 | - EpsilonGreedyPolicy 29 | - SoftmaxPolicy 30 | - title: Value Function Representations 31 | contents: 32 | - makeValueFunction 33 | - ValueTable 34 | - ValueNetwork 35 | - title: Algorithms 36 | contents: 37 | - makeAlgorithm 38 | - QLearning 39 | - title: Agent 40 | contents: 41 | - makeAgent 42 | - title: Interaction 43 | contents: 44 | - interact 45 | - title: Helper functions 46 | contents: 47 | - makeReplayMemory 48 | - getReplayMemory 49 | - Eligibility 50 | - getEligibilityTraces 51 | - getValueFunction 52 | - getStateValues 53 | - tiles 54 | - iht 55 | - nHot 56 | 57 | -------------------------------------------------------------------------------- /R/eligibility.R: -------------------------------------------------------------------------------- 1 | #' Eligibility traces 2 | #' 3 | #' Eligibility traces. 4 | #' 5 | #' Algorithms supporting eligibility traces: 6 | #' * [QLearning] 7 | #' 8 | #' @param lambda \[`numeric(1)` in (0, 1)] \cr Trace decay parameter. 9 | #' @param traces \[`character(1)`] \cr Type of eligibility trace update. One of `c("replace", "accumulate")`. 10 | #' 11 | #' @name Eligibility 12 | #' @md 13 | #' 14 | #' @aliases eligibility 15 | #' 16 | #' @examples 17 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 18 | NULL 19 | 20 | Eligibility = R6::R6Class("Eligibility", 21 | public = list( 22 | lambda = 0, 23 | eligibility.type = NULL, 24 | E = NULL, 25 | initialize = function(lambda = 0, traces = "accumulate") { 26 | self$lambda = lambda 27 | if (traces == "replace") { 28 | self$eligibility.type = 1 29 | } else if (traces == "accumulate") { 30 | self$eligibility.type = 0 31 | } 32 | }, 33 | reset = function(val.fun) { 34 | self$E = matrix(0, nrow = nrow(val.fun), ncol = ncol(val.fun)) 35 | }, 36 | increase = function(s, a) { 37 | self$E[s + 1L, a + 1L] = (1 - self$eligibility.type) * self$E[s + 1L, a + 1L] + 1 38 | }, 39 | decrease = function(discount) { 40 | self$E = discount * self$lambda * self$E # sarsa 41 | } 42 | ) 43 | ) 44 | -------------------------------------------------------------------------------- /man/interact.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/interact.R 3 | \name{interact} 4 | \alias{interact} 5 | \title{Interaction between agent and environment.} 6 | \usage{ 7 | interact(env, agent, n.steps = Inf, n.episodes = Inf, 8 | max.steps.per.episode = Inf, learn = TRUE, visualize = FALSE) 9 | } 10 | \arguments{ 11 | \item{env}{[\code{Environment}] \cr Reinforcement learning environment created by \link{makeEnvironment}.} 12 | 13 | \item{agent}{[\code{Agent}] \cr Agent created by \link{makeAgent}.} 14 | 15 | \item{n.steps}{[\code{integer(1)}] \cr Number of steps to run.} 16 | 17 | \item{n.episodes}{[\code{integer(1)}] \cr Number of episodes to run.} 18 | 19 | \item{max.steps.per.episode}{[\code{integer(1)}] \cr Maximal number of steps allowed per episode.} 20 | 21 | \item{learn}{[\code{logical(1)}] \cr Should the agent learn?} 22 | 23 | \item{visualize}{[\code{logical(1)}] \cr Visualize the interaction between agent and environment?} 24 | } 25 | \value{ 26 | [\code{list}] Return and number of steps per episode. 27 | } 28 | \description{ 29 | Run interaction between agent and environment for specified number of steps 30 | or episodes. 31 | } 32 | \examples{ 33 | env = makeEnvironment("windy.gridworld") 34 | agent = makeAgent("softmax", "table", "qlearning") 35 | interact(env, agent, n.episodes = 10L) 36 | } 37 | -------------------------------------------------------------------------------- /man/makePolicy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/policy.R 3 | \name{makePolicy} 4 | \alias{makePolicy} 5 | \alias{Policy} 6 | \title{Create policy.} 7 | \usage{ 8 | makePolicy(class = "random", args = list(), ...) 9 | } 10 | \arguments{ 11 | \item{class}{[\code{character(1)}] \cr 12 | Class of policy. One of \code{c("random", "epsilon.greedy", "greedy", "softmax")}.} 13 | 14 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the 15 | subclass. The arguments in ... take precedence over values in this list. 16 | We strongly encourage you to use one or the other to pass arguments 17 | to the function but not both.} 18 | 19 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively 20 | these can be given using the \code{args} argument.} 21 | } 22 | \value{ 23 | [\code{list(name, args)}] List with the name and optional args. 24 | This list can then be passed onto \link{makeAgent}, which will construct the 25 | policy accordingly. 26 | } 27 | \description{ 28 | Reinforcement learning policies. 29 | } 30 | \section{Policies}{ 31 | 32 | \itemize{ 33 | \item \link{RandomPolicy} 34 | \item \link{GreedyPolicy} 35 | \item \link{EpsilonGreedyPolicy} 36 | \item \link{SoftmaxPolicy} 37 | } 38 | } 39 | 40 | \examples{ 41 | policy = makePolicy("random") 42 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 43 | } 44 | -------------------------------------------------------------------------------- /man/MountainCar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_mountaincar.R 3 | \docType{data} 4 | \name{MountainCar} 5 | \alias{MountainCar} 6 | \alias{MountainCarContinuous,} 7 | \alias{mountain.car} 8 | \alias{MountainCarContinuous} 9 | \title{Mountain Car} 10 | \format{An object of class \code{R6ClassGenerator} of length 24.} 11 | \arguments{ 12 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 13 | } 14 | \description{ 15 | The classical mountain car problem for reinforcement learning. 16 | } 17 | \details{ 18 | The classical Mountain Car task the action is one of {0, 1, 2}, 19 | in the continuous version the action is in [-1, 1]. 20 | } 21 | \section{Usage}{ 22 | 23 | \code{makeEnvironment("MountainCar", ...)} \cr 24 | \code{makeEnvironment("MountainCarContinuous", ...)} 25 | } 26 | 27 | \section{Methods}{ 28 | 29 | \itemize{ 30 | \item \code{$step(action)} \cr 31 | Take action in environment. 32 | Returns a list with \code{state}, \code{reward}, \code{done}. 33 | \item \code{$reset()} \cr 34 | Resets the \code{done} flag of the environment and returns an initial state. 35 | Useful when starting a new episode. 36 | \item \code{$visualize()} \cr 37 | Visualizes the environment (if there is a visualization function). 38 | } 39 | } 40 | 41 | \examples{ 42 | env = makeEnvironment("mountain.car") 43 | env$reset() 44 | env$step(1L) 45 | 46 | env = makeEnvironment("mountain.car.continuous") 47 | env$reset() 48 | env$step(0.62) 49 | } 50 | \keyword{datasets} 51 | -------------------------------------------------------------------------------- /man/makeValueFunction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/valuefunction.R 3 | \name{makeValueFunction} 4 | \alias{makeValueFunction} 5 | \title{Value Function Representation} 6 | \usage{ 7 | makeValueFunction(class, args = list(), ...) 8 | } 9 | \arguments{ 10 | \item{class}{[\code{character(1)}] \cr Class of value function approximation. 11 | One of \code{c("table", "neural.network")}.} 12 | 13 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the 14 | subclass. The arguments in ... take precedence over values in this list. 15 | We strongly encourage you to use one or the other to pass arguments 16 | to the function but not both.} 17 | 18 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively 19 | these can be given using the \code{args} argument.} 20 | } 21 | \value{ 22 | [\code{list(name, args)}] List with the name and optional args. 23 | This list can then be passed onto \link{makeAgent}, which will construct the 24 | value function accordingly. 25 | } 26 | \description{ 27 | A representation of the value function. 28 | } 29 | \section{Representations}{ 30 | 31 | \itemize{ 32 | \item \link{ValueTable} 33 | \item \link{ValueNetwork} 34 | } 35 | } 36 | 37 | \examples{ 38 | val = makeValueFunction("table", n.states = 16L, n.actions = 4L) 39 | # If the number of states and actions is not supplied, the agent will try 40 | # to figure these out from the environment object during interaction. 41 | val = makeValueFunction("table") 42 | } 43 | -------------------------------------------------------------------------------- /man/GymEnvironment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_gym.R 3 | \name{GymEnvironment} 4 | \alias{GymEnvironment} 5 | \title{Gym Environment} 6 | \arguments{ 7 | \item{gym.name}{[\code{character(1)}] \cr 8 | Name of gym environment, e.g. \code{"CartPole-v0"}.} 9 | 10 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 11 | } 12 | \description{ 13 | Reinforcement learning environment from OpenAI Gym. 14 | } 15 | \details{ 16 | For available gym environments take a look at https://gym.openai.com/envs. 17 | } 18 | \section{Usage}{ 19 | 20 | \code{makeEnvironment("gym", gym.name, ...)} 21 | } 22 | 23 | \section{Installation}{ 24 | 25 | For installation of the python package \code{gym} see 26 | https://github.com/openai/gym#installation. 27 | Then install the R package \code{reticulate}. 28 | } 29 | 30 | \section{Methods}{ 31 | 32 | \itemize{ 33 | \item \code{$close()} 34 | Close visualization window. 35 | } 36 | 37 | 38 | \itemize{ 39 | \item \code{$step(action)} \cr 40 | Take action in environment. 41 | Returns a list with \code{state}, \code{reward}, \code{done}. 42 | \item \code{$reset()} \cr 43 | Resets the \code{done} flag of the environment and returns an initial state. 44 | Useful when starting a new episode. 45 | \item \code{$visualize()} \cr 46 | Visualizes the environment (if there is a visualization function). 47 | } 48 | } 49 | 50 | \examples{ 51 | \dontrun{ 52 | # Create an OpenAI Gym environment. 53 | # Make sure you have Python, gym and reticulate installed. 54 | env = makeEnvironment("gym", gym.name = "MountainCar-v0") 55 | env$reset() 56 | env$close() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /man/makeAgent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/agent.R 3 | \name{makeAgent} 4 | \alias{makeAgent} 5 | \title{Create Agent.} 6 | \usage{ 7 | makeAgent(policy, val.fun = NULL, algorithm = NULL, 8 | preprocess = identity, replay.memory = NULL, policy.args = list(), 9 | val.fun.args = list(), algorithm.args = list()) 10 | } 11 | \arguments{ 12 | \item{policy}{[\code{character(1)} | Policy] \cr A policy. 13 | If you pass a string the policy will be created via \link{makePolicy}.} 14 | 15 | \item{val.fun}{[\code{character(1)} | ValueFunction] \cr A value function representation. 16 | If you pass a string the value function will be created via \link{makeValueFunction}.} 17 | 18 | \item{algorithm}{[\code{character(1)} | Algorithm] \cr An algorithm. 19 | If you pass a string the algorithm will be created via \link{makeAlgorithm}.} 20 | 21 | \item{preprocess}{[\code{function}] \cr A function which preprocesses the state so that the agent can learn on this.} 22 | 23 | \item{replay.memory}{[\code{ReplayMemory}] \cr Replay memory for experience replay created by \link{makeReplayMemory}.} 24 | 25 | \item{policy.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makePolicy}.} 26 | 27 | \item{val.fun.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makeValueFunction}.} 28 | 29 | \item{algorithm.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makeAlgorithm}.} 30 | } 31 | \description{ 32 | An agent consists of a policy and (optional) a value function representation 33 | and (optional) a learning algorithm. 34 | } 35 | \examples{ 36 | agent = makeAgent("softmax", "table", "qlearning") 37 | } 38 | -------------------------------------------------------------------------------- /man/reinforcelearn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reinforcelearn.R 3 | \docType{package} 4 | \name{reinforcelearn} 5 | \alias{reinforcelearn} 6 | \alias{reinforcementlearning} 7 | \alias{reinforcelearn-package} 8 | \title{Reinforcement Learning.} 9 | \description{ 10 | Implementations of reinforcement learning algorithms and environments. 11 | } 12 | \section{Environments}{ 13 | 14 | \itemize{ 15 | \item \link{makeEnvironment} 16 | \item \link{Environment} 17 | \item \link{GymEnvironment} 18 | \item \link{MdpEnvironment} 19 | \item \link{Gridworld} 20 | \item \link{WindyGridworld} 21 | \item \link{CliffWalking} 22 | \item \link{MountainCar} 23 | \item \link{MountainCarContinuous} 24 | } 25 | } 26 | 27 | \section{Policies}{ 28 | 29 | \itemize{ 30 | \item \link{makePolicy} 31 | \item \link{EpsilonGreedyPolicy} 32 | \item \link{GreedyPolicy} 33 | \item \link{SoftmaxPolicy} 34 | \item \link{RandomPolicy} 35 | } 36 | } 37 | 38 | \section{Value Function Representations}{ 39 | 40 | \itemize{ 41 | \item \link{makeValueFunction} 42 | \item \link{ValueTable} 43 | \item \link{ValueNetwork} 44 | } 45 | } 46 | 47 | \section{Algorithms}{ 48 | 49 | \itemize{ 50 | \item \link{makeAlgorithm} 51 | \item \link{QLearning} 52 | } 53 | } 54 | 55 | \section{Extensions}{ 56 | 57 | \itemize{ 58 | \item \link{makeReplayMemory} 59 | \item \link{Eligibility} 60 | } 61 | } 62 | 63 | \section{Agent}{ 64 | 65 | \itemize{ 66 | \item \link{makeAgent} 67 | \item \link{getValueFunction} 68 | \item \link{getReplayMemory} 69 | \item \link{getEligibilityTraces} 70 | } 71 | } 72 | 73 | \section{Interaction}{ 74 | 75 | \itemize{ 76 | \item \link{interact} 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /tests/testthat/test_environment.R: -------------------------------------------------------------------------------- 1 | context("environment creation") 2 | 3 | env1 = makeEnvironment("mountain.car", 4 | action.names = c("stop" = 0L, "pause" = 1L, "go" = 2L)) 5 | env2 = makeEnvironment("windy.gridworld", discount = 0.8) 6 | env3 = makeEnvironment("gridworld", shape = c(2, 3), initial.state = 5L, 7 | goal.states = 0L, discount = 0.9, diagonal.moves = TRUE) 8 | 9 | context("discount") 10 | test_that("discount will be initialized correctly", { 11 | expect_equal(env1$discount, 1) 12 | expect_equal(env2$discount, 0.8) 13 | expect_equal(env3$discount, 0.9) 14 | }) 15 | 16 | context("action.names") 17 | test_that("action names will be initialized correctly", { 18 | expect_equal(env1$action.names, c("stop" = 0L, "pause" = 1L, "go" = 2L)) 19 | expect_equal(env2$action.names, c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L)) 20 | expect_equal(env3$action.names, c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L, 21 | "leftup" = 4L, "leftdown" = 5L, "rightup" = 6L, "rightdown" = 7L)) 22 | }) 23 | 24 | env4 = makeEnvironment("windy.gridworld") 25 | env5 = makeEnvironment("windy.gridworld") 26 | test_that("action.names are equivalent to integer actions", { 27 | env4$step("left") 28 | env5$step(0L) 29 | expect_equal(env4, env5) 30 | }) 31 | 32 | context("visualization") 33 | test_that("gridworld visualization works", { 34 | expect_equal(visualizeGridworld(c(2, 2), current.state = 3L), 35 | paste0(" - ", "- ", "\n", " - ", "o")) 36 | }) 37 | 38 | context("counter of steps, returns") 39 | env3$step(0L) 40 | test_that("env$episode.return computes discounted reward sum", { 41 | expect_equal(env3$episode.return, -1) 42 | env3$step(0L) 43 | expect_equal(env3$episode.return, -1 + env3$discount * env3$reward) 44 | }) 45 | -------------------------------------------------------------------------------- /man/Environment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment.R 3 | \name{Environment} 4 | \alias{Environment} 5 | \title{Custom Reinforcement Learning Environment} 6 | \arguments{ 7 | \item{step}{[\code{function(self, action)}] \cr 8 | Custom step function.} 9 | 10 | \item{reset}{[\code{function(self)}] \cr 11 | Custom reset function.} 12 | 13 | \item{visualize}{[\code{function(self)}] \cr 14 | Optional custom visualization function.} 15 | 16 | \item{discount}{[\code{numeric(1)} in (0, 1)] \cr Discount factor.} 17 | 18 | \item{action.names}{[\code{named integer}] \cr 19 | Optional action names for a discrete action space.} 20 | } 21 | \description{ 22 | Custom Reinforcement Learning Environment 23 | } 24 | \section{Usage}{ 25 | 26 | \code{makeEnvironment("custom", step, reset, visualize = NULL, discount = 1, action.names = NULL)} 27 | } 28 | 29 | \section{Methods}{ 30 | 31 | \itemize{ 32 | \item \code{$step(action)} \cr 33 | Take action in environment. 34 | Returns a list with \code{state}, \code{reward}, \code{done}. 35 | \item \code{$reset()} \cr 36 | Resets the \code{done} flag of the environment and returns an initial state. 37 | Useful when starting a new episode. 38 | \item \code{$visualize()} \cr 39 | Visualizes the environment (if there is a visualization function). 40 | } 41 | } 42 | 43 | \examples{ 44 | step = function(self, action) { 45 | state = list(mean = action + rnorm(1), sd = runif(1)) 46 | reward = rnorm(1, state[[1]], state[[2]]) 47 | done = FALSE 48 | list(state, reward, done) 49 | } 50 | 51 | reset = function(self) { 52 | state = list(mean = 0, sd = 1) 53 | state 54 | } 55 | 56 | env = makeEnvironment(step = step, reset = reset) 57 | env$reset() 58 | env$step(100) 59 | } 60 | -------------------------------------------------------------------------------- /R/algorithm.R: -------------------------------------------------------------------------------- 1 | #' Make reinforcement learning algorithm. 2 | #' 3 | #' @param class \[`character(1)`] \cr Algorithm. One of `c("qlearning")`. 4 | #' @inheritParams makePolicy 5 | #' 6 | #' @md 7 | #' 8 | #' @section Representations: 9 | #' * [QLearning] 10 | #' 11 | #' @export 12 | #' @examples 13 | #' alg = makeAlgorithm("qlearning") 14 | makeAlgorithm = function(class, args = list(), ...) { 15 | checkmate::assertChoice(class, 16 | c("qlearning"))#, "sarsa")) 17 | checkmate::assertList(args, names = "unique") 18 | args = append(list(...), args) 19 | # remove duplicate entries in args list 20 | args = args[unique(names(args))] 21 | 22 | x = list(name = class, args = args) 23 | class(x) = "Algorithm" 24 | x 25 | } 26 | 27 | 28 | #' Q-Learning 29 | #' 30 | #' Q-Learning algorithm. 31 | #' 32 | #' To use eligibility traces specify `lambda` and `traces`. 33 | #' 34 | #' @section Usage: 35 | #' `makeAlgorithm("qlearning", lambda, traces)` 36 | #' 37 | #' @param lambda \[`numeric(1)` in (0, 1)] \cr Trace decay parameter. 38 | #' @param traces \[`character(1)`] \cr Type of eligibility trace update. One of `c("replace", "accumulate")`. 39 | #' 40 | #' @name QLearning 41 | #' @aliases qlearning 42 | #' 43 | #' @seealso [Eligibility] 44 | #' 45 | #' @md 46 | #' 47 | #' @examples 48 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 49 | NULL 50 | 51 | QLearning = R6::R6Class("QLearning", 52 | public = list( 53 | getTarget = function(reward, action.values, discount) { 54 | reward + discount * apply(action.values, 1L, max) 55 | } 56 | ) 57 | ) 58 | 59 | # Sarsa = R6::R6Class("Sarsa", 60 | # public = list( 61 | # getTarget = function(reward, action.values, discount, next.action) { 62 | # reward + discount * action.values[, next.action + 1L] 63 | # } 64 | # ) 65 | # ) 66 | -------------------------------------------------------------------------------- /man/MdpEnvironment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_mdp.R 3 | \name{MdpEnvironment} 4 | \alias{MdpEnvironment} 5 | \title{MDP Environment} 6 | \arguments{ 7 | \item{transitions}{[\code{array (n.states x n.states x n.actions)}] \cr 8 | State transition array.} 9 | 10 | \item{rewards}{[\code{matrix (n.states x n.actions)}] \cr 11 | Reward array.} 12 | 13 | \item{initial.state}{[\code{integer}] \cr 14 | Optional starting state. 15 | If a vector is given a starting state will be 16 | randomly sampled from this vector whenever \code{reset} is called. 17 | Note that states are numerated starting with 18 | 0. If \code{initial.state = NULL} all non-terminal states are 19 | possible starting states.} 20 | 21 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 22 | } 23 | \description{ 24 | Markov Decision Process environment. 25 | } 26 | \section{Usage}{ 27 | 28 | \code{makeEnvironment("MDP", transitions, rewards, initial.state, ...)} 29 | } 30 | 31 | \section{Methods}{ 32 | 33 | \itemize{ 34 | \item \code{$step(action)} \cr 35 | Take action in environment. 36 | Returns a list with \code{state}, \code{reward}, \code{done}. 37 | \item \code{$reset()} \cr 38 | Resets the \code{done} flag of the environment and returns an initial state. 39 | Useful when starting a new episode. 40 | \item \code{$visualize()} \cr 41 | Visualizes the environment (if there is a visualization function). 42 | } 43 | } 44 | 45 | \examples{ 46 | # Create a Markov Decision Process. 47 | P = array(0, c(2, 2, 2)) 48 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 49 | P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE) 50 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 51 | env = makeEnvironment("mdp", transitions = P, rewards = R) 52 | env$reset() 53 | env$step(1L) 54 | } 55 | -------------------------------------------------------------------------------- /man/CliffWalking.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_gridworld.R 3 | \name{CliffWalking} 4 | \alias{CliffWalking} 5 | \alias{cliff.walking} 6 | \title{Cliff Walking} 7 | \arguments{ 8 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 9 | } 10 | \description{ 11 | Gridworld environment for reinforcement learning from Sutton & Barto (2017). 12 | Grid of shape 4x12 with a goal state in the bottom right of the grid. 13 | Episodes start in the lower left state. Possible actions include going left, right, up and down. 14 | Some states in the lower part of the grid are a cliff, 15 | so taking a step into this cliff will yield a high negative reward of - 100 and move the agent 16 | back to the starting state. 17 | Elsewise rewards are - 1, for the goal state 0. 18 | } 19 | \details{ 20 | This is the gridworld (goal state denoted G, cliff states denoted C, start state denoted S): 21 | \tabular{rrrrrrrrrrrr}{ 22 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 23 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 24 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 25 | S \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab G \cr 26 | } 27 | } 28 | \section{Usage}{ 29 | 30 | \code{makeEnvironment("cliff.walking", ...)} 31 | } 32 | 33 | \section{Methods}{ 34 | 35 | \itemize{ 36 | \item \code{$step(action)} \cr 37 | Take action in environment. 38 | Returns a list with \code{state}, \code{reward}, \code{done}. 39 | \item \code{$reset()} \cr 40 | Resets the \code{done} flag of the environment and returns an initial state. 41 | Useful when starting a new episode. 42 | \item \code{$visualize()} \cr 43 | Visualizes the environment (if there is a visualization function). 44 | } 45 | } 46 | 47 | \examples{ 48 | env = makeEnvironment("cliff.walking") 49 | } 50 | \references{ 51 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.6 52 | } 53 | -------------------------------------------------------------------------------- /R/accessor_functions.R: -------------------------------------------------------------------------------- 1 | #' Get weights of value function. 2 | #' 3 | #' Returns the weights of the value function representation of the agent. 4 | #' 5 | #' @param agent \[Agent] \cr An agent created by [makeAgent]. 6 | #' 7 | #' @md 8 | #' 9 | #' @return For a value function table this will return a matrix, for a neural 10 | #' network a list with the weights of the layers. 11 | #' 12 | #' @export 13 | getValueFunction = function(agent) { 14 | checkmate::assertClass(agent, "Agent") 15 | if (!is.null(agent$val.fun)) { 16 | Q = agent$val.fun$getWeights() 17 | } else { 18 | stop("No value function weights found in the agent object.") 19 | } 20 | Q 21 | } 22 | 23 | #' Get replay memory. 24 | #' 25 | #' Returns the replay memory of the agent. 26 | #' 27 | #' @param agent \[Agent] \cr An agent created by [makeAgent]. 28 | #' 29 | #' @md 30 | #' 31 | #' @return A list containing the experienced observations, actions and rewards. 32 | #' 33 | #' @export 34 | getReplayMemory = function(agent) { 35 | checkmate::assertClass(agent, "Agent") 36 | if (!is.null(agent$exp.replay)) { 37 | mem = agent$exp.replay$memory 38 | } else { 39 | stop("No replay memory found in the agent object.") 40 | } 41 | mem 42 | } 43 | 44 | #' Get eligibility traces 45 | #' 46 | #' Returns the eligibility traces of the agent. 47 | #' 48 | #' @param agent \[Agent] \cr An agent created by [makeAgent]. 49 | #' 50 | #' @md 51 | #' 52 | #' @return A matrix with the eligibility traces. 53 | #' 54 | #' @export 55 | getEligibilityTraces = function(agent) { 56 | checkmate::assertClass(agent, "Agent") 57 | if (!is.null(agent$eligibility)) { 58 | e = agent$eligibility$E 59 | } else { 60 | stop("No eligibility traces found in the agent object.") 61 | } 62 | e 63 | } 64 | 65 | 66 | #' Get state values. 67 | #' 68 | #' Get state value function from action value function. 69 | #' 70 | #' @param action.vals \[`matrix`] \cr Action value matrix. 71 | #' 72 | #' @md 73 | #' 74 | #' @export 75 | getStateValues = function(action.vals) { 76 | checkmate::assertMatrix(action.vals) 77 | apply(action.vals, 1L, max) 78 | } 79 | -------------------------------------------------------------------------------- /man/windyGridworld.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_gridworld.R 3 | \name{WindyGridworld} 4 | \alias{WindyGridworld} 5 | \alias{windy.gridworld} 6 | \title{Windy Gridworld} 7 | \arguments{ 8 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 9 | } 10 | \description{ 11 | Windy Gridworld problem for reinforcement learning. Actions include 12 | going left, right, up and down. In each column the wind pushes you up a 13 | specific number of steps (for the next action). If an action would 14 | take you off the grid, you remain in the previous state. For each step you 15 | get a reward of -1, until you reach into a terminal state. 16 | } 17 | \details{ 18 | This is the gridworld (goal state denoted G, start state denoted S). 19 | The last row specifies the upward wind in each column. 20 | \tabular{rrrrrrrrrr}{ 21 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 22 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 23 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 24 | S \tab . \tab . \tab . \tab . \tab . \tab . \tab G \tab . \tab . \cr 25 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 26 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 27 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 28 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 29 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 30 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr 31 | 0 \tab 0 \tab 0 \tab 1 \tab 1 \tab 1 \tab 2 \tab 2 \tab 1 \tab 0 \cr 32 | } 33 | } 34 | \section{Usage}{ 35 | 36 | \code{makeEnvironment("windy.gridworld", ...)} 37 | } 38 | 39 | \section{Methods}{ 40 | 41 | \itemize{ 42 | \item \code{$step(action)} \cr 43 | Take action in environment. 44 | Returns a list with \code{state}, \code{reward}, \code{done}. 45 | \item \code{$reset()} \cr 46 | Resets the \code{done} flag of the environment and returns an initial state. 47 | Useful when starting a new episode. 48 | \item \code{$visualize()} \cr 49 | Visualizes the environment (if there is a visualization function). 50 | } 51 | } 52 | 53 | \examples{ 54 | env = makeEnvironment("windy.gridworld") 55 | } 56 | \references{ 57 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.5 58 | } 59 | -------------------------------------------------------------------------------- /benchmark/benchmark_windy_gridworld.md: -------------------------------------------------------------------------------- 1 | Benchmark Algorithms on Windy Gridworld Task 2 | ================ 3 | Markus Dumke 4 | 2017-12-21 5 | 6 | ``` r 7 | library(reinforcelearn) 8 | env = makeEnvironment("windy.gridworld") 9 | ``` 10 | 11 | The optimal solution is 15 steps. 12 | 13 | Simple Q-Learning 14 | ----------------- 15 | 16 | ``` r 17 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 18 | agent = makeAgent(policy, "table", "qlearning", epsilon = 0.1) 19 | 20 | res = interact(env, agent, n.episodes = 500L) 21 | ``` 22 | 23 | 24 | 25 | Q-Learning with Eligibility Traces 26 | ---------------------------------- 27 | 28 | ``` r 29 | env$resetEverything() 30 | #> [1] 30 31 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 32 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 33 | agent = makeAgent(policy, "table", alg) 34 | 35 | res = interact(env, agent, n.episodes = 500L) 36 | ``` 37 | 38 | 39 | 40 | Q-Learning with Experience replay 41 | --------------------------------- 42 | 43 | ``` r 44 | env$resetEverything() 45 | #> [1] 30 46 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 47 | mem = makeReplayMemory(size = 10L, batch.size = 10L) 48 | agent = makeAgent(policy, "table", "qlearning", experience.replay = mem) 49 | 50 | res = interact(env, agent, n.episodes = 500L) 51 | ``` 52 | 53 | 54 | 55 | Q-Learning with neural network and experience replay 56 | ---------------------------------------------------- 57 | 58 | ``` r 59 | env$resetEverything() 60 | #> [1] 30 61 | library(keras) 62 | model = keras_model_sequential() %>% 63 | layer_dense(units = env$n.actions, activation = "linear", 64 | input_shape = c(env$n.states), kernel_initializer = initializer_zeros(), 65 | use_bias = FALSE) %>% 66 | compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 67 | mem = makeReplayMemory(size = 2L, batch.size = 2L) 68 | val = makeValueFunction("neural.network", model = model) 69 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 70 | preprocess = function(x) to_categorical(x, num_classes = env$n.states) 71 | agent = makeAgent(policy, val, "qlearning", 72 | preprocess = preprocess, experience.replay = mem) 73 | 74 | res = interact(env, agent, n.episodes = 500L) 75 | ``` 76 | 77 | 78 | -------------------------------------------------------------------------------- /man/tilecoding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tiles.R 3 | \name{tiles} 4 | \alias{tiles} 5 | \alias{iht} 6 | \title{Tile Coding} 7 | \usage{ 8 | tiles(iht, n.tilings, state, action = integer(0)) 9 | 10 | iht(max.size) 11 | } 12 | \arguments{ 13 | \item{iht}{[\code{IHT}] \cr A hash table created with \code{iht}.} 14 | 15 | \item{n.tilings}{[\code{integer(1)}] \cr Number of tilings.} 16 | 17 | \item{state}{[\code{vector(2)}] \cr A two-dimensional state observation. 18 | Make sure to scale the observation to unit variance before.} 19 | 20 | \item{action}{[\code{integer(1)}] \cr Optional: If supplied the action space 21 | will also be tiled. All distinct actions will result in different tile numbers.} 22 | 23 | \item{max.size}{[\code{integer(1)}] \cr Maximal size of hash table.} 24 | } 25 | \value{ 26 | \code{iht} creates a hash table, which can then be passed on to \code{tiles}. 27 | \code{tiles} returns an integer vector of size \code{n.tilings} with the active tile numbers. 28 | } 29 | \description{ 30 | Implementation of Sutton's tile coding software version 3. 31 | } 32 | \details{ 33 | Tile coding is a way of representing the values of a vector of continuous variables as a large 34 | binary vector with few 1s and many 0s. The binary vector is not represented explicitly, 35 | but as a list of the components that are 1s. The main step is to partition, or tile, 36 | the continuous space multiple times and select one tile from each tiling, that corresponding 37 | the the vector's value. Each tile is converted to an element in the big binary vector, 38 | and the list of the tile (element) numbers is returned as the representation of the vector's value. 39 | Tile coding is recommended as a way of applying online learning methods to domains with continuous 40 | state or action variables. [copied from manual] 41 | 42 | See detailed manual on the web. 43 | In comparison to the Python implementation indices start with 1 instead of 0. The hash table is 44 | implemented as an environment, which is an attribute of an R6 class. 45 | 46 | Make sure that the size of the hash table is large enough, else an error will be triggered, 47 | when trying to assign a value to a full hash table. 48 | } 49 | \examples{ 50 | # Create hash table 51 | hash = iht(1024) 52 | 53 | # Partition state space using 8 tilings 54 | tiles(hash, n.tilings = 8, state = c(3.6, 7.21)) 55 | tiles(hash, n.tilings = 8, state = c(3.7, 7.21)) 56 | tiles(hash, n.tilings = 8, state = c(4, 7)) 57 | tiles(hash, n.tilings = 8, state = c(- 37.2, 7)) 58 | 59 | } 60 | \references{ 61 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction 62 | } 63 | -------------------------------------------------------------------------------- /session_info.txt: -------------------------------------------------------------------------------- 1 | - Session info ---------------------------------------------------------- 2 | setting value 3 | version R version 3.4.3 (2017-11-30) 4 | os Windows 10 x64 5 | system x86_64, mingw32 6 | ui RTerm 7 | language (EN) 8 | collate English_Germany.1252 9 | tz Europe/Berlin 10 | date 2017-12-23 11 | 12 | - Packages -------------------------------------------------------------- 13 | package * version date source 14 | assertthat 0.2.0 2017-04-11 CRAN (R 3.4.2) 15 | backports 1.1.1 2017-09-25 CRAN (R 3.4.1) 16 | cli 1.0.0 2017-12-22 Github (r-lib/cli@ab1c3aa) 17 | clisymbols 1.2.0 2017-05-21 CRAN (R 3.4.3) 18 | crayon 1.3.4 2017-09-16 CRAN (R 3.4.2) 19 | desc 1.1.1 2017-08-03 CRAN (R 3.4.2) 20 | devtools 1.13.3.9000 2017-12-22 Github (hadley/devtools@0bcfd6e) 21 | digest 0.6.13 2017-12-14 CRAN (R 3.4.3) 22 | evaluate 0.10.1 2017-06-24 CRAN (R 3.4.2) 23 | htmltools 0.3.6 2017-04-28 CRAN (R 3.4.2) 24 | knitr 1.17 2017-08-10 CRAN (R 3.4.2) 25 | magrittr 1.5 2014-11-22 CRAN (R 3.4.2) 26 | memoise 1.1.0 2017-04-21 CRAN (R 3.4.2) 27 | pkgbuild 0.0.0.9000 2017-12-22 Github (r-lib/pkgbuild@ce7f6d1) 28 | pkgload 0.0.0.9000 2017-12-22 Github (r-lib/pkgload@70eaef8) 29 | R6 2.2.2 2017-06-17 CRAN (R 3.4.2) 30 | Rcpp 0.12.13 2017-09-28 CRAN (R 3.4.2) 31 | rlang 0.1.4.9000 2017-12-22 Github (tidyverse/rlang@cc7587c) 32 | rmarkdown 1.8 2017-11-17 CRAN (R 3.4.2) 33 | rprojroot 1.3-1 2017-12-18 CRAN (R 3.4.3) 34 | sessioninfo 1.0.1.9000 2017-12-22 Github (r-lib/sessioninfo@c871d01) 35 | stringi 1.1.6 2017-11-17 CRAN (R 3.4.2) 36 | stringr 1.2.0 2017-02-18 CRAN (R 3.4.2) 37 | testthat 2.0.0 2017-12-13 CRAN (R 3.4.3) 38 | usethis 1.1.0.9000 2017-12-22 Github (r-lib/usethis@973bcab) 39 | withr 2.1.1 2017-12-19 CRAN (R 3.4.3) 40 | yaml 2.1.16 2017-12-12 CRAN (R 3.4.3) 41 | -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net 3 | */ 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
    "))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n= 0.5) { 28 | done = TRUE 29 | reward = 0 30 | } else { 31 | done = FALSE 32 | } 33 | list(state, reward, done) 34 | } 35 | 36 | super$initialize(step_, reset_, ...) 37 | } 38 | ) 39 | ) 40 | 41 | #' Mountain Car 42 | #' 43 | #' The classical mountain car problem for reinforcement learning. 44 | #' 45 | #' The classical Mountain Car task the action is one of \{0, 1, 2\}, 46 | #' in the continuous version the action is in \[-1, 1]. 47 | #' 48 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment]. 49 | #' 50 | #' @section Usage: 51 | #' `makeEnvironment("MountainCar", ...)` \cr 52 | #' `makeEnvironment("MountainCarContinuous", ...)` 53 | #' 54 | #' @md 55 | #' 56 | #' @inheritSection Environment Methods 57 | #' @name MountainCar 58 | #' @aliases MountainCarContinuous, mountain.car 59 | #' @examples 60 | #' env = makeEnvironment("mountain.car") 61 | #' env$reset() 62 | #' env$step(1L) 63 | #' 64 | #' env = makeEnvironment("mountain.car.continuous") 65 | #' env$reset() 66 | #' env$step(0.62) 67 | NULL 68 | 69 | #' @rdname MountainCar 70 | #' @usage NULL 71 | MountainCar = R6::R6Class("MountainCar", 72 | inherit = MountainCarBase, 73 | public = list( 74 | action.space = "Discrete", 75 | actions = 0:2, 76 | n.actions = 3L 77 | ), 78 | private = list( 79 | getVelocity = function(self, action) { 80 | self$velocity + 0.001 * (action - 1L) - 0.0025 * cos(3 * self$position) 81 | } 82 | ) 83 | ) 84 | 85 | #' @rdname MountainCar 86 | #' @usage NULL 87 | MountainCarContinuous = R6::R6Class("MountainCarContinuous", 88 | inherit = MountainCarBase, 89 | public = list( 90 | action.space = "Box", 91 | action.space.bounds = list(c(-1, 1)) 92 | ), 93 | private = list( 94 | getVelocity = function(self, action) { 95 | force = min(max(action, self$action.space.bounds[[1]][1]), self$action.space.bounds[[1]][2]) 96 | self$velocity + 0.0015 * force - 0.0025 * cos(3 * self$position) 97 | } 98 | ) 99 | ) 100 | -------------------------------------------------------------------------------- /vignettes/environments.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=FALSE------------------------------------------------ 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 3 | 4 | ## ------------------------------------------------------------------------ 5 | library(reinforcelearn) 6 | 7 | ## ---- out.width = "200px", fig.align="center", echo = FALSE-------------- 8 | knitr::include_graphics("mountaincar.JPG") 9 | 10 | ## ------------------------------------------------------------------------ 11 | reset = function(self) { 12 | position = runif(1, -0.6, -0.4) 13 | velocity = 0 14 | state = matrix(c(position, velocity), ncol = 2) 15 | state 16 | } 17 | 18 | ## ------------------------------------------------------------------------ 19 | step = function(self, action) { 20 | position = self$state[1] 21 | velocity = self$state[2] 22 | velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025) 23 | velocity = min(max(velocity, -0.07), 0.07) 24 | position = position + velocity 25 | if (position < -1.2) { 26 | position = -1.2 27 | velocity = 0 28 | } 29 | state = matrix(c(position, velocity), ncol = 2) 30 | reward = -1 31 | if (position >= 0.5) { 32 | done = TRUE 33 | reward = 0 34 | } else { 35 | done = FALSE 36 | } 37 | list(state, reward, done) 38 | } 39 | 40 | ## ------------------------------------------------------------------------ 41 | env = makeEnvironment(step = step, reset = reset) 42 | 43 | ## ---- eval = FALSE------------------------------------------------------- 44 | # # Create a gym environment. 45 | # env = makeEnvironment("gym", gym.name = "MountainCar-v0") 46 | 47 | ## ------------------------------------------------------------------------ 48 | # State transition array 49 | P = array(0, c(2, 2, 2)) 50 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 51 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE) 52 | 53 | # Reward matrix 54 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 55 | 56 | env = makeEnvironment("mdp", transitions = P, rewards = R) 57 | 58 | ## ---- out.width = "200px", fig.align="center", echo = FALSE-------------- 59 | knitr::include_graphics("gridworld.JPG") 60 | 61 | ## ------------------------------------------------------------------------ 62 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1) 63 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 64 | 65 | ## ------------------------------------------------------------------------ 66 | env = makeEnvironment("gridworld", shape = c(4, 4), 67 | goal.states = 0L, initial.state = 15L) 68 | 69 | ## ------------------------------------------------------------------------ 70 | # The initial state of the environment. 71 | env$reset() 72 | 73 | env$visualize() 74 | 75 | # Actions are encoded as integers. 76 | env$step(0L) 77 | 78 | env$visualize() 79 | 80 | # But can also have character names. 81 | env$step("left") 82 | 83 | env$visualize() 84 | 85 | ## ------------------------------------------------------------------------ 86 | env = makeEnvironment("mountain.car") 87 | env$n.actions 88 | env$state.space.bounds 89 | 90 | ## ------------------------------------------------------------------------ 91 | env = makeEnvironment("gridworld", shape = c(4, 4), 92 | goal.states = 0L, initial.state = 15L, discount = 0.99) 93 | 94 | env$step("up") 95 | env$n.step 96 | env$episode.return 97 | 98 | env$step("left") 99 | env$n.step 100 | env$episode.return 101 | 102 | -------------------------------------------------------------------------------- /docs/articles/environments.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=FALSE------------------------------------------------ 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 3 | 4 | ## ------------------------------------------------------------------------ 5 | library(reinforcelearn) 6 | 7 | ## ---- out.width = "200px", fig.align="center", echo = FALSE-------------- 8 | knitr::include_graphics("mountaincar.JPG") 9 | 10 | ## ------------------------------------------------------------------------ 11 | reset = function(self) { 12 | position = runif(1, -0.6, -0.4) 13 | velocity = 0 14 | state = matrix(c(position, velocity), ncol = 2) 15 | state 16 | } 17 | 18 | ## ------------------------------------------------------------------------ 19 | step = function(self, action) { 20 | position = self$state[1] 21 | velocity = self$state[2] 22 | velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025) 23 | velocity = min(max(velocity, -0.07), 0.07) 24 | position = position + velocity 25 | if (position < -1.2) { 26 | position = -1.2 27 | velocity = 0 28 | } 29 | state = matrix(c(position, velocity), ncol = 2) 30 | reward = -1 31 | if (position >= 0.5) { 32 | done = TRUE 33 | reward = 0 34 | } else { 35 | done = FALSE 36 | } 37 | list(state, reward, done) 38 | } 39 | 40 | ## ------------------------------------------------------------------------ 41 | env = makeEnvironment(step = step, reset = reset) 42 | 43 | ## ---- eval = FALSE------------------------------------------------------- 44 | # # Create a gym environment. 45 | # env = makeEnvironment("gym", gym.name = "MountainCar-v0") 46 | 47 | ## ------------------------------------------------------------------------ 48 | # State transition array 49 | P = array(0, c(2, 2, 2)) 50 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 51 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE) 52 | 53 | # Reward matrix 54 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 55 | 56 | env = makeEnvironment("mdp", transitions = P, rewards = R) 57 | 58 | ## ---- out.width = "200px", fig.align="center", echo = FALSE-------------- 59 | knitr::include_graphics("gridworld.JPG") 60 | 61 | ## ------------------------------------------------------------------------ 62 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1) 63 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 64 | 65 | ## ------------------------------------------------------------------------ 66 | env = makeEnvironment("gridworld", shape = c(4, 4), 67 | goal.states = 0L, initial.state = 15L) 68 | 69 | ## ------------------------------------------------------------------------ 70 | # The initial state of the environment. 71 | env$reset() 72 | 73 | env$visualize() 74 | 75 | # Actions are encoded as integers. 76 | env$step(0L) 77 | 78 | env$visualize() 79 | 80 | # But can also have character names. 81 | env$step("left") 82 | 83 | env$visualize() 84 | 85 | ## ------------------------------------------------------------------------ 86 | env = makeEnvironment("mountain.car") 87 | env$n.actions 88 | env$state.space.bounds 89 | 90 | ## ------------------------------------------------------------------------ 91 | env = makeEnvironment("gridworld", shape = c(4, 4), 92 | goal.states = 0L, initial.state = 15L, discount = 0.99) 93 | 94 | env$step("up") 95 | env$n.step 96 | env$episode.return 97 | 98 | env$step("left") 99 | env$n.step 100 | env$episode.return 101 | 102 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticker footer */ 2 | body > .container { 3 | display: flex; 4 | padding-top: 60px; 5 | min-height: calc(100vh); 6 | flex-direction: column; 7 | } 8 | 9 | body > .container .row { 10 | flex: 1; 11 | } 12 | 13 | footer { 14 | margin-top: 45px; 15 | padding: 35px 0 36px; 16 | border-top: 1px solid #e5e5e5; 17 | color: #666; 18 | display: flex; 19 | } 20 | footer p { 21 | margin-bottom: 0; 22 | } 23 | footer div { 24 | flex: 1; 25 | } 26 | footer .pkgdown { 27 | text-align: right; 28 | } 29 | footer p { 30 | margin-bottom: 0; 31 | } 32 | 33 | img.icon { 34 | float: right; 35 | } 36 | 37 | img { 38 | max-width: 100%; 39 | } 40 | 41 | /* Section anchors ---------------------------------*/ 42 | 43 | a.anchor { 44 | margin-left: -30px; 45 | display:inline-block; 46 | width: 30px; 47 | height: 30px; 48 | visibility: hidden; 49 | 50 | background-image: url(./link.svg); 51 | background-repeat: no-repeat; 52 | background-size: 20px 20px; 53 | background-position: center center; 54 | } 55 | 56 | .hasAnchor:hover a.anchor { 57 | visibility: visible; 58 | } 59 | 60 | @media (max-width: 767px) { 61 | .hasAnchor:hover a.anchor { 62 | visibility: hidden; 63 | } 64 | } 65 | 66 | 67 | /* Fixes for fixed navbar --------------------------*/ 68 | 69 | .contents h1, .contents h2, .contents h3, .contents h4 { 70 | padding-top: 60px; 71 | margin-top: -60px; 72 | } 73 | 74 | /* Static header placement on mobile devices */ 75 | @media (max-width: 767px) { 76 | .navbar-fixed-top { 77 | position: absolute; 78 | } 79 | .navbar { 80 | padding: 0; 81 | } 82 | } 83 | 84 | 85 | /* Sidebar --------------------------*/ 86 | 87 | #sidebar { 88 | margin-top: 30px; 89 | } 90 | #sidebar h2 { 91 | font-size: 1.5em; 92 | margin-top: 1em; 93 | } 94 | 95 | #sidebar h2:first-child { 96 | margin-top: 0; 97 | } 98 | 99 | #sidebar .list-unstyled li { 100 | margin-bottom: 0.5em; 101 | } 102 | 103 | /* Reference index & topics ----------------------------------------------- */ 104 | 105 | .ref-index th {font-weight: normal;} 106 | .ref-index h2 {font-size: 20px;} 107 | 108 | .ref-index td {vertical-align: top;} 109 | .ref-index .alias {width: 40%;} 110 | .ref-index .title {width: 60%;} 111 | 112 | .ref-index .alias {width: 40%;} 113 | .ref-index .title {width: 60%;} 114 | 115 | .ref-arguments th {text-align: right; padding-right: 10px;} 116 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 117 | .ref-arguments .name {width: 20%;} 118 | .ref-arguments .desc {width: 80%;} 119 | 120 | /* Nice scrolling for wide elements --------------------------------------- */ 121 | 122 | table { 123 | display: block; 124 | overflow: auto; 125 | } 126 | 127 | /* Syntax highlighting ---------------------------------------------------- */ 128 | 129 | pre { 130 | word-wrap: normal; 131 | word-break: normal; 132 | border: 1px solid #eee; 133 | } 134 | 135 | pre, code { 136 | background-color: #f8f8f8; 137 | color: #333; 138 | } 139 | 140 | pre .img { 141 | margin: 5px 0; 142 | } 143 | 144 | pre .img img { 145 | background-color: #fff; 146 | display: block; 147 | height: auto; 148 | } 149 | 150 | code a, pre a { 151 | color: #375f84; 152 | } 153 | 154 | .fl {color: #1514b5;} 155 | .fu {color: #000000;} /* function */ 156 | .ch,.st {color: #036a07;} /* string */ 157 | .kw {color: #264D66;} /* keyword */ 158 | .co {color: #888888;} /* comment */ 159 | 160 | .message { color: black; font-weight: bolder;} 161 | .error { color: orange; font-weight: bolder;} 162 | .warning { color: #6A0366; font-weight: bolder;} 163 | 164 | -------------------------------------------------------------------------------- /tests/testthat/test_agent.R: -------------------------------------------------------------------------------- 1 | context("check input combinations") 2 | test_that("softmax and epsilon greedy policies need value function", { 3 | expect_error(makeAgent("softmax"), 4 | "Cannot use this policy without specifying a value function!") 5 | expect_error(makeAgent("greedy"), 6 | "Cannot use this policy without specifying a value function!") 7 | expect_error(makeAgent("epsilon.greedy"), 8 | "Cannot use this policy without specifying a value function!") 9 | }) 10 | 11 | memory = makeReplayMemory() 12 | test_that("experience replay and eligibility traces cannot be used simultaneously", { 13 | expect_error(makeAgent("random", "table", "qlearning", 14 | replay.memory = memory, algorithm.args = list(lambda = 0.8, traces = "replace")), 15 | "Experience replay with eligibility traces is not supported!") 16 | }) 17 | 18 | # #------------- 19 | # # Test observing 20 | # 21 | # env = makeEnvironment("windy.gridworld") 22 | # 23 | # agent = makeAgent("random") 24 | # interact(env, agent, n.steps = 10L, learn = FALSE) 25 | # 26 | # agent = makeAgent("softmax", "table") 27 | # interact(env, agent, n.steps = 10L, learn = FALSE) 28 | # 29 | # agent = makeAgent("random", "table", "qlearning") 30 | # interact(env, agent, n.steps = 10L, learn = FALSE) 31 | # 32 | # agent = makeAgent("random", "table", "qlearning", lambda = 0.8, traces = "replace") 33 | # interact(env, agent, n.steps = 2L, learn = FALSE) 34 | # getEligibilityTraces(agent) 35 | # 36 | # mem = makeReplayMemory(size = 2, batch.size = 1) 37 | # agent = makeAgent("random", "table", "qlearning", replay.memory = mem) 38 | # interact(env, agent, n.steps = 10L, learn = FALSE) 39 | # getReplayMemory(agent) 40 | # 41 | # 42 | # #------------- 43 | # # Test learning 44 | # 45 | # # qlearning table base 46 | # agent = makeAgent("random", "table", "qlearning") 47 | # interact(env, agent, n.steps = 2L, learn = TRUE) 48 | # getValueFunction(agent) 49 | # 50 | # # qlearning table eligibility 51 | # agent = makeAgent("random", "table", "qlearning", lambda = 0.8, traces = "replace") 52 | # interact(env, agent, n.steps = 2L, learn = TRUE) 53 | # getValueFunction(agent) 54 | # 55 | # # qlearning table exp replay 56 | # mem = makeReplayMemory(size = 2L, batch.size = 2L) 57 | # agent = makeAgent("random", "table", "qlearning", replay.memory = mem) 58 | # interact(env, agent, n.steps = 2L, learn = TRUE) 59 | # getValueFunction(agent) 60 | # 61 | # # qlearning neural.network base 62 | # library(keras) 63 | # model = keras_model_sequential() %>% 64 | # layer_dense(units = env$n.actions, activation = "linear", 65 | # input_shape = c(env$n.states), kernel_initializer = initializer_zeros(), 66 | # use_bias = FALSE) %>% 67 | # compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 68 | # val = makeValueFunction("neural.network", model = model) 69 | # preprocess = function(x) to_categorical(x, num_classes = env$n.states) 70 | # agent = makeAgent("softmax", val, "qlearning", preprocess = preprocess) 71 | # interact(env, agent, n.steps = 2L, learn = TRUE) 72 | # getValueFunction(agent) 73 | # 74 | # # qlearning neural.network exp.replay 75 | # library(keras) 76 | # model = keras_model_sequential() %>% 77 | # layer_dense(units = env$n.actions, activation = "linear", 78 | # input_shape = c(env$n.states), kernel_initializer = initializer_zeros(), 79 | # use_bias = FALSE) %>% 80 | # compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 81 | # mem = makeReplayMemory(size = 2L, batch.size = 2L) 82 | # val = makeValueFunction("neural.network", model = model) 83 | # preprocess = function(x) to_categorical(x, num_classes = env$n.states) 84 | # agent = makeAgent("softmax", val, "qlearning", 85 | # preprocess = preprocess, replay.memory = mem) 86 | # interact(env, agent, n.steps = 2L, learn = TRUE) 87 | # getValueFunction(agent) 88 | -------------------------------------------------------------------------------- /vignettes/agents.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=FALSE------------------------------------------------ 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 3 | 4 | ## ------------------------------------------------------------------------ 5 | set.seed(12) 6 | library(reinforcelearn) 7 | 8 | ## ------------------------------------------------------------------------ 9 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L) 10 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning") 11 | 12 | ## ------------------------------------------------------------------------ 13 | interact(env, agent, n.episodes = 5L) 14 | 15 | ## ------------------------------------------------------------------------ 16 | getValueFunction(agent) 17 | 18 | ## ------------------------------------------------------------------------ 19 | # Uniform random policy 20 | makePolicy("random") 21 | 22 | # Epsilon-greedy policy 23 | makePolicy("epsilon.greedy", epsilon = 0.2) 24 | 25 | # Softmax policy 26 | makePolicy("softmax") 27 | 28 | ## ------------------------------------------------------------------------ 29 | makeValueFunction("table", n.states = 9L, n.actions = 4L) 30 | 31 | ## ---- eval = FALSE------------------------------------------------------- 32 | # library(keras) 33 | # model = keras_model_sequential() %>% 34 | # layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>% 35 | # compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae") 36 | # makeValueFunction("neural.network", model) 37 | 38 | ## ------------------------------------------------------------------------ 39 | makeAlgorithm("qlearning") 40 | 41 | ## ------------------------------------------------------------------------ 42 | policy = makePolicy("epsilon.greedy", epsilon = 0.2) 43 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L) 44 | algorithm = makeAlgorithm("qlearning") 45 | 46 | agent = makeAgent(policy, val.fun, algorithm) 47 | 48 | ## ------------------------------------------------------------------------ 49 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 50 | policy.args = list(epsilon = 0.2)) 51 | 52 | ## ------------------------------------------------------------------------ 53 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L) 54 | agent = makeAgent("random") 55 | 56 | interact(env, agent, n.steps = 3L, visualize = TRUE) 57 | 58 | ## ------------------------------------------------------------------------ 59 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 60 | initial.state = 15L) 61 | agent = makeAgent("random") 62 | 63 | for (i in 1:3L) { 64 | ## comment in the next line to wait on enter press before taking the next action. 65 | # invisible(readline(prompt = "Press [enter] to take the next action")) 66 | interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE) 67 | } 68 | 69 | ## ------------------------------------------------------------------------ 70 | (memory = makeReplayMemory(size = 2L, batch.size = 1L)) 71 | 72 | agent = makeAgent("random", replay.memory = memory) 73 | 74 | interact(env, agent, n.steps = 2L, visualize = TRUE) 75 | 76 | getReplayMemory(agent) 77 | 78 | ## ---- message = FALSE---------------------------------------------------- 79 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 80 | 81 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 82 | memory = makeReplayMemory(size = 100L, batch.size = 20L) 83 | 84 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory) 85 | 86 | for (i in 1:100) { 87 | interact(env, agent, n.steps = 20L, learn = FALSE) 88 | interact(env, agent, n.steps = 1L, learn = TRUE) 89 | } 90 | action.vals = getValueFunction(agent) 91 | matrix(getStateValues(action.vals), ncol = 4L) 92 | 93 | -------------------------------------------------------------------------------- /docs/articles/agents.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=FALSE------------------------------------------------ 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 3 | 4 | ## ------------------------------------------------------------------------ 5 | set.seed(12) 6 | library(reinforcelearn) 7 | 8 | ## ------------------------------------------------------------------------ 9 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L) 10 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning") 11 | 12 | ## ------------------------------------------------------------------------ 13 | interact(env, agent, n.episodes = 5L) 14 | 15 | ## ------------------------------------------------------------------------ 16 | getValueFunction(agent) 17 | 18 | ## ------------------------------------------------------------------------ 19 | # Uniform random policy 20 | makePolicy("random") 21 | 22 | # Epsilon-greedy policy 23 | makePolicy("epsilon.greedy", epsilon = 0.2) 24 | 25 | # Softmax policy 26 | makePolicy("softmax") 27 | 28 | ## ------------------------------------------------------------------------ 29 | makeValueFunction("table", n.states = 9L, n.actions = 4L) 30 | 31 | ## ---- eval = FALSE------------------------------------------------------- 32 | # library(keras) 33 | # model = keras_model_sequential() %>% 34 | # layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>% 35 | # compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae") 36 | # makeValueFunction("neural.network", model) 37 | 38 | ## ------------------------------------------------------------------------ 39 | makeAlgorithm("qlearning") 40 | 41 | ## ------------------------------------------------------------------------ 42 | policy = makePolicy("epsilon.greedy", epsilon = 0.2) 43 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L) 44 | algorithm = makeAlgorithm("qlearning") 45 | 46 | agent = makeAgent(policy, val.fun, algorithm) 47 | 48 | ## ------------------------------------------------------------------------ 49 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 50 | policy.args = list(epsilon = 0.2)) 51 | 52 | ## ------------------------------------------------------------------------ 53 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L) 54 | agent = makeAgent("random") 55 | 56 | interact(env, agent, n.steps = 3L, visualize = TRUE) 57 | 58 | ## ------------------------------------------------------------------------ 59 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 60 | initial.state = 15L) 61 | agent = makeAgent("random") 62 | 63 | for (i in 1:3L) { 64 | ## comment in the next line to wait on enter press before taking the next action. 65 | # invisible(readline(prompt = "Press [enter] to take the next action")) 66 | interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE) 67 | } 68 | 69 | ## ------------------------------------------------------------------------ 70 | (memory = makeReplayMemory(size = 2L, batch.size = 1L)) 71 | 72 | agent = makeAgent("random", replay.memory = memory) 73 | 74 | interact(env, agent, n.steps = 2L, visualize = TRUE) 75 | 76 | getReplayMemory(agent) 77 | 78 | ## ---- message = FALSE---------------------------------------------------- 79 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 80 | 81 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 82 | memory = makeReplayMemory(size = 100L, batch.size = 20L) 83 | 84 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory) 85 | 86 | for (i in 1:100) { 87 | interact(env, agent, n.steps = 20L, learn = FALSE) 88 | interact(env, agent, n.steps = 1L, learn = TRUE) 89 | } 90 | action.vals = getValueFunction(agent) 91 | matrix(getStateValues(action.vals), ncol = 4L) 92 | 93 | -------------------------------------------------------------------------------- /R/environment_mdp.R: -------------------------------------------------------------------------------- 1 | #' MDP Environment 2 | #' 3 | #' Markov Decision Process environment. 4 | #' 5 | #' @section Usage: 6 | #' `makeEnvironment("MDP", transitions, rewards, initial.state, ...)` 7 | #' 8 | #' @param transitions \[`array (n.states x n.states x n.actions)`] \cr 9 | #' State transition array. 10 | #' @param rewards \[`matrix (n.states x n.actions)`] \cr 11 | #' Reward array. 12 | #' @param initial.state \[`integer`] \cr 13 | #' Optional starting state. 14 | #' If a vector is given a starting state will be 15 | #' randomly sampled from this vector whenever `reset` is called. 16 | #' Note that states are numerated starting with 17 | #' 0. If `initial.state = NULL` all non-terminal states are 18 | #' possible starting states. 19 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment]. 20 | #' 21 | #' @md 22 | #' 23 | #' @name MdpEnvironment 24 | #' @inheritSection Environment Methods 25 | #' @export 26 | #' 27 | #' @examples 28 | #' # Create a Markov Decision Process. 29 | #' P = array(0, c(2, 2, 2)) 30 | #' P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 31 | #' P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE) 32 | #' R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 33 | #' env = makeEnvironment("mdp", transitions = P, rewards = R) 34 | #' env$reset() 35 | #' env$step(1L) 36 | NULL 37 | 38 | MdpEnvironment = R6::R6Class("MdpEnvironment", 39 | inherit = Environment, 40 | 41 | public = list( 42 | action.space = NULL, 43 | actions = NULL, 44 | initial.state = NULL, 45 | n.actions = NULL, 46 | n.states = NULL, 47 | rewards = NULL, 48 | state.space = NULL, 49 | states = NULL, 50 | terminal.states = NULL, 51 | transitions = NULL, 52 | 53 | initialize = function(transitions, rewards, initial.state, ...) { 54 | checkmate::assertArray(transitions, any.missing = FALSE, d = 3L) 55 | checkmate::assertArray(rewards, any.missing = FALSE, d = 2L) 56 | 57 | self$state.space = "Discrete" 58 | self$action.space = "Discrete" 59 | self$n.actions = dim(transitions)[3] 60 | self$n.states = dim(transitions)[1] 61 | self$actions = seq_len(self$n.actions) - 1L 62 | self$states = seq_len(self$n.states) - 1L 63 | self$transitions = transitions 64 | self$rewards = rewards 65 | terminal.states = apply(transitions, 3L, function(x) diag(x)) 66 | self$terminal.states = which(apply(terminal.states, 1L, function(x) all(x == 1L))) - 1L 67 | if (length(self$terminal.states) == 0) { 68 | warning("There are no terminal states in the MDP!") 69 | self$terminal.states = -1L 70 | } 71 | if (missing(initial.state)) { 72 | self$initial.state = setdiff(self$states, self$terminal.states) 73 | } else { 74 | checkmate::assertIntegerish(initial.state, upper = self$n.states - 1L) 75 | self$initial.state = initial.state 76 | } 77 | 78 | step_ = function(env, action) { 79 | # if (is.character(action)) { 80 | # action = self$action.names[action] 81 | # } 82 | reward = self$rewards[self$state + 1L, action + 1L] # use old state here! 83 | state = sample(self$states, size = 1L, 84 | prob = self$transitions[self$state + 1L, , action + 1L]) 85 | if (state %in% self$terminal.states) { 86 | done = TRUE 87 | } else { 88 | done = FALSE 89 | } 90 | list(state, reward, done) 91 | } 92 | 93 | reset_ = function(env) { 94 | state = ifelse(length(self$initial.state) > 1L, 95 | sample(self$initial.state, size = 1L), self$initial.state) 96 | state 97 | } 98 | # call initialize of superclass with mdp step and reset function 99 | super$initialize(step_, reset_, ...) 100 | } 101 | ) 102 | ) 103 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • reinforcelearn 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 35 | 36 | 37 | 38 | 39 | 40 |
    41 |
    42 | 89 | 90 | 91 |
    92 | 93 |
    94 |
    95 | 98 | 99 |
      100 |
    • 101 |

      Markus Dumke. Author, maintainer. 102 |

      103 |
    • 104 |
    105 | 106 |
    107 | 108 |
    109 | 110 | 111 |
    112 | 115 | 116 |
    117 |

    Site built with pkgdown.

    118 |
    119 | 120 |
    121 |
    122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /R/environment_gym.R: -------------------------------------------------------------------------------- 1 | #' Gym Environment 2 | #' 3 | #' Reinforcement learning environment from OpenAI Gym. 4 | #' 5 | #' For available gym environments take a look at https://gym.openai.com/envs. 6 | #' 7 | #' @section Usage: 8 | #' `makeEnvironment("gym", gym.name, ...)` 9 | #' 10 | #' @section Installation: 11 | #' For installation of the python package `gym` see 12 | #' https://github.com/openai/gym#installation. 13 | #' Then install the R package `reticulate`. 14 | #' 15 | #' @param gym.name \[`character(1)`] \cr 16 | #' Name of gym environment, e.g. \code{"CartPole-v0"}. 17 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment]. 18 | #' 19 | #' @md 20 | #' 21 | #' @section Methods: 22 | #' * `$close()` 23 | #' Close visualization window. 24 | #' 25 | #' @name GymEnvironment 26 | #' @inheritSection Environment Methods 27 | #' @export 28 | #' 29 | #' @examples 30 | #' \dontrun{ 31 | #' # Create an OpenAI Gym environment. 32 | #' # Make sure you have Python, gym and reticulate installed. 33 | #' env = makeEnvironment("gym", gym.name = "MountainCar-v0") 34 | #' env$reset() 35 | #' env$close() 36 | #' } 37 | NULL 38 | 39 | GymEnvironment = R6::R6Class("GymEnvironment", 40 | inherit = Environment, 41 | 42 | public = list( 43 | gym.env = NULL, 44 | gym.name = NULL, 45 | 46 | action.space = NULL, 47 | actions = NULL, 48 | action.shape = NULL, 49 | n.actions = NULL, 50 | action.space.bounds = NULL, 51 | 52 | state.space = NULL, 53 | state.shape = NULL, 54 | states = NULL, 55 | n.states = NULL, 56 | state.space.bounds = NULL, 57 | 58 | close = function() { 59 | self$gym.env$close() 60 | }, 61 | 62 | initialize = function(gym.name, ...) { 63 | if (!requireNamespace("reticulate", quietly = TRUE)) { 64 | stop("Please install the reticulate package to use environments from OpenAI Gym. 65 | Also make sure you have the python package gym installed.", 66 | call. = FALSE) 67 | } 68 | checkmate::assertCharacter(gym.name, len = 1) 69 | self$gym.name = gym.name 70 | 71 | gym = reticulate::import("gym") 72 | self$gym.env = gym$make(gym.name) 73 | 74 | action.space.info = self$gym.env$action_space 75 | self$action.space = extractSpaceClass(action.space.info) 76 | 77 | state.space.info = self$gym.env$observation_space 78 | self$state.space = extractSpaceClass(state.space.info) 79 | 80 | if (self$action.space == "Discrete") { 81 | res = extractDiscreteInfo(action.space.info) 82 | self$n.actions = res$n 83 | self$actions = res$x 84 | } 85 | 86 | if (self$action.space == "Box") { 87 | res = extractBoxInfo(action.space.info) 88 | self$action.space.bounds = res$bounds 89 | self$action.shape = res$shape 90 | } 91 | 92 | if (self$state.space == "Discrete") { 93 | res = extractDiscreteInfo(state.space.info) 94 | self$n.actions = res$n 95 | self$actions = res$x 96 | } 97 | 98 | if (self$state.space == "Box") { 99 | res = extractBoxInfo(state.space.info) 100 | self$state.space.bounds = res$bounds 101 | self$state.shape = res$shape 102 | } 103 | 104 | step_ = function(self, action) { 105 | res = self$gym.env$step(action) 106 | res[1:3] 107 | } 108 | 109 | reset_ = function(self) { 110 | state = self$gym.env$reset() 111 | state 112 | } 113 | 114 | visualize_ = function(self) { 115 | self$gym.env$render() 116 | } 117 | 118 | super$initialize(step_, reset_, visualize_, ...) 119 | } 120 | ) 121 | ) 122 | 123 | 124 | extractDiscreteInfo = function(info) { 125 | n = info$n 126 | x = seq(0, n - 1) 127 | list(n = n, x = x) 128 | } 129 | 130 | extractBoxInfo = function(info) { 131 | list(bounds = list(info$low, info$high), shape = info$shape[[1]]) # does [[1]] work in all cases? 132 | } 133 | 134 | extractSpaceClass = function(info) { 135 | sub(".*\\.", "", class(info)[1]) 136 | } 137 | -------------------------------------------------------------------------------- /R/experience_replay.R: -------------------------------------------------------------------------------- 1 | #' Experience Replay 2 | #' 3 | #' Create replay memory for experience replay. 4 | #' 5 | #' Sampling from replay memory will be uniform. 6 | #' 7 | #' @param size \[`integer(1)`] \cr Size of replay memory. 8 | #' @param batch.size \[`integer(1)`] \cr Batch size. 9 | #' 10 | #' @return \[`list(size, batch.size)`] 11 | #' This list can then be passed onto [makeAgent], which will construct the 12 | #' replay memory accordingly. 13 | #' 14 | #' @md 15 | #' @aliases experience.replay, replay.memory 16 | #' @export 17 | #' 18 | #' @examples 19 | #' memory = makeReplayMemory(size = 100L, batch.size = 16L) 20 | makeReplayMemory = function(size = 100L, batch.size = 16L) { # add arguments for priorization 21 | checkmate::assertInt(size, lower = 1) 22 | checkmate::assertInt(batch.size, lower = 1, upper = size) 23 | x = list(size = size, batch.size = batch.size) 24 | class(x) = "ReplayMemory" 25 | x 26 | } 27 | 28 | ReplayMemory = R6::R6Class("ReplayMemory", 29 | public = list( 30 | memory = NULL, 31 | size = NULL, 32 | batch.size = NULL, 33 | index = 0L, 34 | index.full = 0L, 35 | 36 | # fixme allow growing replay memory? 37 | initialize = function(size, batch.size) { 38 | self$size = size 39 | self$batch.size = batch.size 40 | self$memory = vector("list", length = self$size) 41 | }, 42 | 43 | # # initialize following policy 44 | # initializeMemory = function(env, policy) { 45 | # for (i in seq_len(self$size)) { 46 | # action = policy$sampleAction() 47 | # env$step(action) 48 | # data = list(state = preprocessState(envir$previous.state), action = action, 49 | # reward = envir$reward, next.state = preprocessState(envir$state)) 50 | # } 51 | # }, 52 | 53 | observe = function(state, action, reward, next.state) { 54 | self$index = self$index + 1L 55 | self$index.full = self$index.full + 1L 56 | self$index.full = min(self$size, self$index.full) 57 | index = self$getReplacementIndex() 58 | obs = self$getReplayObservation(state, action, reward, next.state) 59 | self$add(obs, index) 60 | }, 61 | 62 | getReplayObservation = function(state, action, reward, next.state) { 63 | list(state = state, action = action, reward = reward, next.state = next.state) 64 | }, 65 | 66 | # e.g. oldest entry 67 | getReplacementIndex = function() { 68 | if (self$index > self$size) { 69 | self$index = 1L 70 | } 71 | self$index 72 | }, 73 | 74 | add = function(observation, index) { 75 | self$memory[[index]] = observation 76 | }, 77 | 78 | isFull = function(memory = self$memory) { 79 | # maybe it is enough to check the last entry 80 | full = !(any(purrr::map_lgl(memory, is.null))) 81 | full 82 | }, 83 | 84 | extract = function(batch, member, fun = lapply) { 85 | states = fun(batch, "[[", member) 86 | states 87 | }, 88 | 89 | # checkMemory = function(memory = self$memory, batch.size = self$batch.size) { 90 | # if (!self$isFull()) { 91 | # if (self$index < batch.size) { 92 | # return(FALSE) 93 | # } 94 | # } 95 | # }, 96 | 97 | sampleBatch = function(memory = self$memory[seq_len(self$index.full)], batch.size = self$batch.size) { 98 | if (length(memory) >= batch.size) { 99 | indices = self$getIndices(length(memory), batch.size) 100 | batch = memory[indices] 101 | return(purrr::transpose(batch)) 102 | } else { 103 | message("Cannot sample from replay memory because batch size > number of non-empty entries in replay memory.") 104 | } 105 | }, 106 | 107 | getIndices = function(memory.size, batch.size) { 108 | indices = sample(seq_len(memory.size), size = batch.size) 109 | indices 110 | } 111 | ) 112 | ) 113 | 114 | # ideas: maybe replay memory in future not list but hash table / dictionary etc 115 | # data frame with list columns? 116 | # fixme allow dynamic change of replay memory length 117 | # store preprocessed state? 118 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • reinforcelearn 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 35 | 36 | 37 | 38 | 39 | 40 |
    41 |
    42 | 89 | 90 | 91 |
    92 | 93 | 96 | 97 |
    98 |
    99 |
    100 |

    All vignettes

    101 |

    102 | 103 | 107 |
    108 |
    109 |
    110 | 111 |
    112 | 115 | 116 |
    117 |

    Site built with pkgdown.

    118 |
    119 | 120 |
    121 |
    122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | All news • reinforcelearn 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 35 | 36 | 37 | 38 | 39 | 40 |
    41 |
    42 | 89 | 90 | 91 |
    92 | 93 |
    94 | 95 |
    96 | 99 | 100 |
    101 |
    102 |
    103 | 104 | 111 | 112 |
    113 | 114 |
    115 | 118 | 119 |
    120 |

    Site built with pkgdown.

    121 |
    122 | 123 |
    124 |
    125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /R/policy.R: -------------------------------------------------------------------------------- 1 | #' Create policy. 2 | #' 3 | #' Reinforcement learning policies. 4 | #' 5 | #' @param class \[`character(1)`] \cr 6 | #' Class of policy. One of `c("random", "epsilon.greedy", "greedy", "softmax")`. 7 | #' @param ... \[`any`] \cr Optional named arguments passed on to the subclass. Alternatively 8 | #' these can be given using the `args` argument. 9 | #' @param args \[`list`] \cr Optional list of named arguments passed on to the 10 | #' subclass. The arguments in ... take precedence over values in this list. 11 | #' We strongly encourage you to use one or the other to pass arguments 12 | #' to the function but not both. 13 | #' 14 | #' @return \[`list(name, args)`] List with the name and optional args. 15 | #' This list can then be passed onto [makeAgent], which will construct the 16 | #' policy accordingly. 17 | #' 18 | #' @md 19 | #' @aliases Policy 20 | #' 21 | #' @section Policies: 22 | #' * [RandomPolicy] 23 | #' * [GreedyPolicy] 24 | #' * [EpsilonGreedyPolicy] 25 | #' * [SoftmaxPolicy] 26 | #' 27 | #' @export 28 | #' @examples 29 | #' policy = makePolicy("random") 30 | #' policy = makePolicy("epsilon.greedy", epsilon = 0.1) 31 | makePolicy = function(class = "random", args = list(), ...) { 32 | checkmate::assertChoice(class, 33 | c("random", "epsilon.greedy", "greedy", "softmax")) #, "gaussian")) 34 | checkmate::assertList(args, names = "unique") 35 | args = append(list(...), args) 36 | # remove duplicate entries in args list 37 | args = args[unique(names(args))] 38 | 39 | # fixme: check arguments of policy here 40 | x = list(name = class, args = args) 41 | class(x) = "Policy" 42 | x 43 | } 44 | 45 | 46 | Policy = R6::R6Class("Policy", 47 | public = list( 48 | sampleAction = function(policy) { 49 | action = sample(seq_along(policy), prob = policy, 50 | size = 1, replace = TRUE) - 1L 51 | action 52 | } 53 | ) 54 | ) 55 | 56 | #' Epsilon Greedy Policy 57 | #' 58 | #' @aliases GreedyPolicy 59 | #' @export 60 | #' @section Usage: 61 | #' \code{makePolicy("epsilon.greedy", epsilon = 0.1)} \cr 62 | #' \code{makePolicy("greedy")} 63 | #' 64 | #' @param epsilon [\code{numeric(1) in [0, 1]}] \cr 65 | #' Ratio of random exploration in epsilon-greedy action selection. 66 | #' 67 | #' @name EpsilonGreedyPolicy 68 | #' @examples 69 | #' policy = makePolicy("epsilon.greedy", epsilon = 0.1) 70 | NULL 71 | 72 | EpsilonGreedyPolicy = R6::R6Class("EpsilonGreedyPolicy", 73 | inherit = Policy, 74 | public = list( 75 | epsilon = NULL, 76 | getActionProbs = function(Q, n.actions) { # fixme: break ties 77 | greedy.action = nnet::which.is.max(Q) 78 | policy = matrix(0, nrow = 1, ncol = n.actions) 79 | policy[, greedy.action] = 1 - self$epsilon 80 | policy = policy + self$epsilon / n.actions 81 | policy 82 | }, 83 | initialize = function(epsilon = 0.1) { 84 | checkmate::assertNumber(epsilon, lower = 0, upper = 1) 85 | self$epsilon = epsilon 86 | } 87 | ) 88 | ) 89 | 90 | GreedyPolicy = R6::R6Class("GreedyPolicy", 91 | # inherit = EpsilonGreedyPolicy, 92 | public = list( 93 | getActionProbs = function(Q, n.actions) { 94 | greedy.action = nnet::which.is.max(Q) # this is duplicate code! 95 | policy = matrix(0, nrow = 1, ncol = n.actions) 96 | policy[, greedy.action] = 1 97 | policy 98 | } 99 | ) 100 | ) 101 | 102 | #' Random Policy 103 | #' 104 | #' @export 105 | #' @section Usage: 106 | #' \code{makePolicy("random")} 107 | #' 108 | #' @name RandomPolicy 109 | #' @examples 110 | #' pol = makePolicy("random") 111 | NULL 112 | 113 | RandomPolicy = R6::R6Class("RandomPolicy", 114 | inherit = Policy, 115 | public = list( 116 | getActionProbs = function(Q, n.actions) { 117 | policy = matrix(1 / n.actions, nrow = 1, ncol = n.actions) 118 | policy 119 | } 120 | ) 121 | ) 122 | 123 | # GaussianPolicy = R6::R6Class("GaussianPolicy", 124 | # inherit = Policy, 125 | # public = list( 126 | # sampleAction = function(mean, sd) { 127 | # rnorm(1L, mean, sd) 128 | # } 129 | # ) 130 | # ) 131 | 132 | #' Softmax Policy 133 | #' 134 | #' @export 135 | #' @section Usage: 136 | #' \code{makePolicy("softmax")} 137 | #' 138 | #' @name SoftmaxPolicy 139 | #' @examples 140 | #' pol = makePolicy("softmax") 141 | NULL 142 | 143 | SoftmaxPolicy = R6::R6Class("SoftmaxPolicy", 144 | inherit = Policy, 145 | public = list( 146 | getActionProbs = function(Q, n.actions) { 147 | policy = exp(Q) / rowSums(exp(Q)) 148 | policy 149 | } 150 | ) 151 | ) 152 | -------------------------------------------------------------------------------- /man/gridworld.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/environment_gridworld.R 3 | \name{Gridworld} 4 | \alias{Gridworld} 5 | \title{Gridworld} 6 | \arguments{ 7 | \item{shape}{[\code{integer(2)}] \cr 8 | Shape of the gridworld (number of rows x number of columns).} 9 | 10 | \item{goal.states}{[\code{integer}] \cr 11 | Goal states in the gridworld.} 12 | 13 | \item{cliff.states}{[\code{integer}] \cr 14 | Cliff states in the gridworld.} 15 | 16 | \item{reward.step}{[\code{integer(1)}] \cr 17 | Reward for taking a step.} 18 | 19 | \item{cliff.transition.states}{[\code{integer}] \cr 20 | States to which the environment transitions if stepping into the cliff. 21 | If it is a vector, all states will have equal probability. 22 | Only used when \code{cliff.transition.done == FALSE}, 23 | else specify the \code{initial.state} argument.} 24 | 25 | \item{reward.cliff}{[\code{integer(1)}] \cr 26 | Reward for taking a step in the cliff state.} 27 | 28 | \item{diagonal.moves}{[\code{logical(1)}] \cr 29 | Should diagonal moves be allowed?} 30 | 31 | \item{wind}{[\code{integer}] \cr 32 | Strength of the upward wind in each cell.} 33 | 34 | \item{cliff.transition.done}{[\code{logical(1)}] \cr 35 | Should the episode end after stepping into the cliff?} 36 | 37 | \item{stochasticity}{[\code{numeric(1)}] \cr 38 | Probability of random transition to any of the neighboring states when taking any action.} 39 | 40 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.} 41 | } 42 | \description{ 43 | Creates gridworld environments. 44 | } 45 | \details{ 46 | A gridworld is an episodic navigation task, the goal is to get from start state to goal state. 47 | 48 | Possible actions include going left, right, up or down. If \code{diagonal.moves = TRUE} diagonal 49 | moves are also possible, leftup, leftdown, rightup and rightdown. 50 | 51 | When stepping into a cliff state you get a reward of \code{reward.cliff}, 52 | usually a high negative reward and transition to a state specified by \code{cliff.transition.states}. 53 | 54 | In each column a deterministic wind specified via \code{wind} pushes you up a specific number of 55 | grid cells (for the next action). 56 | 57 | A stochastic gridworld is a gridworld where with probability \code{stochasticity} the next state 58 | is chosen at random from all neighbor states independent of the actual action. 59 | 60 | If an action would take you off the grid, the new state is the nearest cell inside the grid. 61 | For each step you get a reward of \code{reward.step}, until you reach a goal state, 62 | then the episode is done. 63 | 64 | States are enumerated row-wise and numeration starts with 0. 65 | Here is an example 4x4 grid: 66 | \tabular{rrrr}{ 67 | 0 \tab 1 \tab 2 \tab 3 \cr 68 | 4 \tab 5 \tab 6 \tab 7 \cr 69 | 8 \tab 9 \tab 10 \tab 11 \cr 70 | 12 \tab 13 \tab 14 \tab 15 \cr 71 | } 72 | So a board position could look like this (G: goal state, x: current state, C: cliff state): 73 | \tabular{rrrr}{ 74 | G \tab o \tab o \tab o \cr 75 | o \tab o \tab o \tab o \cr 76 | o \tab x \tab o \tab o \cr 77 | o \tab o \tab o \tab C \cr 78 | } 79 | } 80 | \section{Usage}{ 81 | 82 | \code{makeEnvironment("gridworld", shape = NULL, goal.states = NULL, cliff.states = NULL, reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]), cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...)} 83 | } 84 | 85 | \section{Methods}{ 86 | 87 | \itemize{ 88 | \item \code{$step(action)} \cr 89 | Take action in environment. 90 | Returns a list with \code{state}, \code{reward}, \code{done}. 91 | \item \code{$reset()} \cr 92 | Resets the \code{done} flag of the environment and returns an initial state. 93 | Useful when starting a new episode. 94 | \item \code{$visualize()} \cr 95 | Visualizes the environment (if there is a visualization function). 96 | } 97 | } 98 | 99 | \examples{ 100 | # Gridworld Environment (Sutton & Barto Example 4.1) 101 | env1 = makeEnvironment("gridworld", shape = c(4L, 4L), goal.states = 0L, 102 | initial.state = 15L) 103 | env1$reset() 104 | env1$visualize() 105 | env1$step(0L) 106 | env1$visualize() 107 | 108 | # Windy Gridworld (Sutton & Barto Example 6.5) 109 | env2 = makeEnvironment("gridworld", shape = c(7, 10), goal.states = 37L, 110 | reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0), 111 | initial.state = 30L) 112 | 113 | # Cliff Walking (Sutton & Barto Example 6.6) 114 | env3 = makeEnvironment("gridworld", shape = c(4, 12), goal.states = 47L, 115 | cliff.states = 37:46, reward.step = -1, reward.cliff = -100, 116 | cliff.transition.states = 36L, initial.state = 36L) 117 | } 118 | -------------------------------------------------------------------------------- /benchmark/benchmark_windy_gridworld.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Benchmark Algorithms on Windy Gridworld Task" 3 | author: "Markus Dumke" 4 | date: "`r Sys.Date()`" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, include = FALSE} 9 | knitr::opts_chunk$set(comment = "#>", collapse = FALSE, message = FALSE) 10 | knitr::opts_chunk$set(fig.path = 'Images/', eval = TRUE, cache = FALSE, 11 | size = "footnotesize", fig.asp = 0.618, fig.width = 4.5, fig.align = "center", 12 | message = FALSE, comment = "#>", collapse = TRUE, echo = TRUE) 13 | ``` 14 | 15 | 16 | ```{r} 17 | library(reinforcelearn) 18 | env = makeEnvironment("windy.gridworld") 19 | ``` 20 | 21 | The optimal solution is 15 steps. 22 | 23 | ## Simple Q-Learning 24 | 25 | ```{r} 26 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 27 | agent = makeAgent(policy, "table", "qlearning") 28 | 29 | res = interact(env, agent, n.episodes = 500L) 30 | ``` 31 | 32 | ```{r qlearning_windygrid, echo = FALSE, fig.align = "center"} 33 | library(ggplot2) 34 | df = data.frame(episode = seq_along(res$steps), 35 | steps = res$steps) 36 | 37 | ggplot(df, aes(episode, steps), col = "brown1") + 38 | geom_point(alpha = 0.2) + 39 | theme_bw() + 40 | labs( 41 | title = "Q-Learning", 42 | x = "Episode", 43 | y = "Steps per episode" 44 | ) + 45 | coord_cartesian(ylim = c(0, 200)) + 46 | geom_smooth(se = FALSE, size = 1) + 47 | geom_hline(yintercept = 15, size = 1, col = "black", lty = 2) 48 | ``` 49 | 50 | ## Q-Learning with Eligibility Traces 51 | 52 | ```{r} 53 | env$resetEverything() 54 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 55 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 56 | agent = makeAgent(policy, "table", alg) 57 | 58 | res = interact(env, agent, n.episodes = 500L) 59 | ``` 60 | 61 | ```{r qlearning_windygrid_elig, echo = FALSE, fig.align = "center"} 62 | library(ggplot2) 63 | df = data.frame(episode = seq_along(res$steps), 64 | steps = res$steps) 65 | 66 | ggplot(df, aes(episode, steps), col = "brown1") + 67 | geom_point(alpha = 0.2) + 68 | theme_bw() + 69 | labs( 70 | title = "Q-Learning", 71 | subtitle = "Eligibility traces", 72 | x = "Episode", 73 | y = "Steps per episode" 74 | ) + 75 | coord_cartesian(ylim = c(0, 200)) + 76 | geom_smooth(se = FALSE, size = 1) + 77 | geom_hline(yintercept = 15, size = 1, col = "black", lty = 2) 78 | ``` 79 | 80 | ## Q-Learning with Experience replay 81 | 82 | ```{r} 83 | env$resetEverything() 84 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 85 | mem = makeReplayMemory(size = 10L, batch.size = 10L) 86 | agent = makeAgent(policy, "table", "qlearning", replay.memory = mem) 87 | 88 | res = interact(env, agent, n.episodes = 500L) 89 | ``` 90 | 91 | ```{r qlearning_windygrid_expreplay, echo = FALSE, fig.align = "center"} 92 | library(ggplot2) 93 | df = data.frame(episode = seq_along(res$steps), 94 | steps = res$steps) 95 | 96 | ggplot(df, aes(episode, steps), col = "brown1") + 97 | geom_point(alpha = 0.2) + 98 | theme_bw() + 99 | labs( 100 | title = "Q-Learning", 101 | subtitle = "Experience replay", 102 | x = "Episode", 103 | y = "Steps per episode" 104 | ) + 105 | coord_cartesian(ylim = c(0, 200)) + 106 | geom_smooth(se = FALSE, size = 1) + 107 | geom_hline(yintercept = 15, size = 1, col = "black", lty = 2) 108 | ``` 109 | 110 | ## Q-Learning with neural network and experience replay 111 | 112 | ```{r} 113 | env$resetEverything() 114 | library(keras) 115 | model = keras_model_sequential() %>% 116 | layer_dense(units = env$n.actions, activation = "linear", 117 | input_shape = c(env$n.states), kernel_initializer = initializer_zeros(), 118 | use_bias = FALSE) %>% 119 | compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 120 | mem = makeReplayMemory(size = 2L, batch.size = 2L) 121 | val = makeValueFunction("neural.network", model = model) 122 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 123 | preprocess = function(x) to_categorical(x, num_classes = env$n.states) 124 | agent = makeAgent(policy, val, "qlearning", 125 | preprocess = preprocess, replay.memory = mem) 126 | 127 | res = interact(env, agent, n.episodes = 500L) 128 | ``` 129 | 130 | ```{r qlearning_windygrid_neuralnetwork, echo = FALSE, fig.align = "center"} 131 | library(ggplot2) 132 | df = data.frame(episode = seq_along(res$steps), 133 | steps = res$steps) 134 | 135 | ggplot(df, aes(episode, steps), col = "brown1") + 136 | geom_point(alpha = 0.2) + 137 | theme_bw() + 138 | labs( 139 | title = "Q-Learning", 140 | subtitle = "Experience replay and neural network", 141 | x = "Episode", 142 | y = "Steps per episode" 143 | ) + 144 | coord_cartesian(ylim = c(0, 200)) + 145 | geom_smooth(se = FALSE, size = 1) + 146 | geom_hline(yintercept = 15, size = 1, col = "black", lty = 2) 147 | ``` 148 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • reinforcelearn 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 35 | 36 | 37 | 38 | 39 | 40 |
    41 |
    42 | 89 | 90 | 91 |
    92 | 93 |
    94 |
    95 | 98 | 99 |
    YEAR: 2017
    100 | COPYRIGHT HOLDER: Markus Dumke
    101 | 
    102 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
    103 | 
    104 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
    105 | 
    106 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    107 | 
    108 | 109 |
    110 | 111 |
    112 | 113 | 114 |
    115 | 118 | 119 |
    120 |

    Site built with pkgdown.

    121 |
    122 | 123 |
    124 |
    125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /R/interact.R: -------------------------------------------------------------------------------- 1 | #' Interaction between agent and environment. 2 | #' 3 | #' Run interaction between agent and environment for specified number of steps 4 | #' or episodes. 5 | #' 6 | #' @param env \[`Environment`] \cr Reinforcement learning environment created by [makeEnvironment]. 7 | #' @param agent \[`Agent`] \cr Agent created by [makeAgent]. 8 | #' @param n.steps \[`integer(1)`] \cr Number of steps to run. 9 | #' @param n.episodes \[`integer(1)`] \cr Number of episodes to run. 10 | #' @param max.steps.per.episode \[`integer(1)`] \cr Maximal number of steps allowed per episode. 11 | #' @param learn \[`logical(1)`] \cr Should the agent learn? 12 | #' @param visualize \[`logical(1)`] \cr Visualize the interaction between agent and environment? 13 | #' 14 | #' @return \[`list`] Return and number of steps per episode. 15 | #' 16 | #' @md 17 | #' 18 | #' @export 19 | #' @examples 20 | #' env = makeEnvironment("windy.gridworld") 21 | #' agent = makeAgent("softmax", "table", "qlearning") 22 | #' interact(env, agent, n.episodes = 10L) 23 | interact = function(env, agent, n.steps = Inf, n.episodes = Inf, 24 | max.steps.per.episode = Inf, learn = TRUE, visualize = FALSE) { 25 | 26 | checkmate::assertClass(env, "Environment") 27 | checkmate::assertClass(agent, "Agent") 28 | if (!is.infinite(n.steps)) checkmate::assertInt(n.steps, lower = 1) 29 | if (!is.infinite(n.episodes)) checkmate::assertInt(n.episodes, lower = 1) 30 | if (!is.infinite(max.steps.per.episode)) checkmate::assertInt(max.steps.per.episode, lower = 1) 31 | checkmate::assertFlag(learn) 32 | checkmate::assertFlag(visualize) 33 | 34 | # one of steps / episodes must be finite! 35 | if (is.infinite(n.steps) && is.infinite(n.episodes)) { 36 | stop("Specify finite number of steps or finite number of episodes!") 37 | } 38 | 39 | # preallocation if number of episodes | steps is known in advance else append to list 40 | if (n.episodes < Inf) { 41 | episode.returns = rep(NA_real_, n.episodes) 42 | } else { 43 | episode.returns = vector(mode = "double") 44 | } 45 | if (n.episodes < Inf) { 46 | episode.steps = rep(NA_integer_, n.episodes) 47 | } else { 48 | episode.steps = vector(mode = "integer") 49 | } 50 | 51 | # index to fill in 52 | episode = 0L 53 | 54 | # get episode | step number of when to stop 55 | stop.step = env$n.step + n.steps 56 | stop.episode = env$episode + n.episodes 57 | 58 | # # check if environment has been resetted, if not reset else get current state 59 | # if (is.null(env$state)) { 60 | # message("Reset environment.") 61 | # state = env$reset() 62 | # if (visualize) { 63 | # env$visualize() 64 | # } 65 | # } else { 66 | state = env$state 67 | #} 68 | 69 | agent$n.actions = env$n.actions 70 | 71 | if (agent$initialized == FALSE) { 72 | agent$init(env) # if e.g. value fun has not been initialized do this here 73 | agent$initialized = TRUE 74 | } 75 | 76 | while (TRUE) { 77 | # print(paste0("episode: ", env$episode, "; step: ", env$n.step)) 78 | # # agent$observeBeforeAct() # observe before act 79 | action = agent$act(state) # fixme: store action also in agent attribute 80 | res = env$step(action) 81 | 82 | if (visualize) { 83 | env$visualize() 84 | } 85 | 86 | # # keep track of visited states, actions, rewards 87 | # agent$history = append(agent$history, list(list(state = state, action = action, 88 | # reward = res$reward, episode = env$episode + 1L))) 89 | 90 | # observe: e.g. add observation to replay memory 91 | agent$observe(state, action, res$reward, res$state, env) 92 | 93 | # optional learning (check whether to learn maybe as agent method) 94 | if (learn) { 95 | #browser() 96 | agent$learn(env, learn) 97 | } 98 | 99 | state = res$state # set state to next state for new iteration 100 | 101 | # when episode is finished print out information and reset environment 102 | if (res$done || env$episode.step == max.steps.per.episode) { 103 | if (!res$done) { 104 | env$episode = env$episode + 1L 105 | } 106 | message(paste("Episode", env$episode, "finished after", 107 | env$episode.step, "steps with a return of", env$episode.return)) # let this be customizable by having his in a function argument 108 | episode = episode + 1L 109 | episode.returns[episode] = env$episode.return 110 | episode.steps[episode] = env$episode.step 111 | state = env$reset() 112 | # if (visualize) { 113 | # env$visualize() 114 | # } 115 | agent$reset() 116 | } 117 | 118 | # stop criteria 119 | if (env$n.step == stop.step || env$episode == stop.episode) { 120 | break 121 | } 122 | } 123 | # return information about returns, steps 124 | list(returns = episode.returns, steps = episode.steps) # return history 125 | } 126 | # fixme: logging 127 | # fixme: control when to learn 128 | # fixme: print out average return of last n episodes ... 129 | # fixme: maybe return training time, history ... 130 | # make message after done configurable as function argument 131 | -------------------------------------------------------------------------------- /R/tiles.R: -------------------------------------------------------------------------------- 1 | #' Tile Coding 2 | #' 3 | #' Implementation of Sutton's tile coding software version 3. 4 | #' 5 | #' @param iht \[`IHT`] \cr A hash table created with `iht`. 6 | #' @param n.tilings \[`integer(1)`] \cr Number of tilings. 7 | #' @param state \[`vector(2)`] \cr A two-dimensional state observation. 8 | #' Make sure to scale the observation to unit variance before. 9 | #' @param action \[`integer(1)`] \cr Optional: If supplied the action space 10 | #' will also be tiled. All distinct actions will result in different tile numbers. 11 | #' 12 | #' @return `iht` creates a hash table, which can then be passed on to `tiles`. 13 | #' `tiles` returns an integer vector of size `n.tilings` with the active tile numbers. 14 | #' 15 | #' @md 16 | #' 17 | #' @details 18 | #' Tile coding is a way of representing the values of a vector of continuous variables as a large 19 | #' binary vector with few 1s and many 0s. The binary vector is not represented explicitly, 20 | #' but as a list of the components that are 1s. The main step is to partition, or tile, 21 | #' the continuous space multiple times and select one tile from each tiling, that corresponding 22 | #' the the vector's value. Each tile is converted to an element in the big binary vector, 23 | #' and the list of the tile (element) numbers is returned as the representation of the vector's value. 24 | #' Tile coding is recommended as a way of applying online learning methods to domains with continuous 25 | #' state or action variables. \[copied from manual] 26 | #' 27 | #' See detailed manual on the web. 28 | #' In comparison to the Python implementation indices start with 1 instead of 0. The hash table is 29 | #' implemented as an environment, which is an attribute of an R6 class. 30 | #' 31 | #' Make sure that the size of the hash table is large enough, else an error will be triggered, 32 | #' when trying to assign a value to a full hash table. 33 | #' 34 | #' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction 35 | #' @rdname tilecoding 36 | #' @export 37 | #' @examples 38 | #' # Create hash table 39 | #' hash = iht(1024) 40 | #' 41 | #' # Partition state space using 8 tilings 42 | #' tiles(hash, n.tilings = 8, state = c(3.6, 7.21)) 43 | #' tiles(hash, n.tilings = 8, state = c(3.7, 7.21)) 44 | #' tiles(hash, n.tilings = 8, state = c(4, 7)) 45 | #' tiles(hash, n.tilings = 8, state = c(- 37.2, 7)) 46 | #' 47 | tiles = function(iht, n.tilings, state, action = integer(0)) { 48 | checkmate::assertClass(iht, "IHT") 49 | checkmate::assertInt(n.tilings) 50 | checkmate::assertVector(state) 51 | checkmate::assertIntegerish(action, max.len = 1) 52 | 53 | qfloats = floor(state * n.tilings) 54 | active.tiles = rep(0, n.tilings) 55 | coords = rep(0, length(state) + 1) 56 | 57 | for (tiling in seq_len(n.tilings)) { 58 | tiling = tiling - 1 59 | tiling2 = tiling * 2 60 | coords[1] = tiling 61 | b = tiling 62 | for (q in seq_along(qfloats)) { 63 | coords[q + 1] = (qfloats[q] + b) %/% n.tilings 64 | b = b + tiling2 65 | } 66 | coords = append(coords, action) 67 | active.tiles[tiling + 1] = hashcoords(paste(coords, collapse = ""), iht) 68 | } 69 | 70 | return(active.tiles) 71 | } 72 | 73 | hashcoords = function(coords, iht) { 74 | iht$add2Env(coords) 75 | iht$checkFull() 76 | iht$getIndex(coords) 77 | } 78 | 79 | #' @rdname tilecoding 80 | #' @param max.size \[`integer(1)`] \cr Maximal size of hash table. 81 | #' @export 82 | #' @md 83 | iht = function(max.size) { 84 | checkmate::assertInt(max.size) 85 | IHTClass$new(max.size) 86 | } 87 | 88 | IHTClass = R6::R6Class("IHT", 89 | public = list( 90 | i = 0, 91 | max.size = NULL, 92 | e = NULL, 93 | 94 | initialize = function(max.size) { 95 | self$max.size = max.size 96 | self$e = new.env(size = max.size) 97 | }, 98 | 99 | checkFull = function() { 100 | if (length(self$e) > self$max.size) { 101 | stop("Tile Coding failed because hash table IHT is full!") 102 | } 103 | }, 104 | 105 | add2Env = function(coords) { 106 | if (!exists(coords, envir = self$e, inherits = FALSE)) { 107 | self$i = self$i + 1 108 | self$checkFull() 109 | self$e[[coords]] = self$i 110 | } 111 | }, 112 | 113 | getIndex = function(coords) { 114 | return(self$e[[coords]]) 115 | } 116 | ) 117 | ) 118 | 119 | #' Make n hot vector. 120 | #' 121 | #' @param x \[`integer`] \cr Which features are active? 122 | #' @param len \[`integer(1)`] \cr Length of the feature vector. 123 | #' @param out \[`character(1)`] \cr Format of the output. Can be a vector or a matrix. 124 | #' 125 | #' @return \[`matrix(1, len)`] A one-row matrix with `len` columns with every 126 | #' entry 0 except the columns specified by `x` which are 1. 127 | #' 128 | #' @md 129 | #' 130 | #' @export 131 | #' @examples 132 | #' nHot(c(1, 3), 5) 133 | #' nHot(c(1, 3), 5, out = "vector") 134 | nHot = function(x, len, out = "matrix") { 135 | checkmate::assertIntegerish(x, max.len = len) 136 | checkmate::assertInt(len) 137 | if (out == "matrix") { 138 | m = matrix(rep(0, len), nrow = 1) 139 | m[1, x] = 1 140 | } else { 141 | m = rep(0, len) 142 | m[x] = 1 143 | } 144 | m 145 | } 146 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r, echo = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>", 9 | message = FALSE, 10 | fig.path = "README-" 11 | ) 12 | ``` 13 | 14 | # Reinforcement Learning in R 15 | 16 | [![Travis-CI Build Status](https://travis-ci.org/markusdumke/reinforcelearn.svg?branch=master)](https://travis-ci.org/markusdumke/reinforcelearn) 17 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/reinforcelearn)](https://cran.r-project.org/package=reinforcelearn) 18 | [![Coverage Status](https://img.shields.io/codecov/c/github/markusdumke/reinforcelearn/master.svg?maxAge=600)](https://codecov.io/github/markusdumke/reinforcelearn?branch=master) 19 | 20 | ```{r, include = FALSE} 21 | writeLines(capture.output(devtools::session_info()), "session_info.txt") 22 | ``` 23 | 24 | 25 | ### Documentation 26 | 27 | [Website](https://markusdumke.github.io/reinforcelearn) 28 | 29 | ---- 30 | 31 | ### Installation 32 | 33 | ```{r, eval = FALSE} 34 | # Install from CRAN. 35 | install.packages("reinforcelearn") 36 | 37 | # Install development version from github. 38 | devtools::install_github("markusdumke/reinforcelearn") 39 | ``` 40 | 41 | ---- 42 | 43 | ### Get started 44 | 45 | Reinforcement Learning with the package `reinforcelearn` is as easy as 46 | 47 | ```{r} 48 | library(reinforcelearn) 49 | 50 | env = makeEnvironment("windy.gridworld") 51 | agent = makeAgent("softmax", "table", "qlearning") 52 | 53 | # Run interaction for 10 episodes. 54 | interact(env, agent, n.episodes = 10L) 55 | ``` 56 | 57 | ---- 58 | 59 | ### Environments 60 | 61 | With `makeEnvironment` you can create reinforcement learning environments. 62 | 63 | ```{r} 64 | # Create environment. 65 | step = function(self, action) { 66 | state = list(mean = action + rnorm(1), sd = runif(1)) 67 | reward = rnorm(1, state[[1]], state[[2]]) 68 | done = FALSE 69 | list(state, reward, done) 70 | } 71 | 72 | reset = function(self) { 73 | state = list(mean = 0, sd = 1) 74 | state 75 | } 76 | 77 | env = makeEnvironment("custom", step = step, reset = reset) 78 | ``` 79 | 80 | The environment is an `R6` class with a set of attributes and methods. 81 | You can interact with the environment via the `reset` and `step` method. 82 | 83 | ```{r} 84 | # Reset environment. 85 | env$reset() 86 | 87 | # Take action. 88 | env$step(100) 89 | ``` 90 | 91 | There are some predefined environment classes, e.g. `MDPEnvironment`, which allows you to create a Markov Decision Process by passing on state transition array and reward matrix, or `GymEnvironment`, where you can use toy problems from [OpenAI Gym](https://gym.openai.com/). 92 | 93 | ```{r, eval = FALSE} 94 | # Create a gym environment. 95 | # Make sure you have Python, gym and reticulate installed. 96 | env = makeEnvironment("gym", gym.name = "MountainCar-v0") 97 | 98 | # Take random actions for 200 steps. 99 | env$reset() 100 | for (i in 1:200) { 101 | action = sample(0:2, 1) 102 | env$step(action) 103 | env$visualize() 104 | } 105 | env$close() 106 | ``` 107 | 108 | This should open a window showing a graphical visualization of the environment during interaction. 109 | 110 | For more details on how to create an environment have a look at the vignette: [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html) 111 | 112 | ---- 113 | 114 | ### Agents 115 | 116 | With `makeAgent` you can set up a reinforcement learning agent to solve the environment, i.e. to find the best action in each time step. 117 | 118 | The first step is to set up the policy, which defines which action to choose. For example we could use a uniform random policy. 119 | 120 | ```{r} 121 | # Create the environment. 122 | env = makeEnvironment("windy.gridworld") 123 | 124 | # Create agent with uniform random policy. 125 | policy = makePolicy("random") 126 | agent = makeAgent(policy) 127 | 128 | # Run interaction for 10 steps. 129 | interact(env, agent, n.steps = 10L) 130 | ``` 131 | 132 | In this scenario the agent chooses all actions with equal probability and will not learn anything from the interaction. Usually we want the agent to be able to learn something. Value-based algorithms learn a value function from interaction with the environment and adjust the policy according to the value function. For example we could set up Q-Learning with a softmax policy. 133 | 134 | ```{r} 135 | # Create the environment. 136 | env = makeEnvironment("windy.gridworld") 137 | 138 | # Create qlearning agent with softmax policy and tabular value function. 139 | policy = makePolicy("softmax") 140 | values = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 141 | algorithm = makeAlgorithm("qlearning") 142 | agent = makeAgent(policy, values, algorithm) 143 | 144 | # Run interaction for 10 steps. 145 | interact(env, agent, n.episodes = 10L) 146 | ``` 147 | 148 | ---- 149 | 150 | ### Vignettes 151 | 152 | Also have a look at the vignettes for further examples. 153 | 154 | - [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html) 155 | - [Agents](https://markusdumke.github.io/reinforcelearn/articles/agents.html) 156 | 157 | ---- 158 | 159 | Logo is a modification of https://www.r-project.org/logo/. 160 | -------------------------------------------------------------------------------- /R/valuefunction.R: -------------------------------------------------------------------------------- 1 | #' Value Function Representation 2 | #' 3 | #' A representation of the value function. 4 | #' 5 | #' @param class \[`character(1)`] \cr Class of value function approximation. 6 | #' One of `c("table", "neural.network")`. 7 | #' @inheritParams makePolicy 8 | #' 9 | #' @return \[`list(name, args)`] List with the name and optional args. 10 | #' This list can then be passed onto [makeAgent], which will construct the 11 | #' value function accordingly. 12 | #' 13 | #' @md 14 | #' 15 | #' @section Representations: 16 | #' * [ValueTable] 17 | #' * [ValueNetwork] 18 | #' 19 | #' @export 20 | #' @examples 21 | #' val = makeValueFunction("table", n.states = 16L, n.actions = 4L) 22 | #' # If the number of states and actions is not supplied, the agent will try 23 | #' # to figure these out from the environment object during interaction. 24 | #' val = makeValueFunction("table") 25 | makeValueFunction = function(class, args = list(), ...) { 26 | checkmate::assertChoice(class, c("table", "neural.network")) #, "keras.neural.network", "mxnet.neural.network")) 27 | # fixme: check arguments here 28 | checkmate::assertList(args, names = "unique") 29 | args = append(list(...), args) 30 | # remove duplicate entries in args list 31 | args = args[unique(names(args))] 32 | 33 | x = list(name = class, args = args) 34 | class(x) = "ValueFunction" 35 | x 36 | } 37 | # comment: this could also be used for policy params -> better name? 38 | 39 | 40 | #' Value Table 41 | #' 42 | #' Table representing the action value function Q. 43 | #' 44 | #' You can specify the shape of the value table. If omitted the agent will try 45 | #' to configure these automatically from the environment during interaction 46 | #' (therefore the environment needs to have a `n.states` and `n.actions` attribute). 47 | #' 48 | #' @section Usage: 49 | #' `makeValueFunction("table", n.states = NULL, n.actions = 1L, 50 | #' step.size = 0.1, initial.value = NULL)` 51 | #' 52 | #' @param n.states \[`integer(1)`] \cr Number of states (rows in the value function). 53 | #' @param n.actions \[`integer(1)`] \cr Number of actions (columns in the value function). 54 | #' @param step.size \[`numeric(1)`] \cr Step size (learning rate) for gradient descent update. 55 | #' 56 | #' @name ValueTable 57 | #' @aliases table 58 | #' @md 59 | #' 60 | #' @examples 61 | #' val = makeValueFunction("table", n.states = 20L, n.actions = 4L) 62 | NULL 63 | 64 | ValueTable = R6::R6Class("ValueTable", 65 | public = list( 66 | Q = NULL, 67 | step.size = NULL, 68 | 69 | # fixme: get number of states and actions automatically from environment 70 | # fixme: custom initializer, e.g. not to 0 71 | initialize = function(n.states = NULL, n.actions = 1L, step.size = 0.1, 72 | initial.value = NULL) { 73 | 74 | checkmate::assertInt(n.states, lower = 1) 75 | checkmate::assertInt(n.actions, lower = 1) 76 | checkmate::assertNumber(step.size, lower = 0) 77 | checkmate::assertMatrix(initial.value, null.ok = TRUE) 78 | 79 | # state or action value function 80 | if (!is.null(initial.value)) { 81 | self$Q = initial.value 82 | } else { 83 | self$Q = matrix(0, nrow = n.states, ncol = n.actions) 84 | } 85 | self$step.size = step.size 86 | }, 87 | 88 | predictQ = function(state) { 89 | self$Q[state + 1L, , drop = FALSE] 90 | }, 91 | 92 | # fixme: make this vectorised -> ok 93 | # caveat: states must be unique! 94 | train = function(state, target, step.size = self$step.size) { 95 | self$Q[state + 1L, ] = self$Q[state + 1L, ] + step.size * (target - self$Q[state + 1L, ]) # drop = FALSE ? 96 | }, 97 | 98 | # train with td error and eligibility traces 99 | trainWithError = function(eligibility, error, step.size = self$step.size) { 100 | self$Q = self$Q + step.size * error * eligibility 101 | }, 102 | 103 | processBatch = function(batch) { 104 | data = data.frame(state = unlist(batch[["state"]]), action = unlist(batch[["action"]]), 105 | reward = unlist(batch[["reward"]]), next.state = unlist(batch[["next.state"]])) 106 | data 107 | }, 108 | 109 | getWeights = function() { 110 | self$Q 111 | } 112 | ) 113 | ) 114 | 115 | #' Value Network 116 | #' 117 | #' Neural network representing the action value function Q. 118 | #' 119 | #' @section Usage: 120 | #' `makeValueFunction("neural.network", model)` 121 | #' 122 | #' @param model \[`keras model`] \cr A keras model. 123 | #' Make sure that the model has been compiled. 124 | #' 125 | #' @name ValueNetwork 126 | #' @aliases neural.network 127 | #' @md 128 | #' 129 | #' @examples 130 | #' \dontrun{ 131 | #' library(keras) 132 | #' model = keras_model_sequential() 133 | #' model %>% layer_dense(20, input_shape = 10, activation = "relu") 134 | #' model %>% layer_dense(4, activation = "softmax") 135 | #' keras::compile(model, loss = "mae", optimizer = keras::optimizer_sgd(lr = 0.4)) 136 | #' 137 | #' val = makeValueFunction("neural.network", model = model) 138 | #' } 139 | NULL 140 | 141 | ValueNetwork = R6::R6Class("ValueNetwork", 142 | public = list( 143 | model = NULL, 144 | 145 | # keras model # fixme: add support for mxnet 146 | initialize = function(model) { 147 | checkmate::assertClass(model, "keras.models.Sequential") 148 | self$model = model 149 | }, 150 | 151 | predictQ = function(state) { 152 | predict(self$model, state) # another function? 153 | }, 154 | 155 | train = function(state, target) { # add ... argument to pass on arguments to fit 156 | keras::fit(self$model, state, target, verbose = 0L) 157 | }, 158 | 159 | processBatch = function(batch) { 160 | data = list( 161 | state = do.call(rbind, batch[["state"]]), # problematic for matrix with many columns, purrr::reduce 162 | action = unlist(batch[["action"]]), 163 | reward = unlist(batch[["reward"]]), 164 | next.state = purrr::reduce(batch[["next.state"]], rbind) 165 | ) 166 | data 167 | }, 168 | 169 | getWeights = function() { 170 | self$model %>% get_weights() 171 | } 172 | ) 173 | ) 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Reinforcement Learning in R 3 | ======================================================================================= 4 | 5 | [![Travis-CI Build Status](https://travis-ci.org/markusdumke/reinforcelearn.svg?branch=master)](https://travis-ci.org/markusdumke/reinforcelearn) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/reinforcelearn)](https://cran.r-project.org/package=reinforcelearn) [![Coverage Status](https://img.shields.io/codecov/c/github/markusdumke/reinforcelearn/master.svg?maxAge=600)](https://codecov.io/github/markusdumke/reinforcelearn?branch=master) 6 | 7 | WARNING: This package is not maintained anymore! 8 | 9 | ### Documentation 10 | 11 | [Website](https://markusdumke.github.io/reinforcelearn) 12 | 13 | ------------------------------------------------------------------------ 14 | 15 | ### Installation 16 | 17 | ``` r 18 | # Install from CRAN. 19 | install.packages("reinforcelearn") 20 | 21 | # Install development version from github. 22 | devtools::install_github("markusdumke/reinforcelearn") 23 | ``` 24 | 25 | ------------------------------------------------------------------------ 26 | 27 | ### Get started 28 | 29 | Reinforcement Learning with the package `reinforcelearn` is as easy as 30 | 31 | ``` r 32 | library(reinforcelearn) 33 | 34 | env = makeEnvironment("windy.gridworld") 35 | agent = makeAgent("softmax", "table", "qlearning") 36 | 37 | # Run interaction for 10 episodes. 38 | interact(env, agent, n.episodes = 10L) 39 | #> $returns 40 | #> [1] -3244 -2335 -1734 -169 -879 -798 -216 -176 -699 -232 41 | #> 42 | #> $steps 43 | #> [1] 3244 2335 1734 169 879 798 216 176 699 232 44 | ``` 45 | 46 | ------------------------------------------------------------------------ 47 | 48 | ### Environments 49 | 50 | With `makeEnvironment` you can create reinforcement learning environments. 51 | 52 | ``` r 53 | # Create environment. 54 | step = function(self, action) { 55 | state = list(mean = action + rnorm(1), sd = runif(1)) 56 | reward = rnorm(1, state[[1]], state[[2]]) 57 | done = FALSE 58 | list(state, reward, done) 59 | } 60 | 61 | reset = function(self) { 62 | state = list(mean = 0, sd = 1) 63 | state 64 | } 65 | 66 | env = makeEnvironment("custom", step = step, reset = reset) 67 | ``` 68 | 69 | The environment is an `R6` class with a set of attributes and methods. You can interact with the environment via the `reset` and `step` method. 70 | 71 | ``` r 72 | # Reset environment. 73 | env$reset() 74 | #> $mean 75 | #> [1] 0 76 | #> 77 | #> $sd 78 | #> [1] 1 79 | 80 | # Take action. 81 | env$step(100) 82 | #> $state 83 | #> $state$mean 84 | #> [1] 99.56104 85 | #> 86 | #> $state$sd 87 | #> [1] 0.5495179 88 | #> 89 | #> 90 | #> $reward 91 | #> [1] 99.40968 92 | #> 93 | #> $done 94 | #> [1] FALSE 95 | ``` 96 | 97 | There are some predefined environment classes, e.g. `MDPEnvironment`, which allows you to create a Markov Decision Process by passing on state transition array and reward matrix, or `GymEnvironment`, where you can use toy problems from [OpenAI Gym](https://gym.openai.com/). 98 | 99 | ``` r 100 | # Create a gym environment. 101 | # Make sure you have Python, gym and reticulate installed. 102 | env = makeEnvironment("gym", gym.name = "MountainCar-v0") 103 | 104 | # Take random actions for 200 steps. 105 | env$reset() 106 | for (i in 1:200) { 107 | action = sample(0:2, 1) 108 | env$step(action) 109 | env$visualize() 110 | } 111 | env$close() 112 | ``` 113 | 114 | This should open a window showing a graphical visualization of the environment during interaction. 115 | 116 | For more details on how to create an environment have a look at the vignette: [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html) 117 | 118 | ------------------------------------------------------------------------ 119 | 120 | ### Agents 121 | 122 | With `makeAgent` you can set up a reinforcement learning agent to solve the environment, i.e. to find the best action in each time step. 123 | 124 | The first step is to set up the policy, which defines which action to choose. For example we could use a uniform random policy. 125 | 126 | ``` r 127 | # Create the environment. 128 | env = makeEnvironment("windy.gridworld") 129 | 130 | # Create agent with uniform random policy. 131 | policy = makePolicy("random") 132 | agent = makeAgent(policy) 133 | 134 | # Run interaction for 10 steps. 135 | interact(env, agent, n.steps = 10L) 136 | #> $returns 137 | #> numeric(0) 138 | #> 139 | #> $steps 140 | #> integer(0) 141 | ``` 142 | 143 | In this scenario the agent chooses all actions with equal probability and will not learn anything from the interaction. Usually we want the agent to be able to learn something. Value-based algorithms learn a value function from interaction with the environment and adjust the policy according to the value function. For example we could set up Q-Learning with a softmax policy. 144 | 145 | ``` r 146 | # Create the environment. 147 | env = makeEnvironment("windy.gridworld") 148 | 149 | # Create qlearning agent with softmax policy and tabular value function. 150 | policy = makePolicy("softmax") 151 | values = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 152 | algorithm = makeAlgorithm("qlearning") 153 | agent = makeAgent(policy, values, algorithm) 154 | 155 | # Run interaction for 10 steps. 156 | interact(env, agent, n.episodes = 10L) 157 | #> $returns 158 | #> [1] -1524 -3496 -621 -374 -173 -1424 -1742 -468 -184 -39 159 | #> 160 | #> $steps 161 | #> [1] 1524 3496 621 374 173 1424 1742 468 184 39 162 | ``` 163 | 164 | ------------------------------------------------------------------------ 165 | 166 | ### Vignettes 167 | 168 | Also have a look at the vignettes for further examples. 169 | 170 | - [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html) 171 | - [Agents](https://markusdumke.github.io/reinforcelearn/articles/agents.html) 172 | 173 | ------------------------------------------------------------------------ 174 | 175 | Logo is a modification of . 176 | -------------------------------------------------------------------------------- /vignettes/agents.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Agents" 3 | author: Markus Dumke 4 | date: "`r Sys.Date()`" 5 | output:rmarkdown::html_vignette: 6 | fig_caption: yes 7 | bibliography: references.bib 8 | vignette: > 9 | %\VignetteIndexEntry{Agents} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | 19 | 20 | ```{r setup, include=FALSE} 21 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 22 | ``` 23 | 24 | ```{r} 25 | set.seed(12) 26 | library(reinforcelearn) 27 | ``` 28 | 29 | A reinforcement learning agent usually consists of three parts: a policy, a value function representation and an algorithm which updates the value function or policy parameters. In the following it will be explained how to create an agent in `reinforcelearn` to solve an environment. 30 | 31 | You can create an agent with the function `makeAgent`. This will create an R6 class object with the corresponding policy, value function and algorithm. 32 | 33 | ```{r} 34 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L) 35 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning") 36 | ``` 37 | 38 | Then you can run the agent in the environment by calling `interact` for a specified number of steps or episodes. 39 | 40 | ```{r} 41 | interact(env, agent, n.episodes = 5L) 42 | ``` 43 | 44 | Note that `interact` returns a list with the number of steps and returns per episode. Furthermore it will change the environment and agent object. So the environment's state or the agent's value function weights will have most likely changed after the interaction. 45 | 46 | Although you can directly access the agent object, this is not recommended as this will be very likely to change in the next package versions. Instead use one of the accessor functions to e.g. get the weights of the action value function. 47 | 48 | ```{r} 49 | getValueFunction(agent) 50 | ``` 51 | 52 | ## Policies 53 | 54 | A policy is the agent's behavior function. We can define the policy with `makePolicy`. 55 | 56 | ```{r} 57 | # Uniform random policy 58 | makePolicy("random") 59 | 60 | # Epsilon-greedy policy 61 | makePolicy("epsilon.greedy", epsilon = 0.2) 62 | 63 | # Softmax policy 64 | makePolicy("softmax") 65 | ``` 66 | 67 | This will just capture what policy to use and the policy will then be created when we create the agent. 68 | 69 | ## Value Functions 70 | 71 | Many reinforcement learning algorithms use a value function to learn values of state and action pairs. 72 | The value function can be represented with different types of function approximation, e.g. as a table or neural network. 73 | 74 | ```{r} 75 | makeValueFunction("table", n.states = 9L, n.actions = 4L) 76 | ``` 77 | 78 | For a neural network you can use the `keras` package. Therefore you need to specify a the model's architecture and pass these on to `makeValueFunction`. 79 | 80 | ```{r, eval = FALSE} 81 | library(keras) 82 | model = keras_model_sequential() %>% 83 | layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>% 84 | compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae") 85 | makeValueFunction("neural.network", model) 86 | ``` 87 | 88 | Note that online neural network training is currently very slow. One way to work with this is to make updates to the value function not after every interaction, but to store all interactions in a replay memory and make updates to the neural network only once in a while. Read more about this in Section Experience Replay. 89 | 90 | Often you need to preprocess the state observation in a way the agent can work with this. Therefore you can pass on a function to the `preprocess` argument of `makeAgent`, which will then be applied to the state observation before the agent learns on this. 91 | 92 | For neural network training the outcome of `preprocess` must be a one-row matrix in order to be able to learn. 93 | 94 | ## Algorithms 95 | 96 | The algorithm defines how to learn from an interaction with the environment. We can set up an algorithm using the function `makeAlgorithm`. 97 | 98 | ```{r} 99 | makeAlgorithm("qlearning") 100 | ``` 101 | 102 | ## Agent 103 | 104 | If we have defined policy, value function and algorithm we can create the agent by calling `makeAgent`. 105 | 106 | ```{r} 107 | policy = makePolicy("epsilon.greedy", epsilon = 0.2) 108 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L) 109 | algorithm = makeAlgorithm("qlearning") 110 | 111 | agent = makeAgent(policy, val.fun, algorithm) 112 | ``` 113 | 114 | Note that you can also call `makeAgent` with character arguments which can save some typing. 115 | 116 | ```{r} 117 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 118 | policy.args = list(epsilon = 0.2)) 119 | ``` 120 | 121 | ## Interaction 122 | 123 | You can run an interaction between an agent and environment with the `interact` function. 124 | 125 | ```{r} 126 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L) 127 | agent = makeAgent("random") 128 | 129 | interact(env, agent, n.steps = 3L, visualize = TRUE) 130 | ``` 131 | 132 | It allows you to run an interaction for a specified number of steps or episodes and you can also specify a maximum number of steps per episode. 133 | This makes it very flexible to step through the environment one action after the other. Note you can also run an interaction without learning. 134 | 135 | ```{r} 136 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 137 | initial.state = 15L) 138 | agent = makeAgent("random") 139 | 140 | for (i in 1:3L) { 141 | ## comment in the next line to wait on enter press before taking the next action. 142 | # invisible(readline(prompt = "Press [enter] to take the next action")) 143 | interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE) 144 | } 145 | ``` 146 | 147 | ### Experience replay 148 | 149 | Experience replay is a technique to learn at once from multiple past observations. Therefore all the states, actions, rewards and subsequent states will be stored in a list (the so called replay memory) and at each step a random batch from this memory will be replayed. 150 | 151 | ```{r} 152 | (memory = makeReplayMemory(size = 2L, batch.size = 1L)) 153 | 154 | agent = makeAgent("random", replay.memory = memory) 155 | 156 | interact(env, agent, n.steps = 2L, visualize = TRUE) 157 | 158 | getReplayMemory(agent) 159 | ``` 160 | 161 | Here is an example training with experience replay, where the value function is updated only every 21 steps. 162 | 163 | ```{r, message = FALSE} 164 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 165 | 166 | policy = makePolicy("epsilon.greedy", epsilon = 0.1) 167 | memory = makeReplayMemory(size = 100L, batch.size = 20L) 168 | 169 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory) 170 | 171 | for (i in 1:100) { 172 | interact(env, agent, n.steps = 20L, learn = FALSE) 173 | interact(env, agent, n.steps = 1L, learn = TRUE) 174 | } 175 | action.vals = getValueFunction(agent) 176 | matrix(getStateValues(action.vals), ncol = 4L) 177 | ``` 178 | -------------------------------------------------------------------------------- /R/environment.R: -------------------------------------------------------------------------------- 1 | #' Create reinforcement learning environment. 2 | #' 3 | #' This function creates an environment for reinforcement learning. 4 | #' 5 | #' Use the `step` method to interact with the environment. 6 | #' 7 | #' Note that all states and actions are numerated starting with 0! 8 | #' 9 | #' For a detailed explanation and more examples 10 | #' have a look at the vignette "How to create an environment?". 11 | #' 12 | #' @param class \[`character(1)`] \cr 13 | #' Class of environment. One of `c("custom", "mdp", "gym", "gridworld")`. 14 | #' @param discount \[`numeric(1)` in (0, 1)] \cr Discount factor. 15 | #' @param ... \[`any`] \cr Arguments passed on to the specific environment. 16 | #' 17 | #' @md 18 | #' 19 | #' @return R6 class of class Environment. 20 | #' 21 | #' @section Methods: 22 | #' * `$step(action)` \cr 23 | #' Take action in environment. 24 | #' Returns a list with `state`, `reward`, `done`. 25 | #' * `$reset()` \cr 26 | #' Resets the `done` flag of the environment and returns an initial state. 27 | #' Useful when starting a new episode. 28 | #' * `$visualize()` \cr 29 | #' Visualizes the environment (if there is a visualization function). 30 | #' 31 | #' @section Environments: 32 | #' * [Environment] 33 | #' * [GymEnvironment] 34 | #' * [MdpEnvironment] 35 | #' * [Gridworld] 36 | #' * [MountainCar] 37 | #' 38 | #' @export 39 | #' @examples 40 | #' step = function(self, action) { 41 | #' state = list(mean = action + rnorm(1), sd = runif(1)) 42 | #' reward = rnorm(1, state[[1]], state[[2]]) 43 | #' done = FALSE 44 | #' list(state, reward, done) 45 | #' } 46 | #' 47 | #' reset = function(self) { 48 | #' state = list(mean = 0, sd = 1) 49 | #' state 50 | #' } 51 | #' 52 | #' env = makeEnvironment(step = step, reset = reset, discount = 0.9) 53 | #' env$reset() 54 | #' env$step(100) 55 | #' 56 | #' # Create a Markov Decision Process. 57 | #' P = array(0, c(2, 2, 2)) 58 | #' P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 59 | #' P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE) 60 | #' R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 61 | #' env = makeEnvironment("mdp", transitions = P, rewards = R) 62 | #' 63 | #' env$reset() 64 | #' env$step(1L) 65 | #' 66 | #' # Create a Gridworld. 67 | #' grid = makeEnvironment("gridworld", shape = c(4, 4), 68 | #' goal.states = 15, initial.state = 0) 69 | #' grid$visualize() 70 | #' 71 | #' \dontrun{ 72 | #' # Create an OpenAI Gym environment. 73 | #' # Make sure you have Python, gym and reticulate installed. 74 | #' env = makeEnvironment("gym", gym.name = "MountainCar-v0") 75 | #' 76 | #' # Take random actions for 200 steps. 77 | #' env$reset() 78 | #' for (i in 1:200) { 79 | #' action = sample(env$actions, 1) 80 | #' env$step(action) 81 | #' env$visualize() 82 | #' } 83 | #' env$close() 84 | #' } 85 | makeEnvironment = function(class = "custom", discount = 1, ...) { 86 | checkmate::assertChoice(class, 87 | c("custom", "mdp", "gym", "gridworld", "windy.gridworld", "cliff.walking", 88 | "mountain.car", "mountain.car.continuous")) 89 | switch(class, 90 | custom = Environment$new(discount = discount, ...), # default 91 | mdp = MdpEnvironment$new(discount = discount, ...), 92 | gym = GymEnvironment$new(discount = discount, ...), 93 | gridworld = Gridworld$new(discount = discount, ...), 94 | windy.gridworld = WindyGridworld$new(discount = discount, ...), 95 | cliff.walking = CliffWalking$new(discount = discount, ...), 96 | mountain.car = MountainCar$new(discount = discount, ...), 97 | mountain.car.continuous = MountainCarContinuous$new(discount = discount, ...) 98 | ) 99 | } 100 | 101 | #' Custom Reinforcement Learning Environment 102 | #' 103 | #' @section Usage: 104 | #' `makeEnvironment("custom", step, reset, visualize = NULL, discount = 1, action.names = NULL)` 105 | #' 106 | #' @param step \[`function(self, action)`] \cr 107 | #' Custom step function. 108 | #' @param reset \[`function(self)`] \cr 109 | #' Custom reset function. 110 | #' @param visualize \[`function(self)`] \cr 111 | #' Optional custom visualization function. 112 | #' @param discount \[`numeric(1)` in (0, 1)] \cr Discount factor. 113 | #' @param action.names \[`named integer`] \cr 114 | #' Optional action names for a discrete action space. 115 | #' 116 | #' @md 117 | #' 118 | #' @inheritSection makeEnvironment Methods 119 | #' 120 | #' @name Environment 121 | #' @export 122 | #' 123 | #' @examples 124 | #' step = function(self, action) { 125 | #' state = list(mean = action + rnorm(1), sd = runif(1)) 126 | #' reward = rnorm(1, state[[1]], state[[2]]) 127 | #' done = FALSE 128 | #' list(state, reward, done) 129 | #' } 130 | #' 131 | #' reset = function(self) { 132 | #' state = list(mean = 0, sd = 1) 133 | #' state 134 | #' } 135 | #' 136 | #' env = makeEnvironment(step = step, reset = reset) 137 | #' env$reset() 138 | #' env$step(100) 139 | NULL 140 | 141 | Environment = R6::R6Class("Environment", 142 | public = list( 143 | action.names = NULL, 144 | n.step = 0L, 145 | episode = 0L, 146 | episode.step = 0L, 147 | episode.return = 0, 148 | previous.state = NULL, 149 | state = NULL, 150 | reward = NULL, 151 | done = FALSE, 152 | discount = NULL, 153 | 154 | resetEverything = function() { 155 | self$n.step = 0L 156 | self$episode = 0 157 | self$reset() 158 | }, 159 | 160 | reset = function() { 161 | self$episode.step = 0L 162 | self$episode.return = 0 163 | self$done = FALSE 164 | self$state = private$reset_(self) 165 | self$state 166 | }, 167 | 168 | step = function(action) { 169 | if (is.character(action)) { 170 | action = self$action.names[action] 171 | } 172 | self$previous.state = self$state 173 | res = private$step_(self, action) 174 | self$episode.return = self$episode.return + 175 | self$discount ^ self$episode.step * res[[2]] 176 | self$n.step = self$n.step + 1L 177 | self$episode.step = self$episode.step + 1L 178 | self$state = res[[1]] 179 | self$reward = res[[2]] 180 | self$done = res[[3]] 181 | if (self$done) { 182 | self$episode = self$episode + 1L 183 | } 184 | list(state = res[[1]], reward = res[[2]], done = res[[3]]) 185 | }, 186 | 187 | visualize = function() { 188 | private$visualize_(self) 189 | }, 190 | 191 | initialize = function(step, reset, visualize = NULL, discount, action.names = NULL) { 192 | checkmate::assertFunction(step) 193 | checkmate::assertFunction(reset) 194 | checkmate::assertFunction(visualize, null.ok = TRUE) 195 | checkmate::assertNumber(discount, lower = 0, upper = 1) 196 | checkmate::assertIntegerish(action.names, null.ok = TRUE) 197 | 198 | private$step_ = step 199 | private$reset_ = reset 200 | self$discount = discount 201 | self$action.names = action.names 202 | if (!missing(visualize)) { 203 | checkmate::assertFunction(visualize) 204 | private$visualize_ = visualize 205 | } else { 206 | private$visualize_ = function(self) {} 207 | } 208 | self$reset() 209 | } 210 | ), 211 | 212 | private = list( 213 | # step_: custom step method depending on problem that returns list with 214 | # next state, reward, done 215 | step_ = NULL, 216 | # reset_: custom reset method depending on problem that returns state 217 | reset_ = NULL, 218 | visualize_ = NULL 219 | ) 220 | ) 221 | -------------------------------------------------------------------------------- /examples/user_interface.R: -------------------------------------------------------------------------------- 1 | #' #' --- 2 | #' #' title: "User interface" 3 | #' #' author: Markus Dumke 4 | #' #' output: github_document 5 | #' #' --- 6 | #' 7 | #' #+ setup, include=FALSE 8 | #' library(knitr) 9 | #' opts_chunk$set(comment = "#>", collapse = FALSE, message = FALSE) 10 | #' 11 | #' library(reinforcelearn) 12 | #' 13 | #' env = makeEnvironment("windy.gridworld") 14 | #' 15 | #' # policy without val.fun or algorithm 16 | #' agent = makeAgent("random") 17 | #' interact(env, agent, n.steps = 10L) 18 | #' 19 | #' # policy with val.fun, without algorithm 20 | #' agent = makeAgent("softmax", "table") 21 | #' interact(env, agent, n.steps = 10L) 22 | #' 23 | #' # policy, table, qlearning 24 | #' agent = makeAgent("softmax", "table", "qlearning") 25 | #' interact(env, agent, n.steps = 10L) 26 | #' 27 | #' # policy, table, qlearning, eligibility 28 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate") 29 | #' agent = makeAgent("softmax", "table", alg) 30 | #' interact(env, agent, n.steps = 10L) 31 | #' 32 | #' # policy, table, qlearning, exp.replay 33 | #' mem = makeReplayMemory(size = 5, batch.size = 5) 34 | #' agent = makeAgent("softmax", "table", "qlearning", experience.replay = mem) 35 | #' interact(env, agent, n.steps = 10L) 36 | #' 37 | #' # policy, neuralnet, qlearning 38 | #' library(keras) 39 | #' model = keras_model_sequential() %>% 40 | #' layer_dense(units = env$n.actions, activation = "linear", 41 | #' input_shape = c(env$n.states), kernel_initializer = initializer_zeros(), 42 | #' use_bias = FALSE) %>% 43 | #' compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 44 | #' val = makeValueFunction("neural.network", model = model) 45 | #' preprocess = function(x) to_categorical(x, num_classes = env$n.states) 46 | #' agent = makeAgent("softmax", val, "qlearning", preprocess = preprocess) 47 | #' 48 | #' # policy, neuralnet, qlearning, exp. replay 49 | #' 50 | #' 51 | #' 52 | #' 53 | #' 54 | #' 55 | #' 56 | #' 57 | #' 58 | #' 59 | #' 60 | #' 61 | #' 62 | #' # 63 | #' # 64 | #' # 65 | #' # 66 | #' # # run random policy without learning 67 | #' # env = makeEnvironment("gridworld", shape = c(4, 4), 68 | #' # goal.states = 0L, initial.state = 15L, discount = 0.99) 69 | #' # policy = makePolicy("random") 70 | #' # agent = makeAgent(policy) 71 | #' # interact(env, agent, n.steps = 200L) 72 | #' # 73 | #' # # qlearning table 74 | #' # env = makeEnvironment("gridworld", shape = c(4, 4), 75 | #' # goal.states = c(0, 15), initial.state = 1:14, discount = 1) 76 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 77 | #' # alg = makeAlgorithm("qlearning") 78 | #' # agent = makeAgent(policy, val, alg) 79 | #' # interact(env, agent, n.episodes = 50L) # fail 80 | #' # getStateValues(agent$val.fun$Q) 81 | #' # 82 | #' # # qlearning simple 83 | #' # env = makeEnvironment("windy.gridworld") 84 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 85 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1) 86 | #' # alg = makeAlgorithm("qlearning") 87 | #' # agent = makeAgent(policy, val, alg) 88 | #' # interact(env, agent, n.episodes = 100L) 89 | #' # 90 | #' # # sarsa simple 91 | #' # env = makeEnvironment("windy.gridworld") 92 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 93 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1) 94 | #' # alg = makeAlgorithm("sarsa") 95 | #' # agent = makeAgent(policy, val, alg) 96 | #' # interact(env, agent, n.episodes = 100L) 97 | #' # 98 | #' # # sarsa simple with softmax policy 99 | #' # env = makeEnvironment("windy.gridworld") 100 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 101 | #' # policy = makePolicy("softmax") 102 | #' # alg = makeAlgorithm("sarsa") 103 | #' # agent = makeAgent(policy, val, alg) 104 | #' # interact(env, agent, n.episodes = 100L) 105 | #' # 106 | #' # # qlearning eligibility traces 107 | #' # env = makeEnvironment("windy.gridworld") 108 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions) 109 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1) 110 | #' # alg = makeAlgorithm("qlearning", lambda = 0.9, traces = "accumulate") 111 | #' # agent = makeAgent(policy, val, alg) 112 | #' # interact(env, agent, n.episodes = 100L) 113 | #' # 114 | #' # # character arguments 115 | #' # env = makeEnvironment("windy.gridworld") 116 | #' # agent = makeAgent("softmax", "table", "qlearning") 117 | #' # interact(env, agent, n.episodes = 10L) 118 | #' # 119 | #' # env = makeEnvironment("windy.gridworld") 120 | #' # alg = makeAlgorithm("qlearning", lambda = 0.9, traces = "replace") 121 | #' # agent = makeAgent("softmax", "table", alg) 122 | #' # interact(env, agent, n.episodes = 10L) 123 | #' # 124 | #' # # qlearning experience replay 125 | #' # env = makeEnvironment("windy.gridworld") 126 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1) 127 | #' # replay = makeReplayMemory(size = 200L, batch.size = 150L) 128 | #' # agent = makeAgent(policy, "table", "qlearning", experience.replay = replay) # a bit slow 129 | #' # interact(env, agent, n.episodes = 100L) 130 | #' # 131 | #' # # exp replay train every 10 steps 132 | #' # env = makeEnvironment("windy.gridworld") 133 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1) 134 | #' # replay = makeReplayMemory(size = 100L, batch.size = 100L) 135 | #' # agent = makeAgent(policy, "table", "qlearning", experience.replay = replay) # a bit slow 136 | #' # for (i in 1:10000) { 137 | #' # interact(env, agent, n.steps = 100L, learn = FALSE) 138 | #' # interact(env, agent, n.steps = 1L, learn = TRUE) 139 | #' # } 140 | #' # 141 | #' # 142 | #' # # keras neural network 143 | #' # env = makeEnvironment("windy.gridworld") 144 | #' # library(keras) 145 | #' # model = keras_model_sequential() 146 | #' # # "input_shape" parameter for layer_dense should be c(batchsize(None), input_dim), dim in keras is row major 147 | #' # model %>% 148 | #' # layer_dense(units = env$n.actions, activation = "linear", input_shape = c(env$n.states), 149 | #' # kernel_initializer = initializer_zeros(), use_bias = FALSE) 150 | #' # #layer_dense(units = env$n.actions, activation = "linear") 151 | #' # model$compile(loss = "mae", optimizer = optimizer_sgd(lr = 1)) 152 | #' # val = makeValueFunction("neural.network", model = model) 153 | #' # replay = makeReplayMemory(size = 100L, batch.size = 10L) 154 | #' # preprocess = function(x) to_categorical(x, num_classes = env$n.states) 155 | #' # agent = makeAgent("softmax", val, "qlearning", 156 | #' # preprocess = preprocess, experience.replay = replay) 157 | #' # for (i in 1:100) { 158 | #' # interact(env, agent, n.steps = 10L, learn = FALSE, max.steps.per.episode = 100L) 159 | #' # interact(env, agent, n.steps = 1L, learn = TRUE, max.steps.per.episode = 100L) 160 | #' # } 161 | #' # agent$val.fun$model %>% get_weights() 162 | #' # 163 | #' # # solve mountain car with exp replay 164 | #' # m = makeEnvironment("gym", "MountainCar-v0") 165 | #' # library(keras) 166 | #' # model = keras_model_sequential() 167 | #' # # "input_shape" parameter for layer_dense should be c(batchsize(None), input_dim), dim in keras is row major 168 | #' # model %>% 169 | #' # layer_dense(units = 64L, activation = 'relu', input_shape = c(2L)) %>% 170 | #' # layer_dense(units = 3L, activation = 'linear') 171 | #' # model$compile(loss = 'mse', optimizer = optimizer_rmsprop(lr = 0.0025)) 172 | #' # val = makeValueFunction("neural.network", model = model) 173 | #' # replay = makeReplayMemory(size = 100L, batch.size = 10L) 174 | #' # preprocess = function(x) matrix(x, ncol = 2) 175 | #' # agent = makeAgent("softmax", val, "qlearning", 176 | #' # preprocess = preprocess, experience.replay = replay) 177 | #' # for (i in 1:1000) { 178 | #' # interact(env, agent, n.steps = 10L, learn = FALSE) 179 | #' # interact(env, agent, n.steps = 1L, learn = TRUE) 180 | #' # } 181 | #' # #agent$val.fun$model %>% get_weights() 182 | -------------------------------------------------------------------------------- /vignettes/environments.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Environments" 3 | author: Markus Dumke 4 | date: "`r Sys.Date()`" 5 | output:rmarkdown::html_vignette: 6 | fig_caption: yes 7 | bibliography: references.bib 8 | vignette: > 9 | %\VignetteIndexEntry{Environments} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | 19 | 20 | ```{r setup, include=FALSE} 21 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>") 22 | ``` 23 | 24 | This vignette explains the different possibilities to create and use a reinforcement learning environment in `reinforcelearn`. Section [Creation](#creation) explains how to create an environment and Section [Interaction](#interaction) describe how to use the created environment object for interaction. 25 | 26 | ```{r} 27 | library(reinforcelearn) 28 | ``` 29 | 30 | ## Creation 31 | 32 | The `makeEnvironment` function provides different ways to create an environment. 33 | It is called with the class name as a first argument. You can pass arguments of the specific environment class (e.g. the state transition array for an MDP) to the `...` argument. 34 | 35 | ### Create a custom environment 36 | 37 | To create a custom environment you have to set up a `step` and `reset` function, which define the rewards the agent receives and ultimately the goal of what to learn. 38 | 39 | Here is an example setting up a the famous Mountain Car problem. 40 | 41 | ```{r, out.width = "200px", fig.align="center", echo = FALSE} 42 | knitr::include_graphics("mountaincar.JPG") 43 | ``` 44 | 45 | The task of the `reset` function is to initialize the starting state of the environment and usually this function is called when starting a new episode. It returns the `state` of the environment. It takes an argument `self`, which is the newly created R6 class and can be used e.g. to access the current state of the environment. 46 | 47 | ```{r} 48 | reset = function(self) { 49 | position = runif(1, -0.6, -0.4) 50 | velocity = 0 51 | state = matrix(c(position, velocity), ncol = 2) 52 | state 53 | } 54 | ``` 55 | 56 | The `step` function is used for interaction, it controls the transition to the next state and reward given an action. It takes `self` and `action` as an argument and returns a list with the next `state`, `reward` and whether an episode is finished (`done`). 57 | 58 | ```{r} 59 | step = function(self, action) { 60 | position = self$state[1] 61 | velocity = self$state[2] 62 | velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025) 63 | velocity = min(max(velocity, -0.07), 0.07) 64 | position = position + velocity 65 | if (position < -1.2) { 66 | position = -1.2 67 | velocity = 0 68 | } 69 | state = matrix(c(position, velocity), ncol = 2) 70 | reward = -1 71 | if (position >= 0.5) { 72 | done = TRUE 73 | reward = 0 74 | } else { 75 | done = FALSE 76 | } 77 | list(state, reward, done) 78 | } 79 | ``` 80 | 81 | Then we can create the environment with 82 | 83 | ```{r} 84 | env = makeEnvironment(step = step, reset = reset) 85 | ``` 86 | 87 | --- 88 | 89 | ### OpenAI Gym 90 | 91 | OpenAI Gym [@gym_openai] provides a set of environments, which can be used for benchmarking. 92 | 93 | To use a gym environment you have to install 94 | 95 | * Python 96 | * `gym` (Python package, installation instructions here: https://github.com/openai/gym#installation) 97 | * `reticulate` (R package) 98 | 99 | Then you can create a gym environment by passing on the name of the environment. 100 | 101 | ```{r, eval = FALSE} 102 | # Create a gym environment. 103 | env = makeEnvironment("gym", gym.name = "MountainCar-v0") 104 | ``` 105 | 106 | Have a look at [https://gym.openai.com/envs](https://gym.openai.com/envs) for possible environments. 107 | 108 | --- 109 | 110 | ### Markov Decision Process 111 | 112 | A Markov Decision Process (MDP) is a stochastic process, which is commonly used for reinforcement learning environments. 113 | When the problem can be formulated as a MDP, all you need to pass to `makeEnvironment` is the state transition array $P^a_{ss'}$ and reward matrix $R_s^a$ of the MDP. 114 | 115 | We can create a simple MDP with 2 states and 2 actions with the following code. 116 | 117 | ```{r} 118 | # State transition array 119 | P = array(0, c(2, 2, 2)) 120 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE) 121 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE) 122 | 123 | # Reward matrix 124 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE) 125 | 126 | env = makeEnvironment("mdp", transitions = P, rewards = R) 127 | ``` 128 | 129 | --- 130 | 131 | ### Gridworld 132 | 133 | A gridworld is a simple MDP navigation task with a discrete state and action space. The agent has to move through a grid from a start state to a goal state. Possible actions are the standard moves (left, right, up, down) or could also include the diagonal moves (leftup, leftdown, rightup, rightdown). 134 | 135 | Here is an example of a 4x4 gridworld [@sutton2017, Example 4.1] with two terminal states in the lower right and upper left of the grid. Rewards are - 1 for every transition until reaching a terminal state. 136 | 137 | ```{r, out.width = "200px", fig.align="center", echo = FALSE} 138 | knitr::include_graphics("gridworld.JPG") 139 | ``` 140 | 141 | The following code creates this gridworld. 142 | 143 | ```{r} 144 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1) 145 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15)) 146 | ``` 147 | 148 | --- 149 | 150 | ## Interaction 151 | 152 | `makeEnvironment` returns an R6 class object which can be used for the interaction between agent and environment. 153 | 154 | ```{r} 155 | env = makeEnvironment("gridworld", shape = c(4, 4), 156 | goal.states = 0L, initial.state = 15L) 157 | ``` 158 | 159 | To take an action you can call the `step(action)` method. It is called with an action as an argument and internally computes the following `state`, `reward` and whether an episode is finished (`done`). 160 | 161 | ```{r} 162 | # The initial state of the environment. 163 | env$reset() 164 | 165 | env$visualize() 166 | 167 | # Actions are encoded as integers. 168 | env$step(0L) 169 | 170 | env$visualize() 171 | 172 | # But can also have character names. 173 | env$step("left") 174 | 175 | env$visualize() 176 | ``` 177 | 178 | Note that the R6 class object changes whenever calling `step` or `reset`! Therefore calling step with the same action twice will most likely return different states and rewards! 179 | 180 | Note also that all discrete states and actions are numerated starting with 0 to be consistent with OpenAI Gym! 181 | 182 | The environment object often also contains information about the number of states and actions or the bounds in case of a continuous space. 183 | 184 | ```{r} 185 | env = makeEnvironment("mountain.car") 186 | env$n.actions 187 | env$state.space.bounds 188 | ``` 189 | 190 | It also contains a counter of the number of interactions, i.e. the number of times `step` has been called, the number of steps in the current episode, the number of episodes and return in the current episode. 191 | 192 | ```{r} 193 | env = makeEnvironment("gridworld", shape = c(4, 4), 194 | goal.states = 0L, initial.state = 15L, discount = 0.99) 195 | 196 | env$step("up") 197 | env$n.step 198 | env$episode.return 199 | 200 | env$step("left") 201 | env$n.step 202 | env$episode.return 203 | ``` 204 | 205 | --- 206 | 207 | ### Full list of attributes and methods: 208 | 209 | Here is a full list describing the attributes of the `R6` class created by `makeEnvironment`. 210 | 211 | **Attributes**: 212 | 213 | - `state` [any]: The current state observation of the environment. Depending on the problem this can be anything, e.g. a scalar integer, a matrix or a list. 214 | 215 | - `reward` [integer(1)]: The current reward of the environment. It is always a scalar numeric value. 216 | 217 | - `done` [logical(1)]: A logical flag specifying whether an episode is finished. 218 | 219 | - `discount` [numeric(1) in [0, 1]]: The discount factor. 220 | 221 | - `n.step` [integer(1)]: Number of steps, i.e. number of times `$step()` has been called. 222 | 223 | - `episode.step` [integer(1)]: Number of steps in the current episode. In comparison to `n.step` it will be reset to 0 when `reset` is called. Each time `step` is called it is increased by 1. 224 | 225 | - `episode.return` [numeric(1)]: The return in the current episode. Each time `step` is called the discounted `reward` is added. Will be reset to 0 when `reset` is called. 226 | 227 | - `previous.state` [any]: The previous state of the environment. This is often the state which is updated in a reinforcement learning algorithm. 228 | 229 | **Methods**: 230 | 231 | - `reset()`: Resets the environment, i.e. it sets the `state` attribute to a starting state and sets the `done` flag to `FALSE`. It is usually called at the beginning of an episode. 232 | 233 | - `step(action)`: The interaction function between agent and environment. `step` is called with an action as an argument. It then takes the action and internally computes the following state, reward and whether an episode is finished and returns a list with `state`, `reward` and `done`. 234 | 235 | - `visualize()`: Visualize the current state of the environment. 236 | 237 | --- 238 | 239 | ### References 240 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • reinforcelearn 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 35 | 36 | 37 | 38 | 39 | 40 |
    41 |
    42 | 89 | 90 | 91 |
    92 | 93 |
    94 |
    95 | 101 | 102 |
    103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 116 | 117 | 118 | 119 | 122 | 123 | 124 | 125 | 126 | 130 | 131 | 132 | 133 | 136 | 137 | 138 | 139 | 142 | 143 | 144 | 145 | 148 | 149 | 150 | 151 | 154 | 155 | 156 | 157 | 160 | 161 | 162 | 163 | 166 | 167 | 168 | 169 | 172 | 173 | 174 | 175 | 178 | 179 | 180 | 181 | 182 | 186 | 187 | 188 | 189 | 192 | 193 | 194 | 195 | 198 | 199 | 200 | 201 | 204 | 205 | 206 | 207 | 210 | 211 | 212 | 213 | 214 | 218 | 219 | 220 | 221 | 224 | 225 | 226 | 227 | 230 | 231 | 232 | 233 | 236 | 237 | 238 | 239 | 240 | 244 | 245 | 246 | 247 | 250 | 251 | 252 | 253 | 256 | 257 | 258 | 259 | 260 | 264 | 265 | 266 | 267 | 270 | 271 | 272 | 273 | 274 | 278 | 279 | 280 | 281 | 284 | 285 | 286 | 287 | 288 | 292 | 293 | 294 | 295 | 298 | 299 | 300 | 301 | 304 | 305 | 306 | 307 | 310 | 311 | 312 | 313 | 316 | 317 | 318 | 319 | 322 | 323 | 324 | 325 | 328 | 329 | 330 | 331 | 334 | 335 | 336 | 337 | 340 | 341 | 342 | 343 |
    113 |

    Package help page

    114 |

    115 |
    120 |

    reinforcelearn

    121 |

    Reinforcement Learning.

    127 |

    Environments

    128 |

    Creation of reinforcement learning environments.

    129 |
    134 |

    makeEnvironment

    135 |

    Create reinforcement learning environment.

    140 |

    Environment

    141 |

    Custom Reinforcement Learning Environment

    146 |

    GymEnvironment

    147 |

    Gym Environment

    152 |

    MdpEnvironment

    153 |

    MDP Environment

    158 |

    Gridworld

    159 |

    Gridworld

    164 |

    CliffWalking

    165 |

    Cliff Walking

    170 |

    WindyGridworld

    171 |

    Windy Gridworld

    176 |

    MountainCar

    177 |

    Mountain Car

    183 |

    Policies

    184 |

    185 |
    190 |

    makePolicy

    191 |

    Create policy.

    196 |

    RandomPolicy

    197 |

    Random Policy

    202 |

    EpsilonGreedyPolicy

    203 |

    Epsilon Greedy Policy

    208 |

    SoftmaxPolicy

    209 |

    Softmax Policy

    215 |

    Value Function Representations

    216 |

    217 |
    222 |

    makeValueFunction

    223 |

    Value Function Representation

    228 |

    ValueTable

    229 |

    Value Table

    234 |

    ValueNetwork

    235 |

    Value Network

    241 |

    Algorithms

    242 |

    243 |
    248 |

    makeAlgorithm

    249 |

    Make reinforcement learning algorithm.

    254 |

    QLearning

    255 |

    Q-Learning

    261 |

    Agent

    262 |

    263 |
    268 |

    makeAgent

    269 |

    Create Agent.

    275 |

    Interaction

    276 |

    277 |
    282 |

    interact

    283 |

    Interaction between agent and environment.

    289 |

    Helper functions

    290 |

    291 |
    296 |

    makeReplayMemory

    297 |

    Experience Replay

    302 |

    getReplayMemory

    303 |

    Get replay memory.

    308 |

    Eligibility

    309 |

    Eligibility traces

    314 |

    getEligibilityTraces

    315 |

    Get eligibility traces

    320 |

    getValueFunction

    321 |

    Get weights of value function.

    326 |

    getStateValues

    327 |

    Get state values.

    332 |

    tiles iht

    333 |

    Tile Coding

    338 |

    nHot

    339 |

    Make n hot vector.

    344 |
    345 |
    346 | 347 | 360 |
    361 | 362 | 372 |
    373 | 374 | 375 | 376 | --------------------------------------------------------------------------------