├── codecov.yml
├── LICENSE
├── .gitignore
├── man
    ├── figures
    │   └── logo.png
    ├── RandomPolicy.Rd
    ├── SoftmaxPolicy.Rd
    ├── getStateValues.Rd
    ├── getEligibilityTraces.Rd
    ├── getReplayMemory.Rd
    ├── getValueFunction.Rd
    ├── EpsilonGreedyPolicy.Rd
    ├── Eligibility.Rd
    ├── nHot.Rd
    ├── QLearning.Rd
    ├── makeReplayMemory.Rd
    ├── ValueNetwork.Rd
    ├── makeAlgorithm.Rd
    ├── ValueTable.Rd
    ├── interact.Rd
    ├── makePolicy.Rd
    ├── MountainCar.Rd
    ├── makeValueFunction.Rd
    ├── GymEnvironment.Rd
    ├── makeAgent.Rd
    ├── reinforcelearn.Rd
    ├── Environment.Rd
    ├── MdpEnvironment.Rd
    ├── CliffWalking.Rd
    ├── windyGridworld.Rd
    ├── tilecoding.Rd
    ├── makeEnvironment.Rd
    └── gridworld.Rd
├── docs
    ├── reinforcelearn.png
    ├── reference
    │   ├── nHot.html
    │   ├── interact.html
    │   ├── QLearning.html
    │   ├── ValueTable.html
    │   ├── gridworld.html
    │   ├── makeAgent.html
    │   ├── makePolicy.html
    │   ├── tilecoding.html
    │   ├── CliffWalking.html
    │   ├── Eligibility.html
    │   ├── Environment.html
    │   ├── RandomPolicy.html
    │   ├── SoftmaxPolicy.html
    │   ├── ValueNetwork.html
    │   ├── figures
    │   │   └── logo.png
    │   ├── makeAlgorithm.html
    │   ├── mountainCar.html
    │   ├── GymEnvironment.html
    │   ├── MdpEnvironment.html
    │   ├── getReplayMemory.html
    │   ├── getStateValues.html
    │   ├── makeEnvironment.html
    │   ├── reinforcelearn.html
    │   ├── windyGridworld.html
    │   ├── getValueFunction.html
    │   ├── makeReplayMemory.html
    │   ├── makeValueFunction.html
    │   ├── EpsilonGreedyPolicy.html
    │   ├── getEligibilityTraces.html
    │   └── index.html
    ├── articles
    │   ├── gridworld.JPG
    │   ├── mountaincar.JPG
    │   ├── environments.R
    │   ├── agents.R
    │   └── index.html
    ├── pkgdown.yml
    ├── link.svg
    ├── pkgdown.js
    ├── jquery.sticky-kit.min.js
    ├── session_info.txt
    ├── pkgdown.css
    ├── authors.html
    ├── news
    │   └── index.html
    └── LICENSE.html
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_policy.R
    │   ├── test_accessor_functions.R
    │   ├── test_environment.R
    │   └── test_agent.R
├── vignettes
    ├── gridworld.JPG
    ├── mountaincar.JPG
    ├── environments.R
    ├── agents.R
    ├── agents.Rmd
    └── environments.Rmd
├── benchmark
    ├── Images
    │   ├── qlearning_windygrid-1.png
    │   ├── qlearning_windygrid_elig-1.png
    │   ├── qlearning_windygrid_expreplay-1.png
    │   └── qlearning_windygrid_neuralnetwork-1.png
    ├── benchmark_windy_gridworld.md
    └── benchmark_windy_gridworld.Rmd
├── NEWS.md
├── .Rbuildignore
├── cran-comments.md
├── .travis.yml
├── NAMESPACE
├── DESCRIPTION
├── R
    ├── reinforcelearn.R
    ├── eligibility.R
    ├── algorithm.R
    ├── accessor_functions.R
    ├── environment_mountaincar.R
    ├── environment_mdp.R
    ├── environment_gym.R
    ├── experience_replay.R
    ├── policy.R
    ├── interact.R
    ├── tiles.R
    ├── valuefunction.R
    └── environment.R
├── _pkgdown.yml
├── session_info.txt
├── README.Rmd
├── README.md
└── examples
    └── user_interface.R


/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Markus Dumke


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | reinforcelearn.Rproj
6 | 


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/docs/reinforcelearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reinforcelearn.png


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(reinforcelearn)
3 | 
4 | test_check("reinforcelearn")
5 | 


--------------------------------------------------------------------------------
/vignettes/gridworld.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/vignettes/gridworld.JPG


--------------------------------------------------------------------------------
/docs/reference/nHot.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/nHot.html


--------------------------------------------------------------------------------
/vignettes/mountaincar.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/vignettes/mountaincar.JPG


--------------------------------------------------------------------------------
/docs/articles/gridworld.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/articles/gridworld.JPG


--------------------------------------------------------------------------------
/docs/reference/interact.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/interact.html


--------------------------------------------------------------------------------
/docs/articles/mountaincar.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/articles/mountaincar.JPG


--------------------------------------------------------------------------------
/docs/reference/QLearning.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/QLearning.html


--------------------------------------------------------------------------------
/docs/reference/ValueTable.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/ValueTable.html


--------------------------------------------------------------------------------
/docs/reference/gridworld.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/gridworld.html


--------------------------------------------------------------------------------
/docs/reference/makeAgent.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeAgent.html


--------------------------------------------------------------------------------
/docs/reference/makePolicy.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makePolicy.html


--------------------------------------------------------------------------------
/docs/reference/tilecoding.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/tilecoding.html


--------------------------------------------------------------------------------
/docs/reference/CliffWalking.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/CliffWalking.html


--------------------------------------------------------------------------------
/docs/reference/Eligibility.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/Eligibility.html


--------------------------------------------------------------------------------
/docs/reference/Environment.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/Environment.html


--------------------------------------------------------------------------------
/docs/reference/RandomPolicy.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/RandomPolicy.html


--------------------------------------------------------------------------------
/docs/reference/SoftmaxPolicy.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/SoftmaxPolicy.html


--------------------------------------------------------------------------------
/docs/reference/ValueNetwork.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/ValueNetwork.html


--------------------------------------------------------------------------------
/docs/reference/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/figures/logo.png


--------------------------------------------------------------------------------
/docs/reference/makeAlgorithm.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeAlgorithm.html


--------------------------------------------------------------------------------
/docs/reference/mountainCar.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/mountainCar.html


--------------------------------------------------------------------------------
/docs/reference/GymEnvironment.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/GymEnvironment.html


--------------------------------------------------------------------------------
/docs/reference/MdpEnvironment.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/MdpEnvironment.html


--------------------------------------------------------------------------------
/docs/reference/getReplayMemory.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getReplayMemory.html


--------------------------------------------------------------------------------
/docs/reference/getStateValues.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getStateValues.html


--------------------------------------------------------------------------------
/docs/reference/makeEnvironment.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeEnvironment.html


--------------------------------------------------------------------------------
/docs/reference/reinforcelearn.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/reinforcelearn.html


--------------------------------------------------------------------------------
/docs/reference/windyGridworld.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/windyGridworld.html


--------------------------------------------------------------------------------
/docs/reference/getValueFunction.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getValueFunction.html


--------------------------------------------------------------------------------
/docs/reference/makeReplayMemory.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeReplayMemory.html


--------------------------------------------------------------------------------
/docs/reference/makeValueFunction.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/makeValueFunction.html


--------------------------------------------------------------------------------
/docs/reference/EpsilonGreedyPolicy.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/EpsilonGreedyPolicy.html


--------------------------------------------------------------------------------
/docs/reference/getEligibilityTraces.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/docs/reference/getEligibilityTraces.html


--------------------------------------------------------------------------------
/benchmark/Images/qlearning_windygrid-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid-1.png


--------------------------------------------------------------------------------
/benchmark/Images/qlearning_windygrid_elig-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_elig-1.png


--------------------------------------------------------------------------------
/benchmark/Images/qlearning_windygrid_expreplay-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_expreplay-1.png


--------------------------------------------------------------------------------
/benchmark/Images/qlearning_windygrid_neuralnetwork-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusdumke/reinforcelearn/HEAD/benchmark/Images/qlearning_windygrid_neuralnetwork-1.png


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # reinforcelearn
 2 | 
 3 | # 0.2.0
 4 | 
 5 | * Fixed failing tests due to random number generation in R 3.6.0.
 6 | 
 7 | # 0.1.0
 8 | 
 9 | * Initial release.
10 | * Added a `NEWS.md` file to track changes to the package.
11 | 


--------------------------------------------------------------------------------
/docs/pkgdown.yml:
--------------------------------------------------------------------------------
1 | urls:
2 |   reference: http://markusdumke.github.io/reinforcelearn/reference
3 |   article: http://markusdumke.github.io/reinforcelearn/articles
4 | articles:
5 |   agents: agents.html
6 |   environments: environments.html
7 | 
8 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.travis\.yml$
 4 | ^README\.Rmd$
 5 | ^README-.*\.png$
 6 | ^cran-comments\.md$
 7 | ^codecov\.yml$
 8 | ^docs$
 9 | ^_pkgdown\.yml$
10 | ^reinforcelearn\.png$
11 | ^examples$
12 | ^benchmark$
13 | ^session_info.txt$
14 | 


--------------------------------------------------------------------------------
/man/RandomPolicy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/policy.R
 3 | \name{RandomPolicy}
 4 | \alias{RandomPolicy}
 5 | \title{Random Policy}
 6 | \description{
 7 | Random Policy
 8 | }
 9 | \section{Usage}{
10 | 
11 | \code{makePolicy("random")}
12 | }
13 | 
14 | \examples{
15 | pol = makePolicy("random")
16 | }
17 | 


--------------------------------------------------------------------------------
/man/SoftmaxPolicy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/policy.R
 3 | \name{SoftmaxPolicy}
 4 | \alias{SoftmaxPolicy}
 5 | \title{Softmax Policy}
 6 | \description{
 7 | Softmax Policy
 8 | }
 9 | \section{Usage}{
10 | 
11 | \code{makePolicy("softmax")}
12 | }
13 | 
14 | \examples{
15 | pol = makePolicy("softmax")
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/testthat/test_policy.R:
--------------------------------------------------------------------------------
 1 | context("policy")
 2 | policy1 = makePolicy("random")
 3 | policy2 = makePolicy("greedy")
 4 | policy3 = makePolicy("epsilon.greedy", epsilon = 0.2)
 5 | policy4 = makePolicy("softmax")
 6 | test_that("policy creation returns list", {
 7 |   expect_equivalent(policy1, list(name = "random", args = list()))
 8 |   expect_equal(class(policy2), "Policy")
 9 |   expect_equal(policy3$args$epsilon, 0.2)
10 | })
11 | 


--------------------------------------------------------------------------------
/man/getStateValues.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accessor_functions.R
 3 | \name{getStateValues}
 4 | \alias{getStateValues}
 5 | \title{Get state values.}
 6 | \usage{
 7 | getStateValues(action.vals)
 8 | }
 9 | \arguments{
10 | \item{action.vals}{[\code{matrix}] \cr Action value matrix.}
11 | }
12 | \description{
13 | Get state value function from  action value function.
14 | }
15 | 


--------------------------------------------------------------------------------
/man/getEligibilityTraces.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accessor_functions.R
 3 | \name{getEligibilityTraces}
 4 | \alias{getEligibilityTraces}
 5 | \title{Get eligibility traces}
 6 | \usage{
 7 | getEligibilityTraces(agent)
 8 | }
 9 | \arguments{
10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.}
11 | }
12 | \value{
13 | A matrix with the eligibility traces.
14 | }
15 | \description{
16 | Returns the eligibility traces of the agent.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/getReplayMemory.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accessor_functions.R
 3 | \name{getReplayMemory}
 4 | \alias{getReplayMemory}
 5 | \title{Get replay memory.}
 6 | \usage{
 7 | getReplayMemory(agent)
 8 | }
 9 | \arguments{
10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.}
11 | }
12 | \value{
13 | A list containing the experienced observations, actions and rewards.
14 | }
15 | \description{
16 | Returns the replay memory of the agent.
17 | }
18 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Resubmission
 2 | This is a resubmission. In this version I have:
 3 | 
 4 | * Fixed test that failed due to change in R random number generation in R 3.6.0.
 5 | 
 6 | ## Test environments
 7 | * local x86_64-w64-mingw32/x64 (64-bit), R 3.6.0
 8 | * linux_gnu x_86_64, R Under development (unstable) (2017-12-22 r73943) on travis-ci
 9 | 
10 | ## R CMD check results
11 | 
12 | 0 errors | 0 warnings | 0 notes
13 | 
14 | ## Reverse dependencies
15 | 
16 | There are no reverse dependencies.
17 | 
18 | ---
19 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | sudo: false
 5 | cache: packages
 6 | 
 7 | warnings_are_errors: true
 8 | 
 9 | r:
10 | - devel
11 | 
12 | before_install:
13 | - Rscript -e 'if (length(find.package("devtools", quiet = TRUE)) == 0) install.packages("devtools")'
14 | - Rscript -e 'devtools::install_deps()'
15 | 
16 | after_success:
17 | - Rscript -e 'covr::codecov(type = "tests")'
18 | 
19 | notifications:
20 |   slack:
21 |     on_success: change
22 |     on_failure: change
23 | 


--------------------------------------------------------------------------------
/man/getValueFunction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accessor_functions.R
 3 | \name{getValueFunction}
 4 | \alias{getValueFunction}
 5 | \title{Get weights of value function.}
 6 | \usage{
 7 | getValueFunction(agent)
 8 | }
 9 | \arguments{
10 | \item{agent}{[Agent] \cr An agent created by \link{makeAgent}.}
11 | }
12 | \value{
13 | For a value function table this will return a matrix, for a neural
14 | network a list with the weights of the layers.
15 | }
16 | \description{
17 | Returns the weights of the value function representation of the agent.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/EpsilonGreedyPolicy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/policy.R
 3 | \name{EpsilonGreedyPolicy}
 4 | \alias{EpsilonGreedyPolicy}
 5 | \alias{GreedyPolicy}
 6 | \title{Epsilon Greedy Policy}
 7 | \arguments{
 8 | \item{epsilon}{[\code{numeric(1) in [0, 1]}] \cr
 9 | Ratio of random exploration in epsilon-greedy action selection.}
10 | }
11 | \description{
12 | Epsilon Greedy Policy
13 | }
14 | \section{Usage}{
15 | 
16 | \code{makePolicy("epsilon.greedy", epsilon = 0.1)} \cr
17 | \code{makePolicy("greedy")}
18 | }
19 | 
20 | \examples{
21 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/Eligibility.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/eligibility.R
 3 | \name{Eligibility}
 4 | \alias{Eligibility}
 5 | \alias{eligibility}
 6 | \title{Eligibility traces}
 7 | \arguments{
 8 | \item{lambda}{[\code{numeric(1)} in (0, 1)] \cr Trace decay parameter.}
 9 | 
10 | \item{traces}{[\code{character(1)}] \cr Type of eligibility trace update. One of \code{c("replace", "accumulate")}.}
11 | }
12 | \description{
13 | Eligibility traces.
14 | }
15 | \details{
16 | Algorithms supporting eligibility traces:
17 | \itemize{
18 | \item \link{QLearning}
19 | }
20 | }
21 | \examples{
22 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
23 | }
24 | 


--------------------------------------------------------------------------------
/man/nHot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tiles.R
 3 | \name{nHot}
 4 | \alias{nHot}
 5 | \title{Make n hot vector.}
 6 | \usage{
 7 | nHot(x, len, out = "matrix")
 8 | }
 9 | \arguments{
10 | \item{x}{[\code{integer}] \cr Which features are active?}
11 | 
12 | \item{len}{[\code{integer(1)}] \cr Length of the feature vector.}
13 | 
14 | \item{out}{[\code{character(1)}] \cr Format of the output. Can be a vector or a matrix.}
15 | }
16 | \value{
17 | [\code{matrix(1, len)}] A one-row matrix with \code{len} columns with every
18 | entry 0 except the columns specified by \code{x} which are 1.
19 | }
20 | \description{
21 | Make n hot vector.
22 | }
23 | \examples{
24 | nHot(c(1, 3), 5)
25 | nHot(c(1, 3), 5, out = "vector")
26 | }
27 | 


--------------------------------------------------------------------------------
/man/QLearning.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/algorithm.R
 3 | \name{QLearning}
 4 | \alias{QLearning}
 5 | \alias{qlearning}
 6 | \title{Q-Learning}
 7 | \arguments{
 8 | \item{lambda}{[\code{numeric(1)} in (0, 1)] \cr Trace decay parameter.}
 9 | 
10 | \item{traces}{[\code{character(1)}] \cr Type of eligibility trace update. One of \code{c("replace", "accumulate")}.}
11 | }
12 | \description{
13 | Q-Learning algorithm.
14 | }
15 | \details{
16 | To use eligibility traces specify \code{lambda} and \code{traces}.
17 | }
18 | \section{Usage}{
19 | 
20 | \code{makeAlgorithm("qlearning", lambda, traces)}
21 | }
22 | 
23 | \examples{
24 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
25 | }
26 | \seealso{
27 | \link{Eligibility}
28 | }
29 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(go,down)
 4 | S3method(go,left)
 5 | S3method(go,leftdown)
 6 | S3method(go,leftup)
 7 | S3method(go,right)
 8 | S3method(go,rightdown)
 9 | S3method(go,rightup)
10 | S3method(go,up)
11 | export(CliffWalking)
12 | export(Environment)
13 | export(EpsilonGreedyPolicy)
14 | export(Gridworld)
15 | export(GymEnvironment)
16 | export(MdpEnvironment)
17 | export(RandomPolicy)
18 | export(SoftmaxPolicy)
19 | export(WindyGridworld)
20 | export(getEligibilityTraces)
21 | export(getReplayMemory)
22 | export(getStateValues)
23 | export(getValueFunction)
24 | export(iht)
25 | export(interact)
26 | export(makeAgent)
27 | export(makeAlgorithm)
28 | export(makeEnvironment)
29 | export(makePolicy)
30 | export(makeReplayMemory)
31 | export(makeValueFunction)
32 | export(nHot)
33 | export(tiles)
34 | 


--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#75AADB;}
 7 | </style>
 8 | <path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
 9 | 	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
10 | 	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
11 | 	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/man/makeReplayMemory.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/experience_replay.R
 3 | \name{makeReplayMemory}
 4 | \alias{makeReplayMemory}
 5 | \alias{experience.replay,}
 6 | \alias{replay.memory}
 7 | \title{Experience Replay}
 8 | \usage{
 9 | makeReplayMemory(size = 100L, batch.size = 16L)
10 | }
11 | \arguments{
12 | \item{size}{[\code{integer(1)}] \cr Size of replay memory.}
13 | 
14 | \item{batch.size}{[\code{integer(1)}] \cr Batch size.}
15 | }
16 | \value{
17 | [\code{list(size, batch.size)}]
18 | This list can then be passed onto \link{makeAgent}, which will construct the
19 | replay memory accordingly.
20 | }
21 | \description{
22 | Create replay memory for experience replay.
23 | }
24 | \details{
25 | Sampling from replay memory will be uniform.
26 | }
27 | \examples{
28 | memory = makeReplayMemory(size = 100L, batch.size = 16L)
29 | }
30 | 


--------------------------------------------------------------------------------
/man/ValueNetwork.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/valuefunction.R
 3 | \name{ValueNetwork}
 4 | \alias{ValueNetwork}
 5 | \alias{neural.network}
 6 | \title{Value Network}
 7 | \arguments{
 8 | \item{model}{[\code{keras model}] \cr A keras model.
 9 | Make sure that the model has been compiled.}
10 | }
11 | \description{
12 | Neural network representing the action value function Q.
13 | }
14 | \section{Usage}{
15 | 
16 | \code{makeValueFunction("neural.network", model)}
17 | }
18 | 
19 | \examples{
20 | \dontrun{
21 | library(keras)
22 | model = keras_model_sequential()
23 | model \%>\% layer_dense(20, input_shape = 10, activation = "relu")
24 | model \%>\% layer_dense(4, activation = "softmax")
25 | keras::compile(model, loss = "mae", optimizer = keras::optimizer_sgd(lr = 0.4))
26 | 
27 | val = makeValueFunction("neural.network", model = model)
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/man/makeAlgorithm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/algorithm.R
 3 | \name{makeAlgorithm}
 4 | \alias{makeAlgorithm}
 5 | \title{Make reinforcement learning algorithm.}
 6 | \usage{
 7 | makeAlgorithm(class, args = list(), ...)
 8 | }
 9 | \arguments{
10 | \item{class}{[\code{character(1)}] \cr Algorithm. One of \code{c("qlearning")}.}
11 | 
12 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the
13 | subclass. The arguments in ... take precedence over values in this list.
14 | We strongly encourage you to use one or the other to pass arguments
15 | to the function but not both.}
16 | 
17 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively
18 | these can be given using the \code{args} argument.}
19 | }
20 | \description{
21 | Make reinforcement learning algorithm.
22 | }
23 | \section{Representations}{
24 | 
25 | \itemize{
26 | \item \link{QLearning}
27 | }
28 | }
29 | 
30 | \examples{
31 | alg = makeAlgorithm("qlearning")
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/testthat/test_accessor_functions.R:
--------------------------------------------------------------------------------
 1 | context("getValueFunction")
 2 | val.fun = makeValueFunction("table", n.states = 8L, n.actions = 4L)
 3 | agent = makeAgent("random", val.fun)
 4 | 
 5 | test_that("getValueFunction returns action value function", {
 6 |   expect_equal(getValueFunction(agent), matrix(0, nrow = 8, ncol = 4))
 7 | })
 8 | 
 9 | test_that("getStateValues returns row max of action value function", {
10 |   expect_equal(getStateValues(matrix(c(1, 2, 3, 4), ncol = 2)), c(3, 4))
11 | })
12 | 
13 | set.seed(1)
14 | context("getReplayMemory")
15 | env = makeEnvironment("windy.gridworld")
16 | memory = makeReplayMemory(size = 2L, batch.size = 2L)
17 | agent = makeAgent("random", replay.memory = memory)
18 | interact(env, agent, n.steps = 2L)
19 | 
20 | test_that("getReplayMemory returns list", {
21 |   expect_equal(typeof(getReplayMemory(agent)), "list")
22 |   expect_equal(getReplayMemory(agent), list(list(state = 30, action = 0,
23 |     reward = -1, next.state = 30), list(state = 30, action = 2, reward = -1,
24 |       next.state = 20)))
25 | })
26 | 
27 | context("getEligibilityTraces")
28 | 


--------------------------------------------------------------------------------
/man/ValueTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/valuefunction.R
 3 | \name{ValueTable}
 4 | \alias{ValueTable}
 5 | \alias{table}
 6 | \title{Value Table}
 7 | \arguments{
 8 | \item{n.states}{[\code{integer(1)}] \cr Number of states (rows in the value function).}
 9 | 
10 | \item{n.actions}{[\code{integer(1)}] \cr Number of actions (columns in the value function).}
11 | 
12 | \item{step.size}{[\code{numeric(1)}] \cr Step size (learning rate) for gradient descent update.}
13 | }
14 | \description{
15 | Table representing the action value function Q.
16 | }
17 | \details{
18 | You can specify the shape of the value table. If omitted the agent will try
19 | to configure these automatically from the environment during interaction
20 | (therefore the environment needs to have a \code{n.states} and \code{n.actions} attribute).
21 | }
22 | \section{Usage}{
23 | 
24 | \code{makeValueFunction("table", n.states = NULL, n.actions = 1L, step.size = 0.1, initial.value = NULL)}
25 | }
26 | 
27 | \examples{
28 | val = makeValueFunction("table", n.states = 20L, n.actions = 4L)
29 | }
30 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: reinforcelearn
 2 | Type: Package
 3 | Title: Reinforcement Learning
 4 | Version: 0.2.0
 5 | Authors@R: person("Markus", "Dumke", email = {"markusdumke@gmail.com"}, role = c("aut", "cre"))
 6 | Description: Implements reinforcement learning environments and algorithms as described in Sutton & Barto (1998, ISBN:0262193981).
 7 |     The Q-Learning algorithm can be used with function approximation, 
 8 |     eligibility traces (Singh & Sutton (1996) <doi:10.1007/BF00114726>) 
 9 |     and experience replay (Mnih et al. (2013) <arXiv:1312.5602>).
10 | License: MIT + file LICENSE
11 | Encoding: UTF-8
12 | LazyData: true
13 | Depends: R (>= 3.0.0)
14 | RoxygenNote: 6.1.1
15 | BugReports: https://github.com/markusdumke/reinforcelearn/issues
16 | URL: http://markusdumke.github.io/reinforcelearn
17 | SystemRequirements: (Python and gym only required if gym environments are used)
18 | Imports:
19 |     checkmate (>= 1.8.4),
20 |     R6 (>= 2.2.2),
21 |     nnet (>= 7.3-12),
22 |     purrr (>= 0.2.4)
23 | Suggests:
24 |     reticulate,
25 |     keras,
26 |     knitr,
27 |     rmarkdown,
28 |     testthat,
29 |     covr,
30 |     lintr
31 | VignetteBuilder: knitr
32 | 


--------------------------------------------------------------------------------
/R/reinforcelearn.R:
--------------------------------------------------------------------------------
 1 | #' Reinforcement Learning.
 2 | #'
 3 | #' Implementations of reinforcement learning algorithms and environments.
 4 | #'
 5 | #' @md
 6 | #'
 7 | #' @section Environments:
 8 | #' * [makeEnvironment]
 9 | #' * [Environment]
10 | #' * [GymEnvironment]
11 | #' * [MdpEnvironment]
12 | #' * [Gridworld]
13 | #' * [WindyGridworld]
14 | #' * [CliffWalking]
15 | #' * [MountainCar]
16 | #' * [MountainCarContinuous]
17 | #'
18 | #' @section Policies:
19 | #' * [makePolicy]
20 | #' * [EpsilonGreedyPolicy]
21 | #' * [GreedyPolicy]
22 | #' * [SoftmaxPolicy]
23 | #' * [RandomPolicy]
24 | #'
25 | #' @section Value Function Representations:
26 | #' * [makeValueFunction]
27 | #' * [ValueTable]
28 | #' * [ValueNetwork]
29 | #'
30 | #' @section Algorithms:
31 | #' * [makeAlgorithm]
32 | #' * [QLearning]
33 | #'
34 | #' @section Extensions:
35 | #' * [makeReplayMemory]
36 | #' * [Eligibility]
37 | #'
38 | #' @section Agent:
39 | #' * [makeAgent]
40 | #' * [getValueFunction]
41 | #' * [getReplayMemory]
42 | #' * [getEligibilityTraces]
43 | #'
44 | #' @section Interaction:
45 | #' * [interact]
46 | #'
47 | #' @name reinforcelearn
48 | #' @aliases reinforcementlearning
49 | #' @docType package
50 | NULL
51 | 


--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
 1 | $(function() {
 2 |   $("#sidebar").stick_in_parent({offset_top: 40});
 3 |   $('body').scrollspy({
 4 |     target: '#sidebar',
 5 |     offset: 60
 6 |   });
 7 | 
 8 |   var cur_path = paths(location.pathname);
 9 |   $("#navbar ul li a").each(function(index, value) {
10 |     if (value.text == "Home")
11 |       return;
12 |     if (value.getAttribute("href") === "#")
13 |       return;
14 | 
15 |     var path = paths(value.pathname);
16 |     if (is_prefix(cur_path, path)) {
17 |       // Add class to parent <li>, and enclosing <li> if in dropdown
18 |       var menu_anchor = $(value);
19 |       menu_anchor.parent().addClass("active");
20 |       menu_anchor.closest("li.dropdown").addClass("active");
21 |     }
22 |   });
23 | });
24 | 
25 | function paths(pathname) {
26 |   var pieces = pathname.split("/");
27 |   pieces.shift(); // always starts with /
28 | 
29 |   var end = pieces[pieces.length - 1];
30 |   if (end === "index.html" || end === "")
31 |     pieces.pop();
32 |   return(pieces);
33 | }
34 | 
35 | function is_prefix(needle, haystack) {
36 |   if (needle.length > haystack.lengh)
37 |     return(false);
38 | 
39 |   for (var i = 0; i < haystack.length; i++) {
40 |     if (needle[i] != haystack[i])
41 |       return(false);
42 |   }
43 | 
44 |   return(true);
45 | }
46 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: http://markusdumke.github.io/reinforcelearn
 2 | 
 3 | template:
 4 |   params:
 5 |     bootswatch: cerulean
 6 | 
 7 | reference:
 8 |   - title: Package help page
 9 |     contents:
10 |     - reinforcelearn
11 |   - title: Environments
12 |     desc: Creation of reinforcement learning environments.
13 |     contents:
14 |     - makeEnvironment
15 |     - Environment
16 |     - GymEnvironment
17 |     - MdpEnvironment
18 |     - Gridworld
19 |     - CliffWalking
20 |     - WindyGridworld
21 |     - MountainCar
22 |     - MountainCarContinuous
23 |   - title: Policies
24 |     contents:
25 |     - makePolicy
26 |     - RandomPolicy
27 |     - GreedyPolicy
28 |     - EpsilonGreedyPolicy
29 |     - SoftmaxPolicy
30 |   - title: Value Function Representations
31 |     contents:
32 |     - makeValueFunction
33 |     - ValueTable
34 |     - ValueNetwork
35 |   - title: Algorithms
36 |     contents:
37 |     - makeAlgorithm
38 |     - QLearning
39 |   - title: Agent
40 |     contents:
41 |     - makeAgent
42 |   - title: Interaction
43 |     contents:
44 |     - interact
45 |   - title: Helper functions
46 |     contents:
47 |     - makeReplayMemory
48 |     - getReplayMemory
49 |     - Eligibility
50 |     - getEligibilityTraces
51 |     - getValueFunction
52 |     - getStateValues
53 |     - tiles
54 |     - iht
55 |     - nHot
56 | 
57 | 


--------------------------------------------------------------------------------
/R/eligibility.R:
--------------------------------------------------------------------------------
 1 | #' Eligibility traces
 2 | #'
 3 | #' Eligibility traces.
 4 | #'
 5 | #' Algorithms supporting eligibility traces:
 6 | #' * [QLearning]
 7 | #'
 8 | #' @param lambda \[`numeric(1)` in (0, 1)] \cr Trace decay parameter.
 9 | #' @param traces \[`character(1)`] \cr Type of eligibility trace update. One of `c("replace", "accumulate")`.
10 | #'
11 | #' @name Eligibility
12 | #' @md
13 | #'
14 | #' @aliases eligibility
15 | #'
16 | #' @examples
17 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
18 | NULL
19 | 
20 | Eligibility = R6::R6Class("Eligibility",
21 |   public = list(
22 |     lambda = 0,
23 |     eligibility.type = NULL,
24 |     E = NULL,
25 |     initialize = function(lambda = 0, traces = "accumulate") {
26 |       self$lambda = lambda
27 |       if (traces == "replace") {
28 |         self$eligibility.type = 1
29 |       } else if (traces == "accumulate") {
30 |         self$eligibility.type = 0
31 |       }
32 |     },
33 |     reset = function(val.fun) {
34 |       self$E = matrix(0, nrow = nrow(val.fun), ncol = ncol(val.fun))
35 |     },
36 |     increase = function(s, a) {
37 |       self$E[s + 1L, a + 1L] = (1 - self$eligibility.type) * self$E[s + 1L, a + 1L] + 1
38 |     },
39 |     decrease = function(discount) {
40 |       self$E = discount * self$lambda * self$E # sarsa
41 |     }
42 |   )
43 | )
44 | 


--------------------------------------------------------------------------------
/man/interact.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/interact.R
 3 | \name{interact}
 4 | \alias{interact}
 5 | \title{Interaction between agent and environment.}
 6 | \usage{
 7 | interact(env, agent, n.steps = Inf, n.episodes = Inf,
 8 |   max.steps.per.episode = Inf, learn = TRUE, visualize = FALSE)
 9 | }
10 | \arguments{
11 | \item{env}{[\code{Environment}] \cr Reinforcement learning environment created by \link{makeEnvironment}.}
12 | 
13 | \item{agent}{[\code{Agent}] \cr Agent created by \link{makeAgent}.}
14 | 
15 | \item{n.steps}{[\code{integer(1)}] \cr Number of steps to run.}
16 | 
17 | \item{n.episodes}{[\code{integer(1)}] \cr Number of episodes to run.}
18 | 
19 | \item{max.steps.per.episode}{[\code{integer(1)}] \cr Maximal number of steps allowed per episode.}
20 | 
21 | \item{learn}{[\code{logical(1)}] \cr Should the agent learn?}
22 | 
23 | \item{visualize}{[\code{logical(1)}] \cr Visualize the interaction between agent and environment?}
24 | }
25 | \value{
26 | [\code{list}] Return and number of steps per episode.
27 | }
28 | \description{
29 | Run interaction between agent and environment for specified number of steps
30 | or episodes.
31 | }
32 | \examples{
33 | env = makeEnvironment("windy.gridworld")
34 | agent = makeAgent("softmax", "table", "qlearning")
35 | interact(env, agent, n.episodes = 10L)
36 | }
37 | 


--------------------------------------------------------------------------------
/man/makePolicy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/policy.R
 3 | \name{makePolicy}
 4 | \alias{makePolicy}
 5 | \alias{Policy}
 6 | \title{Create policy.}
 7 | \usage{
 8 | makePolicy(class = "random", args = list(), ...)
 9 | }
10 | \arguments{
11 | \item{class}{[\code{character(1)}] \cr
12 | Class of policy. One of \code{c("random", "epsilon.greedy", "greedy", "softmax")}.}
13 | 
14 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the
15 | subclass. The arguments in ... take precedence over values in this list.
16 | We strongly encourage you to use one or the other to pass arguments
17 | to the function but not both.}
18 | 
19 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively
20 | these can be given using the \code{args} argument.}
21 | }
22 | \value{
23 | [\code{list(name, args)}] List with the name and optional args.
24 | This list can then be passed onto \link{makeAgent}, which will construct the
25 | policy accordingly.
26 | }
27 | \description{
28 | Reinforcement learning policies.
29 | }
30 | \section{Policies}{
31 | 
32 | \itemize{
33 | \item \link{RandomPolicy}
34 | \item \link{GreedyPolicy}
35 | \item \link{EpsilonGreedyPolicy}
36 | \item \link{SoftmaxPolicy}
37 | }
38 | }
39 | 
40 | \examples{
41 | policy = makePolicy("random")
42 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/MountainCar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment_mountaincar.R
 3 | \docType{data}
 4 | \name{MountainCar}
 5 | \alias{MountainCar}
 6 | \alias{MountainCarContinuous,}
 7 | \alias{mountain.car}
 8 | \alias{MountainCarContinuous}
 9 | \title{Mountain Car}
10 | \format{An object of class \code{R6ClassGenerator} of length 24.}
11 | \arguments{
12 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
13 | }
14 | \description{
15 | The classical mountain car problem for reinforcement learning.
16 | }
17 | \details{
18 | The classical Mountain Car task the action is one of {0, 1, 2},
19 | in the continuous version the action is in [-1, 1].
20 | }
21 | \section{Usage}{
22 | 
23 | \code{makeEnvironment("MountainCar", ...)} \cr
24 | \code{makeEnvironment("MountainCarContinuous", ...)}
25 | }
26 | 
27 | \section{Methods}{
28 | 
29 | \itemize{
30 | \item \code{$step(action)} \cr
31 | Take action in environment.
32 | Returns a list with \code{state}, \code{reward}, \code{done}.
33 | \item \code{$reset()} \cr
34 | Resets the \code{done} flag of the environment and returns an initial state.
35 | Useful when starting a new episode.
36 | \item \code{$visualize()} \cr
37 | Visualizes the environment (if there is a visualization function).
38 | }
39 | }
40 | 
41 | \examples{
42 | env = makeEnvironment("mountain.car")
43 | env$reset()
44 | env$step(1L)
45 | 
46 | env = makeEnvironment("mountain.car.continuous")
47 | env$reset()
48 | env$step(0.62)
49 | }
50 | \keyword{datasets}
51 | 


--------------------------------------------------------------------------------
/man/makeValueFunction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/valuefunction.R
 3 | \name{makeValueFunction}
 4 | \alias{makeValueFunction}
 5 | \title{Value Function Representation}
 6 | \usage{
 7 | makeValueFunction(class, args = list(), ...)
 8 | }
 9 | \arguments{
10 | \item{class}{[\code{character(1)}] \cr Class of value function approximation.
11 | One of \code{c("table", "neural.network")}.}
12 | 
13 | \item{args}{[\code{list}] \cr Optional list of named arguments passed on to the
14 | subclass. The arguments in ... take precedence over values in this list.
15 | We strongly encourage you to use one or the other to pass arguments
16 | to the function but not both.}
17 | 
18 | \item{...}{[\code{any}] \cr Optional named arguments passed on to the subclass. Alternatively
19 | these can be given using the \code{args} argument.}
20 | }
21 | \value{
22 | [\code{list(name, args)}] List with the name and optional args.
23 | This list can then be passed onto \link{makeAgent}, which will construct the
24 | value function accordingly.
25 | }
26 | \description{
27 | A representation of the value function.
28 | }
29 | \section{Representations}{
30 | 
31 | \itemize{
32 | \item \link{ValueTable}
33 | \item \link{ValueNetwork}
34 | }
35 | }
36 | 
37 | \examples{
38 | val = makeValueFunction("table", n.states = 16L, n.actions = 4L)
39 | # If the number of states and actions is not supplied, the agent will try
40 | # to figure these out from the environment object during interaction.
41 | val = makeValueFunction("table")
42 | }
43 | 


--------------------------------------------------------------------------------
/man/GymEnvironment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment_gym.R
 3 | \name{GymEnvironment}
 4 | \alias{GymEnvironment}
 5 | \title{Gym Environment}
 6 | \arguments{
 7 | \item{gym.name}{[\code{character(1)}] \cr
 8 | Name of gym environment, e.g. \code{"CartPole-v0"}.}
 9 | 
10 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
11 | }
12 | \description{
13 | Reinforcement learning environment from OpenAI Gym.
14 | }
15 | \details{
16 | For available gym environments take a look at https://gym.openai.com/envs.
17 | }
18 | \section{Usage}{
19 | 
20 | \code{makeEnvironment("gym", gym.name, ...)}
21 | }
22 | 
23 | \section{Installation}{
24 | 
25 | For installation of the python package \code{gym} see
26 | https://github.com/openai/gym#installation.
27 | Then install the R package \code{reticulate}.
28 | }
29 | 
30 | \section{Methods}{
31 | 
32 | \itemize{
33 | \item \code{$close()}
34 | Close visualization window.
35 | }
36 | 
37 | 
38 | \itemize{
39 | \item \code{$step(action)} \cr
40 | Take action in environment.
41 | Returns a list with \code{state}, \code{reward}, \code{done}.
42 | \item \code{$reset()} \cr
43 | Resets the \code{done} flag of the environment and returns an initial state.
44 | Useful when starting a new episode.
45 | \item \code{$visualize()} \cr
46 | Visualizes the environment (if there is a visualization function).
47 | }
48 | }
49 | 
50 | \examples{
51 | \dontrun{
52 | # Create an OpenAI Gym environment.
53 | # Make sure you have Python, gym and reticulate installed.
54 | env = makeEnvironment("gym", gym.name = "MountainCar-v0")
55 | env$reset()
56 | env$close()
57 | }
58 | }
59 | 


--------------------------------------------------------------------------------
/man/makeAgent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/agent.R
 3 | \name{makeAgent}
 4 | \alias{makeAgent}
 5 | \title{Create Agent.}
 6 | \usage{
 7 | makeAgent(policy, val.fun = NULL, algorithm = NULL,
 8 |   preprocess = identity, replay.memory = NULL, policy.args = list(),
 9 |   val.fun.args = list(), algorithm.args = list())
10 | }
11 | \arguments{
12 | \item{policy}{[\code{character(1)} | Policy] \cr A policy.
13 | If you pass a string the policy will be created via \link{makePolicy}.}
14 | 
15 | \item{val.fun}{[\code{character(1)} | ValueFunction] \cr A value function representation.
16 | If you pass a string the value function will be created via \link{makeValueFunction}.}
17 | 
18 | \item{algorithm}{[\code{character(1)} | Algorithm] \cr An algorithm.
19 | If you pass a string the algorithm will be created via \link{makeAlgorithm}.}
20 | 
21 | \item{preprocess}{[\code{function}] \cr A function which preprocesses the state so that the agent can learn on this.}
22 | 
23 | \item{replay.memory}{[\code{ReplayMemory}] \cr Replay memory for experience replay created by \link{makeReplayMemory}.}
24 | 
25 | \item{policy.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makePolicy}.}
26 | 
27 | \item{val.fun.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makeValueFunction}.}
28 | 
29 | \item{algorithm.args}{[\code{list}] \cr Arguments passed on to \code{args} in \link{makeAlgorithm}.}
30 | }
31 | \description{
32 | An agent consists of a policy and (optional) a value function representation
33 | and (optional) a learning algorithm.
34 | }
35 | \examples{
36 | agent = makeAgent("softmax", "table", "qlearning")
37 | }
38 | 


--------------------------------------------------------------------------------
/man/reinforcelearn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reinforcelearn.R
 3 | \docType{package}
 4 | \name{reinforcelearn}
 5 | \alias{reinforcelearn}
 6 | \alias{reinforcementlearning}
 7 | \alias{reinforcelearn-package}
 8 | \title{Reinforcement Learning.}
 9 | \description{
10 | Implementations of reinforcement learning algorithms and environments.
11 | }
12 | \section{Environments}{
13 | 
14 | \itemize{
15 | \item \link{makeEnvironment}
16 | \item \link{Environment}
17 | \item \link{GymEnvironment}
18 | \item \link{MdpEnvironment}
19 | \item \link{Gridworld}
20 | \item \link{WindyGridworld}
21 | \item \link{CliffWalking}
22 | \item \link{MountainCar}
23 | \item \link{MountainCarContinuous}
24 | }
25 | }
26 | 
27 | \section{Policies}{
28 | 
29 | \itemize{
30 | \item \link{makePolicy}
31 | \item \link{EpsilonGreedyPolicy}
32 | \item \link{GreedyPolicy}
33 | \item \link{SoftmaxPolicy}
34 | \item \link{RandomPolicy}
35 | }
36 | }
37 | 
38 | \section{Value Function Representations}{
39 | 
40 | \itemize{
41 | \item \link{makeValueFunction}
42 | \item \link{ValueTable}
43 | \item \link{ValueNetwork}
44 | }
45 | }
46 | 
47 | \section{Algorithms}{
48 | 
49 | \itemize{
50 | \item \link{makeAlgorithm}
51 | \item \link{QLearning}
52 | }
53 | }
54 | 
55 | \section{Extensions}{
56 | 
57 | \itemize{
58 | \item \link{makeReplayMemory}
59 | \item \link{Eligibility}
60 | }
61 | }
62 | 
63 | \section{Agent}{
64 | 
65 | \itemize{
66 | \item \link{makeAgent}
67 | \item \link{getValueFunction}
68 | \item \link{getReplayMemory}
69 | \item \link{getEligibilityTraces}
70 | }
71 | }
72 | 
73 | \section{Interaction}{
74 | 
75 | \itemize{
76 | \item \link{interact}
77 | }
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/tests/testthat/test_environment.R:
--------------------------------------------------------------------------------
 1 | context("environment creation")
 2 | 
 3 | env1 = makeEnvironment("mountain.car",
 4 |   action.names = c("stop" = 0L, "pause" = 1L, "go" = 2L))
 5 | env2 = makeEnvironment("windy.gridworld", discount = 0.8)
 6 | env3 = makeEnvironment("gridworld", shape = c(2, 3), initial.state = 5L,
 7 |   goal.states = 0L, discount = 0.9, diagonal.moves = TRUE)
 8 | 
 9 | context("discount")
10 | test_that("discount will be initialized correctly", {
11 |   expect_equal(env1$discount, 1)
12 |   expect_equal(env2$discount, 0.8)
13 |   expect_equal(env3$discount, 0.9)
14 | })
15 | 
16 | context("action.names")
17 | test_that("action names will be initialized correctly", {
18 |   expect_equal(env1$action.names, c("stop" = 0L, "pause" = 1L, "go" = 2L))
19 |   expect_equal(env2$action.names, c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L))
20 |   expect_equal(env3$action.names, c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L,
21 |     "leftup" = 4L, "leftdown" = 5L, "rightup" = 6L, "rightdown" = 7L))
22 | })
23 | 
24 | env4 = makeEnvironment("windy.gridworld")
25 | env5 = makeEnvironment("windy.gridworld")
26 | test_that("action.names are equivalent to integer actions", {
27 |   env4$step("left")
28 |   env5$step(0L)
29 |   expect_equal(env4, env5)
30 | })
31 | 
32 | context("visualization")
33 | test_that("gridworld visualization works", {
34 |   expect_equal(visualizeGridworld(c(2, 2), current.state = 3L),
35 |     paste0(" - ", "- ", "\n", " - ", "o"))
36 | })
37 | 
38 | context("counter of steps, returns")
39 | env3$step(0L)
40 | test_that("env$episode.return computes discounted reward sum", {
41 |   expect_equal(env3$episode.return, -1)
42 |   env3$step(0L)
43 |   expect_equal(env3$episode.return, -1 + env3$discount * env3$reward)
44 | })
45 | 


--------------------------------------------------------------------------------
/man/Environment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment.R
 3 | \name{Environment}
 4 | \alias{Environment}
 5 | \title{Custom Reinforcement Learning Environment}
 6 | \arguments{
 7 | \item{step}{[\code{function(self, action)}] \cr
 8 | Custom step function.}
 9 | 
10 | \item{reset}{[\code{function(self)}] \cr
11 | Custom reset function.}
12 | 
13 | \item{visualize}{[\code{function(self)}] \cr
14 | Optional custom visualization function.}
15 | 
16 | \item{discount}{[\code{numeric(1)} in (0, 1)] \cr Discount factor.}
17 | 
18 | \item{action.names}{[\code{named integer}] \cr
19 | Optional action names for a discrete action space.}
20 | }
21 | \description{
22 | Custom Reinforcement Learning Environment
23 | }
24 | \section{Usage}{
25 | 
26 | \code{makeEnvironment("custom", step, reset, visualize = NULL, discount = 1, action.names = NULL)}
27 | }
28 | 
29 | \section{Methods}{
30 | 
31 | \itemize{
32 | \item \code{$step(action)} \cr
33 | Take action in environment.
34 | Returns a list with \code{state}, \code{reward}, \code{done}.
35 | \item \code{$reset()} \cr
36 | Resets the \code{done} flag of the environment and returns an initial state.
37 | Useful when starting a new episode.
38 | \item \code{$visualize()} \cr
39 | Visualizes the environment (if there is a visualization function).
40 | }
41 | }
42 | 
43 | \examples{
44 | step = function(self, action) {
45 |   state = list(mean = action + rnorm(1), sd = runif(1))
46 |   reward = rnorm(1, state[[1]], state[[2]])
47 |   done = FALSE
48 |   list(state, reward, done)
49 | }
50 | 
51 | reset = function(self) {
52 |   state = list(mean = 0, sd = 1)
53 |   state
54 | }
55 | 
56 | env = makeEnvironment(step = step, reset = reset)
57 | env$reset()
58 | env$step(100)
59 | }
60 | 


--------------------------------------------------------------------------------
/R/algorithm.R:
--------------------------------------------------------------------------------
 1 | #' Make reinforcement learning algorithm.
 2 | #'
 3 | #' @param class \[`character(1)`] \cr Algorithm. One of `c("qlearning")`.
 4 | #' @inheritParams makePolicy
 5 | #'
 6 | #' @md
 7 | #'
 8 | #' @section Representations:
 9 | #' * [QLearning]
10 | #'
11 | #' @export
12 | #' @examples
13 | #' alg = makeAlgorithm("qlearning")
14 | makeAlgorithm = function(class, args = list(), ...) {
15 |   checkmate::assertChoice(class,
16 |     c("qlearning"))#, "sarsa"))
17 |   checkmate::assertList(args, names = "unique")
18 |   args = append(list(...), args)
19 |   # remove duplicate entries in args list
20 |   args = args[unique(names(args))]
21 | 
22 |   x = list(name = class, args = args)
23 |   class(x) = "Algorithm"
24 |   x
25 | }
26 | 
27 | 
28 | #' Q-Learning
29 | #'
30 | #' Q-Learning algorithm.
31 | #'
32 | #' To use eligibility traces specify `lambda` and `traces`.
33 | #'
34 | #' @section Usage:
35 | #' `makeAlgorithm("qlearning", lambda, traces)`
36 | #'
37 | #' @param lambda \[`numeric(1)` in (0, 1)] \cr Trace decay parameter.
38 | #' @param traces \[`character(1)`] \cr Type of eligibility trace update. One of `c("replace", "accumulate")`.
39 | #'
40 | #' @name QLearning
41 | #' @aliases qlearning
42 | #'
43 | #' @seealso [Eligibility]
44 | #'
45 | #' @md
46 | #'
47 | #' @examples
48 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
49 | NULL
50 | 
51 | QLearning = R6::R6Class("QLearning",
52 |   public = list(
53 |     getTarget = function(reward, action.values, discount) {
54 |       reward + discount * apply(action.values, 1L, max)
55 |     }
56 |   )
57 | )
58 | 
59 | # Sarsa = R6::R6Class("Sarsa",
60 | #   public = list(
61 | #     getTarget = function(reward, action.values, discount, next.action) {
62 | #       reward + discount * action.values[, next.action + 1L]
63 | #     }
64 | #   )
65 | # )
66 | 


--------------------------------------------------------------------------------
/man/MdpEnvironment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment_mdp.R
 3 | \name{MdpEnvironment}
 4 | \alias{MdpEnvironment}
 5 | \title{MDP Environment}
 6 | \arguments{
 7 | \item{transitions}{[\code{array (n.states x n.states x n.actions)}] \cr
 8 | State transition array.}
 9 | 
10 | \item{rewards}{[\code{matrix (n.states x n.actions)}] \cr
11 | Reward array.}
12 | 
13 | \item{initial.state}{[\code{integer}] \cr
14 | Optional starting state.
15 | If a vector is given a starting state will be
16 | randomly sampled from this vector whenever \code{reset} is called.
17 | Note that states are numerated starting with
18 | 0. If \code{initial.state = NULL} all non-terminal states are
19 | possible starting states.}
20 | 
21 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
22 | }
23 | \description{
24 | Markov Decision Process environment.
25 | }
26 | \section{Usage}{
27 | 
28 | \code{makeEnvironment("MDP", transitions, rewards, initial.state, ...)}
29 | }
30 | 
31 | \section{Methods}{
32 | 
33 | \itemize{
34 | \item \code{$step(action)} \cr
35 | Take action in environment.
36 | Returns a list with \code{state}, \code{reward}, \code{done}.
37 | \item \code{$reset()} \cr
38 | Resets the \code{done} flag of the environment and returns an initial state.
39 | Useful when starting a new episode.
40 | \item \code{$visualize()} \cr
41 | Visualizes the environment (if there is a visualization function).
42 | }
43 | }
44 | 
45 | \examples{
46 | # Create a Markov Decision Process.
47 | P = array(0, c(2, 2, 2))
48 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
49 | P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE)
50 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
51 | env = makeEnvironment("mdp", transitions = P, rewards = R)
52 | env$reset()
53 | env$step(1L)
54 | }
55 | 


--------------------------------------------------------------------------------
/man/CliffWalking.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment_gridworld.R
 3 | \name{CliffWalking}
 4 | \alias{CliffWalking}
 5 | \alias{cliff.walking}
 6 | \title{Cliff Walking}
 7 | \arguments{
 8 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
 9 | }
10 | \description{
11 | Gridworld environment for reinforcement learning from Sutton & Barto (2017).
12 | Grid of shape 4x12 with a goal state in the bottom right of the grid.
13 | Episodes start in the lower left state. Possible actions include going left, right, up and down.
14 | Some states in the lower part of the grid are a cliff,
15 | so taking a step into this cliff will yield a high negative reward of - 100 and move the agent
16 | back to the starting state.
17 | Elsewise rewards are - 1, for the goal state 0.
18 | }
19 | \details{
20 | This is the gridworld (goal state denoted G, cliff states denoted C, start state denoted S):
21 | \tabular{rrrrrrrrrrrr}{
22 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
23 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
24 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
25 | S \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab G \cr
26 | }
27 | }
28 | \section{Usage}{
29 | 
30 | \code{makeEnvironment("cliff.walking", ...)}
31 | }
32 | 
33 | \section{Methods}{
34 | 
35 | \itemize{
36 | \item \code{$step(action)} \cr
37 | Take action in environment.
38 | Returns a list with \code{state}, \code{reward}, \code{done}.
39 | \item \code{$reset()} \cr
40 | Resets the \code{done} flag of the environment and returns an initial state.
41 | Useful when starting a new episode.
42 | \item \code{$visualize()} \cr
43 | Visualizes the environment (if there is a visualization function).
44 | }
45 | }
46 | 
47 | \examples{
48 | env = makeEnvironment("cliff.walking")
49 | }
50 | \references{
51 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.6
52 | }
53 | 


--------------------------------------------------------------------------------
/R/accessor_functions.R:
--------------------------------------------------------------------------------
 1 | #' Get weights of value function.
 2 | #'
 3 | #' Returns the weights of the value function representation of the agent.
 4 | #'
 5 | #' @param agent \[Agent] \cr An agent created by [makeAgent].
 6 | #'
 7 | #' @md
 8 | #'
 9 | #' @return For a value function table this will return a matrix, for a neural
10 | #' network a list with the weights of the layers.
11 | #'
12 | #' @export
13 | getValueFunction = function(agent) {
14 |   checkmate::assertClass(agent, "Agent")
15 |   if (!is.null(agent$val.fun)) {
16 |     Q = agent$val.fun$getWeights()
17 |   } else {
18 |     stop("No value function weights found in the agent object.")
19 |   }
20 |   Q
21 | }
22 | 
23 | #' Get replay memory.
24 | #'
25 | #' Returns the replay memory of the agent.
26 | #'
27 | #' @param agent \[Agent] \cr An agent created by [makeAgent].
28 | #'
29 | #' @md
30 | #'
31 | #' @return A list containing the experienced observations, actions and rewards.
32 | #'
33 | #' @export
34 | getReplayMemory = function(agent) {
35 |   checkmate::assertClass(agent, "Agent")
36 |   if (!is.null(agent$exp.replay)) {
37 |     mem = agent$exp.replay$memory
38 |   } else {
39 |     stop("No replay memory found in the agent object.")
40 |   }
41 |   mem
42 | }
43 | 
44 | #' Get eligibility traces
45 | #'
46 | #' Returns the eligibility traces of the agent.
47 | #'
48 | #' @param agent \[Agent] \cr An agent created by [makeAgent].
49 | #'
50 | #' @md
51 | #'
52 | #' @return A matrix with the eligibility traces.
53 | #'
54 | #' @export
55 | getEligibilityTraces = function(agent) {
56 |   checkmate::assertClass(agent, "Agent")
57 |   if (!is.null(agent$eligibility)) {
58 |     e = agent$eligibility$E
59 |   } else {
60 |     stop("No eligibility traces found in the agent object.")
61 |   }
62 |   e
63 | }
64 | 
65 | 
66 | #' Get state values.
67 | #'
68 | #' Get state value function from  action value function.
69 | #'
70 | #' @param action.vals \[`matrix`] \cr Action value matrix.
71 | #'
72 | #' @md
73 | #'
74 | #' @export
75 | getStateValues = function(action.vals) {
76 |   checkmate::assertMatrix(action.vals)
77 |   apply(action.vals, 1L, max)
78 | }
79 | 


--------------------------------------------------------------------------------
/man/windyGridworld.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/environment_gridworld.R
 3 | \name{WindyGridworld}
 4 | \alias{WindyGridworld}
 5 | \alias{windy.gridworld}
 6 | \title{Windy Gridworld}
 7 | \arguments{
 8 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
 9 | }
10 | \description{
11 | Windy Gridworld problem for reinforcement learning. Actions include
12 | going left, right, up and down. In each column the wind pushes you up a
13 | specific number of steps (for the next action). If an action would
14 | take you off the grid, you remain in the previous state. For each step you
15 | get a reward of -1, until you reach into a terminal state.
16 | }
17 | \details{
18 | This is the gridworld (goal state denoted G, start state denoted S).
19 | The last row specifies the upward wind in each column.
20 | \tabular{rrrrrrrrrr}{
21 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
22 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
23 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
24 | S \tab . \tab . \tab . \tab . \tab . \tab . \tab G \tab . \tab . \cr
25 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
26 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
27 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
28 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
29 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
30 | . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
31 | 0 \tab 0 \tab 0 \tab 1 \tab 1 \tab 1 \tab 2 \tab 2 \tab 1 \tab 0 \cr
32 | }
33 | }
34 | \section{Usage}{
35 | 
36 | \code{makeEnvironment("windy.gridworld", ...)}
37 | }
38 | 
39 | \section{Methods}{
40 | 
41 | \itemize{
42 | \item \code{$step(action)} \cr
43 | Take action in environment.
44 | Returns a list with \code{state}, \code{reward}, \code{done}.
45 | \item \code{$reset()} \cr
46 | Resets the \code{done} flag of the environment and returns an initial state.
47 | Useful when starting a new episode.
48 | \item \code{$visualize()} \cr
49 | Visualizes the environment (if there is a visualization function).
50 | }
51 | }
52 | 
53 | \examples{
54 | env = makeEnvironment("windy.gridworld")
55 | }
56 | \references{
57 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.5
58 | }
59 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_windy_gridworld.md:
--------------------------------------------------------------------------------
 1 | Benchmark Algorithms on Windy Gridworld Task
 2 | ================
 3 | Markus Dumke
 4 | 2017-12-21
 5 | 
 6 | ``` r
 7 | library(reinforcelearn)
 8 | env = makeEnvironment("windy.gridworld")
 9 | ```
10 | 
11 | The optimal solution is 15 steps.
12 | 
13 | Simple Q-Learning
14 | -----------------
15 | 
16 | ``` r
17 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
18 | agent = makeAgent(policy, "table", "qlearning", epsilon = 0.1)
19 | 
20 | res = interact(env, agent, n.episodes = 500L)
21 | ```
22 | 
23 | <img src="Images/qlearning_windygrid-1.png" style="display: block; margin: auto;" />
24 | 
25 | Q-Learning with Eligibility Traces
26 | ----------------------------------
27 | 
28 | ``` r
29 | env$resetEverything()
30 | #> [1] 30
31 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
32 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
33 | agent = makeAgent(policy, "table", alg)
34 | 
35 | res = interact(env, agent, n.episodes = 500L)
36 | ```
37 | 
38 | <img src="Images/qlearning_windygrid_elig-1.png" style="display: block; margin: auto;" />
39 | 
40 | Q-Learning with Experience replay
41 | ---------------------------------
42 | 
43 | ``` r
44 | env$resetEverything()
45 | #> [1] 30
46 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
47 | mem = makeReplayMemory(size = 10L, batch.size = 10L)
48 | agent = makeAgent(policy, "table", "qlearning", experience.replay = mem)
49 | 
50 | res = interact(env, agent, n.episodes = 500L)
51 | ```
52 | 
53 | <img src="Images/qlearning_windygrid_expreplay-1.png" style="display: block; margin: auto;" />
54 | 
55 | Q-Learning with neural network and experience replay
56 | ----------------------------------------------------
57 | 
58 | ``` r
59 | env$resetEverything()
60 | #> [1] 30
61 | library(keras)
62 | model = keras_model_sequential() %>%
63 |   layer_dense(units = env$n.actions, activation = "linear",
64 |     input_shape = c(env$n.states), kernel_initializer = initializer_zeros(),
65 |     use_bias = FALSE) %>%
66 |   compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
67 | mem = makeReplayMemory(size = 2L, batch.size = 2L)
68 | val = makeValueFunction("neural.network", model = model)
69 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
70 | preprocess = function(x) to_categorical(x, num_classes = env$n.states)
71 | agent = makeAgent(policy, val, "qlearning",
72 |   preprocess = preprocess, experience.replay = mem)
73 | 
74 | res = interact(env, agent, n.episodes = 500L)
75 | ```
76 | 
77 | <img src="Images/qlearning_windygrid_neuralnetwork-1.png" style="display: block; margin: auto;" />
78 | 


--------------------------------------------------------------------------------
/man/tilecoding.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tiles.R
 3 | \name{tiles}
 4 | \alias{tiles}
 5 | \alias{iht}
 6 | \title{Tile Coding}
 7 | \usage{
 8 | tiles(iht, n.tilings, state, action = integer(0))
 9 | 
10 | iht(max.size)
11 | }
12 | \arguments{
13 | \item{iht}{[\code{IHT}] \cr A hash table created with \code{iht}.}
14 | 
15 | \item{n.tilings}{[\code{integer(1)}] \cr Number of tilings.}
16 | 
17 | \item{state}{[\code{vector(2)}] \cr A two-dimensional state observation.
18 | Make sure to scale the observation to unit variance before.}
19 | 
20 | \item{action}{[\code{integer(1)}] \cr Optional: If supplied the action space
21 | will also be tiled. All distinct actions will result in different tile numbers.}
22 | 
23 | \item{max.size}{[\code{integer(1)}] \cr Maximal size of hash table.}
24 | }
25 | \value{
26 | \code{iht} creates a hash table, which can then be passed on to \code{tiles}.
27 | \code{tiles} returns an integer vector of size \code{n.tilings} with the active tile numbers.
28 | }
29 | \description{
30 | Implementation of Sutton's tile coding software version 3.
31 | }
32 | \details{
33 | Tile coding is a way of representing the values of a vector of continuous variables as a large
34 | binary vector with few 1s and many 0s. The binary vector is not represented explicitly,
35 | but as a list of the components that are 1s. The main step is to partition, or tile,
36 | the continuous space multiple times and select one tile from each tiling, that corresponding
37 | the the vector's value. Each tile is converted to an element in the big binary vector,
38 | and the list of the tile (element) numbers is returned as the representation of the vector's value.
39 | Tile coding is recommended as a way of applying online learning methods to domains with continuous
40 | state or action variables. [copied from manual]
41 | 
42 | See detailed manual on the web.
43 | In comparison to the Python implementation indices start with 1 instead of 0. The hash table is
44 | implemented as an environment, which is an attribute of an R6 class.
45 | 
46 | Make sure that the size of the hash table is large enough, else an error will be triggered,
47 | when trying to assign a value to a full hash table.
48 | }
49 | \examples{
50 | # Create hash table
51 | hash = iht(1024)
52 | 
53 | # Partition state space using 8 tilings
54 | tiles(hash, n.tilings = 8, state = c(3.6, 7.21))
55 | tiles(hash, n.tilings = 8, state = c(3.7, 7.21))
56 | tiles(hash, n.tilings = 8, state = c(4, 7))
57 | tiles(hash, n.tilings = 8, state = c(- 37.2, 7))
58 | 
59 | }
60 | \references{
61 | Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction
62 | }
63 | 


--------------------------------------------------------------------------------
/session_info.txt:
--------------------------------------------------------------------------------
 1 | - Session info ----------------------------------------------------------
 2 |  setting  value                       
 3 |  version  R version 3.4.3 (2017-11-30)
 4 |  os       Windows 10 x64              
 5 |  system   x86_64, mingw32             
 6 |  ui       RTerm                       
 7 |  language (EN)                        
 8 |  collate  English_Germany.1252        
 9 |  tz       Europe/Berlin               
10 |  date     2017-12-23                  
11 | 
12 | - Packages --------------------------------------------------------------
13 |  package     * version     date       source                            
14 |  assertthat    0.2.0       2017-04-11 CRAN (R 3.4.2)                    
15 |  backports     1.1.1       2017-09-25 CRAN (R 3.4.1)                    
16 |  cli           1.0.0       2017-12-22 Github (r-lib/cli@ab1c3aa)        
17 |  clisymbols    1.2.0       2017-05-21 CRAN (R 3.4.3)                    
18 |  crayon        1.3.4       2017-09-16 CRAN (R 3.4.2)                    
19 |  desc          1.1.1       2017-08-03 CRAN (R 3.4.2)                    
20 |  devtools      1.13.3.9000 2017-12-22 Github (hadley/devtools@0bcfd6e)  
21 |  digest        0.6.13      2017-12-14 CRAN (R 3.4.3)                    
22 |  evaluate      0.10.1      2017-06-24 CRAN (R 3.4.2)                    
23 |  htmltools     0.3.6       2017-04-28 CRAN (R 3.4.2)                    
24 |  knitr         1.17        2017-08-10 CRAN (R 3.4.2)                    
25 |  magrittr      1.5         2014-11-22 CRAN (R 3.4.2)                    
26 |  memoise       1.1.0       2017-04-21 CRAN (R 3.4.2)                    
27 |  pkgbuild      0.0.0.9000  2017-12-22 Github (r-lib/pkgbuild@ce7f6d1)   
28 |  pkgload       0.0.0.9000  2017-12-22 Github (r-lib/pkgload@70eaef8)    
29 |  R6            2.2.2       2017-06-17 CRAN (R 3.4.2)                    
30 |  Rcpp          0.12.13     2017-09-28 CRAN (R 3.4.2)                    
31 |  rlang         0.1.4.9000  2017-12-22 Github (tidyverse/rlang@cc7587c)  
32 |  rmarkdown     1.8         2017-11-17 CRAN (R 3.4.2)                    
33 |  rprojroot     1.3-1       2017-12-18 CRAN (R 3.4.3)                    
34 |  sessioninfo   1.0.1.9000  2017-12-22 Github (r-lib/sessioninfo@c871d01)
35 |  stringi       1.1.6       2017-11-17 CRAN (R 3.4.2)                    
36 |  stringr       1.2.0       2017-02-18 CRAN (R 3.4.2)                    
37 |  testthat      2.0.0       2017-12-13 CRAN (R 3.4.3)                    
38 |  usethis       1.1.0.9000  2017-12-22 Github (r-lib/usethis@973bcab)    
39 |  withr         2.1.1       2017-12-19 CRAN (R 3.4.3)                    
40 |  yaml          2.1.16      2017-12-12 CRAN (R 3.4.3)                    
41 | 


--------------------------------------------------------------------------------
/docs/jquery.sticky-kit.min.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net
 3 | */
 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k));
 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("<div />"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q,
 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),e<F&&(m=!1,c=q,null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),
 7 | h.detach()),b={position:"",width:"",top:""},a.css(b).removeClass(t).trigger("sticky_kit:unstick")),B&&(b=f.height(),u+q>b&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}),
 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize",
 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n<K;n++)d=this[n],J(b(d));return this}}).call(this);
10 | 


--------------------------------------------------------------------------------
/man/makeEnvironment.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/environment.R
  3 | \name{makeEnvironment}
  4 | \alias{makeEnvironment}
  5 | \title{Create reinforcement learning environment.}
  6 | \usage{
  7 | makeEnvironment(class = "custom", discount = 1, ...)
  8 | }
  9 | \arguments{
 10 | \item{class}{[\code{character(1)}] \cr
 11 | Class of environment. One of \code{c("custom", "mdp", "gym", "gridworld")}.}
 12 | 
 13 | \item{discount}{[\code{numeric(1)} in (0, 1)] \cr Discount factor.}
 14 | 
 15 | \item{...}{[\code{any}] \cr Arguments passed on to the specific environment.}
 16 | }
 17 | \value{
 18 | R6 class of class Environment.
 19 | }
 20 | \description{
 21 | This function creates an environment for reinforcement learning.
 22 | }
 23 | \details{
 24 | Use the \code{step} method to interact with the environment.
 25 | 
 26 | Note that all states and actions are numerated starting with 0!
 27 | 
 28 | For a detailed explanation and more examples
 29 | have a look at the vignette "How to create an environment?".
 30 | }
 31 | \section{Methods}{
 32 | 
 33 | \itemize{
 34 | \item \code{$step(action)} \cr
 35 | Take action in environment.
 36 | Returns a list with \code{state}, \code{reward}, \code{done}.
 37 | \item \code{$reset()} \cr
 38 | Resets the \code{done} flag of the environment and returns an initial state.
 39 | Useful when starting a new episode.
 40 | \item \code{$visualize()} \cr
 41 | Visualizes the environment (if there is a visualization function).
 42 | }
 43 | }
 44 | 
 45 | \section{Environments}{
 46 | 
 47 | \itemize{
 48 | \item \link{Environment}
 49 | \item \link{GymEnvironment}
 50 | \item \link{MdpEnvironment}
 51 | \item \link{Gridworld}
 52 | \item \link{MountainCar}
 53 | }
 54 | }
 55 | 
 56 | \examples{
 57 | step = function(self, action) {
 58 |   state = list(mean = action + rnorm(1), sd = runif(1))
 59 |   reward = rnorm(1, state[[1]], state[[2]])
 60 |   done = FALSE
 61 |   list(state, reward, done)
 62 | }
 63 | 
 64 | reset = function(self) {
 65 |   state = list(mean = 0, sd = 1)
 66 |   state
 67 | }
 68 | 
 69 | env = makeEnvironment(step = step, reset = reset, discount = 0.9)
 70 | env$reset()
 71 | env$step(100)
 72 | 
 73 | # Create a Markov Decision Process.
 74 | P = array(0, c(2, 2, 2))
 75 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
 76 | P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE)
 77 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
 78 | env = makeEnvironment("mdp", transitions = P, rewards = R)
 79 | 
 80 | env$reset()
 81 | env$step(1L)
 82 | 
 83 | # Create a Gridworld.
 84 | grid = makeEnvironment("gridworld", shape = c(4, 4),
 85 |   goal.states = 15, initial.state = 0)
 86 | grid$visualize()
 87 | 
 88 | \dontrun{
 89 | # Create an OpenAI Gym environment.
 90 | # Make sure you have Python, gym and reticulate installed.
 91 | env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 92 | 
 93 | # Take random actions for 200 steps.
 94 | env$reset()
 95 | for (i in 1:200) {
 96 |   action = sample(env$actions, 1)
 97 |   env$step(action)
 98 |   env$visualize()
 99 | }
100 | env$close()
101 | }
102 | }
103 | 


--------------------------------------------------------------------------------
/docs/session_info.txt:
--------------------------------------------------------------------------------
 1 | - Session info ----------------------------------------------------------
 2 |  setting  value                       
 3 |  version  R version 3.4.3 (2017-11-30)
 4 |  os       Windows 10 x64              
 5 |  system   x86_64, mingw32             
 6 |  ui       RTerm                       
 7 |  language (EN)                        
 8 |  collate  English_Germany.1252        
 9 |  tz       Europe/Berlin               
10 |  date     2017-12-23                  
11 | 
12 | - Packages --------------------------------------------------------------
13 |  package     * version     date       source                             
14 |  assertthat    0.2.0       2017-04-11 CRAN (R 3.4.2)                     
15 |  backports     1.1.1       2017-09-25 CRAN (R 3.4.1)                     
16 |  cli           1.0.0       2017-12-22 Github (r-lib/cli@ab1c3aa)         
17 |  clisymbols    1.2.0       2017-05-21 CRAN (R 3.4.3)                     
18 |  commonmark    1.4         2017-09-01 CRAN (R 3.4.2)                     
19 |  crayon        1.3.4       2017-09-16 CRAN (R 3.4.2)                     
20 |  desc          1.1.1       2017-08-03 CRAN (R 3.4.2)                     
21 |  devtools      1.13.3.9000 2017-12-22 Github (hadley/devtools@0bcfd6e)   
22 |  digest        0.6.13      2017-12-14 CRAN (R 3.4.3)                     
23 |  evaluate      0.10.1      2017-06-24 CRAN (R 3.4.2)                     
24 |  htmltools     0.3.6       2017-04-28 CRAN (R 3.4.2)                     
25 |  knitr         1.17        2017-08-10 CRAN (R 3.4.2)                     
26 |  magrittr      1.5         2014-11-22 CRAN (R 3.4.2)                     
27 |  MASS          7.3-47      2017-02-26 CRAN (R 3.4.3)                     
28 |  memoise       1.1.0       2017-04-21 CRAN (R 3.4.2)                     
29 |  pkgbuild      0.0.0.9000  2017-12-22 Github (r-lib/pkgbuild@ce7f6d1)    
30 |  pkgdown       0.1.0.9000  2017-12-22 Github (r-lib/pkgdown@d436a0a)     
31 |  pkgload       0.0.0.9000  2017-12-22 Github (r-lib/pkgload@70eaef8)     
32 |  R6            2.2.2       2017-06-17 CRAN (R 3.4.2)                     
33 |  Rcpp          0.12.13     2017-09-28 CRAN (R 3.4.2)                     
34 |  rlang         0.1.4.9000  2017-12-22 Github (tidyverse/rlang@cc7587c)   
35 |  rmarkdown     1.8         2017-11-17 CRAN (R 3.4.2)                     
36 |  roxygen2      6.0.1       2017-02-06 CRAN (R 3.4.2)                     
37 |  rprojroot     1.3-1       2017-12-18 CRAN (R 3.4.3)                     
38 |  rstudioapi    0.7.0-9000  2017-12-22 Github (rstudio/rstudioapi@109e593)
39 |  sessioninfo   1.0.1.9000  2017-12-22 Github (r-lib/sessioninfo@c871d01) 
40 |  stringi       1.1.6       2017-11-17 CRAN (R 3.4.2)                     
41 |  stringr       1.2.0       2017-02-18 CRAN (R 3.4.2)                     
42 |  testthat      2.0.0       2017-12-13 CRAN (R 3.4.3)                     
43 |  usethis       1.1.0.9000  2017-12-22 Github (r-lib/usethis@973bcab)     
44 |  withr         2.1.1       2017-12-19 CRAN (R 3.4.3)                     
45 |  xml2          1.1.1       2017-01-24 CRAN (R 3.4.2)                     
46 |  yaml          2.1.16      2017-12-12 CRAN (R 3.4.3)                     
47 | 


--------------------------------------------------------------------------------
/R/environment_mountaincar.R:
--------------------------------------------------------------------------------
  1 | MountainCarBase = R6::R6Class("MountainCarBase",
  2 |   inherit = Environment,
  3 |   public = list(
  4 |     state.space = "Box",
  5 |     state.space.bounds = list(c(-1.2, 0.5), c(-0.07, 0.07)),
  6 |     velocity = NULL,
  7 |     position = NULL,
  8 | 
  9 |     initialize = function(...) {
 10 |       reset_ = function(self) {
 11 |         self$position = runif(1, -0.6, -0.4)
 12 |         self$velocity = 0
 13 |         matrix(c(self$position, self$velocity), ncol = 2)
 14 |       }
 15 | 
 16 |       step_ = function(self, action) {
 17 |         self$velocity = private$getVelocity(self, action)
 18 |         self$velocity = min(max(self$velocity, self$state.space.bounds[[2]][1]),
 19 |           self$state.space.bounds[[2]][2])
 20 |         self$position = self$position + self$velocity
 21 |         if (self$position < self$state.space.bounds[[1]][1]) {
 22 |           self$position = self$state.space.bounds[[1]][1]
 23 |           self$velocity = 0
 24 |         }
 25 |         state = matrix(c(self$position, self$velocity), ncol = 2)
 26 |         reward = -1
 27 |         if (self$position >= 0.5) {
 28 |           done = TRUE
 29 |           reward = 0
 30 |         } else {
 31 |           done = FALSE
 32 |         }
 33 |         list(state, reward, done)
 34 |       }
 35 | 
 36 |       super$initialize(step_, reset_, ...)
 37 |     }
 38 |   )
 39 | )
 40 | 
 41 | #' Mountain Car
 42 | #'
 43 | #' The classical mountain car problem for reinforcement learning.
 44 | #'
 45 | #' The classical Mountain Car task the action is one of \{0, 1, 2\},
 46 | #' in the continuous version the action is in \[-1, 1].
 47 | #'
 48 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
 49 | #'
 50 | #' @section Usage:
 51 | #' `makeEnvironment("MountainCar", ...)` \cr
 52 | #' `makeEnvironment("MountainCarContinuous", ...)`
 53 | #'
 54 | #' @md
 55 | #'
 56 | #' @inheritSection Environment Methods
 57 | #' @name MountainCar
 58 | #' @aliases MountainCarContinuous, mountain.car
 59 | #' @examples
 60 | #' env = makeEnvironment("mountain.car")
 61 | #' env$reset()
 62 | #' env$step(1L)
 63 | #'
 64 | #' env = makeEnvironment("mountain.car.continuous")
 65 | #' env$reset()
 66 | #' env$step(0.62)
 67 | NULL
 68 | 
 69 | #' @rdname MountainCar
 70 | #' @usage NULL
 71 | MountainCar = R6::R6Class("MountainCar",
 72 |   inherit = MountainCarBase,
 73 |   public = list(
 74 |     action.space = "Discrete",
 75 |     actions = 0:2,
 76 |     n.actions = 3L
 77 |   ),
 78 |   private = list(
 79 |     getVelocity = function(self, action) {
 80 |       self$velocity + 0.001 * (action - 1L) - 0.0025 * cos(3 * self$position)
 81 |     }
 82 |   )
 83 | )
 84 | 
 85 | #' @rdname MountainCar
 86 | #' @usage NULL
 87 | MountainCarContinuous = R6::R6Class("MountainCarContinuous",
 88 |   inherit = MountainCarBase,
 89 |   public = list(
 90 |     action.space = "Box",
 91 |     action.space.bounds = list(c(-1, 1))
 92 |   ),
 93 |   private = list(
 94 |     getVelocity = function(self, action) {
 95 |       force = min(max(action, self$action.space.bounds[[1]][1]), self$action.space.bounds[[1]][2])
 96 |       self$velocity + 0.0015 * force - 0.0025 * cos(3 * self$position)
 97 |     }
 98 |   )
 99 | )
100 | 


--------------------------------------------------------------------------------
/vignettes/environments.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=FALSE------------------------------------------------
  2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
  3 | 
  4 | ## ------------------------------------------------------------------------
  5 | library(reinforcelearn)
  6 | 
  7 | ## ---- out.width = "200px", fig.align="center", echo = FALSE--------------
  8 | knitr::include_graphics("mountaincar.JPG")
  9 | 
 10 | ## ------------------------------------------------------------------------
 11 | reset = function(self) {
 12 |   position = runif(1, -0.6, -0.4)
 13 |   velocity = 0
 14 |   state = matrix(c(position, velocity), ncol = 2)
 15 |   state
 16 | }
 17 | 
 18 | ## ------------------------------------------------------------------------
 19 | step = function(self, action) {
 20 |   position = self$state[1]
 21 |   velocity = self$state[2]
 22 |   velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025)
 23 |   velocity = min(max(velocity, -0.07), 0.07)
 24 |   position = position + velocity
 25 |   if (position < -1.2) {
 26 |     position = -1.2
 27 |     velocity = 0
 28 |   }
 29 |   state = matrix(c(position, velocity), ncol = 2)
 30 |   reward = -1
 31 |   if (position >= 0.5) {
 32 |     done = TRUE
 33 |     reward = 0
 34 |   } else {
 35 |     done = FALSE
 36 |   }
 37 |   list(state, reward, done)
 38 | }
 39 | 
 40 | ## ------------------------------------------------------------------------
 41 | env = makeEnvironment(step = step, reset = reset)
 42 | 
 43 | ## ---- eval = FALSE-------------------------------------------------------
 44 | #  # Create a gym environment.
 45 | #  env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 46 | 
 47 | ## ------------------------------------------------------------------------
 48 | # State transition array
 49 | P = array(0, c(2, 2, 2))
 50 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
 51 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE)
 52 | 
 53 | # Reward matrix
 54 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
 55 | 
 56 | env = makeEnvironment("mdp", transitions = P, rewards = R)
 57 | 
 58 | ## ---- out.width = "200px", fig.align="center", echo = FALSE--------------
 59 | knitr::include_graphics("gridworld.JPG")
 60 | 
 61 | ## ------------------------------------------------------------------------
 62 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1)
 63 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | env = makeEnvironment("gridworld", shape = c(4, 4), 
 67 |   goal.states = 0L, initial.state = 15L)
 68 | 
 69 | ## ------------------------------------------------------------------------
 70 | # The initial state of the environment.
 71 | env$reset()
 72 | 
 73 | env$visualize()
 74 | 
 75 | # Actions are encoded as integers.
 76 | env$step(0L)
 77 | 
 78 | env$visualize()
 79 | 
 80 | # But can also have character names.
 81 | env$step("left")
 82 | 
 83 | env$visualize()
 84 | 
 85 | ## ------------------------------------------------------------------------
 86 | env = makeEnvironment("mountain.car")
 87 | env$n.actions
 88 | env$state.space.bounds
 89 | 
 90 | ## ------------------------------------------------------------------------
 91 | env = makeEnvironment("gridworld", shape = c(4, 4), 
 92 |   goal.states = 0L, initial.state = 15L, discount = 0.99)
 93 | 
 94 | env$step("up")
 95 | env$n.step
 96 | env$episode.return
 97 | 
 98 | env$step("left")
 99 | env$n.step
100 | env$episode.return
101 | 
102 | 


--------------------------------------------------------------------------------
/docs/articles/environments.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=FALSE------------------------------------------------
  2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
  3 | 
  4 | ## ------------------------------------------------------------------------
  5 | library(reinforcelearn)
  6 | 
  7 | ## ---- out.width = "200px", fig.align="center", echo = FALSE--------------
  8 | knitr::include_graphics("mountaincar.JPG")
  9 | 
 10 | ## ------------------------------------------------------------------------
 11 | reset = function(self) {
 12 |   position = runif(1, -0.6, -0.4)
 13 |   velocity = 0
 14 |   state = matrix(c(position, velocity), ncol = 2)
 15 |   state
 16 | }
 17 | 
 18 | ## ------------------------------------------------------------------------
 19 | step = function(self, action) {
 20 |   position = self$state[1]
 21 |   velocity = self$state[2]
 22 |   velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025)
 23 |   velocity = min(max(velocity, -0.07), 0.07)
 24 |   position = position + velocity
 25 |   if (position < -1.2) {
 26 |     position = -1.2
 27 |     velocity = 0
 28 |   }
 29 |   state = matrix(c(position, velocity), ncol = 2)
 30 |   reward = -1
 31 |   if (position >= 0.5) {
 32 |     done = TRUE
 33 |     reward = 0
 34 |   } else {
 35 |     done = FALSE
 36 |   }
 37 |   list(state, reward, done)
 38 | }
 39 | 
 40 | ## ------------------------------------------------------------------------
 41 | env = makeEnvironment(step = step, reset = reset)
 42 | 
 43 | ## ---- eval = FALSE-------------------------------------------------------
 44 | #  # Create a gym environment.
 45 | #  env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 46 | 
 47 | ## ------------------------------------------------------------------------
 48 | # State transition array
 49 | P = array(0, c(2, 2, 2))
 50 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
 51 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE)
 52 | 
 53 | # Reward matrix
 54 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
 55 | 
 56 | env = makeEnvironment("mdp", transitions = P, rewards = R)
 57 | 
 58 | ## ---- out.width = "200px", fig.align="center", echo = FALSE--------------
 59 | knitr::include_graphics("gridworld.JPG")
 60 | 
 61 | ## ------------------------------------------------------------------------
 62 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1)
 63 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | env = makeEnvironment("gridworld", shape = c(4, 4), 
 67 |   goal.states = 0L, initial.state = 15L)
 68 | 
 69 | ## ------------------------------------------------------------------------
 70 | # The initial state of the environment.
 71 | env$reset()
 72 | 
 73 | env$visualize()
 74 | 
 75 | # Actions are encoded as integers.
 76 | env$step(0L)
 77 | 
 78 | env$visualize()
 79 | 
 80 | # But can also have character names.
 81 | env$step("left")
 82 | 
 83 | env$visualize()
 84 | 
 85 | ## ------------------------------------------------------------------------
 86 | env = makeEnvironment("mountain.car")
 87 | env$n.actions
 88 | env$state.space.bounds
 89 | 
 90 | ## ------------------------------------------------------------------------
 91 | env = makeEnvironment("gridworld", shape = c(4, 4), 
 92 |   goal.states = 0L, initial.state = 15L, discount = 0.99)
 93 | 
 94 | env$step("up")
 95 | env$n.step
 96 | env$episode.return
 97 | 
 98 | env$step("left")
 99 | env$n.step
100 | env$episode.return
101 | 
102 | 


--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
  1 | /* Sticker footer */
  2 | body > .container {
  3 |   display: flex;
  4 |   padding-top: 60px;
  5 |   min-height: calc(100vh);
  6 |   flex-direction: column;
  7 | }
  8 | 
  9 | body > .container .row {
 10 |   flex: 1;
 11 | }
 12 | 
 13 | footer {
 14 |   margin-top: 45px;
 15 |   padding: 35px 0 36px;
 16 |   border-top: 1px solid #e5e5e5;
 17 |   color: #666;
 18 |   display: flex;
 19 | }
 20 | footer p {
 21 |   margin-bottom: 0;
 22 | }
 23 | footer div {
 24 |   flex: 1;
 25 | }
 26 | footer .pkgdown {
 27 |   text-align: right;
 28 | }
 29 | footer p {
 30 |   margin-bottom: 0;
 31 | }
 32 | 
 33 | img.icon {
 34 |   float: right;
 35 | }
 36 | 
 37 | img {
 38 |   max-width: 100%;
 39 | }
 40 | 
 41 | /* Section anchors ---------------------------------*/
 42 | 
 43 | a.anchor {
 44 |   margin-left: -30px;
 45 |   display:inline-block;
 46 |   width: 30px;
 47 |   height: 30px;
 48 |   visibility: hidden;
 49 | 
 50 |   background-image: url(./link.svg);
 51 |   background-repeat: no-repeat;
 52 |   background-size: 20px 20px;
 53 |   background-position: center center;
 54 | }
 55 | 
 56 | .hasAnchor:hover a.anchor {
 57 |   visibility: visible;
 58 | }
 59 | 
 60 | @media (max-width: 767px) {
 61 |   .hasAnchor:hover a.anchor {
 62 |     visibility: hidden;
 63 |   }
 64 | }
 65 | 
 66 | 
 67 | /* Fixes for fixed navbar --------------------------*/
 68 | 
 69 | .contents h1, .contents h2, .contents h3, .contents h4 {
 70 |   padding-top: 60px;
 71 |   margin-top: -60px;
 72 | }
 73 | 
 74 | /* Static header placement on mobile devices */
 75 | @media (max-width: 767px) {
 76 |   .navbar-fixed-top {
 77 |     position: absolute;
 78 |   }
 79 |   .navbar {
 80 |     padding: 0;
 81 |   }
 82 | }
 83 | 
 84 | 
 85 | /* Sidebar --------------------------*/
 86 | 
 87 | #sidebar {
 88 |   margin-top: 30px;
 89 | }
 90 | #sidebar h2 {
 91 |   font-size: 1.5em;
 92 |   margin-top: 1em;
 93 | }
 94 | 
 95 | #sidebar h2:first-child {
 96 |   margin-top: 0;
 97 | }
 98 | 
 99 | #sidebar .list-unstyled li {
100 |   margin-bottom: 0.5em;
101 | }
102 | 
103 | /* Reference index & topics ----------------------------------------------- */
104 | 
105 | .ref-index th {font-weight: normal;}
106 | .ref-index h2 {font-size: 20px;}
107 | 
108 | .ref-index td {vertical-align: top;}
109 | .ref-index .alias {width: 40%;}
110 | .ref-index .title {width: 60%;}
111 | 
112 | .ref-index .alias {width: 40%;}
113 | .ref-index .title {width: 60%;}
114 | 
115 | .ref-arguments th {text-align: right; padding-right: 10px;}
116 | .ref-arguments th, .ref-arguments td {vertical-align: top;}
117 | .ref-arguments .name {width: 20%;}
118 | .ref-arguments .desc {width: 80%;}
119 | 
120 | /* Nice scrolling for wide elements --------------------------------------- */
121 | 
122 | table {
123 |   display: block;
124 |   overflow: auto;
125 | }
126 | 
127 | /* Syntax highlighting ---------------------------------------------------- */
128 | 
129 | pre {
130 |   word-wrap: normal;
131 |   word-break: normal;
132 |   border: 1px solid #eee;
133 | }
134 | 
135 | pre, code {
136 |   background-color: #f8f8f8;
137 |   color: #333;
138 | }
139 | 
140 | pre .img {
141 |   margin: 5px 0;
142 | }
143 | 
144 | pre .img img {
145 |   background-color: #fff;
146 |   display: block;
147 |   height: auto;
148 | }
149 | 
150 | code a, pre a {
151 |   color: #375f84;
152 | }
153 | 
154 | .fl      {color: #1514b5;}
155 | .fu      {color: #000000;} /* function */
156 | .ch,.st  {color: #036a07;} /* string */
157 | .kw      {color: #264D66;} /* keyword */
158 | .co      {color: #888888;} /* comment */
159 | 
160 | .message { color: black;   font-weight: bolder;}
161 | .error   { color: orange;  font-weight: bolder;}
162 | .warning { color: #6A0366; font-weight: bolder;}
163 | 
164 | 


--------------------------------------------------------------------------------
/tests/testthat/test_agent.R:
--------------------------------------------------------------------------------
 1 | context("check input combinations")
 2 | test_that("softmax and epsilon greedy policies need value function", {
 3 |   expect_error(makeAgent("softmax"),
 4 |     "Cannot use this policy without specifying a value function!")
 5 |   expect_error(makeAgent("greedy"),
 6 |     "Cannot use this policy without specifying a value function!")
 7 |   expect_error(makeAgent("epsilon.greedy"),
 8 |     "Cannot use this policy without specifying a value function!")
 9 | })
10 | 
11 | memory = makeReplayMemory()
12 | test_that("experience replay and eligibility traces cannot be used simultaneously", {
13 |   expect_error(makeAgent("random", "table", "qlearning",
14 |     replay.memory = memory, algorithm.args = list(lambda = 0.8, traces = "replace")),
15 |     "Experience replay with eligibility traces is not supported!")
16 | })
17 | 
18 | # #-------------
19 | # # Test observing
20 | #
21 | # env = makeEnvironment("windy.gridworld")
22 | #
23 | # agent = makeAgent("random")
24 | # interact(env, agent, n.steps = 10L, learn = FALSE)
25 | #
26 | # agent = makeAgent("softmax", "table")
27 | # interact(env, agent, n.steps = 10L, learn = FALSE)
28 | #
29 | # agent = makeAgent("random", "table", "qlearning")
30 | # interact(env, agent, n.steps = 10L, learn = FALSE)
31 | #
32 | # agent = makeAgent("random", "table", "qlearning", lambda = 0.8, traces = "replace")
33 | # interact(env, agent, n.steps = 2L, learn = FALSE)
34 | # getEligibilityTraces(agent)
35 | #
36 | # mem = makeReplayMemory(size = 2, batch.size = 1)
37 | # agent = makeAgent("random", "table", "qlearning", replay.memory = mem)
38 | # interact(env, agent, n.steps = 10L, learn = FALSE)
39 | # getReplayMemory(agent)
40 | #
41 | #
42 | # #-------------
43 | # # Test learning
44 | #
45 | # # qlearning table base
46 | # agent = makeAgent("random", "table", "qlearning")
47 | # interact(env, agent, n.steps = 2L, learn = TRUE)
48 | # getValueFunction(agent)
49 | #
50 | # # qlearning table eligibility
51 | # agent = makeAgent("random", "table", "qlearning", lambda = 0.8, traces = "replace")
52 | # interact(env, agent, n.steps = 2L, learn = TRUE)
53 | # getValueFunction(agent)
54 | #
55 | # # qlearning table exp replay
56 | # mem = makeReplayMemory(size = 2L, batch.size = 2L)
57 | # agent = makeAgent("random", "table", "qlearning", replay.memory = mem)
58 | # interact(env, agent, n.steps = 2L, learn = TRUE)
59 | # getValueFunction(agent)
60 | #
61 | # # qlearning neural.network base
62 | # library(keras)
63 | # model = keras_model_sequential() %>%
64 | #   layer_dense(units = env$n.actions, activation = "linear",
65 | #     input_shape = c(env$n.states), kernel_initializer = initializer_zeros(),
66 | #     use_bias = FALSE) %>%
67 | #   compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
68 | # val = makeValueFunction("neural.network", model = model)
69 | # preprocess = function(x) to_categorical(x, num_classes = env$n.states)
70 | # agent = makeAgent("softmax", val, "qlearning", preprocess = preprocess)
71 | # interact(env, agent, n.steps = 2L, learn = TRUE)
72 | # getValueFunction(agent)
73 | #
74 | # # qlearning neural.network exp.replay
75 | # library(keras)
76 | # model = keras_model_sequential() %>%
77 | #   layer_dense(units = env$n.actions, activation = "linear",
78 | #     input_shape = c(env$n.states), kernel_initializer = initializer_zeros(),
79 | #     use_bias = FALSE) %>%
80 | #   compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
81 | # mem = makeReplayMemory(size = 2L, batch.size = 2L)
82 | # val = makeValueFunction("neural.network", model = model)
83 | # preprocess = function(x) to_categorical(x, num_classes = env$n.states)
84 | # agent = makeAgent("softmax", val, "qlearning",
85 | #   preprocess = preprocess, replay.memory = mem)
86 | # interact(env, agent, n.steps = 2L, learn = TRUE)
87 | # getValueFunction(agent)
88 | 


--------------------------------------------------------------------------------
/vignettes/agents.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include=FALSE------------------------------------------------
 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
 3 | 
 4 | ## ------------------------------------------------------------------------
 5 | set.seed(12)
 6 | library(reinforcelearn)
 7 | 
 8 | ## ------------------------------------------------------------------------
 9 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L)
10 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning")
11 | 
12 | ## ------------------------------------------------------------------------
13 | interact(env, agent, n.episodes = 5L)
14 | 
15 | ## ------------------------------------------------------------------------
16 | getValueFunction(agent)
17 | 
18 | ## ------------------------------------------------------------------------
19 | # Uniform random policy
20 | makePolicy("random")
21 | 
22 | # Epsilon-greedy policy
23 | makePolicy("epsilon.greedy", epsilon = 0.2)
24 | 
25 | # Softmax policy
26 | makePolicy("softmax")
27 | 
28 | ## ------------------------------------------------------------------------
29 | makeValueFunction("table", n.states = 9L, n.actions = 4L)
30 | 
31 | ## ---- eval = FALSE-------------------------------------------------------
32 | #  library(keras)
33 | #  model = keras_model_sequential() %>%
34 | #    layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>%
35 | #    compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae")
36 | #  makeValueFunction("neural.network", model)
37 | 
38 | ## ------------------------------------------------------------------------
39 | makeAlgorithm("qlearning")
40 | 
41 | ## ------------------------------------------------------------------------
42 | policy = makePolicy("epsilon.greedy", epsilon = 0.2)
43 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L)
44 | algorithm = makeAlgorithm("qlearning")
45 | 
46 | agent = makeAgent(policy, val.fun, algorithm)
47 | 
48 | ## ------------------------------------------------------------------------
49 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 
50 |   policy.args = list(epsilon = 0.2))
51 | 
52 | ## ------------------------------------------------------------------------
53 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L)
54 | agent = makeAgent("random")
55 | 
56 | interact(env, agent, n.steps = 3L, visualize = TRUE)
57 | 
58 | ## ------------------------------------------------------------------------
59 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 
60 |   initial.state = 15L)
61 | agent = makeAgent("random")
62 | 
63 | for (i in 1:3L) {
64 |   ## comment in the next line to wait on enter press before taking the next action.
65 |   # invisible(readline(prompt = "Press [enter] to take the next action"))
66 |   interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE)
67 | }
68 | 
69 | ## ------------------------------------------------------------------------
70 | (memory = makeReplayMemory(size = 2L, batch.size = 1L))
71 | 
72 | agent = makeAgent("random", replay.memory = memory)
73 | 
74 | interact(env, agent, n.steps = 2L, visualize = TRUE)
75 | 
76 | getReplayMemory(agent)
77 | 
78 | ## ---- message = FALSE----------------------------------------------------
79 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
80 | 
81 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
82 | memory = makeReplayMemory(size = 100L, batch.size = 20L)
83 | 
84 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory)
85 | 
86 | for (i in 1:100) {
87 |   interact(env, agent, n.steps = 20L, learn = FALSE)
88 |   interact(env, agent, n.steps = 1L, learn = TRUE)
89 | }
90 | action.vals = getValueFunction(agent)
91 | matrix(getStateValues(action.vals), ncol = 4L)
92 | 
93 | 


--------------------------------------------------------------------------------
/docs/articles/agents.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include=FALSE------------------------------------------------
 2 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
 3 | 
 4 | ## ------------------------------------------------------------------------
 5 | set.seed(12)
 6 | library(reinforcelearn)
 7 | 
 8 | ## ------------------------------------------------------------------------
 9 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L)
10 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning")
11 | 
12 | ## ------------------------------------------------------------------------
13 | interact(env, agent, n.episodes = 5L)
14 | 
15 | ## ------------------------------------------------------------------------
16 | getValueFunction(agent)
17 | 
18 | ## ------------------------------------------------------------------------
19 | # Uniform random policy
20 | makePolicy("random")
21 | 
22 | # Epsilon-greedy policy
23 | makePolicy("epsilon.greedy", epsilon = 0.2)
24 | 
25 | # Softmax policy
26 | makePolicy("softmax")
27 | 
28 | ## ------------------------------------------------------------------------
29 | makeValueFunction("table", n.states = 9L, n.actions = 4L)
30 | 
31 | ## ---- eval = FALSE-------------------------------------------------------
32 | #  library(keras)
33 | #  model = keras_model_sequential() %>%
34 | #    layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>%
35 | #    compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae")
36 | #  makeValueFunction("neural.network", model)
37 | 
38 | ## ------------------------------------------------------------------------
39 | makeAlgorithm("qlearning")
40 | 
41 | ## ------------------------------------------------------------------------
42 | policy = makePolicy("epsilon.greedy", epsilon = 0.2)
43 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L)
44 | algorithm = makeAlgorithm("qlearning")
45 | 
46 | agent = makeAgent(policy, val.fun, algorithm)
47 | 
48 | ## ------------------------------------------------------------------------
49 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 
50 |   policy.args = list(epsilon = 0.2))
51 | 
52 | ## ------------------------------------------------------------------------
53 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L)
54 | agent = makeAgent("random")
55 | 
56 | interact(env, agent, n.steps = 3L, visualize = TRUE)
57 | 
58 | ## ------------------------------------------------------------------------
59 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 
60 |   initial.state = 15L)
61 | agent = makeAgent("random")
62 | 
63 | for (i in 1:3L) {
64 |   ## comment in the next line to wait on enter press before taking the next action.
65 |   # invisible(readline(prompt = "Press [enter] to take the next action"))
66 |   interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE)
67 | }
68 | 
69 | ## ------------------------------------------------------------------------
70 | (memory = makeReplayMemory(size = 2L, batch.size = 1L))
71 | 
72 | agent = makeAgent("random", replay.memory = memory)
73 | 
74 | interact(env, agent, n.steps = 2L, visualize = TRUE)
75 | 
76 | getReplayMemory(agent)
77 | 
78 | ## ---- message = FALSE----------------------------------------------------
79 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
80 | 
81 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
82 | memory = makeReplayMemory(size = 100L, batch.size = 20L)
83 | 
84 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory)
85 | 
86 | for (i in 1:100) {
87 |   interact(env, agent, n.steps = 20L, learn = FALSE)
88 |   interact(env, agent, n.steps = 1L, learn = TRUE)
89 | }
90 | action.vals = getValueFunction(agent)
91 | matrix(getStateValues(action.vals), ncol = 4L)
92 | 
93 | 


--------------------------------------------------------------------------------
/R/environment_mdp.R:
--------------------------------------------------------------------------------
  1 | #' MDP Environment
  2 | #'
  3 | #' Markov Decision Process environment.
  4 | #'
  5 | #' @section Usage:
  6 | #' `makeEnvironment("MDP", transitions, rewards, initial.state, ...)`
  7 | #'
  8 | #' @param transitions \[`array (n.states x n.states x n.actions)`] \cr
  9 | #'   State transition array.
 10 | #' @param rewards \[`matrix (n.states x n.actions)`] \cr
 11 | #'   Reward array.
 12 | #' @param initial.state \[`integer`] \cr
 13 | #'   Optional starting state.
 14 | #'   If a vector is given a starting state will be
 15 | #'   randomly sampled from this vector whenever `reset` is called.
 16 | #'   Note that states are numerated starting with
 17 | #'   0. If `initial.state = NULL` all non-terminal states are
 18 | #'   possible starting states.
 19 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
 20 | #'
 21 | #' @md
 22 | #'
 23 | #' @name MdpEnvironment
 24 | #' @inheritSection Environment Methods
 25 | #' @export
 26 | #'
 27 | #' @examples
 28 | #' # Create a Markov Decision Process.
 29 | #' P = array(0, c(2, 2, 2))
 30 | #' P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
 31 | #' P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE)
 32 | #' R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
 33 | #' env = makeEnvironment("mdp", transitions = P, rewards = R)
 34 | #' env$reset()
 35 | #' env$step(1L)
 36 | NULL
 37 | 
 38 | MdpEnvironment = R6::R6Class("MdpEnvironment",
 39 |   inherit = Environment,
 40 | 
 41 |   public = list(
 42 |     action.space = NULL,
 43 |     actions = NULL,
 44 |     initial.state = NULL,
 45 |     n.actions = NULL,
 46 |     n.states = NULL,
 47 |     rewards = NULL,
 48 |     state.space = NULL,
 49 |     states = NULL,
 50 |     terminal.states = NULL,
 51 |     transitions = NULL,
 52 | 
 53 |     initialize = function(transitions, rewards, initial.state, ...) {
 54 |       checkmate::assertArray(transitions, any.missing = FALSE, d = 3L)
 55 |       checkmate::assertArray(rewards, any.missing = FALSE, d = 2L)
 56 | 
 57 |       self$state.space = "Discrete"
 58 |       self$action.space = "Discrete"
 59 |       self$n.actions = dim(transitions)[3]
 60 |       self$n.states = dim(transitions)[1]
 61 |       self$actions = seq_len(self$n.actions) - 1L
 62 |       self$states = seq_len(self$n.states) - 1L
 63 |       self$transitions = transitions
 64 |       self$rewards = rewards
 65 |       terminal.states = apply(transitions, 3L, function(x) diag(x))
 66 |       self$terminal.states = which(apply(terminal.states, 1L, function(x) all(x == 1L))) - 1L
 67 |       if (length(self$terminal.states) == 0) {
 68 |         warning("There are no terminal states in the MDP!")
 69 |         self$terminal.states = -1L
 70 |       }
 71 |       if (missing(initial.state)) {
 72 |         self$initial.state = setdiff(self$states, self$terminal.states)
 73 |       } else {
 74 |         checkmate::assertIntegerish(initial.state, upper = self$n.states - 1L)
 75 |         self$initial.state = initial.state
 76 |       }
 77 | 
 78 |       step_ = function(env, action) {
 79 |         # if (is.character(action)) {
 80 |         #   action = self$action.names[action]
 81 |         # }
 82 |         reward = self$rewards[self$state + 1L, action + 1L] # use old state here!
 83 |         state = sample(self$states, size = 1L,
 84 |           prob = self$transitions[self$state + 1L, , action + 1L])
 85 |         if (state %in% self$terminal.states) {
 86 |           done = TRUE
 87 |         } else {
 88 |           done = FALSE
 89 |         }
 90 |         list(state, reward, done)
 91 |       }
 92 | 
 93 |       reset_ = function(env) {
 94 |         state = ifelse(length(self$initial.state) > 1L,
 95 |           sample(self$initial.state, size = 1L), self$initial.state)
 96 |         state
 97 |       }
 98 |       # call initialize of superclass with mdp step and reset function
 99 |       super$initialize(step_, reset_, ...)
100 |     }
101 |   )
102 | )
103 | 


--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Authors • reinforcelearn</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/cerulean/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
 15 | 
 16 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
 20 | 
 21 | 
 22 | <!-- pkgdown -->
 23 | <link href="pkgdown.css" rel="stylesheet">
 24 | <script src="jquery.sticky-kit.min.js"></script>
 25 | <script src="pkgdown.js"></script>
 26 |   
 27 |   
 28 | <!-- mathjax -->
 29 | <script src='https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
 30 | 
 31 | <!--[if lt IE 9]>
 32 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 33 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 34 | <![endif]-->
 35 | 
 36 | 
 37 |   </head>
 38 | 
 39 |   <body>
 40 |     <div class="container template-authors">
 41 |       <header>
 42 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 43 |   <div class="container">
 44 |     <div class="navbar-header">
 45 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
 46 |         <span class="icon-bar"></span>
 47 |         <span class="icon-bar"></span>
 48 |         <span class="icon-bar"></span>
 49 |       </button>
 50 |       <a class="navbar-brand" href="index.html">reinforcelearn</a>
 51 |     </div>
 52 |     <div id="navbar" class="navbar-collapse collapse">
 53 |       <ul class="nav navbar-nav">
 54 |         <li>
 55 |   <a href="index.html">
 56 |     <span class="fa fa-home fa-lg"></span>
 57 |      
 58 |   </a>
 59 | </li>
 60 | <li>
 61 |   <a href="reference/index.html">Reference</a>
 62 | </li>
 63 | <li class="dropdown">
 64 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 65 |     Articles
 66 |      
 67 |     <span class="caret"></span>
 68 |   </a>
 69 |   <ul class="dropdown-menu" role="menu">
 70 |     <li>
 71 |       <a href="articles/agents.html">Agents</a>
 72 |     </li>
 73 |     <li>
 74 |       <a href="articles/environments.html">Environments</a>
 75 |     </li>
 76 |   </ul>
 77 | </li>
 78 | <li>
 79 |   <a href="news/index.html">News</a>
 80 | </li>
 81 |       </ul>
 82 |       
 83 |       <ul class="nav navbar-nav navbar-right">
 84 |         
 85 |       </ul>
 86 |     </div><!--/.nav-collapse -->
 87 |   </div><!--/.container -->
 88 | </div><!--/.navbar -->
 89 | 
 90 |       
 91 |       </header>
 92 | 
 93 |       <div class="row">
 94 |   <div class="contents col-md-12">
 95 |     <div class="page-header">
 96 |       <h1>Authors</h1>
 97 |     </div>
 98 | 
 99 |     <ul class="list-unstyled">
100 |       <li>
101 |         <p><strong>Markus Dumke</strong>. Author, maintainer.
102 |         </p>
103 |       </li>
104 |     </ul>
105 | 
106 |   </div>
107 | 
108 | </div>
109 | 
110 | 
111 |       <footer>
112 |       <div class="copyright">
113 |   <p>Developed by Markus Dumke.</p>
114 | </div>
115 | 
116 | <div class="pkgdown">
117 |   <p>Site built with <a href="http://hadley.github.io/pkgdown/">pkgdown</a>.</p>
118 | </div>
119 | 
120 |       </footer>
121 |    </div>
122 | 
123 |   </body>
124 | </html>
125 | 


--------------------------------------------------------------------------------
/R/environment_gym.R:
--------------------------------------------------------------------------------
  1 | #' Gym Environment
  2 | #'
  3 | #' Reinforcement learning environment from OpenAI Gym.
  4 | #'
  5 | #' For available gym environments take a look at https://gym.openai.com/envs.
  6 | #'
  7 | #' @section Usage:
  8 | #' `makeEnvironment("gym", gym.name, ...)`
  9 | #'
 10 | #' @section Installation:
 11 | #' For installation of the python package `gym` see
 12 | #' https://github.com/openai/gym#installation.
 13 | #' Then install the R package `reticulate`.
 14 | #'
 15 | #' @param gym.name \[`character(1)`] \cr
 16 | #'   Name of gym environment, e.g. \code{"CartPole-v0"}.
 17 | #' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
 18 | #'
 19 | #' @md
 20 | #'
 21 | #' @section Methods:
 22 | #' * `$close()`
 23 | #'   Close visualization window.
 24 | #'
 25 | #' @name GymEnvironment
 26 | #' @inheritSection Environment Methods
 27 | #' @export
 28 | #'
 29 | #' @examples
 30 | #' \dontrun{
 31 | #' # Create an OpenAI Gym environment.
 32 | #' # Make sure you have Python, gym and reticulate installed.
 33 | #' env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 34 | #' env$reset()
 35 | #' env$close()
 36 | #' }
 37 | NULL
 38 | 
 39 | GymEnvironment = R6::R6Class("GymEnvironment",
 40 |   inherit = Environment,
 41 | 
 42 |   public = list(
 43 |     gym.env = NULL,
 44 |     gym.name = NULL,
 45 | 
 46 |     action.space = NULL,
 47 |     actions = NULL,
 48 |     action.shape = NULL,
 49 |     n.actions = NULL,
 50 |     action.space.bounds = NULL,
 51 | 
 52 |     state.space = NULL,
 53 |     state.shape = NULL,
 54 |     states = NULL,
 55 |     n.states = NULL,
 56 |     state.space.bounds = NULL,
 57 | 
 58 |     close = function() {
 59 |       self$gym.env$close()
 60 |     },
 61 | 
 62 |     initialize = function(gym.name, ...) {
 63 |       if (!requireNamespace("reticulate", quietly = TRUE)) {
 64 |         stop("Please install the reticulate package to use environments from OpenAI Gym.
 65 |         Also make sure you have the python package gym installed.",
 66 |           call. = FALSE)
 67 |       }
 68 |       checkmate::assertCharacter(gym.name, len = 1)
 69 |       self$gym.name = gym.name
 70 | 
 71 |       gym = reticulate::import("gym")
 72 |       self$gym.env = gym$make(gym.name)
 73 | 
 74 |       action.space.info = self$gym.env$action_space
 75 |       self$action.space = extractSpaceClass(action.space.info)
 76 | 
 77 |       state.space.info = self$gym.env$observation_space
 78 |       self$state.space = extractSpaceClass(state.space.info)
 79 | 
 80 |       if (self$action.space == "Discrete") {
 81 |         res = extractDiscreteInfo(action.space.info)
 82 |         self$n.actions = res$n
 83 |         self$actions = res$x
 84 |       }
 85 | 
 86 |       if (self$action.space == "Box") {
 87 |         res = extractBoxInfo(action.space.info)
 88 |         self$action.space.bounds = res$bounds
 89 |         self$action.shape = res$shape
 90 |       }
 91 | 
 92 |       if (self$state.space == "Discrete") {
 93 |         res = extractDiscreteInfo(state.space.info)
 94 |         self$n.actions = res$n
 95 |         self$actions = res$x
 96 |       }
 97 | 
 98 |       if (self$state.space == "Box") {
 99 |         res = extractBoxInfo(state.space.info)
100 |         self$state.space.bounds = res$bounds
101 |         self$state.shape = res$shape
102 |       }
103 | 
104 |       step_ = function(self, action) {
105 |         res = self$gym.env$step(action)
106 |         res[1:3]
107 |       }
108 | 
109 |       reset_ = function(self) {
110 |         state = self$gym.env$reset()
111 |         state
112 |       }
113 | 
114 |       visualize_ = function(self) {
115 |         self$gym.env$render()
116 |       }
117 | 
118 |       super$initialize(step_, reset_, visualize_, ...)
119 |     }
120 |   )
121 | )
122 | 
123 | 
124 | extractDiscreteInfo = function(info) {
125 |   n = info$n
126 |   x = seq(0, n - 1)
127 |   list(n = n, x = x)
128 | }
129 | 
130 | extractBoxInfo = function(info) {
131 |   list(bounds = list(info$low, info$high), shape = info$shape[[1]]) # does [[1]] work in all cases?
132 | }
133 | 
134 | extractSpaceClass = function(info) {
135 |   sub(".*\\.", "", class(info)[1])
136 | }
137 | 


--------------------------------------------------------------------------------
/R/experience_replay.R:
--------------------------------------------------------------------------------
  1 | #' Experience Replay
  2 | #'
  3 | #' Create replay memory for experience replay.
  4 | #'
  5 | #' Sampling from replay memory will be uniform.
  6 | #'
  7 | #' @param size \[`integer(1)`] \cr Size of replay memory.
  8 | #' @param batch.size \[`integer(1)`] \cr Batch size.
  9 | #'
 10 | #' @return \[`list(size, batch.size)`]
 11 | #'   This list can then be passed onto [makeAgent], which will construct the
 12 | #'   replay memory accordingly.
 13 | #'
 14 | #' @md
 15 | #' @aliases experience.replay, replay.memory
 16 | #' @export
 17 | #'
 18 | #' @examples
 19 | #' memory = makeReplayMemory(size = 100L, batch.size = 16L)
 20 | makeReplayMemory = function(size = 100L, batch.size = 16L) { # add arguments for priorization
 21 |   checkmate::assertInt(size, lower = 1)
 22 |   checkmate::assertInt(batch.size, lower = 1, upper = size)
 23 |   x = list(size = size, batch.size = batch.size)
 24 |   class(x) = "ReplayMemory"
 25 |   x
 26 | }
 27 | 
 28 | ReplayMemory = R6::R6Class("ReplayMemory",
 29 |   public = list(
 30 |     memory = NULL,
 31 |     size = NULL,
 32 |     batch.size = NULL,
 33 |     index = 0L,
 34 |     index.full = 0L,
 35 | 
 36 |     # fixme allow growing replay memory?
 37 |     initialize = function(size, batch.size) {
 38 |       self$size = size
 39 |       self$batch.size = batch.size
 40 |       self$memory = vector("list", length = self$size)
 41 |     },
 42 | 
 43 |     # # initialize following policy
 44 |     # initializeMemory = function(env, policy) {
 45 |     #   for (i in seq_len(self$size)) {
 46 |     #     action = policy$sampleAction()
 47 |     #     env$step(action)
 48 |     #     data = list(state = preprocessState(envir$previous.state), action = action,
 49 |     #       reward = envir$reward, next.state = preprocessState(envir$state))
 50 |     #   }
 51 |     # },
 52 | 
 53 |     observe = function(state, action, reward, next.state) {
 54 |       self$index = self$index + 1L
 55 |       self$index.full = self$index.full + 1L
 56 |       self$index.full = min(self$size, self$index.full)
 57 |       index = self$getReplacementIndex()
 58 |       obs = self$getReplayObservation(state, action, reward, next.state)
 59 |       self$add(obs, index)
 60 |     },
 61 | 
 62 |     getReplayObservation = function(state, action, reward, next.state) {
 63 |       list(state = state, action = action, reward = reward, next.state = next.state)
 64 |     },
 65 | 
 66 |     # e.g. oldest entry
 67 |     getReplacementIndex = function() {
 68 |       if (self$index > self$size) {
 69 |         self$index = 1L
 70 |       }
 71 |       self$index
 72 |     },
 73 | 
 74 |     add = function(observation, index) {
 75 |       self$memory[[index]] = observation
 76 |     },
 77 | 
 78 |     isFull = function(memory = self$memory) {
 79 |       # maybe it is enough to check the last entry
 80 |       full = !(any(purrr::map_lgl(memory, is.null)))
 81 |       full
 82 |     },
 83 | 
 84 |     extract = function(batch, member, fun = lapply) {
 85 |       states = fun(batch, "[[", member)
 86 |       states
 87 |     },
 88 | 
 89 |     # checkMemory = function(memory = self$memory, batch.size = self$batch.size) {
 90 |     #   if (!self$isFull()) {
 91 |     #     if (self$index < batch.size) {
 92 |     #       return(FALSE)
 93 |     #     }
 94 |     #   }
 95 |     # },
 96 | 
 97 |     sampleBatch = function(memory = self$memory[seq_len(self$index.full)], batch.size = self$batch.size) {
 98 |       if (length(memory) >= batch.size) {
 99 |         indices = self$getIndices(length(memory), batch.size)
100 |         batch = memory[indices]
101 |         return(purrr::transpose(batch))
102 |       } else {
103 |         message("Cannot sample from replay memory because batch size > number of non-empty entries in replay memory.")
104 |       }
105 |     },
106 | 
107 |     getIndices = function(memory.size, batch.size) {
108 |       indices = sample(seq_len(memory.size), size = batch.size)
109 |       indices
110 |     }
111 |   )
112 | )
113 | 
114 | # ideas: maybe replay memory in future not list but hash table / dictionary etc
115 | # data frame with list columns?
116 | # fixme allow dynamic change of replay memory length
117 | # store preprocessed state?
118 | 


--------------------------------------------------------------------------------
/docs/articles/index.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Articles • reinforcelearn</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/cerulean/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
 15 | 
 16 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
 20 | 
 21 | 
 22 | <!-- pkgdown -->
 23 | <link href="../pkgdown.css" rel="stylesheet">
 24 | <script src="../jquery.sticky-kit.min.js"></script>
 25 | <script src="../pkgdown.js"></script>
 26 |   
 27 |   
 28 | <!-- mathjax -->
 29 | <script src='https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
 30 | 
 31 | <!--[if lt IE 9]>
 32 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 33 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 34 | <![endif]-->
 35 | 
 36 | 
 37 |   </head>
 38 | 
 39 |   <body>
 40 |     <div class="container template-vignette-index">
 41 |       <header>
 42 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 43 |   <div class="container">
 44 |     <div class="navbar-header">
 45 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
 46 |         <span class="icon-bar"></span>
 47 |         <span class="icon-bar"></span>
 48 |         <span class="icon-bar"></span>
 49 |       </button>
 50 |       <a class="navbar-brand" href="../index.html">reinforcelearn</a>
 51 |     </div>
 52 |     <div id="navbar" class="navbar-collapse collapse">
 53 |       <ul class="nav navbar-nav">
 54 |         <li>
 55 |   <a href="../index.html">
 56 |     <span class="fa fa-home fa-lg"></span>
 57 |      
 58 |   </a>
 59 | </li>
 60 | <li>
 61 |   <a href="../reference/index.html">Reference</a>
 62 | </li>
 63 | <li class="dropdown">
 64 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 65 |     Articles
 66 |      
 67 |     <span class="caret"></span>
 68 |   </a>
 69 |   <ul class="dropdown-menu" role="menu">
 70 |     <li>
 71 |       <a href="../articles/agents.html">Agents</a>
 72 |     </li>
 73 |     <li>
 74 |       <a href="../articles/environments.html">Environments</a>
 75 |     </li>
 76 |   </ul>
 77 | </li>
 78 | <li>
 79 |   <a href="../news/index.html">News</a>
 80 | </li>
 81 |       </ul>
 82 |       
 83 |       <ul class="nav navbar-nav navbar-right">
 84 |         
 85 |       </ul>
 86 |     </div><!--/.nav-collapse -->
 87 |   </div><!--/.container -->
 88 | </div><!--/.navbar -->
 89 | 
 90 |       
 91 |       </header>
 92 | 
 93 |       <div class="page-header">
 94 |   <h1>Articles <small>version&nbsp;0.1.0</small></h1>
 95 | </div>
 96 | 
 97 | <div class="row">
 98 |   <div class="col-md-9">
 99 |     <div class="section ">
100 |       <h3>All vignettes</h3>
101 |       <p class="section-desc"></p>
102 | 
103 |       <ul>
104 |         <li><a href="agents.html">Agents</a></li>
105 |         <li><a href="environments.html">Environments</a></li>
106 |       </ul>
107 |     </div>
108 |   </div>
109 | </div>
110 | 
111 |       <footer>
112 |       <div class="copyright">
113 |   <p>Developed by Markus Dumke.</p>
114 | </div>
115 | 
116 | <div class="pkgdown">
117 |   <p>Site built with <a href="http://hadley.github.io/pkgdown/">pkgdown</a>.</p>
118 | </div>
119 | 
120 |       </footer>
121 |    </div>
122 | 
123 |   </body>
124 | </html>
125 | 


--------------------------------------------------------------------------------
/docs/news/index.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>All news • reinforcelearn</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/cerulean/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
 15 | 
 16 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
 20 | 
 21 | 
 22 | <!-- pkgdown -->
 23 | <link href="../pkgdown.css" rel="stylesheet">
 24 | <script src="../jquery.sticky-kit.min.js"></script>
 25 | <script src="../pkgdown.js"></script>
 26 |   
 27 |   
 28 | <!-- mathjax -->
 29 | <script src='https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
 30 | 
 31 | <!--[if lt IE 9]>
 32 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 33 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 34 | <![endif]-->
 35 | 
 36 | 
 37 |   </head>
 38 | 
 39 |   <body>
 40 |     <div class="container template-news">
 41 |       <header>
 42 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 43 |   <div class="container">
 44 |     <div class="navbar-header">
 45 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
 46 |         <span class="icon-bar"></span>
 47 |         <span class="icon-bar"></span>
 48 |         <span class="icon-bar"></span>
 49 |       </button>
 50 |       <a class="navbar-brand" href="../index.html">reinforcelearn</a>
 51 |     </div>
 52 |     <div id="navbar" class="navbar-collapse collapse">
 53 |       <ul class="nav navbar-nav">
 54 |         <li>
 55 |   <a href="../index.html">
 56 |     <span class="fa fa-home fa-lg"></span>
 57 |      
 58 |   </a>
 59 | </li>
 60 | <li>
 61 |   <a href="../reference/index.html">Reference</a>
 62 | </li>
 63 | <li class="dropdown">
 64 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 65 |     Articles
 66 |      
 67 |     <span class="caret"></span>
 68 |   </a>
 69 |   <ul class="dropdown-menu" role="menu">
 70 |     <li>
 71 |       <a href="../articles/agents.html">Agents</a>
 72 |     </li>
 73 |     <li>
 74 |       <a href="../articles/environments.html">Environments</a>
 75 |     </li>
 76 |   </ul>
 77 | </li>
 78 | <li>
 79 |   <a href="../news/index.html">News</a>
 80 | </li>
 81 |       </ul>
 82 |       
 83 |       <ul class="nav navbar-nav navbar-right">
 84 |         
 85 |       </ul>
 86 |     </div><!--/.nav-collapse -->
 87 |   </div><!--/.container -->
 88 | </div><!--/.navbar -->
 89 | 
 90 |       
 91 |       </header>
 92 | 
 93 |       <div class="row">
 94 | 
 95 |   <div class="col-md-9">
 96 |     <div class="page-header">
 97 |       <h1>Change log <small>All releases</small></h1>
 98 |     </div>
 99 | 
100 |     <div class="contents">
101 |     </div>
102 |   </div>
103 | 
104 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
105 |     <div id="tocnav">
106 |       <h2>Contents</h2>
107 |       <ul class="nav nav-pills nav-stacked">
108 |       </ul>
109 |     </div>
110 |   </div>
111 | 
112 | </div>
113 | 
114 |       <footer>
115 |       <div class="copyright">
116 |   <p>Developed by Markus Dumke.</p>
117 | </div>
118 | 
119 | <div class="pkgdown">
120 |   <p>Site built with <a href="http://hadley.github.io/pkgdown/">pkgdown</a>.</p>
121 | </div>
122 | 
123 |       </footer>
124 |    </div>
125 | 
126 |   </body>
127 | </html>
128 | 


--------------------------------------------------------------------------------
/R/policy.R:
--------------------------------------------------------------------------------
  1 | #' Create policy.
  2 | #'
  3 | #' Reinforcement learning policies.
  4 | #'
  5 | #' @param class \[`character(1)`] \cr
  6 | #'   Class of policy. One of `c("random", "epsilon.greedy", "greedy", "softmax")`.
  7 | #' @param ... \[`any`] \cr Optional named arguments passed on to the subclass. Alternatively
  8 | #' these can be given using the `args` argument.
  9 | #' @param args \[`list`] \cr Optional list of named arguments passed on to the
 10 | #' subclass. The arguments in ... take precedence over values in this list.
 11 | #' We strongly encourage you to use one or the other to pass arguments
 12 | #' to the function but not both.
 13 | #'
 14 | #' @return \[`list(name, args)`] List with the name and optional args.
 15 | #'   This list can then be passed onto [makeAgent], which will construct the
 16 | #'   policy accordingly.
 17 | #'
 18 | #' @md
 19 | #' @aliases Policy
 20 | #'
 21 | #' @section Policies:
 22 | #' * [RandomPolicy]
 23 | #' * [GreedyPolicy]
 24 | #' * [EpsilonGreedyPolicy]
 25 | #' * [SoftmaxPolicy]
 26 | #'
 27 | #' @export
 28 | #' @examples
 29 | #' policy = makePolicy("random")
 30 | #' policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 31 | makePolicy = function(class = "random", args = list(), ...) {
 32 |   checkmate::assertChoice(class,
 33 |     c("random", "epsilon.greedy", "greedy", "softmax")) #, "gaussian"))
 34 |   checkmate::assertList(args, names = "unique")
 35 |   args = append(list(...), args)
 36 |   # remove duplicate entries in args list
 37 |   args = args[unique(names(args))]
 38 | 
 39 |   # fixme: check arguments of policy here
 40 |   x = list(name = class, args = args)
 41 |   class(x) = "Policy"
 42 |   x
 43 | }
 44 | 
 45 | 
 46 | Policy = R6::R6Class("Policy",
 47 |   public = list(
 48 |     sampleAction = function(policy) {
 49 |       action = sample(seq_along(policy), prob = policy,
 50 |         size = 1, replace = TRUE) - 1L
 51 |       action
 52 |     }
 53 |   )
 54 | )
 55 | 
 56 | #' Epsilon Greedy Policy
 57 | #'
 58 | #' @aliases GreedyPolicy
 59 | #' @export
 60 | #' @section Usage:
 61 | #' \code{makePolicy("epsilon.greedy", epsilon = 0.1)} \cr
 62 | #' \code{makePolicy("greedy")}
 63 | #'
 64 | #' @param epsilon [\code{numeric(1) in [0, 1]}] \cr
 65 | #'   Ratio of random exploration in epsilon-greedy action selection.
 66 | #'
 67 | #' @name EpsilonGreedyPolicy
 68 | #' @examples
 69 | #' policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 70 | NULL
 71 | 
 72 | EpsilonGreedyPolicy = R6::R6Class("EpsilonGreedyPolicy",
 73 |   inherit = Policy,
 74 |   public = list(
 75 |     epsilon = NULL,
 76 |     getActionProbs = function(Q, n.actions) { # fixme: break ties
 77 |       greedy.action = nnet::which.is.max(Q)
 78 |       policy = matrix(0, nrow = 1, ncol = n.actions)
 79 |       policy[, greedy.action] = 1 - self$epsilon
 80 |       policy = policy + self$epsilon / n.actions
 81 |       policy
 82 |     },
 83 |     initialize = function(epsilon = 0.1) {
 84 |       checkmate::assertNumber(epsilon, lower = 0, upper = 1)
 85 |       self$epsilon = epsilon
 86 |     }
 87 |   )
 88 | )
 89 | 
 90 | GreedyPolicy = R6::R6Class("GreedyPolicy",
 91 |   # inherit = EpsilonGreedyPolicy,
 92 |   public = list(
 93 |     getActionProbs = function(Q, n.actions) {
 94 |       greedy.action = nnet::which.is.max(Q) # this is duplicate code!
 95 |       policy = matrix(0, nrow = 1, ncol = n.actions)
 96 |       policy[, greedy.action] = 1
 97 |       policy
 98 |     }
 99 |   )
100 | )
101 | 
102 | #' Random Policy
103 | #'
104 | #' @export
105 | #' @section Usage:
106 | #' \code{makePolicy("random")}
107 | #'
108 | #' @name RandomPolicy
109 | #' @examples
110 | #' pol = makePolicy("random")
111 | NULL
112 | 
113 | RandomPolicy = R6::R6Class("RandomPolicy",
114 |   inherit = Policy,
115 |   public = list(
116 |     getActionProbs = function(Q, n.actions) {
117 |       policy = matrix(1 / n.actions, nrow = 1, ncol = n.actions)
118 |       policy
119 |     }
120 |   )
121 | )
122 | 
123 | # GaussianPolicy = R6::R6Class("GaussianPolicy",
124 | #   inherit = Policy,
125 | #   public = list(
126 | #     sampleAction = function(mean, sd) {
127 | #       rnorm(1L, mean, sd)
128 | #     }
129 | #   )
130 | # )
131 | 
132 | #' Softmax Policy
133 | #'
134 | #' @export
135 | #' @section Usage:
136 | #' \code{makePolicy("softmax")}
137 | #'
138 | #' @name SoftmaxPolicy
139 | #' @examples
140 | #' pol = makePolicy("softmax")
141 | NULL
142 | 
143 | SoftmaxPolicy = R6::R6Class("SoftmaxPolicy",
144 |   inherit = Policy,
145 |   public = list(
146 |     getActionProbs = function(Q, n.actions) {
147 |       policy = exp(Q) / rowSums(exp(Q))
148 |       policy
149 |     }
150 |   )
151 | )
152 | 


--------------------------------------------------------------------------------
/man/gridworld.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/environment_gridworld.R
  3 | \name{Gridworld}
  4 | \alias{Gridworld}
  5 | \title{Gridworld}
  6 | \arguments{
  7 | \item{shape}{[\code{integer(2)}] \cr
  8 | Shape of the gridworld (number of rows x number of columns).}
  9 | 
 10 | \item{goal.states}{[\code{integer}] \cr
 11 | Goal states in the gridworld.}
 12 | 
 13 | \item{cliff.states}{[\code{integer}] \cr
 14 | Cliff states in the gridworld.}
 15 | 
 16 | \item{reward.step}{[\code{integer(1)}] \cr
 17 | Reward for taking a step.}
 18 | 
 19 | \item{cliff.transition.states}{[\code{integer}] \cr
 20 | States to which the environment transitions if stepping into the cliff.
 21 | If it is a vector, all states will have equal probability.
 22 | Only used when \code{cliff.transition.done == FALSE},
 23 | else specify the \code{initial.state} argument.}
 24 | 
 25 | \item{reward.cliff}{[\code{integer(1)}] \cr
 26 | Reward for taking a step in the cliff state.}
 27 | 
 28 | \item{diagonal.moves}{[\code{logical(1)}] \cr
 29 | Should diagonal moves be allowed?}
 30 | 
 31 | \item{wind}{[\code{integer}] \cr
 32 | Strength of the upward wind in each cell.}
 33 | 
 34 | \item{cliff.transition.done}{[\code{logical(1)}] \cr
 35 | Should the episode end after stepping into the cliff?}
 36 | 
 37 | \item{stochasticity}{[\code{numeric(1)}] \cr
 38 | Probability of random transition to any of the neighboring states when taking any action.}
 39 | 
 40 | \item{...}{[\code{any}] \cr Arguments passed on to \link{makeEnvironment}.}
 41 | }
 42 | \description{
 43 | Creates gridworld environments.
 44 | }
 45 | \details{
 46 | A gridworld is an episodic navigation task, the goal is to get from start state to goal state.
 47 | 
 48 | Possible actions include going left, right, up or down. If \code{diagonal.moves = TRUE} diagonal
 49 | moves are also possible, leftup, leftdown, rightup and rightdown.
 50 | 
 51 | When stepping into a cliff state you get a reward of \code{reward.cliff},
 52 | usually a high negative reward and transition to a state specified by \code{cliff.transition.states}.
 53 | 
 54 | In each column a deterministic wind specified via \code{wind} pushes you up a specific number of
 55 | grid cells (for the next action).
 56 | 
 57 | A stochastic gridworld is a gridworld where with probability \code{stochasticity} the next state
 58 | is chosen at random from all neighbor states independent of the actual action.
 59 | 
 60 | If an action would take you off the grid, the new state is the nearest cell inside the grid.
 61 | For each step you get a reward of \code{reward.step}, until you reach a goal state,
 62 | then the episode is done.
 63 | 
 64 | States are enumerated row-wise and numeration starts with 0.
 65 | Here is an example 4x4 grid:
 66 | \tabular{rrrr}{
 67 | 0 \tab 1 \tab 2 \tab 3 \cr
 68 | 4 \tab 5 \tab 6 \tab 7 \cr
 69 | 8 \tab 9 \tab 10 \tab 11 \cr
 70 | 12 \tab 13 \tab 14 \tab 15 \cr
 71 | }
 72 | So a board position could look like this (G: goal state, x: current state, C: cliff state):
 73 | \tabular{rrrr}{
 74 | G \tab o \tab o \tab o \cr
 75 | o \tab o \tab o \tab o \cr
 76 | o \tab x \tab o \tab o \cr
 77 | o \tab o \tab o \tab C \cr
 78 | }
 79 | }
 80 | \section{Usage}{
 81 | 
 82 | \code{makeEnvironment("gridworld", shape = NULL, goal.states = NULL, cliff.states = NULL, reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]), cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...)}
 83 | }
 84 | 
 85 | \section{Methods}{
 86 | 
 87 | \itemize{
 88 | \item \code{$step(action)} \cr
 89 | Take action in environment.
 90 | Returns a list with \code{state}, \code{reward}, \code{done}.
 91 | \item \code{$reset()} \cr
 92 | Resets the \code{done} flag of the environment and returns an initial state.
 93 | Useful when starting a new episode.
 94 | \item \code{$visualize()} \cr
 95 | Visualizes the environment (if there is a visualization function).
 96 | }
 97 | }
 98 | 
 99 | \examples{
100 | # Gridworld Environment (Sutton & Barto Example 4.1)
101 | env1 = makeEnvironment("gridworld", shape = c(4L, 4L), goal.states = 0L,
102 |   initial.state = 15L)
103 | env1$reset()
104 | env1$visualize()
105 | env1$step(0L)
106 | env1$visualize()
107 | 
108 | # Windy Gridworld (Sutton & Barto Example 6.5)
109 | env2 = makeEnvironment("gridworld", shape = c(7, 10), goal.states = 37L,
110 |   reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0),
111 |   initial.state = 30L)
112 | 
113 | # Cliff Walking (Sutton & Barto Example 6.6)
114 | env3 = makeEnvironment("gridworld", shape = c(4, 12), goal.states = 47L,
115 |   cliff.states = 37:46, reward.step = -1, reward.cliff = -100,
116 |   cliff.transition.states = 36L, initial.state = 36L)
117 | }
118 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_windy_gridworld.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Benchmark Algorithms on Windy Gridworld Task"
  3 | author: "Markus Dumke"
  4 | date: "`r Sys.Date()`"
  5 | output: github_document
  6 | ---
  7 | 
  8 | ```{r setup, include = FALSE}
  9 | knitr::opts_chunk$set(comment = "#>", collapse = FALSE, message = FALSE)
 10 | knitr::opts_chunk$set(fig.path = 'Images/', eval = TRUE, cache = FALSE,
 11 |   size = "footnotesize", fig.asp = 0.618, fig.width = 4.5, fig.align = "center",
 12 |   message = FALSE, comment = "#>", collapse = TRUE, echo = TRUE)
 13 | ```
 14 | 
 15 | 
 16 | ```{r}
 17 | library(reinforcelearn)
 18 | env = makeEnvironment("windy.gridworld")
 19 | ```
 20 | 
 21 | The optimal solution is 15 steps.
 22 | 
 23 | ## Simple Q-Learning
 24 | 
 25 | ```{r}
 26 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 27 | agent = makeAgent(policy, "table", "qlearning")
 28 | 
 29 | res = interact(env, agent, n.episodes = 500L)
 30 | ```
 31 | 
 32 | ```{r qlearning_windygrid, echo = FALSE, fig.align = "center"}
 33 | library(ggplot2)
 34 | df = data.frame(episode = seq_along(res$steps),
 35 |   steps = res$steps)
 36 | 
 37 | ggplot(df, aes(episode, steps), col = "brown1") +
 38 |   geom_point(alpha = 0.2) +
 39 |   theme_bw() +
 40 |   labs(
 41 |     title = "Q-Learning",
 42 |     x = "Episode",
 43 |     y = "Steps per episode"
 44 |     ) +
 45 |   coord_cartesian(ylim = c(0, 200)) +
 46 |   geom_smooth(se = FALSE, size = 1) +
 47 |   geom_hline(yintercept = 15, size = 1, col = "black", lty = 2)
 48 | ```
 49 | 
 50 | ## Q-Learning with Eligibility Traces
 51 | 
 52 | ```{r}
 53 | env$resetEverything()
 54 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 55 | alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
 56 | agent = makeAgent(policy, "table", alg)
 57 | 
 58 | res = interact(env, agent, n.episodes = 500L)
 59 | ```
 60 | 
 61 | ```{r qlearning_windygrid_elig, echo = FALSE, fig.align = "center"}
 62 | library(ggplot2)
 63 | df = data.frame(episode = seq_along(res$steps),
 64 |   steps = res$steps)
 65 | 
 66 | ggplot(df, aes(episode, steps), col = "brown1") +
 67 |   geom_point(alpha = 0.2) +
 68 |   theme_bw() +
 69 |   labs(
 70 |     title = "Q-Learning",
 71 |     subtitle = "Eligibility traces",
 72 |     x = "Episode",
 73 |     y = "Steps per episode"
 74 |     ) +
 75 |   coord_cartesian(ylim = c(0, 200)) +
 76 |   geom_smooth(se = FALSE, size = 1) +
 77 |   geom_hline(yintercept = 15, size = 1, col = "black", lty = 2)
 78 | ```
 79 | 
 80 | ## Q-Learning with Experience replay
 81 | 
 82 | ```{r}
 83 | env$resetEverything()
 84 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 85 | mem = makeReplayMemory(size = 10L, batch.size = 10L)
 86 | agent = makeAgent(policy, "table", "qlearning", replay.memory = mem)
 87 | 
 88 | res = interact(env, agent, n.episodes = 500L)
 89 | ```
 90 | 
 91 | ```{r qlearning_windygrid_expreplay, echo = FALSE, fig.align = "center"}
 92 | library(ggplot2)
 93 | df = data.frame(episode = seq_along(res$steps),
 94 |   steps = res$steps)
 95 | 
 96 | ggplot(df, aes(episode, steps), col = "brown1") +
 97 |   geom_point(alpha = 0.2) +
 98 |   theme_bw() +
 99 |   labs(
100 |     title = "Q-Learning",
101 |     subtitle = "Experience replay",
102 |     x = "Episode",
103 |     y = "Steps per episode"
104 |     ) +
105 |   coord_cartesian(ylim = c(0, 200)) +
106 |   geom_smooth(se = FALSE, size = 1) +
107 |   geom_hline(yintercept = 15, size = 1, col = "black", lty = 2)
108 | ```
109 | 
110 | ## Q-Learning with neural network and experience replay
111 | 
112 | ```{r}
113 | env$resetEverything()
114 | library(keras)
115 | model = keras_model_sequential() %>%
116 |   layer_dense(units = env$n.actions, activation = "linear",
117 |     input_shape = c(env$n.states), kernel_initializer = initializer_zeros(),
118 |     use_bias = FALSE) %>%
119 |   compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
120 | mem = makeReplayMemory(size = 2L, batch.size = 2L)
121 | val = makeValueFunction("neural.network", model = model)
122 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
123 | preprocess = function(x) to_categorical(x, num_classes = env$n.states)
124 | agent = makeAgent(policy, val, "qlearning",
125 |   preprocess = preprocess, replay.memory = mem)
126 | 
127 | res = interact(env, agent, n.episodes = 500L)
128 | ```
129 | 
130 | ```{r qlearning_windygrid_neuralnetwork, echo = FALSE, fig.align = "center"}
131 | library(ggplot2)
132 | df = data.frame(episode = seq_along(res$steps),
133 |   steps = res$steps)
134 | 
135 | ggplot(df, aes(episode, steps), col = "brown1") +
136 |   geom_point(alpha = 0.2) +
137 |   theme_bw() +
138 |   labs(
139 |     title = "Q-Learning",
140 |     subtitle = "Experience replay and neural network",
141 |     x = "Episode",
142 |     y = "Steps per episode"
143 |     ) +
144 |   coord_cartesian(ylim = c(0, 200)) +
145 |   geom_smooth(se = FALSE, size = 1) +
146 |   geom_hline(yintercept = 15, size = 1, col = "black", lty = 2)
147 | ```
148 | 


--------------------------------------------------------------------------------
/docs/LICENSE.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>License • reinforcelearn</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/cerulean/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
 15 | 
 16 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
 20 | 
 21 | 
 22 | <!-- pkgdown -->
 23 | <link href="pkgdown.css" rel="stylesheet">
 24 | <script src="jquery.sticky-kit.min.js"></script>
 25 | <script src="pkgdown.js"></script>
 26 |   
 27 |   
 28 | <!-- mathjax -->
 29 | <script src='https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
 30 | 
 31 | <!--[if lt IE 9]>
 32 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 33 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 34 | <![endif]-->
 35 | 
 36 | 
 37 |   </head>
 38 | 
 39 |   <body>
 40 |     <div class="container template-license">
 41 |       <header>
 42 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 43 |   <div class="container">
 44 |     <div class="navbar-header">
 45 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
 46 |         <span class="icon-bar"></span>
 47 |         <span class="icon-bar"></span>
 48 |         <span class="icon-bar"></span>
 49 |       </button>
 50 |       <a class="navbar-brand" href="index.html">reinforcelearn</a>
 51 |     </div>
 52 |     <div id="navbar" class="navbar-collapse collapse">
 53 |       <ul class="nav navbar-nav">
 54 |         <li>
 55 |   <a href="index.html">
 56 |     <span class="fa fa-home fa-lg"></span>
 57 |      
 58 |   </a>
 59 | </li>
 60 | <li>
 61 |   <a href="reference/index.html">Reference</a>
 62 | </li>
 63 | <li class="dropdown">
 64 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 65 |     Articles
 66 |      
 67 |     <span class="caret"></span>
 68 |   </a>
 69 |   <ul class="dropdown-menu" role="menu">
 70 |     <li>
 71 |       <a href="articles/agents.html">Agents</a>
 72 |     </li>
 73 |     <li>
 74 |       <a href="articles/environments.html">Environments</a>
 75 |     </li>
 76 |   </ul>
 77 | </li>
 78 | <li>
 79 |   <a href="news/index.html">News</a>
 80 | </li>
 81 |       </ul>
 82 |       
 83 |       <ul class="nav navbar-nav navbar-right">
 84 |         
 85 |       </ul>
 86 |     </div><!--/.nav-collapse -->
 87 |   </div><!--/.container -->
 88 | </div><!--/.navbar -->
 89 | 
 90 |       
 91 |       </header>
 92 | 
 93 |       <div class="row">
 94 |   <div class="contents col-md-12">
 95 |     <div class="page-header">
 96 |       <h1>License</h1>
 97 |     </div>
 98 | 
 99 | <pre>YEAR: 2017
100 | COPYRIGHT HOLDER: Markus Dumke
101 | 
102 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the &quot;Software&quot;), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
103 | 
104 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
105 | 
106 | THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
107 | </pre>
108 | 
109 |   </div>
110 | 
111 | </div>
112 | 
113 | 
114 |       <footer>
115 |       <div class="copyright">
116 |   <p>Developed by Markus Dumke.</p>
117 | </div>
118 | 
119 | <div class="pkgdown">
120 |   <p>Site built with <a href="http://hadley.github.io/pkgdown/">pkgdown</a>.</p>
121 | </div>
122 | 
123 |       </footer>
124 |    </div>
125 | 
126 |   </body>
127 | </html>
128 | 


--------------------------------------------------------------------------------
/R/interact.R:
--------------------------------------------------------------------------------
  1 | #' Interaction between agent and environment.
  2 | #'
  3 | #' Run interaction between agent and environment for specified number of steps
  4 | #' or episodes.
  5 | #'
  6 | #' @param env \[`Environment`] \cr Reinforcement learning environment created by [makeEnvironment].
  7 | #' @param agent \[`Agent`] \cr Agent created by [makeAgent].
  8 | #' @param n.steps \[`integer(1)`] \cr Number of steps to run.
  9 | #' @param n.episodes \[`integer(1)`] \cr Number of episodes to run.
 10 | #' @param max.steps.per.episode \[`integer(1)`] \cr Maximal number of steps allowed per episode.
 11 | #' @param learn \[`logical(1)`] \cr Should the agent learn?
 12 | #' @param visualize \[`logical(1)`] \cr Visualize the interaction between agent and environment?
 13 | #'
 14 | #' @return \[`list`] Return and number of steps per episode.
 15 | #'
 16 | #' @md
 17 | #'
 18 | #' @export
 19 | #' @examples
 20 | #' env = makeEnvironment("windy.gridworld")
 21 | #' agent = makeAgent("softmax", "table", "qlearning")
 22 | #' interact(env, agent, n.episodes = 10L)
 23 | interact = function(env, agent, n.steps = Inf, n.episodes = Inf,
 24 |   max.steps.per.episode = Inf, learn = TRUE, visualize = FALSE) {
 25 | 
 26 |   checkmate::assertClass(env, "Environment")
 27 |   checkmate::assertClass(agent, "Agent")
 28 |   if (!is.infinite(n.steps)) checkmate::assertInt(n.steps, lower = 1)
 29 |   if (!is.infinite(n.episodes)) checkmate::assertInt(n.episodes, lower = 1)
 30 |   if (!is.infinite(max.steps.per.episode)) checkmate::assertInt(max.steps.per.episode, lower = 1)
 31 |   checkmate::assertFlag(learn)
 32 |   checkmate::assertFlag(visualize)
 33 | 
 34 |   # one of steps / episodes must be finite!
 35 |   if (is.infinite(n.steps) && is.infinite(n.episodes)) {
 36 |     stop("Specify finite number of steps or finite number of episodes!")
 37 |   }
 38 | 
 39 |   # preallocation if number of episodes | steps is known in advance else append to list
 40 |   if (n.episodes < Inf) {
 41 |     episode.returns = rep(NA_real_, n.episodes)
 42 |   } else {
 43 |     episode.returns = vector(mode = "double")
 44 |   }
 45 |   if (n.episodes < Inf) {
 46 |     episode.steps = rep(NA_integer_, n.episodes)
 47 |   } else {
 48 |     episode.steps = vector(mode = "integer")
 49 |   }
 50 | 
 51 |   # index to fill in
 52 |   episode = 0L
 53 | 
 54 |   # get episode | step number of when to stop
 55 |   stop.step = env$n.step + n.steps
 56 |   stop.episode = env$episode + n.episodes
 57 | 
 58 |   # # check if environment has been resetted, if not reset else get current state
 59 |   # if (is.null(env$state)) {
 60 |   #   message("Reset environment.")
 61 |   #   state = env$reset()
 62 |   #   if (visualize) {
 63 |   #     env$visualize()
 64 |   #   }
 65 |   # } else {
 66 |   state = env$state
 67 |   #}
 68 | 
 69 |   agent$n.actions = env$n.actions
 70 | 
 71 |   if (agent$initialized == FALSE) {
 72 |     agent$init(env) # if e.g. value fun has not been initialized do this here
 73 |     agent$initialized = TRUE
 74 |   }
 75 | 
 76 |   while (TRUE) {
 77 |     # print(paste0("episode: ", env$episode, "; step: ", env$n.step))
 78 |     # # agent$observeBeforeAct() # observe before act
 79 |     action = agent$act(state) # fixme: store action also in agent attribute
 80 |     res = env$step(action)
 81 | 
 82 |     if (visualize) {
 83 |       env$visualize()
 84 |     }
 85 | 
 86 |     # # keep track of visited states, actions, rewards
 87 |     # agent$history = append(agent$history, list(list(state = state, action = action,
 88 |     #   reward = res$reward, episode = env$episode + 1L)))
 89 | 
 90 |     # observe: e.g. add observation to replay memory
 91 |     agent$observe(state, action, res$reward, res$state, env)
 92 | 
 93 |     # optional learning (check whether to learn maybe as agent method)
 94 |     if (learn) {
 95 |       #browser()
 96 |       agent$learn(env, learn)
 97 |     }
 98 | 
 99 |     state = res$state # set state to next state for new iteration
100 | 
101 |     # when episode is finished print out information and reset environment
102 |     if (res$done || env$episode.step == max.steps.per.episode) {
103 |       if (!res$done) {
104 |         env$episode = env$episode + 1L
105 |       }
106 |       message(paste("Episode", env$episode, "finished after",
107 |         env$episode.step, "steps with a return of", env$episode.return)) # let this be customizable by having his in a function argument
108 |       episode = episode + 1L
109 |       episode.returns[episode] = env$episode.return
110 |       episode.steps[episode] = env$episode.step
111 |       state = env$reset()
112 |       # if (visualize) {
113 |       #   env$visualize()
114 |       # }
115 |       agent$reset()
116 |     }
117 | 
118 |     # stop criteria
119 |     if (env$n.step == stop.step || env$episode == stop.episode) {
120 |       break
121 |     }
122 |   }
123 |   # return information about returns, steps
124 |   list(returns = episode.returns, steps = episode.steps) # return history
125 | }
126 | # fixme: logging
127 | # fixme: control when to learn
128 | # fixme: print out average return of last n episodes ...
129 | # fixme: maybe return training time, history ...
130 | # make message after done configurable as function argument
131 | 


--------------------------------------------------------------------------------
/R/tiles.R:
--------------------------------------------------------------------------------
  1 | #' Tile Coding
  2 | #'
  3 | #' Implementation of Sutton's tile coding software version 3.
  4 | #'
  5 | #' @param iht \[`IHT`] \cr A hash table created with `iht`.
  6 | #' @param n.tilings \[`integer(1)`] \cr Number of tilings.
  7 | #' @param state \[`vector(2)`] \cr A two-dimensional state observation.
  8 | #'   Make sure to scale the observation to unit variance before.
  9 | #' @param action \[`integer(1)`] \cr Optional: If supplied the action space
 10 | #'   will also be tiled. All distinct actions will result in different tile numbers.
 11 | #'
 12 | #' @return `iht` creates a hash table, which can then be passed on to `tiles`.
 13 | #'   `tiles` returns an integer vector of size `n.tilings` with the active tile numbers.
 14 | #'
 15 | #' @md
 16 | #'
 17 | #' @details
 18 | #' Tile coding is a way of representing the values of a vector of continuous variables as a large
 19 | #' binary vector with few 1s and many 0s. The binary vector is not represented explicitly,
 20 | #' but as a list of the components that are 1s. The main step is to partition, or tile,
 21 | #' the continuous space multiple times and select one tile from each tiling, that corresponding
 22 | #' the the vector's value. Each tile is converted to an element in the big binary vector,
 23 | #' and the list of the tile (element) numbers is returned as the representation of the vector's value.
 24 | #' Tile coding is recommended as a way of applying online learning methods to domains with continuous
 25 | #' state or action variables. \[copied from manual]
 26 | #'
 27 | #' See detailed manual on the web.
 28 | #' In comparison to the Python implementation indices start with 1 instead of 0. The hash table is
 29 | #' implemented as an environment, which is an attribute of an R6 class.
 30 | #'
 31 | #' Make sure that the size of the hash table is large enough, else an error will be triggered,
 32 | #' when trying to assign a value to a full hash table.
 33 | #'
 34 | #' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction
 35 | #' @rdname tilecoding
 36 | #' @export
 37 | #' @examples
 38 | #' # Create hash table
 39 | #' hash = iht(1024)
 40 | #'
 41 | #' # Partition state space using 8 tilings
 42 | #' tiles(hash, n.tilings = 8, state = c(3.6, 7.21))
 43 | #' tiles(hash, n.tilings = 8, state = c(3.7, 7.21))
 44 | #' tiles(hash, n.tilings = 8, state = c(4, 7))
 45 | #' tiles(hash, n.tilings = 8, state = c(- 37.2, 7))
 46 | #'
 47 | tiles = function(iht, n.tilings, state, action = integer(0)) {
 48 |   checkmate::assertClass(iht, "IHT")
 49 |   checkmate::assertInt(n.tilings)
 50 |   checkmate::assertVector(state)
 51 |   checkmate::assertIntegerish(action, max.len = 1)
 52 | 
 53 |   qfloats = floor(state * n.tilings)
 54 |   active.tiles = rep(0, n.tilings)
 55 |   coords = rep(0, length(state) + 1)
 56 | 
 57 |   for (tiling in seq_len(n.tilings)) {
 58 |     tiling = tiling - 1
 59 |     tiling2 = tiling * 2
 60 |     coords[1] = tiling
 61 |     b = tiling
 62 |     for (q in seq_along(qfloats)) {
 63 |       coords[q + 1] = (qfloats[q] + b) %/% n.tilings
 64 |       b = b + tiling2
 65 |     }
 66 |     coords = append(coords, action)
 67 |     active.tiles[tiling + 1] = hashcoords(paste(coords, collapse = ""), iht)
 68 |   }
 69 | 
 70 |   return(active.tiles)
 71 | }
 72 | 
 73 | hashcoords = function(coords, iht) {
 74 |   iht$add2Env(coords)
 75 |   iht$checkFull()
 76 |   iht$getIndex(coords)
 77 | }
 78 | 
 79 | #' @rdname tilecoding
 80 | #' @param max.size \[`integer(1)`] \cr Maximal size of hash table.
 81 | #' @export
 82 | #' @md
 83 | iht = function(max.size) {
 84 |   checkmate::assertInt(max.size)
 85 |   IHTClass$new(max.size)
 86 | }
 87 | 
 88 | IHTClass = R6::R6Class("IHT",
 89 |   public = list(
 90 |     i = 0,
 91 |     max.size = NULL,
 92 |     e = NULL,
 93 | 
 94 |     initialize = function(max.size) {
 95 |       self$max.size = max.size
 96 |       self$e = new.env(size = max.size)
 97 |     },
 98 | 
 99 |     checkFull = function() {
100 |       if (length(self$e) > self$max.size) {
101 |         stop("Tile Coding failed because hash table IHT is full!")
102 |       }
103 |     },
104 | 
105 |     add2Env = function(coords) {
106 |       if (!exists(coords, envir = self$e, inherits = FALSE)) {
107 |         self$i = self$i + 1
108 |         self$checkFull()
109 |         self$e[[coords]] = self$i
110 |       }
111 |     },
112 | 
113 |     getIndex = function(coords) {
114 |       return(self$e[[coords]])
115 |     }
116 |   )
117 | )
118 | 
119 | #' Make n hot vector.
120 | #'
121 | #' @param x \[`integer`] \cr Which features are active?
122 | #' @param len \[`integer(1)`] \cr Length of the feature vector.
123 | #' @param out \[`character(1)`] \cr Format of the output. Can be a vector or a matrix.
124 | #'
125 | #' @return \[`matrix(1, len)`] A one-row matrix with `len` columns with every
126 | #'   entry 0 except the columns specified by `x` which are 1.
127 | #'
128 | #' @md
129 | #'
130 | #' @export
131 | #' @examples
132 | #' nHot(c(1, 3), 5)
133 | #' nHot(c(1, 3), 5, out = "vector")
134 | nHot = function(x, len, out = "matrix") {
135 |   checkmate::assertIntegerish(x, max.len = len)
136 |   checkmate::assertInt(len)
137 |   if (out == "matrix") {
138 |     m = matrix(rep(0, len), nrow = 1)
139 |     m[1, x] = 1
140 |   } else {
141 |     m = rep(0, len)
142 |     m[x] = 1
143 |   }
144 |   m
145 | }
146 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | ```{r, echo = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "#>",
  9 |   message = FALSE,
 10 |   fig.path = "README-"
 11 | )
 12 | ```
 13 | 
 14 | # Reinforcement Learning in R <img src="man/figures/logo.png" align="right" height="36"/>
 15 | 
 16 | [![Travis-CI Build Status](https://travis-ci.org/markusdumke/reinforcelearn.svg?branch=master)](https://travis-ci.org/markusdumke/reinforcelearn)
 17 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/reinforcelearn)](https://cran.r-project.org/package=reinforcelearn)
 18 | [![Coverage Status](https://img.shields.io/codecov/c/github/markusdumke/reinforcelearn/master.svg?maxAge=600)](https://codecov.io/github/markusdumke/reinforcelearn?branch=master)
 19 | 
 20 | ```{r, include = FALSE}
 21 | writeLines(capture.output(devtools::session_info()), "session_info.txt")
 22 | ```
 23 | 
 24 | 
 25 | ### Documentation
 26 | 
 27 | [Website](https://markusdumke.github.io/reinforcelearn)
 28 | 
 29 | ----
 30 | 
 31 | ### Installation
 32 | 
 33 | ```{r, eval = FALSE}
 34 | # Install from CRAN.
 35 | install.packages("reinforcelearn")
 36 | 
 37 | # Install development version from github.
 38 | devtools::install_github("markusdumke/reinforcelearn")
 39 | ```
 40 | 
 41 | ----
 42 | 
 43 | ### Get started
 44 | 
 45 | Reinforcement Learning with the package `reinforcelearn` is as easy as
 46 | 
 47 | ```{r}
 48 | library(reinforcelearn)
 49 | 
 50 | env = makeEnvironment("windy.gridworld")
 51 | agent = makeAgent("softmax", "table", "qlearning")
 52 | 
 53 | # Run interaction for 10 episodes.
 54 | interact(env, agent, n.episodes = 10L)
 55 | ```
 56 | 
 57 | ----
 58 | 
 59 | ### Environments
 60 | 
 61 | With `makeEnvironment` you can create reinforcement learning environments.
 62 | 
 63 | ```{r}
 64 | # Create environment.
 65 | step = function(self, action) {
 66 |   state = list(mean = action + rnorm(1), sd = runif(1))
 67 |   reward = rnorm(1, state[[1]], state[[2]])
 68 |   done = FALSE
 69 |   list(state, reward, done)
 70 | }
 71 | 
 72 | reset = function(self) {
 73 |   state = list(mean = 0, sd = 1)
 74 |   state
 75 | }
 76 | 
 77 | env = makeEnvironment("custom", step = step, reset = reset)
 78 | ```
 79 | 
 80 | The environment is an `R6` class with a set of attributes and methods.
 81 | You can interact with the environment via the `reset` and `step` method.
 82 | 
 83 | ```{r}
 84 | # Reset environment.
 85 | env$reset()
 86 | 
 87 | # Take action.
 88 | env$step(100)
 89 | ```
 90 | 
 91 | There are some predefined environment classes, e.g. `MDPEnvironment`, which allows you to create a Markov Decision Process by passing on state transition array and reward matrix, or `GymEnvironment`, where you can use toy problems from [OpenAI Gym](https://gym.openai.com/).
 92 | 
 93 | ```{r, eval = FALSE}
 94 | # Create a gym environment.
 95 | # Make sure you have Python, gym and reticulate installed.
 96 | env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 97 | 
 98 | # Take random actions for 200 steps.
 99 | env$reset()
100 | for (i in 1:200) {
101 |   action = sample(0:2, 1)
102 |   env$step(action)
103 |   env$visualize()
104 | }
105 | env$close()
106 | ```
107 | 
108 | This should open a window showing a graphical visualization of the environment during interaction.
109 | 
110 | For more details on how to create an environment have a look at the vignette: [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html)
111 | 
112 | ----
113 | 
114 | ### Agents
115 | 
116 | With `makeAgent` you can set up a reinforcement learning agent to solve the environment, i.e. to find the best action in each time step.
117 | 
118 | The first step is to set up the policy, which defines which action to choose. For example we could use a uniform random policy.
119 | 
120 | ```{r}
121 | # Create the environment.
122 | env = makeEnvironment("windy.gridworld")
123 | 
124 | # Create agent with uniform random policy.
125 | policy = makePolicy("random")
126 | agent = makeAgent(policy)
127 | 
128 | # Run interaction for 10 steps.
129 | interact(env, agent, n.steps = 10L)
130 | ```
131 | 
132 | In this scenario the agent chooses all actions with equal probability and will not learn anything from the interaction. Usually we want the agent to be able to learn something. Value-based algorithms learn a value function from interaction with the environment and adjust the policy according to the value function. For example we could set up Q-Learning with a softmax policy.
133 | 
134 | ```{r}
135 | # Create the environment.
136 | env = makeEnvironment("windy.gridworld")
137 | 
138 | # Create qlearning agent with softmax policy and tabular value function.
139 | policy = makePolicy("softmax")
140 | values = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
141 | algorithm = makeAlgorithm("qlearning")
142 | agent = makeAgent(policy, values, algorithm)
143 | 
144 | # Run interaction for 10 steps.
145 | interact(env, agent, n.episodes = 10L)
146 | ```
147 | 
148 | ----
149 | 
150 | ### Vignettes
151 | 
152 | Also have a look at the vignettes for further examples.
153 | 
154 | - [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html)
155 | - [Agents](https://markusdumke.github.io/reinforcelearn/articles/agents.html)
156 | 
157 | ----
158 | 
159 | Logo is a modification of https://www.r-project.org/logo/.
160 | 


--------------------------------------------------------------------------------
/R/valuefunction.R:
--------------------------------------------------------------------------------
  1 | #' Value Function Representation
  2 | #'
  3 | #' A representation of the value function.
  4 | #'
  5 | #' @param class \[`character(1)`] \cr Class of value function approximation.
  6 | #'   One of `c("table", "neural.network")`.
  7 | #' @inheritParams makePolicy
  8 | #'
  9 | #' @return \[`list(name, args)`] List with the name and optional args.
 10 | #'   This list can then be passed onto [makeAgent], which will construct the
 11 | #'   value function accordingly.
 12 | #'
 13 | #' @md
 14 | #'
 15 | #' @section Representations:
 16 | #' * [ValueTable]
 17 | #' * [ValueNetwork]
 18 | #'
 19 | #' @export
 20 | #' @examples
 21 | #' val = makeValueFunction("table", n.states = 16L, n.actions = 4L)
 22 | #' # If the number of states and actions is not supplied, the agent will try
 23 | #' # to figure these out from the environment object during interaction.
 24 | #' val = makeValueFunction("table")
 25 | makeValueFunction = function(class, args = list(), ...) {
 26 |   checkmate::assertChoice(class, c("table", "neural.network")) #, "keras.neural.network", "mxnet.neural.network"))
 27 |   # fixme: check arguments here
 28 |   checkmate::assertList(args, names = "unique")
 29 |   args = append(list(...), args)
 30 |   # remove duplicate entries in args list
 31 |   args = args[unique(names(args))]
 32 | 
 33 |   x = list(name = class, args = args)
 34 |   class(x) = "ValueFunction"
 35 |   x
 36 | }
 37 | # comment: this could also be used for policy params -> better name?
 38 | 
 39 | 
 40 | #' Value Table
 41 | #'
 42 | #' Table representing the action value function Q.
 43 | #'
 44 | #' You can specify the shape of the value table. If omitted the agent will try
 45 | #' to configure these automatically from the environment during interaction
 46 | #' (therefore the environment needs to have a `n.states` and `n.actions` attribute).
 47 | #'
 48 | #' @section Usage:
 49 | #' `makeValueFunction("table", n.states = NULL, n.actions = 1L,
 50 | #'   step.size = 0.1, initial.value = NULL)`
 51 | #'
 52 | #' @param n.states \[`integer(1)`] \cr Number of states (rows in the value function).
 53 | #' @param n.actions \[`integer(1)`] \cr Number of actions (columns in the value function).
 54 | #' @param step.size \[`numeric(1)`] \cr Step size (learning rate) for gradient descent update.
 55 | #'
 56 | #' @name ValueTable
 57 | #' @aliases table
 58 | #' @md
 59 | #'
 60 | #' @examples
 61 | #' val = makeValueFunction("table", n.states = 20L, n.actions = 4L)
 62 | NULL
 63 | 
 64 | ValueTable = R6::R6Class("ValueTable",
 65 |   public = list(
 66 |     Q = NULL,
 67 |     step.size = NULL,
 68 | 
 69 |     # fixme: get number of states and actions automatically from environment
 70 |     # fixme: custom initializer, e.g. not to 0
 71 |     initialize = function(n.states = NULL, n.actions = 1L, step.size = 0.1,
 72 |       initial.value = NULL) {
 73 | 
 74 |       checkmate::assertInt(n.states, lower = 1)
 75 |       checkmate::assertInt(n.actions, lower = 1)
 76 |       checkmate::assertNumber(step.size, lower = 0)
 77 |       checkmate::assertMatrix(initial.value, null.ok = TRUE)
 78 | 
 79 |       # state or action value function
 80 |       if (!is.null(initial.value)) {
 81 |         self$Q = initial.value
 82 |       } else {
 83 |         self$Q = matrix(0, nrow = n.states, ncol = n.actions)
 84 |       }
 85 |       self$step.size = step.size
 86 |     },
 87 | 
 88 |     predictQ = function(state) {
 89 |       self$Q[state + 1L, , drop = FALSE]
 90 |     },
 91 | 
 92 |     # fixme: make this vectorised -> ok
 93 |     # caveat: states must be unique!
 94 |     train = function(state, target, step.size = self$step.size) {
 95 |       self$Q[state + 1L, ] = self$Q[state + 1L, ] + step.size * (target - self$Q[state + 1L, ]) # drop = FALSE ?
 96 |     },
 97 | 
 98 |     # train with td error and eligibility traces
 99 |     trainWithError = function(eligibility, error, step.size = self$step.size) {
100 |       self$Q = self$Q + step.size * error * eligibility
101 |     },
102 | 
103 |     processBatch = function(batch) {
104 |       data = data.frame(state = unlist(batch[["state"]]), action = unlist(batch[["action"]]),
105 |         reward = unlist(batch[["reward"]]), next.state = unlist(batch[["next.state"]]))
106 |       data
107 |     },
108 | 
109 |     getWeights = function() {
110 |       self$Q
111 |     }
112 |   )
113 | )
114 | 
115 | #' Value Network
116 | #'
117 | #' Neural network representing the action value function Q.
118 | #'
119 | #' @section Usage:
120 | #' `makeValueFunction("neural.network", model)`
121 | #'
122 | #' @param model \[`keras model`] \cr A keras model.
123 | #'   Make sure that the model has been compiled.
124 | #'
125 | #' @name ValueNetwork
126 | #' @aliases neural.network
127 | #' @md
128 | #'
129 | #' @examples
130 | #' \dontrun{
131 | #' library(keras)
132 | #' model = keras_model_sequential()
133 | #' model %>% layer_dense(20, input_shape = 10, activation = "relu")
134 | #' model %>% layer_dense(4, activation = "softmax")
135 | #' keras::compile(model, loss = "mae", optimizer = keras::optimizer_sgd(lr = 0.4))
136 | #'
137 | #' val = makeValueFunction("neural.network", model = model)
138 | #' }
139 | NULL
140 | 
141 | ValueNetwork = R6::R6Class("ValueNetwork",
142 |   public = list(
143 |     model = NULL,
144 | 
145 |     # keras model # fixme: add support for mxnet
146 |     initialize = function(model) {
147 |       checkmate::assertClass(model, "keras.models.Sequential")
148 |       self$model = model
149 |     },
150 | 
151 |     predictQ = function(state) {
152 |       predict(self$model, state) # another function?
153 |     },
154 | 
155 |     train = function(state, target) { # add ... argument to pass on arguments to fit
156 |       keras::fit(self$model, state, target, verbose = 0L)
157 |     },
158 | 
159 |     processBatch = function(batch) {
160 |       data = list(
161 |         state = do.call(rbind, batch[["state"]]), # problematic for matrix with many columns, purrr::reduce
162 |         action = unlist(batch[["action"]]),
163 |         reward = unlist(batch[["reward"]]),
164 |         next.state = purrr::reduce(batch[["next.state"]], rbind)
165 |       )
166 |       data
167 |     },
168 | 
169 |     getWeights = function() {
170 |       self$model %>% get_weights()
171 |     }
172 |   )
173 | )
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | Reinforcement Learning in R <img src="man/figures/logo.png" align="right" height="36"/>
  3 | =======================================================================================
  4 | 
  5 | [![Travis-CI Build Status](https://travis-ci.org/markusdumke/reinforcelearn.svg?branch=master)](https://travis-ci.org/markusdumke/reinforcelearn) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/reinforcelearn)](https://cran.r-project.org/package=reinforcelearn) [![Coverage Status](https://img.shields.io/codecov/c/github/markusdumke/reinforcelearn/master.svg?maxAge=600)](https://codecov.io/github/markusdumke/reinforcelearn?branch=master)
  6 | 
  7 | WARNING: This package is not maintained anymore!
  8 | 
  9 | ### Documentation
 10 | 
 11 | [Website](https://markusdumke.github.io/reinforcelearn)
 12 | 
 13 | ------------------------------------------------------------------------
 14 | 
 15 | ### Installation
 16 | 
 17 | ``` r
 18 | # Install from CRAN.
 19 | install.packages("reinforcelearn")
 20 | 
 21 | # Install development version from github.
 22 | devtools::install_github("markusdumke/reinforcelearn")
 23 | ```
 24 | 
 25 | ------------------------------------------------------------------------
 26 | 
 27 | ### Get started
 28 | 
 29 | Reinforcement Learning with the package `reinforcelearn` is as easy as
 30 | 
 31 | ``` r
 32 | library(reinforcelearn)
 33 | 
 34 | env = makeEnvironment("windy.gridworld")
 35 | agent = makeAgent("softmax", "table", "qlearning")
 36 | 
 37 | # Run interaction for 10 episodes.
 38 | interact(env, agent, n.episodes = 10L)
 39 | #> $returns
 40 | #>  [1] -3244 -2335 -1734  -169  -879  -798  -216  -176  -699  -232
 41 | #> 
 42 | #> $steps
 43 | #>  [1] 3244 2335 1734  169  879  798  216  176  699  232
 44 | ```
 45 | 
 46 | ------------------------------------------------------------------------
 47 | 
 48 | ### Environments
 49 | 
 50 | With `makeEnvironment` you can create reinforcement learning environments.
 51 | 
 52 | ``` r
 53 | # Create environment.
 54 | step = function(self, action) {
 55 |   state = list(mean = action + rnorm(1), sd = runif(1))
 56 |   reward = rnorm(1, state[[1]], state[[2]])
 57 |   done = FALSE
 58 |   list(state, reward, done)
 59 | }
 60 | 
 61 | reset = function(self) {
 62 |   state = list(mean = 0, sd = 1)
 63 |   state
 64 | }
 65 | 
 66 | env = makeEnvironment("custom", step = step, reset = reset)
 67 | ```
 68 | 
 69 | The environment is an `R6` class with a set of attributes and methods. You can interact with the environment via the `reset` and `step` method.
 70 | 
 71 | ``` r
 72 | # Reset environment.
 73 | env$reset()
 74 | #> $mean
 75 | #> [1] 0
 76 | #> 
 77 | #> $sd
 78 | #> [1] 1
 79 | 
 80 | # Take action.
 81 | env$step(100)
 82 | #> $state
 83 | #> $state$mean
 84 | #> [1] 99.56104
 85 | #> 
 86 | #> $state$sd
 87 | #> [1] 0.5495179
 88 | #> 
 89 | #> 
 90 | #> $reward
 91 | #> [1] 99.40968
 92 | #> 
 93 | #> $done
 94 | #> [1] FALSE
 95 | ```
 96 | 
 97 | There are some predefined environment classes, e.g. `MDPEnvironment`, which allows you to create a Markov Decision Process by passing on state transition array and reward matrix, or `GymEnvironment`, where you can use toy problems from [OpenAI Gym](https://gym.openai.com/).
 98 | 
 99 | ``` r
100 | # Create a gym environment.
101 | # Make sure you have Python, gym and reticulate installed.
102 | env = makeEnvironment("gym", gym.name = "MountainCar-v0")
103 | 
104 | # Take random actions for 200 steps.
105 | env$reset()
106 | for (i in 1:200) {
107 |   action = sample(0:2, 1)
108 |   env$step(action)
109 |   env$visualize()
110 | }
111 | env$close()
112 | ```
113 | 
114 | This should open a window showing a graphical visualization of the environment during interaction.
115 | 
116 | For more details on how to create an environment have a look at the vignette: [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html)
117 | 
118 | ------------------------------------------------------------------------
119 | 
120 | ### Agents
121 | 
122 | With `makeAgent` you can set up a reinforcement learning agent to solve the environment, i.e. to find the best action in each time step.
123 | 
124 | The first step is to set up the policy, which defines which action to choose. For example we could use a uniform random policy.
125 | 
126 | ``` r
127 | # Create the environment.
128 | env = makeEnvironment("windy.gridworld")
129 | 
130 | # Create agent with uniform random policy.
131 | policy = makePolicy("random")
132 | agent = makeAgent(policy)
133 | 
134 | # Run interaction for 10 steps.
135 | interact(env, agent, n.steps = 10L)
136 | #> $returns
137 | #> numeric(0)
138 | #> 
139 | #> $steps
140 | #> integer(0)
141 | ```
142 | 
143 | In this scenario the agent chooses all actions with equal probability and will not learn anything from the interaction. Usually we want the agent to be able to learn something. Value-based algorithms learn a value function from interaction with the environment and adjust the policy according to the value function. For example we could set up Q-Learning with a softmax policy.
144 | 
145 | ``` r
146 | # Create the environment.
147 | env = makeEnvironment("windy.gridworld")
148 | 
149 | # Create qlearning agent with softmax policy and tabular value function.
150 | policy = makePolicy("softmax")
151 | values = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
152 | algorithm = makeAlgorithm("qlearning")
153 | agent = makeAgent(policy, values, algorithm)
154 | 
155 | # Run interaction for 10 steps.
156 | interact(env, agent, n.episodes = 10L)
157 | #> $returns
158 | #>  [1] -1524 -3496  -621  -374  -173 -1424 -1742  -468  -184   -39
159 | #> 
160 | #> $steps
161 | #>  [1] 1524 3496  621  374  173 1424 1742  468  184   39
162 | ```
163 | 
164 | ------------------------------------------------------------------------
165 | 
166 | ### Vignettes
167 | 
168 | Also have a look at the vignettes for further examples.
169 | 
170 | -   [Environments](https://markusdumke.github.io/reinforcelearn/articles/environments.html)
171 | -   [Agents](https://markusdumke.github.io/reinforcelearn/articles/agents.html)
172 | 
173 | ------------------------------------------------------------------------
174 | 
175 | Logo is a modification of <https://www.r-project.org/logo/>.
176 | 


--------------------------------------------------------------------------------
/vignettes/agents.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Agents"
  3 | author: Markus Dumke
  4 | date: "`r Sys.Date()`"
  5 | output:rmarkdown::html_vignette:
  6 | fig_caption: yes
  7 | bibliography: references.bib
  8 | vignette: >
  9 |  %\VignetteIndexEntry{Agents}
 10 |  %\VignetteEngine{knitr::rmarkdown}
 11 |  %\VignetteEncoding{UTF-8}
 12 | ---
 13 |   
 14 | <style type="text/css">
 15 |   h1.title {
 16 |   font-size: 34px;
 17 |   }
 18 | </style>
 19 |   
 20 | ```{r setup, include=FALSE}
 21 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
 22 | ```
 23 | 
 24 | ```{r}
 25 | set.seed(12)
 26 | library(reinforcelearn)
 27 | ```
 28 | 
 29 | A reinforcement learning agent usually consists of three parts: a policy, a value function representation and an algorithm which updates the value function or policy parameters. In the following it will be explained how to create an agent in `reinforcelearn` to solve an environment.
 30 | 
 31 | You can create an agent with the function `makeAgent`. This will create an R6 class object with the corresponding policy, value function and algorithm.
 32 | 
 33 | ```{r}
 34 | env = makeEnvironment("gridworld", shape = c(3, 3), goal.states = 0L)
 35 | agent = makeAgent(policy = "softmax", val.fun = "table", algorithm = "qlearning")
 36 | ```
 37 | 
 38 | Then you can run the agent in the environment by calling `interact` for a specified number of steps or episodes.
 39 | 
 40 | ```{r}
 41 | interact(env, agent, n.episodes = 5L)
 42 | ```
 43 | 
 44 | Note that `interact` returns a list with the number of steps and returns per episode. Furthermore it will change the environment and agent object. So the environment's state or the agent's value function weights will have most likely changed after the interaction.
 45 | 
 46 | Although you can directly access the agent object, this is not recommended as this will be very likely to change in the next package versions. Instead use one of the accessor functions to e.g. get the weights of the action value function.
 47 | 
 48 | ```{r}
 49 | getValueFunction(agent)
 50 | ```
 51 | 
 52 | ## Policies
 53 | 
 54 | A policy is the agent's behavior function. We can define the policy with `makePolicy`.
 55 | 
 56 | ```{r}
 57 | # Uniform random policy
 58 | makePolicy("random")
 59 | 
 60 | # Epsilon-greedy policy
 61 | makePolicy("epsilon.greedy", epsilon = 0.2)
 62 | 
 63 | # Softmax policy
 64 | makePolicy("softmax")
 65 | ```
 66 | 
 67 | This will just capture what policy to use and the policy will then be created when we create the agent.
 68 | 
 69 | ## Value Functions
 70 | 
 71 | Many reinforcement learning algorithms use a value function to learn values of state and action pairs. 
 72 | The value function can be represented with different types of function approximation, e.g. as a table or neural network.
 73 | 
 74 | ```{r}
 75 | makeValueFunction("table", n.states = 9L, n.actions = 4L)
 76 | ```
 77 | 
 78 | For a neural network you can use the `keras` package. Therefore you need to specify a the model's architecture and pass these on to `makeValueFunction`.
 79 | 
 80 | ```{r, eval = FALSE}
 81 | library(keras)
 82 | model = keras_model_sequential() %>% 
 83 |   layer_dense(shape = 10L, input_shape = 4L, activation = "linear") %>%
 84 |   compile(optimizer = optimizer_sgd(lr = 0.1), loss = "mae")
 85 | makeValueFunction("neural.network", model)
 86 | ```
 87 | 
 88 | Note that online neural network training is currently very slow. One way to work with this is to make updates to the value function not after every interaction, but to store all interactions in a replay memory and make updates to the neural network only once in a while. Read more about this in Section Experience Replay.
 89 | 
 90 | Often you need to preprocess the state observation in a way the agent can work with this. Therefore you can pass on a function to the `preprocess` argument of `makeAgent`, which will then be applied to the state observation before the agent learns on this.
 91 | 
 92 | For neural network training the outcome of `preprocess` must be a one-row matrix in order to be able to learn.
 93 | 
 94 | ## Algorithms
 95 | 
 96 | The algorithm defines how to learn from an interaction with the environment. We can set up an algorithm using the function `makeAlgorithm`.
 97 | 
 98 | ```{r}
 99 | makeAlgorithm("qlearning")
100 | ```
101 | 
102 | ## Agent
103 | 
104 | If we have defined policy, value function and algorithm we can create the agent by calling `makeAgent`.
105 | 
106 | ```{r}
107 | policy = makePolicy("epsilon.greedy", epsilon = 0.2)
108 | val.fun = makeValueFunction("table", n.states = 9L, n.actions = 4L)
109 | algorithm = makeAlgorithm("qlearning")
110 | 
111 | agent = makeAgent(policy, val.fun, algorithm)
112 | ```
113 | 
114 | Note that you can also call `makeAgent` with character arguments which can save some typing.
115 | 
116 | ```{r}
117 | agent = makeAgent("epsilon.greedy", "table", "qlearning", 
118 |   policy.args = list(epsilon = 0.2))
119 | ```
120 | 
121 | ## Interaction
122 | 
123 | You can run an interaction between an agent and environment with the `interact` function.
124 | 
125 | ```{r}
126 | env = makeEnvironment("gridworld", shape = c(3, 2), goal.states = 0L)
127 | agent = makeAgent("random")
128 | 
129 | interact(env, agent, n.steps = 3L, visualize = TRUE)
130 | ```
131 | 
132 | It allows you to run an interaction for a specified number of steps or episodes and you can also specify a maximum number of steps per episode.
133 | This makes it very flexible to step through the environment one action after the other. Note you can also run an interaction without learning.
134 | 
135 | ```{r}
136 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = 0L, 
137 |   initial.state = 15L)
138 | agent = makeAgent("random")
139 | 
140 | for (i in 1:3L) {
141 |   ## comment in the next line to wait on enter press before taking the next action.
142 |   # invisible(readline(prompt = "Press [enter] to take the next action"))
143 |   interact(env, agent, n.steps = 1L, learn = FALSE, visualize = TRUE)
144 | }
145 | ```
146 | 
147 | ### Experience replay
148 | 
149 | Experience replay is a technique to learn at once from multiple past observations. Therefore all the states, actions, rewards and subsequent states will be stored in a list (the so called replay memory) and at each step a random batch from this memory will be replayed.
150 | 
151 | ```{r}
152 | (memory = makeReplayMemory(size = 2L, batch.size = 1L))
153 | 
154 | agent = makeAgent("random", replay.memory = memory)
155 | 
156 | interact(env, agent, n.steps = 2L, visualize = TRUE)
157 | 
158 | getReplayMemory(agent)
159 | ```
160 | 
161 | Here is an example training with experience replay, where the value function is updated only every 21 steps.
162 | 
163 | ```{r, message = FALSE}
164 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
165 | 
166 | policy = makePolicy("epsilon.greedy", epsilon = 0.1)
167 | memory = makeReplayMemory(size = 100L, batch.size = 20L)
168 | 
169 | agent = makeAgent(policy, "table", "qlearning", replay.memory = memory)
170 | 
171 | for (i in 1:100) {
172 |   interact(env, agent, n.steps = 20L, learn = FALSE)
173 |   interact(env, agent, n.steps = 1L, learn = TRUE)
174 | }
175 | action.vals = getValueFunction(agent)
176 | matrix(getStateValues(action.vals), ncol = 4L)
177 | ```
178 | 


--------------------------------------------------------------------------------
/R/environment.R:
--------------------------------------------------------------------------------
  1 | #' Create reinforcement learning environment.
  2 | #'
  3 | #' This function creates an environment for reinforcement learning.
  4 | #'
  5 | #' Use the `step` method to interact with the environment.
  6 | #'
  7 | #' Note that all states and actions are numerated starting with 0!
  8 | #'
  9 | #' For a detailed explanation and more examples
 10 | #' have a look at the vignette "How to create an environment?".
 11 | #'
 12 | #' @param class \[`character(1)`] \cr
 13 | #'   Class of environment. One of `c("custom", "mdp", "gym", "gridworld")`.
 14 | #' @param discount \[`numeric(1)` in (0, 1)] \cr Discount factor.
 15 | #' @param ... \[`any`] \cr Arguments passed on to the specific environment.
 16 | #'
 17 | #' @md
 18 | #'
 19 | #' @return R6 class of class Environment.
 20 | #'
 21 | #' @section Methods:
 22 | #' * `$step(action)` \cr
 23 | #'   Take action in environment.
 24 | #'   Returns a list with `state`, `reward`, `done`.
 25 | #' * `$reset()` \cr
 26 | #'   Resets the `done` flag of the environment and returns an initial state.
 27 | #'   Useful when starting a new episode.
 28 | #' * `$visualize()` \cr
 29 | #'   Visualizes the environment (if there is a visualization function).
 30 | #'
 31 | #' @section Environments:
 32 | #' * [Environment]
 33 | #' * [GymEnvironment]
 34 | #' * [MdpEnvironment]
 35 | #' * [Gridworld]
 36 | #' * [MountainCar]
 37 | #'
 38 | #' @export
 39 | #' @examples
 40 | #' step = function(self, action) {
 41 | #'   state = list(mean = action + rnorm(1), sd = runif(1))
 42 | #'   reward = rnorm(1, state[[1]], state[[2]])
 43 | #'   done = FALSE
 44 | #'   list(state, reward, done)
 45 | #' }
 46 | #'
 47 | #' reset = function(self) {
 48 | #'   state = list(mean = 0, sd = 1)
 49 | #'   state
 50 | #' }
 51 | #'
 52 | #' env = makeEnvironment(step = step, reset = reset, discount = 0.9)
 53 | #' env$reset()
 54 | #' env$step(100)
 55 | #'
 56 | #' # Create a Markov Decision Process.
 57 | #' P = array(0, c(2, 2, 2))
 58 | #' P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
 59 | #' P[, , 2] = matrix(c(0, 1, 0, 1), 2, 2, byrow = TRUE)
 60 | #' R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
 61 | #' env = makeEnvironment("mdp", transitions = P, rewards = R)
 62 | #'
 63 | #' env$reset()
 64 | #' env$step(1L)
 65 | #'
 66 | #' # Create a Gridworld.
 67 | #' grid = makeEnvironment("gridworld", shape = c(4, 4),
 68 | #'   goal.states = 15, initial.state = 0)
 69 | #' grid$visualize()
 70 | #'
 71 | #' \dontrun{
 72 | #' # Create an OpenAI Gym environment.
 73 | #' # Make sure you have Python, gym and reticulate installed.
 74 | #' env = makeEnvironment("gym", gym.name = "MountainCar-v0")
 75 | #'
 76 | #' # Take random actions for 200 steps.
 77 | #' env$reset()
 78 | #' for (i in 1:200) {
 79 | #'   action = sample(env$actions, 1)
 80 | #'   env$step(action)
 81 | #'   env$visualize()
 82 | #' }
 83 | #' env$close()
 84 | #' }
 85 | makeEnvironment = function(class = "custom", discount = 1, ...) {
 86 |   checkmate::assertChoice(class,
 87 |     c("custom", "mdp", "gym", "gridworld", "windy.gridworld", "cliff.walking",
 88 |       "mountain.car", "mountain.car.continuous"))
 89 |   switch(class,
 90 |     custom = Environment$new(discount = discount, ...), # default
 91 |     mdp = MdpEnvironment$new(discount = discount, ...),
 92 |     gym = GymEnvironment$new(discount = discount, ...),
 93 |     gridworld = Gridworld$new(discount = discount, ...),
 94 |     windy.gridworld = WindyGridworld$new(discount = discount, ...),
 95 |     cliff.walking = CliffWalking$new(discount = discount, ...),
 96 |     mountain.car = MountainCar$new(discount = discount, ...),
 97 |     mountain.car.continuous = MountainCarContinuous$new(discount = discount, ...)
 98 |   )
 99 | }
100 | 
101 | #' Custom Reinforcement Learning Environment
102 | #'
103 | #' @section Usage:
104 | #' `makeEnvironment("custom", step, reset, visualize = NULL, discount = 1, action.names = NULL)`
105 | #'
106 | #' @param step \[`function(self, action)`] \cr
107 | #'   Custom step function.
108 | #' @param reset \[`function(self)`] \cr
109 | #'   Custom reset function.
110 | #' @param visualize \[`function(self)`] \cr
111 | #'   Optional custom visualization function.
112 | #' @param discount \[`numeric(1)` in (0, 1)] \cr Discount factor.
113 | #' @param action.names \[`named integer`] \cr
114 | #'   Optional action names for a discrete action space.
115 | #'
116 | #' @md
117 | #'
118 | #' @inheritSection makeEnvironment Methods
119 | #'
120 | #' @name Environment
121 | #' @export
122 | #'
123 | #' @examples
124 | #' step = function(self, action) {
125 | #'   state = list(mean = action + rnorm(1), sd = runif(1))
126 | #'   reward = rnorm(1, state[[1]], state[[2]])
127 | #'   done = FALSE
128 | #'   list(state, reward, done)
129 | #' }
130 | #'
131 | #' reset = function(self) {
132 | #'   state = list(mean = 0, sd = 1)
133 | #'   state
134 | #' }
135 | #'
136 | #' env = makeEnvironment(step = step, reset = reset)
137 | #' env$reset()
138 | #' env$step(100)
139 | NULL
140 | 
141 | Environment = R6::R6Class("Environment",
142 |   public = list(
143 |     action.names = NULL,
144 |     n.step = 0L,
145 |     episode = 0L,
146 |     episode.step = 0L,
147 |     episode.return = 0,
148 |     previous.state = NULL,
149 |     state = NULL,
150 |     reward = NULL,
151 |     done = FALSE,
152 |     discount = NULL,
153 | 
154 |     resetEverything = function() {
155 |       self$n.step = 0L
156 |       self$episode = 0
157 |       self$reset()
158 |     },
159 | 
160 |     reset = function() {
161 |       self$episode.step = 0L
162 |       self$episode.return = 0
163 |       self$done = FALSE
164 |       self$state = private$reset_(self)
165 |       self$state
166 |     },
167 | 
168 |     step = function(action) {
169 |       if (is.character(action)) {
170 |         action = self$action.names[action]
171 |       }
172 |       self$previous.state = self$state
173 |       res = private$step_(self, action)
174 |       self$episode.return = self$episode.return +
175 |         self$discount ^ self$episode.step * res[[2]]
176 |       self$n.step = self$n.step + 1L
177 |       self$episode.step = self$episode.step + 1L
178 |       self$state = res[[1]]
179 |       self$reward = res[[2]]
180 |       self$done = res[[3]]
181 |       if (self$done) {
182 |         self$episode = self$episode + 1L
183 |       }
184 |       list(state = res[[1]], reward = res[[2]], done = res[[3]])
185 |     },
186 | 
187 |     visualize = function() {
188 |       private$visualize_(self)
189 |     },
190 | 
191 |     initialize = function(step, reset, visualize = NULL, discount, action.names = NULL) {
192 |       checkmate::assertFunction(step)
193 |       checkmate::assertFunction(reset)
194 |       checkmate::assertFunction(visualize, null.ok = TRUE)
195 |       checkmate::assertNumber(discount, lower = 0, upper = 1)
196 |       checkmate::assertIntegerish(action.names, null.ok = TRUE)
197 | 
198 |       private$step_ = step
199 |       private$reset_ = reset
200 |       self$discount = discount
201 |       self$action.names = action.names
202 |       if (!missing(visualize)) {
203 |         checkmate::assertFunction(visualize)
204 |         private$visualize_ = visualize
205 |       } else {
206 |         private$visualize_ = function(self) {}
207 |       }
208 |       self$reset()
209 |     }
210 |   ),
211 | 
212 |   private = list(
213 |     # step_: custom step method depending on problem that returns list with
214 |     #   next state, reward, done
215 |     step_ = NULL,
216 |     # reset_: custom reset method depending on problem that returns state
217 |     reset_ = NULL,
218 |     visualize_ = NULL
219 |   )
220 | )
221 | 


--------------------------------------------------------------------------------
/examples/user_interface.R:
--------------------------------------------------------------------------------
  1 | #' #' ---
  2 | #' #' title: "User interface"
  3 | #' #' author: Markus Dumke
  4 | #' #' output: github_document
  5 | #' #' ---
  6 | #'
  7 | #' #+ setup, include=FALSE
  8 | #' library(knitr)
  9 | #' opts_chunk$set(comment = "#>", collapse = FALSE, message = FALSE)
 10 | #'
 11 | #' library(reinforcelearn)
 12 | #'
 13 | #' env = makeEnvironment("windy.gridworld")
 14 | #'
 15 | #' # policy without val.fun or algorithm
 16 | #' agent = makeAgent("random")
 17 | #' interact(env, agent, n.steps = 10L)
 18 | #'
 19 | #' # policy with val.fun, without algorithm
 20 | #' agent = makeAgent("softmax", "table")
 21 | #' interact(env, agent, n.steps = 10L)
 22 | #'
 23 | #' # policy, table, qlearning
 24 | #' agent = makeAgent("softmax", "table", "qlearning")
 25 | #' interact(env, agent, n.steps = 10L)
 26 | #'
 27 | #' # policy, table, qlearning, eligibility
 28 | #' alg = makeAlgorithm("qlearning", lambda = 0.8, traces = "accumulate")
 29 | #' agent = makeAgent("softmax", "table", alg)
 30 | #' interact(env, agent, n.steps = 10L)
 31 | #'
 32 | #' # policy, table, qlearning, exp.replay
 33 | #' mem = makeReplayMemory(size = 5, batch.size = 5)
 34 | #' agent = makeAgent("softmax", "table", "qlearning", experience.replay = mem)
 35 | #' interact(env, agent, n.steps = 10L)
 36 | #'
 37 | #' # policy, neuralnet, qlearning
 38 | #' library(keras)
 39 | #' model = keras_model_sequential() %>%
 40 | #'   layer_dense(units = env$n.actions, activation = "linear",
 41 | #'     input_shape = c(env$n.states), kernel_initializer = initializer_zeros(),
 42 | #'     use_bias = FALSE) %>%
 43 | #'   compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
 44 | #' val = makeValueFunction("neural.network", model = model)
 45 | #' preprocess = function(x) to_categorical(x, num_classes = env$n.states)
 46 | #' agent = makeAgent("softmax", val, "qlearning", preprocess = preprocess)
 47 | #'
 48 | #' # policy, neuralnet, qlearning, exp. replay
 49 | #'
 50 | #'
 51 | #'
 52 | #'
 53 | #'
 54 | #'
 55 | #'
 56 | #'
 57 | #'
 58 | #'
 59 | #'
 60 | #'
 61 | #'
 62 | #' #
 63 | #' #
 64 | #' #
 65 | #' #
 66 | #' # # run random policy without learning
 67 | #' # env = makeEnvironment("gridworld", shape = c(4, 4),
 68 | #' #   goal.states = 0L, initial.state = 15L, discount = 0.99)
 69 | #' # policy = makePolicy("random")
 70 | #' # agent = makeAgent(policy)
 71 | #' # interact(env, agent, n.steps = 200L)
 72 | #' #
 73 | #' # # qlearning table
 74 | #' # env = makeEnvironment("gridworld", shape = c(4, 4),
 75 | #' #   goal.states = c(0, 15), initial.state = 1:14, discount = 1)
 76 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
 77 | #' # alg = makeAlgorithm("qlearning")
 78 | #' # agent = makeAgent(policy, val, alg)
 79 | #' # interact(env, agent, n.episodes = 50L) # fail
 80 | #' # getStateValues(agent$val.fun$Q)
 81 | #' #
 82 | #' # # qlearning simple
 83 | #' # env = makeEnvironment("windy.gridworld")
 84 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
 85 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 86 | #' # alg = makeAlgorithm("qlearning")
 87 | #' # agent = makeAgent(policy, val, alg)
 88 | #' # interact(env, agent, n.episodes = 100L)
 89 | #' #
 90 | #' # # sarsa simple
 91 | #' # env = makeEnvironment("windy.gridworld")
 92 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
 93 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1)
 94 | #' # alg = makeAlgorithm("sarsa")
 95 | #' # agent = makeAgent(policy, val, alg)
 96 | #' # interact(env, agent, n.episodes = 100L)
 97 | #' #
 98 | #' # # sarsa simple with softmax policy
 99 | #' # env = makeEnvironment("windy.gridworld")
100 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
101 | #' # policy = makePolicy("softmax")
102 | #' # alg = makeAlgorithm("sarsa")
103 | #' # agent = makeAgent(policy, val, alg)
104 | #' # interact(env, agent, n.episodes = 100L)
105 | #' #
106 | #' # # qlearning eligibility traces
107 | #' # env = makeEnvironment("windy.gridworld")
108 | #' # val = makeValueFunction("table", n.states = env$n.states, n.actions = env$n.actions)
109 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1)
110 | #' # alg = makeAlgorithm("qlearning", lambda = 0.9, traces = "accumulate")
111 | #' # agent = makeAgent(policy, val, alg)
112 | #' # interact(env, agent, n.episodes = 100L)
113 | #' #
114 | #' # # character arguments
115 | #' # env = makeEnvironment("windy.gridworld")
116 | #' # agent = makeAgent("softmax", "table", "qlearning")
117 | #' # interact(env, agent, n.episodes = 10L)
118 | #' #
119 | #' # env = makeEnvironment("windy.gridworld")
120 | #' # alg = makeAlgorithm("qlearning", lambda = 0.9, traces = "replace")
121 | #' # agent = makeAgent("softmax", "table", alg)
122 | #' # interact(env, agent, n.episodes = 10L)
123 | #' #
124 | #' # # qlearning experience replay
125 | #' # env = makeEnvironment("windy.gridworld")
126 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1)
127 | #' # replay = makeReplayMemory(size = 200L, batch.size = 150L)
128 | #' # agent = makeAgent(policy, "table", "qlearning", experience.replay = replay) # a bit slow
129 | #' # interact(env, agent, n.episodes = 100L)
130 | #' #
131 | #' # # exp replay train every 10 steps
132 | #' # env = makeEnvironment("windy.gridworld")
133 | #' # policy = makePolicy("epsilon.greedy", epsilon = 0.1)
134 | #' # replay = makeReplayMemory(size = 100L, batch.size = 100L)
135 | #' # agent = makeAgent(policy, "table", "qlearning", experience.replay = replay) # a bit slow
136 | #' # for (i in 1:10000) {
137 | #' #   interact(env, agent, n.steps = 100L, learn = FALSE)
138 | #' #   interact(env, agent, n.steps = 1L, learn = TRUE)
139 | #' # }
140 | #' #
141 | #' #
142 | #' # # keras neural network
143 | #' # env = makeEnvironment("windy.gridworld")
144 | #' # library(keras)
145 | #' # model = keras_model_sequential()
146 | #' # # "input_shape" parameter for layer_dense should be  c(batchsize(None), input_dim), dim in keras is row major
147 | #' # model %>%
148 | #' #   layer_dense(units = env$n.actions, activation = "linear", input_shape = c(env$n.states),
149 | #' #     kernel_initializer = initializer_zeros(), use_bias = FALSE)
150 | #' #   #layer_dense(units = env$n.actions, activation = "linear")
151 | #' # model$compile(loss = "mae", optimizer = optimizer_sgd(lr = 1))
152 | #' # val = makeValueFunction("neural.network", model = model)
153 | #' # replay = makeReplayMemory(size = 100L, batch.size = 10L)
154 | #' # preprocess = function(x) to_categorical(x, num_classes = env$n.states)
155 | #' # agent = makeAgent("softmax", val, "qlearning",
156 | #' #   preprocess = preprocess, experience.replay = replay)
157 | #' # for (i in 1:100) {
158 | #' #   interact(env, agent, n.steps = 10L, learn = FALSE, max.steps.per.episode = 100L)
159 | #' #   interact(env, agent, n.steps = 1L, learn = TRUE, max.steps.per.episode = 100L)
160 | #' # }
161 | #' # agent$val.fun$model %>% get_weights()
162 | #' #
163 | #' # # solve mountain car with exp replay
164 | #' # m = makeEnvironment("gym", "MountainCar-v0")
165 | #' # library(keras)
166 | #' # model = keras_model_sequential()
167 | #' # # "input_shape" parameter for layer_dense should be  c(batchsize(None), input_dim), dim in keras is row major
168 | #' # model %>%
169 | #' #   layer_dense(units = 64L, activation = 'relu', input_shape = c(2L)) %>%
170 | #' #   layer_dense(units = 3L, activation = 'linear')
171 | #' # model$compile(loss = 'mse', optimizer = optimizer_rmsprop(lr = 0.0025))
172 | #' # val = makeValueFunction("neural.network", model = model)
173 | #' # replay = makeReplayMemory(size = 100L, batch.size = 10L)
174 | #' # preprocess = function(x) matrix(x, ncol = 2)
175 | #' # agent = makeAgent("softmax", val, "qlearning",
176 | #' #   preprocess = preprocess, experience.replay = replay)
177 | #' # for (i in 1:1000) {
178 | #' #   interact(env, agent, n.steps = 10L, learn = FALSE)
179 | #' #   interact(env, agent, n.steps = 1L, learn = TRUE)
180 | #' # }
181 | #' # #agent$val.fun$model %>% get_weights()
182 | 


--------------------------------------------------------------------------------
/vignettes/environments.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Environments"
  3 | author: Markus Dumke
  4 | date: "`r Sys.Date()`"
  5 | output:rmarkdown::html_vignette:
  6 | fig_caption: yes
  7 | bibliography: references.bib
  8 | vignette: >
  9 |  %\VignetteIndexEntry{Environments}
 10 |  %\VignetteEngine{knitr::rmarkdown}
 11 |  %\VignetteEncoding{UTF-8}
 12 | ---
 13 |   
 14 | <style type="text/css">
 15 |   h1.title {
 16 |   font-size: 34px;
 17 |   }
 18 | </style>
 19 |   
 20 | ```{r setup, include=FALSE}
 21 | knitr::opts_chunk$set(message = TRUE, eval = TRUE, collapse = TRUE, comment = "#>")
 22 | ```
 23 |   
 24 | This vignette explains the different possibilities to create and use a reinforcement learning environment in `reinforcelearn`. Section [Creation](#creation) explains how to create an environment and Section [Interaction](#interaction) describe how to use the created environment object for interaction.
 25 |   
 26 | ```{r}
 27 | library(reinforcelearn)
 28 | ```
 29 | 
 30 | ## Creation
 31 | 
 32 | The `makeEnvironment` function provides different ways to create an environment.
 33 | It is called with the class name as a first argument. You can pass arguments of the specific environment class (e.g. the state transition array for an MDP) to the `...` argument.
 34 | 
 35 | ### Create a custom environment
 36 | 
 37 | To create a custom environment you have to set up a `step` and `reset` function, which define the rewards the agent receives and ultimately the goal of what to learn.
 38 | 
 39 | Here is an example setting up a the famous Mountain Car problem. 
 40 | 
 41 | ```{r, out.width = "200px", fig.align="center", echo = FALSE}
 42 | knitr::include_graphics("mountaincar.JPG")
 43 | ```
 44 | 
 45 | The task of the `reset` function is to initialize the starting state of the environment and usually this function is called when starting a new episode. It returns the `state` of the environment. It takes an argument `self`, which is the newly created R6 class and can be used e.g. to access the current state of the environment.
 46 | 
 47 | ```{r}
 48 | reset = function(self) {
 49 |   position = runif(1, -0.6, -0.4)
 50 |   velocity = 0
 51 |   state = matrix(c(position, velocity), ncol = 2)
 52 |   state
 53 | }
 54 | ```
 55 | 
 56 | The `step` function is used for interaction, it controls the transition to the next state and reward given an action. It takes `self` and `action` as an argument and returns a list with the next `state`, `reward` and whether an episode is finished (`done`).
 57 | 
 58 | ```{r}
 59 | step = function(self, action) {
 60 |   position = self$state[1]
 61 |   velocity = self$state[2]
 62 |   velocity = (action - 1L) * 0.001 + cos(3 * position) * (-0.0025)
 63 |   velocity = min(max(velocity, -0.07), 0.07)
 64 |   position = position + velocity
 65 |   if (position < -1.2) {
 66 |     position = -1.2
 67 |     velocity = 0
 68 |   }
 69 |   state = matrix(c(position, velocity), ncol = 2)
 70 |   reward = -1
 71 |   if (position >= 0.5) {
 72 |     done = TRUE
 73 |     reward = 0
 74 |   } else {
 75 |     done = FALSE
 76 |   }
 77 |   list(state, reward, done)
 78 | }
 79 | ```
 80 | 
 81 | Then we can create the environment with
 82 | 
 83 | ```{r}
 84 | env = makeEnvironment(step = step, reset = reset)
 85 | ```
 86 | 
 87 | ---
 88 | 
 89 | ### OpenAI Gym
 90 | 
 91 | OpenAI Gym [@gym_openai] provides a set of environments, which can be used for benchmarking.
 92 | 
 93 | To use a gym environment you have to install
 94 | 
 95 | * Python
 96 | * `gym` (Python package, installation instructions here: https://github.com/openai/gym#installation)
 97 | * `reticulate` (R package)
 98 | 
 99 | Then you can create a gym environment by passing on the name of the environment.
100 | 
101 | ```{r, eval = FALSE}
102 | # Create a gym environment.
103 | env = makeEnvironment("gym", gym.name = "MountainCar-v0")
104 | ```
105 | 
106 | Have a look at [https://gym.openai.com/envs](https://gym.openai.com/envs) for possible environments.
107 | 
108 | ---
109 | 
110 | ### Markov Decision Process
111 | 
112 | A Markov Decision Process (MDP) is a stochastic process, which is commonly used for reinforcement learning environments.
113 | When the problem can be formulated as a MDP, all you need to pass to `makeEnvironment` is the state transition array $P^a_{ss'}$ and reward matrix $R_s^a$ of the MDP.
114 | 
115 | We can create a simple MDP with 2 states and 2 actions with the following code.
116 | 
117 | ```{r}
118 | # State transition array
119 | P = array(0, c(2, 2, 2))
120 | P[, , 1] = matrix(c(0.5, 0.5, 0, 1), 2, 2, byrow = TRUE)
121 | P[, , 2] = matrix(c(0.1, 0.9, 0, 1), 2, 2, byrow = TRUE)
122 | 
123 | # Reward matrix
124 | R = matrix(c(5, 10, -1, 2), 2, 2, byrow = TRUE)
125 | 
126 | env = makeEnvironment("mdp", transitions = P, rewards = R)
127 | ```
128 | 
129 | ---
130 | 
131 | ### Gridworld
132 | 
133 | A gridworld is a simple MDP navigation task with a discrete state and action space. The agent has to move through a grid from a start state to a goal state. Possible actions are the standard moves (left, right, up, down) or could also include the diagonal moves (leftup, leftdown, rightup, rightdown).
134 | 
135 | Here is an example of a 4x4 gridworld [@sutton2017, Example 4.1] with two terminal states in the lower right and upper left of the grid. Rewards are - 1 for every transition until reaching a terminal state. 
136 | 
137 | ```{r, out.width = "200px", fig.align="center", echo = FALSE}
138 | knitr::include_graphics("gridworld.JPG")
139 | ```
140 | 
141 | The following code creates this gridworld.
142 | 
143 | ```{r}
144 | # Gridworld Environment (Sutton & Barto (2017) Example 4.1)
145 | env = makeEnvironment("gridworld", shape = c(4, 4), goal.states = c(0, 15))
146 | ```
147 | 
148 | ---
149 | 
150 | ## Interaction
151 | 
152 | `makeEnvironment` returns an R6 class object which can be used for the interaction between agent and environment.
153 | 
154 | ```{r}
155 | env = makeEnvironment("gridworld", shape = c(4, 4), 
156 |   goal.states = 0L, initial.state = 15L)
157 | ```
158 | 
159 | To take an action you can call the `step(action)` method. It is called with an action as an argument and internally computes the following `state`, `reward` and whether an episode is finished (`done`).
160 | 
161 | ```{r}
162 | # The initial state of the environment.
163 | env$reset()
164 | 
165 | env$visualize()
166 | 
167 | # Actions are encoded as integers.
168 | env$step(0L)
169 | 
170 | env$visualize()
171 | 
172 | # But can also have character names.
173 | env$step("left")
174 | 
175 | env$visualize()
176 | ```
177 | 
178 | Note that the R6 class object changes whenever calling `step` or `reset`! Therefore calling step with the same action twice will most likely return different states and rewards!
179 | 
180 | Note also that all discrete states and actions are numerated starting with 0 to be consistent with OpenAI Gym!
181 | 
182 | The environment object often also contains information about the number of states and actions or the bounds in case of a continuous space.
183 | 
184 | ```{r}
185 | env = makeEnvironment("mountain.car")
186 | env$n.actions
187 | env$state.space.bounds
188 | ```
189 | 
190 | It also contains a counter of the number of interactions, i.e. the number of times `step` has been called, the number of steps in the current episode, the number of episodes and return in the current episode.
191 | 
192 | ```{r}
193 | env = makeEnvironment("gridworld", shape = c(4, 4), 
194 |   goal.states = 0L, initial.state = 15L, discount = 0.99)
195 | 
196 | env$step("up")
197 | env$n.step
198 | env$episode.return
199 | 
200 | env$step("left")
201 | env$n.step
202 | env$episode.return
203 | ```
204 | 
205 | ---
206 | 
207 | ### Full list of attributes and methods:
208 | 
209 | Here is a full list describing the attributes of the `R6` class created by `makeEnvironment`.
210 | 
211 | **Attributes**:
212 | 
213 | - `state` [any]: The current state observation of the environment. Depending on the problem this can be anything, e.g. a scalar integer, a matrix or a list.
214 | 
215 | - `reward` [integer(1)]: The current reward of the environment. It is always a scalar numeric value.
216 | 
217 | - `done` [logical(1)]: A logical flag specifying whether an episode is finished.
218 | 
219 | - `discount` [numeric(1) in [0, 1]]: The discount factor.
220 | 
221 | - `n.step` [integer(1)]: Number of steps, i.e. number of times `$step()` has been called.
222 | 
223 | - `episode.step` [integer(1)]: Number of steps in the current episode. In comparison to `n.step` it will be reset to 0 when `reset` is called. Each time `step` is called it is increased by 1.
224 | 
225 | - `episode.return` [numeric(1)]: The return in the current episode. Each time `step` is called the discounted `reward` is added. Will be reset to 0 when `reset` is called.
226 | 
227 | - `previous.state` [any]: The previous state of the environment. This is often the state which is updated in a reinforcement learning algorithm.
228 | 
229 | **Methods**:
230 | 
231 | - `reset()`: Resets the environment, i.e. it sets the `state` attribute to a starting state and sets the `done` flag to `FALSE`. It is usually called at the beginning of an episode. 
232 | 
233 | - `step(action)`: The interaction function between agent and environment. `step` is called with an action as an argument. It then takes the action and internally computes the following state, reward and whether an episode is finished and returns a list with `state`, `reward` and `done`.
234 | 
235 | - `visualize()`: Visualize the current state of the environment.
236 | 
237 | ---
238 | 
239 | ### References
240 | 


--------------------------------------------------------------------------------
/docs/reference/index.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Function reference • reinforcelearn</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.7/cerulean/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
 15 | 
 16 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
 20 | 
 21 | 
 22 | <!-- pkgdown -->
 23 | <link href="../pkgdown.css" rel="stylesheet">
 24 | <script src="../jquery.sticky-kit.min.js"></script>
 25 | <script src="../pkgdown.js"></script>
 26 |   
 27 |   
 28 | <!-- mathjax -->
 29 | <script src='https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
 30 | 
 31 | <!--[if lt IE 9]>
 32 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 33 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 34 | <![endif]-->
 35 | 
 36 | 
 37 |   </head>
 38 | 
 39 |   <body>
 40 |     <div class="container template-reference-index">
 41 |       <header>
 42 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 43 |   <div class="container">
 44 |     <div class="navbar-header">
 45 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
 46 |         <span class="icon-bar"></span>
 47 |         <span class="icon-bar"></span>
 48 |         <span class="icon-bar"></span>
 49 |       </button>
 50 |       <a class="navbar-brand" href="../index.html">reinforcelearn</a>
 51 |     </div>
 52 |     <div id="navbar" class="navbar-collapse collapse">
 53 |       <ul class="nav navbar-nav">
 54 |         <li>
 55 |   <a href="../index.html">
 56 |     <span class="fa fa-home fa-lg"></span>
 57 |      
 58 |   </a>
 59 | </li>
 60 | <li>
 61 |   <a href="../reference/index.html">Reference</a>
 62 | </li>
 63 | <li class="dropdown">
 64 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 65 |     Articles
 66 |      
 67 |     <span class="caret"></span>
 68 |   </a>
 69 |   <ul class="dropdown-menu" role="menu">
 70 |     <li>
 71 |       <a href="../articles/agents.html">Agents</a>
 72 |     </li>
 73 |     <li>
 74 |       <a href="../articles/environments.html">Environments</a>
 75 |     </li>
 76 |   </ul>
 77 | </li>
 78 | <li>
 79 |   <a href="../news/index.html">News</a>
 80 | </li>
 81 |       </ul>
 82 |       
 83 |       <ul class="nav navbar-nav navbar-right">
 84 |         
 85 |       </ul>
 86 |     </div><!--/.nav-collapse -->
 87 |   </div><!--/.container -->
 88 | </div><!--/.navbar -->
 89 | 
 90 |       
 91 |       </header>
 92 | 
 93 |       <div class="row">
 94 |   <div class="col-md-9">
 95 |     <div class="page-header">
 96 |       <h1>
 97 |         Reference
 98 |         <small>version&nbsp;0.1.0</small>
 99 |       </h1>
100 |     </div>
101 | 
102 |     <div class="contents">
103 |       <table class="ref-index">
104 | 
105 |       <colgroup>
106 |         <col class="alias" />
107 |         <col class="title" />
108 |       </colgroup>
109 | 
110 |       <tbody>
111 |         <tr>
112 |           <th colspan="2">
113 |             <h2 id="section-package-help-page" class="hasAnchor"><a href="#section-package-help-page" class="anchor"></a>Package help page</h2>
114 |             <p class="section-desc"></p>
115 |           </th>
116 |         </tr>
117 |         <tr>
118 |           <!--  -->
119 |           <td>
120 |             <p><code><a href="reinforcelearn.html">reinforcelearn</a></code> </p>
121 |           </td>
122 |           <td><p>Reinforcement Learning.</p></td>
123 |         </tr>
124 |       </tbody><tbody>
125 |         <tr>
126 |           <th colspan="2">
127 |             <h2 id="section-environments" class="hasAnchor"><a href="#section-environments" class="anchor"></a>Environments</h2>
128 |             <p class="section-desc"><p>Creation of reinforcement learning environments.</p></p>
129 |           </th>
130 |         </tr>
131 |         <tr>
132 |           <!--  -->
133 |           <td>
134 |             <p><code><a href="makeEnvironment.html">makeEnvironment</a></code> </p>
135 |           </td>
136 |           <td><p>Create reinforcement learning environment.</p></td>
137 |         </tr><tr>
138 |           <!--  -->
139 |           <td>
140 |             <p><code><a href="Environment.html">Environment</a></code> </p>
141 |           </td>
142 |           <td><p>Custom Reinforcement Learning Environment</p></td>
143 |         </tr><tr>
144 |           <!--  -->
145 |           <td>
146 |             <p><code><a href="GymEnvironment.html">GymEnvironment</a></code> </p>
147 |           </td>
148 |           <td><p>Gym Environment</p></td>
149 |         </tr><tr>
150 |           <!--  -->
151 |           <td>
152 |             <p><code><a href="MdpEnvironment.html">MdpEnvironment</a></code> </p>
153 |           </td>
154 |           <td><p>MDP Environment</p></td>
155 |         </tr><tr>
156 |           <!--  -->
157 |           <td>
158 |             <p><code><a href="Gridworld.html">Gridworld</a></code> </p>
159 |           </td>
160 |           <td><p>Gridworld</p></td>
161 |         </tr><tr>
162 |           <!--  -->
163 |           <td>
164 |             <p><code><a href="CliffWalking.html">CliffWalking</a></code> </p>
165 |           </td>
166 |           <td><p>Cliff Walking</p></td>
167 |         </tr><tr>
168 |           <!--  -->
169 |           <td>
170 |             <p><code><a href="WindyGridworld.html">WindyGridworld</a></code> </p>
171 |           </td>
172 |           <td><p>Windy Gridworld</p></td>
173 |         </tr><tr>
174 |           <!--  -->
175 |           <td>
176 |             <p><code><a href="MountainCar.html">MountainCar</a></code> </p>
177 |           </td>
178 |           <td><p>Mountain Car</p></td>
179 |         </tr>
180 |       </tbody><tbody>
181 |         <tr>
182 |           <th colspan="2">
183 |             <h2 id="section-policies" class="hasAnchor"><a href="#section-policies" class="anchor"></a>Policies</h2>
184 |             <p class="section-desc"></p>
185 |           </th>
186 |         </tr>
187 |         <tr>
188 |           <!--  -->
189 |           <td>
190 |             <p><code><a href="makePolicy.html">makePolicy</a></code> </p>
191 |           </td>
192 |           <td><p>Create policy.</p></td>
193 |         </tr><tr>
194 |           <!--  -->
195 |           <td>
196 |             <p><code><a href="RandomPolicy.html">RandomPolicy</a></code> </p>
197 |           </td>
198 |           <td><p>Random Policy</p></td>
199 |         </tr><tr>
200 |           <!--  -->
201 |           <td>
202 |             <p><code><a href="EpsilonGreedyPolicy.html">EpsilonGreedyPolicy</a></code> </p>
203 |           </td>
204 |           <td><p>Epsilon Greedy Policy</p></td>
205 |         </tr><tr>
206 |           <!--  -->
207 |           <td>
208 |             <p><code><a href="SoftmaxPolicy.html">SoftmaxPolicy</a></code> </p>
209 |           </td>
210 |           <td><p>Softmax Policy</p></td>
211 |         </tr>
212 |       </tbody><tbody>
213 |         <tr>
214 |           <th colspan="2">
215 |             <h2 id="section-value-function-representations" class="hasAnchor"><a href="#section-value-function-representations" class="anchor"></a>Value Function Representations</h2>
216 |             <p class="section-desc"></p>
217 |           </th>
218 |         </tr>
219 |         <tr>
220 |           <!--  -->
221 |           <td>
222 |             <p><code><a href="makeValueFunction.html">makeValueFunction</a></code> </p>
223 |           </td>
224 |           <td><p>Value Function Representation</p></td>
225 |         </tr><tr>
226 |           <!--  -->
227 |           <td>
228 |             <p><code><a href="ValueTable.html">ValueTable</a></code> </p>
229 |           </td>
230 |           <td><p>Value Table</p></td>
231 |         </tr><tr>
232 |           <!--  -->
233 |           <td>
234 |             <p><code><a href="ValueNetwork.html">ValueNetwork</a></code> </p>
235 |           </td>
236 |           <td><p>Value Network</p></td>
237 |         </tr>
238 |       </tbody><tbody>
239 |         <tr>
240 |           <th colspan="2">
241 |             <h2 id="section-algorithms" class="hasAnchor"><a href="#section-algorithms" class="anchor"></a>Algorithms</h2>
242 |             <p class="section-desc"></p>
243 |           </th>
244 |         </tr>
245 |         <tr>
246 |           <!--  -->
247 |           <td>
248 |             <p><code><a href="makeAlgorithm.html">makeAlgorithm</a></code> </p>
249 |           </td>
250 |           <td><p>Make reinforcement learning algorithm.</p></td>
251 |         </tr><tr>
252 |           <!--  -->
253 |           <td>
254 |             <p><code><a href="QLearning.html">QLearning</a></code> </p>
255 |           </td>
256 |           <td><p>Q-Learning</p></td>
257 |         </tr>
258 |       </tbody><tbody>
259 |         <tr>
260 |           <th colspan="2">
261 |             <h2 id="section-agent" class="hasAnchor"><a href="#section-agent" class="anchor"></a>Agent</h2>
262 |             <p class="section-desc"></p>
263 |           </th>
264 |         </tr>
265 |         <tr>
266 |           <!--  -->
267 |           <td>
268 |             <p><code><a href="makeAgent.html">makeAgent</a></code> </p>
269 |           </td>
270 |           <td><p>Create Agent.</p></td>
271 |         </tr>
272 |       </tbody><tbody>
273 |         <tr>
274 |           <th colspan="2">
275 |             <h2 id="section-interaction" class="hasAnchor"><a href="#section-interaction" class="anchor"></a>Interaction</h2>
276 |             <p class="section-desc"></p>
277 |           </th>
278 |         </tr>
279 |         <tr>
280 |           <!--  -->
281 |           <td>
282 |             <p><code><a href="interact.html">interact</a></code> </p>
283 |           </td>
284 |           <td><p>Interaction between agent and environment.</p></td>
285 |         </tr>
286 |       </tbody><tbody>
287 |         <tr>
288 |           <th colspan="2">
289 |             <h2 id="section-helper-functions" class="hasAnchor"><a href="#section-helper-functions" class="anchor"></a>Helper functions</h2>
290 |             <p class="section-desc"></p>
291 |           </th>
292 |         </tr>
293 |         <tr>
294 |           <!--  -->
295 |           <td>
296 |             <p><code><a href="makeReplayMemory.html">makeReplayMemory</a></code> </p>
297 |           </td>
298 |           <td><p>Experience Replay</p></td>
299 |         </tr><tr>
300 |           <!--  -->
301 |           <td>
302 |             <p><code><a href="getReplayMemory.html">getReplayMemory</a></code> </p>
303 |           </td>
304 |           <td><p>Get replay memory.</p></td>
305 |         </tr><tr>
306 |           <!--  -->
307 |           <td>
308 |             <p><code><a href="Eligibility.html">Eligibility</a></code> </p>
309 |           </td>
310 |           <td><p>Eligibility traces</p></td>
311 |         </tr><tr>
312 |           <!--  -->
313 |           <td>
314 |             <p><code><a href="getEligibilityTraces.html">getEligibilityTraces</a></code> </p>
315 |           </td>
316 |           <td><p>Get eligibility traces</p></td>
317 |         </tr><tr>
318 |           <!--  -->
319 |           <td>
320 |             <p><code><a href="getValueFunction.html">getValueFunction</a></code> </p>
321 |           </td>
322 |           <td><p>Get weights of value function.</p></td>
323 |         </tr><tr>
324 |           <!--  -->
325 |           <td>
326 |             <p><code><a href="getStateValues.html">getStateValues</a></code> </p>
327 |           </td>
328 |           <td><p>Get state values.</p></td>
329 |         </tr><tr>
330 |           <!--  -->
331 |           <td>
332 |             <p><code><a href="tilecoding.html">tiles</a></code> <code><a href="tilecoding.html">iht</a></code> </p>
333 |           </td>
334 |           <td><p>Tile Coding</p></td>
335 |         </tr><tr>
336 |           <!--  -->
337 |           <td>
338 |             <p><code><a href="nHot.html">nHot</a></code> </p>
339 |           </td>
340 |           <td><p>Make n hot vector.</p></td>
341 |         </tr>
342 |       </tbody>
343 |       </table>
344 |     </div>
345 |   </div>
346 | 
347 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
348 |     <h2>Contents</h2>
349 |     <ul class="nav nav-pills nav-stacked">
350 |       <li><a href="#section-package-help-page">Package help page</a></li>
351 |       <li><a href="#section-environments">Environments</a></li>
352 |       <li><a href="#section-policies">Policies</a></li>
353 |       <li><a href="#section-value-function-representations">Value Function Representations</a></li>
354 |       <li><a href="#section-algorithms">Algorithms</a></li>
355 |       <li><a href="#section-agent">Agent</a></li>
356 |       <li><a href="#section-interaction">Interaction</a></li>
357 |       <li><a href="#section-helper-functions">Helper functions</a></li>
358 |     </ul>
359 |   </div>
360 | </div>
361 | 
362 |       <footer>
363 |       <div class="copyright">
364 |   <p>Developed by Markus Dumke.</p>
365 | </div>
366 | 
367 | <div class="pkgdown">
368 |   <p>Site built with <a href="http://hadley.github.io/pkgdown/">pkgdown</a>.</p>
369 | </div>
370 | 
371 |       </footer>
372 |    </div>
373 | 
374 |   </body>
375 | </html>
376 | 


--------------------------------------------------------------------------------