├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── gradle.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
    ├── main
        ├── kotlin
        │   └── lab
        │   │   └── mars
        │   │       └── rl
        │   │           ├── algo
        │   │               ├── dp
        │   │               │   ├── PolicyIteration.kt
        │   │               │   └── ValueIteration.kt
        │   │               ├── dyna
        │   │               │   ├── Dyna-Q+.kt
        │   │               │   ├── Dyna-Q-OnPolicy.kt
        │   │               │   ├── Dyna-Q.kt
        │   │               │   ├── PrioritizedSweeping.kt
        │   │               │   ├── PrioritizedSweepingStochasticEnv.kt
        │   │               │   └── RandomSampleOneStepTabularQLearning.kt
        │   │               ├── eligibility_trace
        │   │               │   ├── control
        │   │               │   │   ├── Sarsa(λ).kt
        │   │               │   │   └── True Online Sarsa(λ).kt
        │   │               │   └── prediction
        │   │               │   │   ├── Off-line λ-return.kt
        │   │               │   │   ├── Semi-gradient TD(λ) prediction.kt
        │   │               │   │   └── True Online TD(λ) prediction.kt
        │   │               ├── func_approx
        │   │               │   ├── Play.kt
        │   │               │   ├── off_policy
        │   │               │   │   ├── Semi-gradient Expected Sarsa.kt
        │   │               │   │   ├── Semi-gradient off-policy TD(0).kt
        │   │               │   │   ├── n-step semi-gradient off-policy Q(σ).kt
        │   │               │   │   └── n-step semi-gradient off-policy sarsa.kt
        │   │               │   ├── on_policy
        │   │               │   │   ├── Differential semi-gradient Sarsa.kt
        │   │               │   │   ├── Differential semi-gradient n-step Sarsa.kt
        │   │               │   │   ├── Episodic Semi-gradient QLearning control.kt
        │   │               │   │   ├── Episodic Semi-gradient Sarsa control.kt
        │   │               │   │   └── Episodic semi-gradient n-step Sarsa.kt
        │   │               │   └── prediction
        │   │               │   │   ├── Gradient Monte Carlo algorithm.kt
        │   │               │   │   ├── LSTD.kt
        │   │               │   │   ├── Semi-gradient TD(0).kt
        │   │               │   │   └── n-step semi-gradient TD.kt
        │   │               ├── mc
        │   │               │   ├── ExploringStarts.kt
        │   │               │   ├── First visit Monte Carlo Prediction.kt
        │   │               │   ├── Off-Policy Prediction.kt
        │   │               │   ├── Off-policy Optimal.kt
        │   │               │   └── On-Policy Optimal.kt
        │   │               ├── ntd
        │   │               │   ├── N-step Off-policy Sarsa.kt
        │   │               │   ├── N-step Off-policy n-step Q(σ).kt
        │   │               │   ├── N-step Sarsa.kt
        │   │               │   ├── N-step TD prediction.kt
        │   │               │   └── N-step Treebackup.kt
        │   │               ├── package.kt
        │   │               ├── policy_gradient
        │   │               │   ├── Actor-Critic with Eligibility Traces (continuing).kt
        │   │               │   ├── Actor-Critic with Eligibility Traces (episodic).kt
        │   │               │   ├── One-step Actor-Critic (episodic).kt
        │   │               │   ├── REINFORCE with Baseline (episodic).kt
        │   │               │   └── REINFORCE.kt
        │   │               └── td
        │   │               │   ├── DoubleQLearning.kt
        │   │               │   ├── ExpectedSarsa.kt
        │   │               │   ├── QLearning.kt
        │   │               │   ├── Sarsa.kt
        │   │               │   └── Tabular TD(0).kt
        │   │           ├── model
        │   │               ├── ApproximateFunction.kt
        │   │               ├── MDP.kt
        │   │               └── impl
        │   │               │   ├── func
        │   │               │       ├── LinearFunc.kt
        │   │               │       ├── SimpleCoarseCoding.kt
        │   │               │       ├── SimpleTileCoding.kt
        │   │               │       ├── StateAggregation.kt
        │   │               │       └── SuttonTileCoding.kt
        │   │               │   └── mdp
        │   │               │       ├── DefaultAction.kt
        │   │               │       ├── DefaultMDP.kt
        │   │               │       ├── EpsilonGreedyFunctionPolicy.kt
        │   │               │       ├── IndexedAction.kt
        │   │               │       ├── IndexedMDP.kt
        │   │               │       ├── IndexedPolicy.kt
        │   │               │       ├── IndexedPossible.kt
        │   │               │       ├── IndexedState.kt
        │   │               │       ├── NSetMDP.kt
        │   │               │       ├── SoftmaxpPolicy.kt
        │   │               │       └── package.kt
        │   │           ├── problem
        │   │               ├── 1000-state RandomWalk.kt
        │   │               ├── 19-state RandomWalk.kt
        │   │               ├── AccessControl.kt
        │   │               ├── Blackjack.kt
        │   │               ├── CarRental.kt
        │   │               ├── CliffWalking.kt
        │   │               ├── DynaMaze.kt
        │   │               ├── FlyPlane.kt
        │   │               ├── Gambler.kt
        │   │               ├── GridWorld.kt
        │   │               ├── MaximizationBias.kt
        │   │               ├── MountainCar.kt
        │   │               ├── RandomWalk.kt
        │   │               ├── RodManeuvering.kt
        │   │               ├── SquareWave.kt
        │   │               ├── WindyGridworld.kt
        │   │               └── package.kt
        │   │           └── util
        │   │               ├── buf
        │   │                   ├── Buf.kt
        │   │                   ├── DefaultBuf.kt
        │   │                   ├── DefaultIntBuf.kt
        │   │                   ├── Index.kt
        │   │                   ├── IntBuf.kt
        │   │                   ├── MutableBuf.kt
        │   │                   └── MutableIntBuf.kt
        │   │               ├── collection
        │   │                   ├── CompactNSet.kt
        │   │                   ├── Gettable.kt
        │   │                   ├── HashMapRAC.kt
        │   │                   ├── IndexedCollection.kt
        │   │                   ├── NSet.kt
        │   │                   └── extensions.kt
        │   │               ├── dimension
        │   │                   ├── Dimension.kt
        │   │                   └── DimensionBuilder.kt
        │   │               ├── exception
        │   │                   ├── IndexOutOfDimensionException.kt
        │   │                   └── NoMoreElementsException.kt
        │   │               ├── log
        │   │                   └── LoggerHelpers.kt
        │   │               ├── math
        │   │                   ├── Binomial.kt
        │   │                   ├── MathHelpers.kt
        │   │                   ├── Poisson.kt
        │   │                   └── Vector.kt
        │   │               ├── matrix
        │   │                   └── Matrix.kt
        │   │               ├── resource
        │   │                   ├── ClasspathLocation.kt
        │   │                   ├── FileSystemLocation.kt
        │   │                   ├── ResourceLoader.kt
        │   │                   └── ResourceLocation.kt
        │   │               ├── tuples
        │   │                   ├── tuple2.kt
        │   │                   ├── tuple3.kt
        │   │                   ├── tuple4.kt
        │   │                   ├── tuple5.kt
        │   │                   └── tuple6.kt
        │   │               └── ui
        │   │                   ├── ChartApp.kt
        │   │                   ├── D2DGameUI.kt
        │   │                   ├── D3DChartUI.kt
        │   │                   ├── GridWorldUI.kt
        │   │                   ├── MountainCarUI.kt
        │   │                   ├── RawD3DChartUI.kt
        │   │                   └── RodManeuveringUI.kt
        └── resources
        │   └── StockLineChart.css
    └── test
        ├── kotlin
            └── lab
            │   └── mars
            │       └── rl
            │           ├── algo
            │               ├── dp
            │               │   ├── Test Policy Iteration.kt
            │               │   └── Test Value Iteration.kt
            │               ├── dyna
            │               │   ├── Test Optimal Dyna-Q on-policy.kt
            │               │   ├── Test Optimal Dyna-Q+.kt
            │               │   ├── Test Optimal Dyna-Q.kt
            │               │   ├── Test Optimal Prioritized Sweeping Stochastic.kt
            │               │   ├── Test Optimal Prioritized Sweeping.kt
            │               │   └── Test Optimal RandomSampleOneStepTabularQLearning.kt
            │               ├── eligibility_trace
            │               │   ├── control
            │               │   │   └── Test Optimal Sarsa(λ).kt
            │               │   └── prediction
            │               │   │   ├── Test Prediction Off-line λ-return.kt
            │               │   │   ├── Test Prediction Semi-gradient TD(λ).kt
            │               │   │   └── Test Prediction True Online TD(λ).kt
            │               ├── func_approx
            │               │   ├── on_policy
            │               │   │   ├── Test Optimal Differential semi-gradient Sarsa.kt
            │               │   │   ├── Test Optimal Episodic Semi-gradient QLearning control.kt
            │               │   │   ├── Test Optimal Episodic Semi-gradient Sarsa control.kt
            │               │   │   └── Test Optimal n-step semi-gradient Sarsa.kt
            │               │   └── prediction
            │               │   │   ├── Test Function Approximator Coarse Coding.kt
            │               │   │   ├── Test Function Approximator Fourier vs Poly.kt
            │               │   │   ├── Test Function Approximator Tile coding.kt
            │               │   │   ├── Test Prediction Gradient MC.kt
            │               │   │   ├── Test Prediction LSTD.kt
            │               │   │   ├── Test Prediction Semi-gradient TD(0).kt
            │               │   │   └── Test Prediction n-step Semi-gradient TD.kt
            │               ├── mc
            │               │   ├── Test Optimal MC Exploring Starts.kt
            │               │   ├── Test Optimal MC Off-policy.kt
            │               │   ├── Test Optimal MC On-policy first-visit.kt
            │               │   ├── Test Prediction MC Off-policy.kt
            │               │   └── Test Prediction Monte Carlo Prediction.kt
            │               ├── ntd
            │               │   ├── Test Optimal n-TD Off-policy Sarsa.kt
            │               │   ├── Test Optimal n-TD Q(σ).kt
            │               │   ├── Test Optimal n-TD Sarsa.kt
            │               │   ├── Test Optimal n-TD Treebackup.kt
            │               │   └── Test Prediction n-TD.kt
            │               └── td
            │               │   ├── Test Optimal TD Doubel Q-Learning.kt
            │               │   ├── Test Optimal TD Expected sarsa.kt
            │               │   ├── Test Optimal TD Q-Learning.kt
            │               │   ├── Test Optimal TD Sarsa.kt
            │               │   └── Test Prediction Tabular TD(0).kt
            │           ├── problem
            │               ├── Test Mountain Car with Actor-Critic.kt
            │               ├── `Test FlyPlane Problem with Actor-Critic`.kt
            │               ├── `Test FlyPlane Problem with REINFORCE`.kt
            │               └── `Test FlyPlane Problem with TD(λ)`.kt
            │           └── util
            │               ├── TestBase.kt
            │               ├── TestCNSet.kt
            │               ├── TestIndex.kt
            │               ├── TestNSet.kt
            │               ├── extensions.kt
            │               └── range
            │                   └── DoubleProgression.kt
        └── resources
            ├── Figure 10.1.PNG
            ├── Figure 10.4.PNG
            ├── Figure 12.10.PNG
            ├── Figure 12.11.PNG
            ├── Figure 12.3.PNG
            ├── Figure 12.6.PNG
            ├── Figure 12.8.PNG
            ├── Figure 7.2.PNG
            └── logback-test.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/**
2 | .gradle/**
3 | logs/**
4 | **/build
5 | **/out


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 wumo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'org.jetbrains.kotlin.jvm' version '1.3.61'
 3 | }
 4 | 
 5 | group 'wumo'
 6 | version '2.0'
 7 | 
 8 | repositories {
 9 |     mavenCentral()
10 | }
11 | 
12 | dependencies {
13 |     implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8"
14 |     compile "org.jetbrains.kotlinx:kotlinx-coroutines-core:1.3.3"
15 |     compile "com.object-refinery:orson-charts-fx:1.0"
16 |     compile "no.tornado:tornadofx:1.7.20"
17 |     compile "org.apache.commons:commons-math3:3.6.1"
18 |     compile "org.slf4j:slf4j-api:1.7.30"
19 |     compile "ch.qos.logback:logback-classic:1.2.3"
20 |     testCompile "junit:junit:4.13.1"
21 | }
22 | 
23 | compileKotlin {
24 |     kotlinOptions.jvmTarget = "1.8"
25 | }
26 | compileTestKotlin {
27 |     kotlinOptions.jvmTarget = "1.8"
28 | }
29 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/gradle.properties


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS="-Xmx64m"
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'reinforcement-learning-model'
2 | 
3 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dp/PolicyIteration.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dp
 2 | 
 3 | import lab.mars.rl.algo.Q_from_V
 4 | import lab.mars.rl.algo.V_from_Q
 5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 7 | import lab.mars.rl.model.impl.mdp.OptimalSolution
 8 | import lab.mars.rl.model.isNotTerminal
 9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.collection.filter
11 | import lab.mars.rl.util.collection.fork
12 | import lab.mars.rl.util.log.debug
13 | import lab.mars.rl.util.math.argmax
14 | import lab.mars.rl.util.math.Σ
15 | import lab.mars.rl.util.tuples.tuple3
16 | import org.apache.commons.math3.util.FastMath.abs
17 | import org.apache.commons.math3.util.FastMath.max
18 | 
19 | /**
20 |  * <p>
21 |  * Created on 2017-09-05.
22 |  * </p>
23 |  *
24 |  * @author wumo
25 |  */
26 | 
27 | val θ = 1e-6
28 | 
29 | fun IndexedMDP.`Policy Iteration V`(): OptimalSolution {
30 |   val V = VFunc { 0.0 }
31 |   val π = IndexedPolicy(QFunc { 1.0 })
32 |   val Q = QFunc { 0.0 }
33 |   
34 |   do {
35 |     //Policy Evaluation
36 |     do {
37 |       var Δ = 0.0
38 |       for (s in states.filter { it.isNotTerminal }) {
39 |         val v = V[s]
40 |         V[s] = Σ(π(s).possibles) { probability * (reward + γ * V[next]) }
41 |         Δ = max(Δ, abs(v - V[s]))
42 |       }
43 |       log.debug { "Δ=$Δ" }
44 |     } while (Δ >= θ)
45 |     
46 |     //Policy Improvement
47 |     var `policy-stable` = true
48 |     for (s in states.filter { it.isNotTerminal }) {
49 |       val `old-action` = π(s)
50 |       val `new-action` = argmax(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
51 |       π[s] = `new-action`
52 |       if (`old-action` !== `new-action`) `policy-stable` = false
53 |     }
54 |   } while (!`policy-stable`)
55 |   val result = tuple3(π, V, Q)
56 |   Q_from_V(γ, states, result)
57 |   return result
58 | }
59 | 
60 | fun IndexedMDP.`Policy Iteration Q`(): OptimalSolution {
61 |   val V = VFunc { 0.0 }
62 |   val π = IndexedPolicy(QFunc { 1.0 })
63 |   val Q = QFunc { 0.0 }
64 |   do {
65 |     //Policy Evaluation
66 |     do {
67 |       var Δ = 0.0
68 |       for ((s, a) in states.fork { it.actions }) {
69 |         val q = Q[s, a]
70 |         Q[s, a] = Σ(a.possibles) { probability * (reward + γ * if (next.actions.any()) Q[next, π(next)] else 0.0) }
71 |         Δ = max(Δ, abs(q - Q[s, a]))
72 |       }
73 |       log.debug { "Δ=$Δ" }
74 |     } while (Δ >= θ)
75 |     
76 |     //Policy Improvement
77 |     var `policy-stable` = true
78 |     for (s in states.filter { it.isNotTerminal }) {
79 |       val `old-action` = π(s)
80 |       val `new-action` = argmax(s.actions) { Q[s, it] }
81 |       π[s] = `new-action`
82 |       if (`old-action` !== `new-action`) `policy-stable` = false
83 |     }
84 |   } while (!`policy-stable`)
85 |   val result = tuple3(π, V, Q)
86 |   V_from_Q(states, result)
87 |   return result
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dp/ValueIteration.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dp
 2 | 
 3 | import lab.mars.rl.algo.Q_from_V
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 6 | import lab.mars.rl.model.impl.mdp.OptimalSolution
 7 | import lab.mars.rl.model.isNotTerminal
 8 | import lab.mars.rl.model.log
 9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.argmax
12 | import lab.mars.rl.util.math.max
13 | import lab.mars.rl.util.math.Σ
14 | import lab.mars.rl.util.tuples.tuple3
15 | import org.apache.commons.math3.util.FastMath.abs
16 | import org.apache.commons.math3.util.FastMath.max
17 | 
18 | /**
19 |  * <p>
20 |  * Created on 2017-09-06.
21 |  * </p>
22 |  *
23 |  * @author wumo
24 |  */
25 | fun IndexedMDP.ValueIteration(): OptimalSolution {
26 |   val V = VFunc { 0.0 }
27 |   val π = IndexedPolicy(QFunc { 1.0 })
28 |   val Q = QFunc { 0.0 }
29 |   //value iteration
30 |   do {
31 |     var Δ = 0.0
32 |     for (s in states.filter { it.isNotTerminal }) {
33 |       val v = V[s]
34 |       V[s] = max(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
35 |       Δ = max(Δ, abs(v - V[s]))
36 |     }
37 |     log.debug { "Δ=$Δ" }
38 |   } while (Δ >= θ)
39 |   //policy generation
40 |   for (s in states.filter { it.isNotTerminal })
41 |     π[s] = argmax(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
42 |   val result = tuple3(π, V, Q)
43 |   Q_from_V(γ, states, result)
44 |   return result
45 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q+.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.model.null_state
 9 | import lab.mars.rl.util.buf.DefaultBuf
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.max
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.tuples.tuple3
14 | import org.apache.commons.math3.util.FastMath.sqrt
15 | 
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.`Dyna-Q+`(
18 |     α: (IndexedState, IndexedAction) -> Double,
19 |     ε: Double,
20 |     κ: Double,
21 |     n: Int,
22 |     episodes: Int,
23 |     stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
24 |     episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
25 |   val null_tuple3 = tuple3(null_state, Double.NaN, 0)
26 |   val π = IndexedPolicy(QFunc { 0.0 })
27 |   val Q = QFunc { 0.0 }
28 |   val cachedSA = DefaultBuf.new<tuple2<IndexedState, IndexedAction>>(Q.size)
29 |   val Model = QFunc { null_tuple3 }
30 |   val V = VFunc { 0.0 }
31 |   val result = tuple3(π, V, Q)
32 |   var time = 0
33 |   for (episode in 1..episodes) {
34 |     log.debug { "$episode/$episodes" }
35 |     var s = started()
36 |     while (s.isNotTerminal) {
37 |       V_from_Q(states, result)
38 |       stepListener(V, s)
39 |       time++
40 |       `ε-greedy`(s, Q, π, ε)
41 |       val a = π(s)
42 |       val (s_next, reward) = a.sample()
43 |       Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
44 |       for (_a in s.actions) {
45 |         if (_a !== a && Model[s, _a] === null_tuple3) {
46 |           cachedSA.append(tuple2(s, _a))
47 |           Model[s, _a] = tuple3(s, 0.0, 1)
48 |         }
49 |       }
50 |       if (Model[s, a] === null_tuple3)
51 |         cachedSA.append(tuple2(s, a))
52 |       Model[s, a] = tuple3(s_next, reward, time)
53 |       repeat(n) {
54 |         val (s, a) = cachedSA.rand()
55 |         var (s_next, reward, t) = Model[s, a]
56 |         reward += κ * sqrt((time - t).toDouble())
57 |         Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
58 |       }
59 |       s = s_next
60 |     }
61 |     episodeListener(V)
62 |     log.debug { "steps=$time" }
63 |   }
64 |   return result
65 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q-OnPolicy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.math.Rand
10 | import lab.mars.rl.util.math.max
11 | import lab.mars.rl.util.math.repeat
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.tuples.tuple3
14 | import java.util.*
15 | 
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.`Dyna-Q-OnPolicy`(
18 |     n: Int,
19 |     ε: Double,
20 |     α: (IndexedState, IndexedAction) -> Double,
21 |     episodes: Int,
22 |     stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
23 |     episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
24 |   val π = IndexedPolicy(QFunc { 0.0 })
25 |   val Q = QFunc { 0.0 }
26 |   val V = VFunc { 0.0 }
27 |   val result = tuple3(π, V, Q)
28 |   
29 |   val startedStates = hashMapOf<IndexedState, Int>()
30 |   val Model = QFunc { hashMapOf<tuple2<IndexedState, Double>, Int>() }
31 |   val N = QFunc { 0 }
32 |   
33 |   for (episode in 1..episodes) {
34 |     log.debug { "$episode/$episodes" }
35 |     var step = 0
36 |     var stat = 0
37 |     var s = started()
38 |     startedStates.compute(s) { _, v -> (v ?: 0) + 1 }//record the total visits of each state
39 |     while (s.isNotTerminal) {
40 |       V_from_Q(states, result)
41 |       stepListener(V, s)
42 |       step++
43 |       `ε-greedy (tie broken randomly)`(s, Q, π, ε)
44 |       val a = π(s)
45 |       val (s_next, reward) = a.sample()
46 |       Model[s, a].compute(tuple2(s_next, reward)) { _, v -> (v ?: 0) + 1 }
47 |       N[s, a]++
48 |       Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
49 |       
50 |       var _s = startedStates.rand(episode)
51 |       repeat(n, { _s.isNotTerminal }) {
52 |         `ε-greedy (tie broken randomly)`(_s, Q, π, ε)//using on-policy to distribute computation
53 |         val a = π(_s)
54 |         if (Model[_s, a].isEmpty()) return@repeat
55 |         stat++
56 |         val (s_next, reward) = Model[_s, a].rand(N[_s, a])
57 |         Q[_s, a] += α(_s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[_s, a])
58 |         _s = s_next
59 |       }
60 |       s = s_next
61 |     }
62 |     episodeListener(V)
63 |     log.debug { "steps=$step, stat=$stat" }
64 |   }
65 |   return result
66 | }
67 | 
68 | private fun <K> HashMap<K, Int>.rand(N: Int): K {
69 |   val p = Rand().nextDouble()
70 |   var acc = 0.0
71 |   for ((k, v) in this) {
72 |     acc += v.toDouble() / N
73 |     if (p <= acc)
74 |       return k
75 |   }
76 |   throw IllegalArgumentException("random=$p, but accumulation=$acc")
77 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
 5 | import lab.mars.rl.model.emptyPossibleSet
 6 | import lab.mars.rl.model.impl.mdp.*
 7 | import lab.mars.rl.model.isNotTerminal
 8 | import lab.mars.rl.model.log
 9 | import lab.mars.rl.util.buf.DefaultBuf
10 | import lab.mars.rl.util.collection.cnsetOf
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.max
13 | import lab.mars.rl.util.tuples.tuple2
14 | import lab.mars.rl.util.tuples.tuple3
15 | 
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.DynaQ(
18 |     α: (IndexedState, IndexedAction) -> Double,
19 |     ε: Double,
20 |     n: Int,
21 |     episodes: Int,
22 |     stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
23 |     episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
24 |   val π = IndexedPolicy(QFunc { 0.0 })
25 |   val Q = QFunc { 0.0 }
26 |   val cachedSA = DefaultBuf.new<tuple2<IndexedState, IndexedAction>>(Q.size)
27 |   val Model = QFunc { emptyPossibleSet }
28 |   val V = VFunc { 0.0 }
29 |   val result = tuple3(π, V, Q)
30 |   for (episode in 1..episodes) {
31 |     log.debug { "$episode/$episodes" }
32 |     var step = 0
33 |     var s = started()
34 |     while (s.isNotTerminal) {
35 |       V_from_Q(states, result)
36 |       stepListener(V, s)
37 |       step++
38 |       `ε-greedy (tie broken randomly)`(s, Q, π, ε)
39 |       val a = π(s)
40 |       val (s_next, reward) = a.sample()
41 |       Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
42 |       if (Model[s, a].isEmpty())
43 |         cachedSA.append(tuple2(s, a))
44 |       Model[s, a] = cnsetOf(IndexedPossible(s_next, reward, 1.0))
45 |       repeat(n) {
46 |         val (s, a) = cachedSA.rand()
47 |         val (s_next, reward) = Model[s, a].rand()
48 |         Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
49 |       }
50 |       s = s_next
51 |     }
52 |     episodeListener(V)
53 |     log.debug { "steps=$step" }
54 |   }
55 |   return result
56 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/PrioritizedSweeping.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
 5 | import lab.mars.rl.model.emptyPossibleSet
 6 | import lab.mars.rl.model.impl.mdp.*
 7 | import lab.mars.rl.model.isNotTerminal
 8 | import lab.mars.rl.model.log
 9 | import lab.mars.rl.util.collection.cnsetOf
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.max
12 | import lab.mars.rl.util.math.repeat
13 | import lab.mars.rl.util.tuples.tuple2
14 | import lab.mars.rl.util.tuples.tuple3
15 | import org.apache.commons.math3.util.FastMath.abs
16 | import java.util.*
17 | 
18 | @Suppress("NAME_SHADOWING")
19 | fun IndexedMDP.PrioritizedSweeping(
20 |     n: Int,
21 |     θ: Double,
22 |     ε: Double,
23 |     α: (IndexedState, IndexedAction) -> Double,
24 |     episodes: Int,
25 |     stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
26 |     episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
27 |   val π = IndexedPolicy(QFunc { 0.0 })
28 |   val Q = QFunc { 0.0 }
29 |   val PQueue = PriorityQueue(Q.size, Comparator<tuple3<Double, IndexedState, IndexedAction>> { o1, o2 ->
30 |     o2._1.compareTo(o1._1)
31 |   })
32 |   val Model = QFunc { emptyPossibleSet }
33 |   val predecessor = VFunc { hashSetOf<tuple2<IndexedState, IndexedAction>>() }
34 |   val V = VFunc { 0.0 }
35 |   val result = tuple3(π, V, Q)
36 |   for (episode in 1..episodes) {
37 |     log.debug { "$episode/$episodes" }
38 |     var step = 0
39 |     var s = started()
40 |     while (s.isNotTerminal) {
41 |       V_from_Q(states, result)
42 |       stepListener(V, s)
43 |       step++
44 |       `ε-greedy (tie broken randomly)`(s, Q, π, ε)
45 |       val a = π(s)
46 |       val (s_next, reward) = a.sample()
47 |       Model[s, a] = cnsetOf(IndexedPossible(s_next, reward, 1.0))
48 |       predecessor[s_next] += tuple2(s, a)
49 |       val P = abs(reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
50 |       if (P > θ) PQueue.add(tuple3(P, s, a))
51 |       repeat(n, { PQueue.isNotEmpty() }) {
52 |         val (_, s, a) = PQueue.poll()
53 |         val (s_next, reward) = Model[s, a].rand()
54 |         Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
55 |         for ((s_pre, a_pre) in predecessor[s]) {
56 |           val (s_next, reward) = Model[s_pre, a_pre].rand()
57 |           assert(s_next === s)
58 |           val P = abs(reward + γ * max(s.actions, 0.0) { Q[s, it] } - Q[s_pre, a_pre])
59 |           if (P > θ) PQueue.add(tuple3(P, s_pre, a_pre))
60 |         }
61 |       }
62 |       s = s_next
63 |     }
64 |     episodeListener(V)
65 |     log.debug { "steps=$step" }
66 |   }
67 |   return result
68 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/RandomSampleOneStepTabularQLearning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.collection.filter
 9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.math.max
11 | import lab.mars.rl.util.tuples.tuple3
12 | 
13 | fun IndexedMDP.RandomSampleOneStepTabularQLearning(
14 |     ε: Double,
15 |     α: (IndexedState, IndexedAction) -> Double,
16 |     episodes: Int): OptimalSolution {
17 |   
18 |   val Q = QFunc { 0.0 }
19 |   for (episode in 1..episodes) {
20 |     log.debug { "$episode/$episodes" }
21 |     val s = started()
22 |     val a = s.actions.rand()//Exploring Starts
23 |     val (s_next, reward) = a.sample()
24 |     Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
25 |   }
26 |   val π = IndexedPolicy(QFunc { 0.0 })
27 |   for (s in states.filter { it.isNotTerminal })
28 |     `ε-greedy`(s, Q, π, ε)
29 |   val V = VFunc { 0.0 }
30 |   val result = tuple3(π, V, Q)
31 |   V_from_Q(states, result)
32 |   return result
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/control/True Online Sarsa(λ).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.control
 2 | 
 3 | import lab.mars.rl.algo.EpisodeListener
 4 | import lab.mars.rl.algo.StepListener
 5 | import lab.mars.rl.model.MDP
 6 | import lab.mars.rl.model.Policy
 7 | import lab.mars.rl.model.impl.func.LinearFunc
 8 | import lab.mars.rl.model.isNotTerminal
 9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.matrix.Matrix
12 | import lab.mars.rl.util.matrix.MatrixSpec
13 | import lab.mars.rl.util.matrix.minus
14 | import lab.mars.rl.util.matrix.times
15 | 
16 | fun <E> MDP.`True Online Sarsa(λ)`(
17 |     Qfunc: LinearFunc<E>,
18 |     π: Policy,
19 |     λ: Double,
20 |     α: Double,
21 |     episodes: Int,
22 |     z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
23 |     maxStep: Int = Int.MAX_VALUE,
24 |     episodeListener: EpisodeListener = { _, _, _, _ -> },
25 |     stepListener: StepListener = { _, _, _, _ -> }) {
26 |   val X = Qfunc.x
27 |   val w = Qfunc.w
28 |   val d = w.size
29 |   val z = z_maker(d, 1)
30 |   for (episode in 1..episodes) {
31 |     log.debug { "$episode/$episodes" }
32 |     var step = 0
33 |     var s = started()
34 |     var a = π(s)
35 |     var x = X(s, a)
36 |     z.zero()
37 |     var Q_old = 0.0
38 |     var G = 0.0
39 |     var γn = 1.0
40 |     while (true) {
41 |       z `=` (γ * λ * z + (1.0 - α * γ * λ * (z `T*` x)) * x)
42 |       val (s_next, reward) = a.sample()
43 |       γn *= γ
44 |       G += γn * reward
45 |       s = s_next
46 |       val Q = (w `T*` x).toScalar
47 |       var δ = reward - Q
48 |       if (s_next.isNotTerminal) {
49 |         val a_next = π(s_next)
50 |         val `x'` = X(s_next, a_next)
51 |         val `Q'` = (w `T*` `x'`).toScalar
52 |         δ += γ * `Q'`
53 |         w += α * (δ + Q - Q_old) * z - α * (Q - Q_old) * x
54 |         Q_old = `Q'`
55 |         x = `x'`
56 |         a = a_next
57 |       } else {
58 |         w += α * (δ + Q - Q_old) * z - α * (Q - Q_old) * x
59 |         break
60 |       }
61 |       step++
62 |       stepListener(episode, step, s_next, a)
63 |       if (step >= maxStep) break
64 |     }
65 |     episodeListener(episode, step, s, G)
66 |   }
67 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Off-line λ-return.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.buf.newBuf
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.math.Σ
 7 | import lab.mars.rl.util.matrix.times
 8 | import org.apache.commons.math3.util.FastMath.pow
 9 | 
10 | fun <E> MDP.`Off-line λ-return`(
11 |     V: ApproximateFunction<E>,
12 |     π: Policy,
13 |     λ: Double,
14 |     α: Double,
15 |     episodes: Int,
16 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
17 |   val R = newBuf<Double>()
18 |   val S = newBuf<State>()
19 |   for (episode in 1..episodes) {
20 |     log.debug { "$episode/$episodes" }
21 |     var s = started()
22 |     S.clear();S.append(s)
23 |     R.clear();R.append(0.0)
24 |     var T = 0
25 |     while (s.isNotTerminal) {
26 |       val a = π(s)
27 |       val (s_next, reward) = a.sample()
28 |       S.append(s_next)
29 |       R.append(reward)
30 |       s = s_next
31 |       T++
32 |     }
33 |     
34 |     fun Gt(t: Int, n: Int)
35 |         = Σ(1..n) { pow(γ, it - 1) * R[t + it] } +
36 |           (if (t + n < T) pow(γ, n) * V(S[t + n]) else 0.0)
37 |     
38 |     for (t in 0 until T) {
39 |       val Gtλ = (1 - λ) * Σ(1..T - t - 1) { pow(λ, it - 1) * Gt(t, it) } +
40 |                 pow(λ, T - t - 1) * Gt(t, T - t)
41 |       V.w += α * (Gtλ - V(S[t])) * V.`∇`(S[t])
42 |     }
43 |     episodeListener(episode, T)
44 |   }
45 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Semi-gradient TD(λ) prediction.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.matrix.Matrix
 6 | import lab.mars.rl.util.matrix.MatrixSpec
 7 | import lab.mars.rl.util.matrix.times
 8 | 
 9 | fun <E> MDP.`Semi-gradient TD(λ) prediction`(
10 |     V: ApproximateFunction<E>,
11 |     π: Policy,
12 |     λ: Double,
13 |     α: Double,
14 |     episodes: Int,
15 |     z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
16 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
17 |   val w = V.w
18 |   val d = w.size
19 |   for (episode in 1..episodes) {
20 |     log.debug { "$episode/$episodes" }
21 |     var step = 0
22 |     var s = started()
23 |     val z = z_maker(d, 1)
24 |     while (s.isNotTerminal) {
25 |       val a = π(s)
26 |       val (s_next, reward) = a.sample()
27 |       z `=` γ * λ * z + V.`∇`(s)
28 |       val δ = reward + γ * (if (s_next.isTerminal) 0.0 else V(s_next)) - V(s)
29 |       V.w += α * δ * z
30 |       s = s_next
31 |       step++
32 |     }
33 |     episodeListener(episode, step)
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/True Online TD(λ) prediction.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.matrix.Matrix
 7 | import lab.mars.rl.util.matrix.MatrixSpec
 8 | import lab.mars.rl.util.matrix.minus
 9 | import lab.mars.rl.util.matrix.times
10 | 
11 | fun <E> MDP.`True Online TD(λ) prediction`(
12 |     Vfunc: LinearFunc<E>,
13 |     π: Policy,
14 |     λ: Double,
15 |     α: Double,
16 |     episodes: Int,
17 |     z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
18 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
19 |   val X = Vfunc.x
20 |   val w = Vfunc.w
21 |   val d = w.size
22 |   for (episode in 1..episodes) {
23 |     log.debug { "$episode/$episodes" }
24 |     var step = 0
25 |     var s = started()
26 |     var x = X(s)
27 |     val z = z_maker(d, 1)
28 |     var V_old = 0.0
29 |     while (s.isNotTerminal) {
30 |       val a = π(s)
31 |       val (s_next, reward) = a.sample()
32 |       val `x'` = X(s_next)
33 |       val V = (w `T*` x).toScalar
34 |       val `V'` = if (s_next.isTerminal) 0.0 else (w `T*` `x'`).toScalar
35 |       val δ = reward + γ * `V'` - V
36 |       z `=` γ * λ * z + (1.0 - α * γ * λ * (z `T*` x)) * x
37 |       w += α * (δ + V - V_old) * z - α * (V - V_old) * x
38 |       V_old = `V'`
39 |       x = `x'`
40 |       s = s_next
41 |       step++
42 |     }
43 |     episodeListener(episode, step)
44 |   }
45 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/Play.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx
 2 | 
 3 | import lab.mars.rl.model.*
 4 | 
 5 | fun MDP.play(
 6 |     π: Policy,
 7 |     episodes: Int,
 8 |     maxStep: Int = Int.MAX_VALUE,
 9 |     episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
10 |     stepListener: (Int, Int, State, Action<State>) -> Unit = { _, _, _, _ -> }) {
11 |   for (episode in 1..episodes) {
12 |     var s = started()
13 |     var step = 0
14 |     var G = 0.0
15 |     var γn = 1.0
16 |     while (s.isNotTerminal) {
17 |       val a = π(s)
18 |       stepListener(episode, step, s, a)
19 |       val (s_next, reward) = a.sample()
20 |       γn *= γ
21 |       G += γn * reward
22 |       s = s_next
23 |       step++
24 |       if (step >= maxStep)
25 |         break
26 |     }
27 |     episodeListener(episode, step, s, G)
28 |   }
29 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/off_policy/Semi-gradient Expected Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.off_policy
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.math.Σ
 6 | import lab.mars.rl.util.matrix.times
 7 | 
 8 | fun <E> MDP.`Semi-gradient Expected Sarsa`(
 9 |     q: ApproximateFunction<E>, π: Policy,
10 |     α: Double,
11 |     episodes: Int,
12 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 |   for (episode in 1..episodes) {
14 |     log.debug { "$episode/$episodes" }
15 |     var step = 0
16 |     var s = started()
17 |     while (s.isNotTerminal) {
18 |       step++
19 |       val a = π(s)
20 |       val (s_next, reward) = a.sample()
21 |       val δ = reward + γ * Σ(s_next.actions) { π[s_next, it] * q(s_next, it) } - q(s, a)
22 |       q.w += α * δ * q.`∇`(s, a)
23 |       s = s_next
24 |     }
25 |     episodeListener(episode, step)
26 |   }
27 | }
28 | 
29 | fun <E> MDP.`Semi-gradient Expected Sarsa`(q: ApproximateFunction<E>, π: Policy,
30 |                                            α: Double, β: Double) {
31 |   var average_reward = 0.0
32 |   var s = started()
33 |   while (true) {
34 |     val a = π(s)
35 |     val (s_next, reward) = a.sample()
36 |     val δ = reward - average_reward + Σ(s_next.actions) { π[s_next, it] * q(s_next, it) } - q(s, a)
37 |     q.w += α * δ * q.`∇`(s, a)
38 |     average_reward += β * δ
39 |     s = s_next
40 |   }
41 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/off_policy/Semi-gradient off-policy TD(0).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.off_policy
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.matrix.times
 6 | 
 7 | fun <E> MDP.`Semi-gradient off-policy TD(0) episodic`(
 8 |     v: ApproximateFunction<E>, π: Policy, b: Policy,
 9 |     α: Double,
10 |     episodes: Int,
11 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
12 |   for (episode in 1..episodes) {
13 |     log.debug { "$episode/$episodes" }
14 |     var step = 0
15 |     var s = started()
16 |     while (s.isNotTerminal) {
17 |       step++
18 |       val a = b(s)
19 |       val (s_next, reward) = a.sample()
20 |       val ρ = π[s, a] / b[s, a]
21 |       val δ = reward + γ * v(s_next) - v(s)
22 |       v.w += α * ρ * δ * v.`∇`(s)
23 |       s = s_next
24 |     }
25 |     episodeListener(episode, step)
26 |   }
27 | }
28 | 
29 | fun <E> MDP.`Semi-gradient off-policy TD(0) continuing`(v: ApproximateFunction<E>, π: Policy, b: Policy,
30 |                                                         α: Double, β: Double) {
31 |   var average_reward = 0.0
32 |   var s = started()
33 |   while (true) {
34 |     val a = b(s)
35 |     val (s_next, reward) = a.sample()
36 |     val ρ = π[s, a] / b[s, a]
37 |     val δ = reward - average_reward + v(s_next) - v(s)
38 |     v.w += α * ρ * δ * v.`∇`(s)
39 |     average_reward += β * δ
40 |     s = s_next
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Differential semi-gradient Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.on_policy
 2 | 
 3 | import lab.mars.rl.model.ApproximateFunction
 4 | import lab.mars.rl.model.MDP
 5 | import lab.mars.rl.model.Policy
 6 | import lab.mars.rl.util.matrix.times
 7 | 
 8 | fun <E> MDP.`Differential semi-gradient Sarsa`(
 9 |     q: ApproximateFunction<E>, π: Policy,
10 |     α: Double, β: Double, maxStep: Int) {
11 |   var average_reward = 0.0
12 |   var s = started()
13 |   var a = π(s)
14 |   var step = 0
15 |   while (true) {
16 |     val (s_next, reward) = a.sample()
17 |     val a_next = π(s_next)
18 |     val δ = reward - average_reward + q(s_next, a_next) - q(s, a)
19 |     average_reward += β * δ
20 |     q.w += α * δ * q.`∇`(s, a)
21 |     s = s_next
22 |     a = a_next
23 |     step++
24 |     if (step >= maxStep) break
25 |   }
26 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Differential semi-gradient n-step Sarsa.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.func_approx.on_policy
 4 | 
 5 | import lab.mars.rl.algo.ntd.MAX_N
 6 | import lab.mars.rl.model.*
 7 | import lab.mars.rl.util.buf.newBuf
 8 | import lab.mars.rl.util.math.Σ
 9 | import lab.mars.rl.util.matrix.times
10 | import org.apache.commons.math3.util.FastMath.min
11 | 
12 | fun <E> MDP.`Differential semi-gradient n-step Sarsa`(
13 |     q: ApproximateFunction<E>, π: Policy,
14 |     n: Int,
15 |     α: Double, β: Double) {
16 |   var average_reward = 0.0
17 |   val _R = newBuf<Double>(min(n, MAX_N))
18 |   val _S = newBuf<State>(min(n, MAX_N))
19 |   val _A = newBuf<Action<State>>(min(n, MAX_N))
20 |   
21 |   var t = 0
22 |   val s = started()
23 |   var a = π(s)
24 |   _R.clear();_R.append(0.0)
25 |   _S.clear();_S.append(s)
26 |   _A.clear();_A.append(a)
27 |   while (true) {
28 |     if (t >= n) {
29 |       _R.removeFirst()
30 |       _S.removeFirst()
31 |       _A.removeFirst()
32 |     }
33 |     val (s_next, reward) = a.sample()
34 |     _R.append(reward)
35 |     _S.append(s_next)
36 |     a = π(s)
37 |     _A.append(a)
38 |     val τ = t - n + 1
39 |     if (τ >= 0) {
40 |       val δ = Σ(1..n) { _R[it] - average_reward } + q(_S[n], _A[n]) - q(_S[0], _A[0])
41 |       average_reward += β * δ
42 |       q.w += α * δ * q.`∇`(_S[0], _A[0])
43 |     }
44 |     t++
45 |   }
46 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic Semi-gradient QLearning control.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.on_policy
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.math.max
 6 | import lab.mars.rl.util.matrix.times
 7 | 
 8 | fun <E> MDP.`Episodic semi-gradient QLearning control`(
 9 |     Q: ApproximateFunction<E>,
10 |     π: Policy,
11 |     α: Double,
12 |     episodes: Int,
13 |     episodeListener: (Int, Int) -> Unit = { _, _ -> },
14 |     stepListener: (Int, Int, State, Action<State>) -> Unit = { _, _, _, _ -> }) {
15 |   for (episode in 1..episodes) {
16 |     log.debug { "$episode/$episodes" }
17 |     var step = 0
18 |     var s = started()
19 |     var a = π(s)
20 |     while (true) {
21 |       step++
22 |       stepListener(episode, step, s, a)
23 |       val (s_next, reward) = a.sample()
24 |       if (s_next.isNotTerminal) {
25 |         val a_next = π(s_next)
26 |         Q.w += α * (reward + γ * max(s_next.actions) { Q(s_next, it) } - Q(s, a)) * Q.`∇`(s, a)
27 |         s = s_next
28 |         a = a_next
29 |       } else {
30 |         Q.w += α * (reward - Q(s, a)) * Q.`∇`(s, a)
31 |         break
32 |       }
33 |     }
34 |     episodeListener(episode, step)
35 |   }
36 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic Semi-gradient Sarsa control.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.on_policy
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.matrix.times
 6 | 
 7 | fun <E> MDP.`Episodic semi-gradient Sarsa control`(
 8 |     Qfunc: ApproximateFunction<E>,
 9 |     π: Policy,
10 |     α: Double,
11 |     episodes: Int,
12 |     episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
13 |     stepListener: (Int, Int, State, Action<State>) -> Unit = { _, _, _, _ -> }) {
14 |   for (episode in 1..episodes) {
15 |     log.debug { "$episode/$episodes" }
16 |     var step = 0
17 |     var s = started()
18 |     var a = π(s)
19 |     var G = 0.0
20 |     var γn = 1.0
21 |     while (true) {
22 |       step++
23 |       stepListener(episode, step, s, a)
24 |       val (s_next, reward) = a.sample()
25 |       γn *= γ
26 |       G += γn * reward
27 |       if (s_next.isNotTerminal) {
28 |         val a_next = π(s_next)
29 |         Qfunc.w += α * (reward + γ * Qfunc(s_next, a_next) - Qfunc(s, a)) * Qfunc.`∇`(s, a)
30 |         s = s_next
31 |         a = a_next
32 |       } else {
33 |         Qfunc.w += α * (reward - Qfunc(s, a)) * Qfunc.`∇`(s, a)
34 |         s = s_next
35 |         break
36 |       }
37 |     }
38 |     episodeListener(episode, step, s, G)
39 |   }
40 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic semi-gradient n-step Sarsa.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.func_approx.on_policy
 4 | 
 5 | import lab.mars.rl.algo.ntd.MAX_N
 6 | import lab.mars.rl.model.*
 7 | import lab.mars.rl.util.buf.newBuf
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.matrix.times
11 | import org.apache.commons.math3.util.FastMath.min
12 | import org.apache.commons.math3.util.FastMath.pow
13 | 
14 | fun <E> MDP.`Episodic semi-gradient n-step Sarsa control`(
15 |     q: ApproximateFunction<E>, π: Policy,
16 |     n: Int,
17 |     α: Double,
18 |     episodes: Int,
19 |     maxStep: Int = Int.MAX_VALUE,
20 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
21 |   val _R = newBuf<Double>(min(n + 1, MAX_N))
22 |   val _S = newBuf<State>(min(n + 1, MAX_N))
23 |   val _A = newBuf<Action<State>>(min(n + 1, MAX_N))
24 |   
25 |   for (episode in 1..episodes) {
26 |     log.debug { "$episode/$episodes" }
27 |     var step = 0
28 |     var n = n
29 |     var T = Int.MAX_VALUE
30 |     var t = 0
31 |     var s = started()
32 |     var a = π(s)
33 |     _R.clear();_R.append(0.0)
34 |     _S.clear();_S.append(s)
35 |     _A.clear();_A.append(a)
36 |     do {
37 |       step++
38 |       if (t >= n) {
39 |         _R.removeFirst()
40 |         _S.removeFirst()
41 |         _A.removeFirst()
42 |       }
43 |       if (t < T) {
44 |         val (s_next, reward) = a.sample()
45 |         _R.append(reward)
46 |         _S.append(s_next)
47 |         s = s_next
48 |         if (s.isTerminal || step >= maxStep) {
49 |           T = t + 1
50 |           val τ = t - n + 1
51 |           if (τ < 0) n = T //n is too large
52 |         } else {
53 |           a = π(s)
54 |           _A.append(a)
55 |         }
56 |       }
57 |       val τ = t - n + 1
58 |       if (τ >= 0) {
59 |         var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
60 |         if (τ + n < T) G += pow(γ, n) * q(_S[n], _A[n])
61 |         q.w += α * (G - q(_S[0], _A[0])) * q.`∇`(_S[0], _A[0])
62 |       }
63 |       t++
64 |     } while (τ < T - 1)
65 |     log.debug { "n=$n,T=$T" }
66 |     episodeListener(episode, step)
67 |   }
68 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/Gradient Monte Carlo algorithm.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.buf.newBuf
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.matrix.times
 7 | 
 8 | fun <E> MDP.`Gradient Monte Carlo algorithm`(
 9 |     v: ApproximateFunction<E>, π: Policy,
10 |     α: Double,
11 |     episodes: Int,
12 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 |   val _S = newBuf<State>()
14 |   val _R = newBuf<Double>()
15 |   
16 |   for (episode in 1..episodes) {
17 |     log.debug { "$episode/$episodes" }
18 |     var step = 0
19 |     _S.clear(); _R.clear()
20 |     var s = started()
21 |     _S.append(s); _R.append(0.0)
22 |     var T = 0
23 |     var accum = 0.0
24 |     while (s.isNotTerminal) {
25 |       step++
26 |       val a = π(s)
27 |       val (s_next, reward) = a.sample()
28 |       accum += reward
29 |       _S.append(s_next)
30 |       _R.append(reward)
31 |       s = s_next
32 |       T++
33 |     }
34 |     var pre = 0.0
35 |     for (t in 0 until T) {
36 |       pre += _R[t]
37 |       val Gt = accum - pre
38 |       v.w += α * (Gt - v(_S[t])) * v.`∇`(_S[t])
39 |     }
40 |     episodeListener(episode, step)
41 |   }
42 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/LSTD.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.matrix.Matrix
 7 | import lab.mars.rl.util.matrix.plus
 8 | import lab.mars.rl.util.matrix.times
 9 | 
10 | fun <E> MDP.LSTD(vFunc: LinearFunc<E>, π: Policy, ε: Double,
11 |                  episodes: Int,
12 |                  episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 |   val xFeature = vFunc.x
14 |   val d = xFeature.numOfComponents
15 |   val A_ = 1 / ε * Matrix.identity(d)
16 |   val b = Matrix.column(d)
17 |   for (episode in 1..episodes) {
18 |     log.debug { "$episode/$episodes" }
19 |     var step = 0
20 |     var s = started()
21 |     var x = xFeature(s)
22 |     while (s.isNotTerminal) {
23 |       step++
24 |       val a = π(s)
25 |       val (s_next, reward) = a.sample()
26 |       val _x = if (s_next.isTerminal) Matrix.column(d) else xFeature(s_next)
27 |       
28 |       val v = A_ `T*` (x - γ * _x)
29 |       A_ -= (A_ * x) * v.T / (1.0 + (v `T*` x))
30 |       b += reward * x
31 |       s = s_next
32 |       x = _x
33 |     }
34 |     episodeListener(episode, step)
35 |   }
36 |   vFunc.w `=` A_ * b
37 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/Semi-gradient TD(0).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.util.log.debug
 5 | import lab.mars.rl.util.matrix.times
 6 | 
 7 | fun <E> MDP.`Semi-gradient TD(0)`(
 8 |     v: ApproximateFunction<E>, π: Policy,
 9 |     α: Double,
10 |     episodes: Int,
11 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
12 |   for (episode in 1..episodes) {
13 |     log.debug { "$episode/$episodes" }
14 |     var step = 0
15 |     var s = started()
16 |     while (s.isNotTerminal) {
17 |       step++
18 |       val a = π(s)
19 |       val (s_next, reward) = a.sample()
20 |       v.w += α * (reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)) * v.`∇`(s)
21 |       s = s_next
22 |     }
23 |     episodeListener(episode, step)
24 |   }
25 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/n-step semi-gradient TD.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.func_approx.prediction
 4 | 
 5 | import lab.mars.rl.algo.ntd.MAX_N
 6 | import lab.mars.rl.model.*
 7 | import lab.mars.rl.util.buf.newBuf
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.matrix.times
11 | import org.apache.commons.math3.util.FastMath.min
12 | import org.apache.commons.math3.util.FastMath.pow
13 | 
14 | fun <E> MDP.`n-step semi-gradient TD`(
15 |     v: ApproximateFunction<E>, π: Policy,
16 |     n: Int,
17 |     α: Double,
18 |     episodes: Int,
19 |     episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
20 |   val _R = newBuf<Double>(min(n, MAX_N))
21 |   val _S = newBuf<State>(min(n, MAX_N))
22 |   for (episode in 1..episodes) {
23 |     log.debug { "$episode/$episodes" }
24 |     var step = 0
25 |     var n = n
26 |     var T = Int.MAX_VALUE
27 |     var t = 0
28 |     var s = started()
29 |     var a = π(s)
30 |     _R.clear();_R.append(0.0)
31 |     _S.clear();_S.append(s)
32 |     do {
33 |       step++
34 |       if (t >= n) {
35 |         _R.removeFirst()
36 |         _S.removeFirst()
37 |       }
38 |       if (t < T) {
39 |         val (s_next, reward) = a.sample()
40 |         
41 |         _R.append(reward)
42 |         _S.append(s_next)
43 |         s = s_next
44 |         if (s.isTerminal) {
45 |           T = t + 1
46 |           val τ = t - n + 1
47 |           if (τ < 0) n = T //n is too large
48 |         } else
49 |           a = π(s)
50 |       }
51 |       val τ = t - n + 1
52 |       if (τ >= 0) {
53 |         var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
54 |         if (τ + n < T) G += pow(γ, n) * v(_S[n])
55 |         v.w += α * (G - v(_S[0])) * v.`∇`(_S[0])
56 |       }
57 |       t++
58 |     } while (τ < T - 1)
59 |     log.debug { "n=$n,T=$T" }
60 |     episodeListener(episode, step)
61 |   }
62 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/ExploringStarts.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.mc
 4 | 
 5 | import lab.mars.rl.algo.V_from_Q
 6 | import lab.mars.rl.model.impl.mdp.*
 7 | import lab.mars.rl.model.isNotTerminal
 8 | import lab.mars.rl.model.log
 9 | import lab.mars.rl.util.buf.newBuf
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.argmax
13 | import lab.mars.rl.util.tuples.tuple3
14 | 
15 | fun IndexedMDP.`Monte Carlo Exploring Starts`(π: IndexedPolicy = null_policy, episodes: Int): OptimalSolution {
16 |   val π = if (π == null_policy) IndexedPolicy(QFunc { 1.0 }) else π
17 |   val Q = QFunc { 0.0 }
18 |   val tmpQ = QFunc { Double.NaN }
19 |   val count = QFunc { 0 }
20 |   val tmpS = newBuf<IndexedState>(states.size)
21 |   
22 |   for (episode in 1..episodes) {
23 |     log.debug { "$episode/$episodes" }
24 |     var s = started()
25 |     var a = s.actions.rand()//Exploring Starts
26 |     
27 |     var accumulate = 0.0
28 |     do {
29 |       val (s_next, reward) = a.sample()
30 |       if (tmpQ[s, a].isNaN())
31 |         tmpQ[s, a] = accumulate
32 |       accumulate += reward
33 |       s = s_next
34 |     } while (s.isNotTerminal.apply { if (this) a = π(s) })
35 |     
36 |     tmpS.clear()
37 |     for ((s, a) in states.fork { it.actions }) {
38 |       val value = tmpQ[s, a]
39 |       if (!value.isNaN()) {
40 |         Q[s, a] += accumulate - value
41 |         count[s, a] += 1
42 |         tmpS.append(s)
43 |         tmpQ[s, a] = Double.NaN
44 |       }
45 |     }
46 |     for (s in tmpS) {
47 |       val a_greedy = argmax(s.actions) {
48 |         val n = count[s, it]
49 |         if (n > 0)
50 |           Q[s, it] / n
51 |         else
52 |           Q[s, it]
53 |       }
54 |       for (a in s.actions)
55 |         π[s, a] = if (a === a_greedy) 1.0 else 0.0
56 |     }
57 |   }
58 |   
59 |   Q.set { idx, value ->
60 |     val n = count[idx]
61 |     if (n > 0)
62 |       value / n
63 |     else
64 |       value
65 |   }
66 |   val V = VFunc { 0.0 }
67 |   val result = tuple3(π, V, Q)
68 |   V_from_Q(states, result)
69 |   return result
70 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/First visit Monte Carlo Prediction.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 5 | import lab.mars.rl.model.impl.mdp.StateValueFunction
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | 
10 | fun IndexedMDP.`First Visit Monte Carlo Prediction`(π: IndexedPolicy, episodes: Int): StateValueFunction {
11 |   val V = VFunc { 0.0 }
12 |   val preReturn = VFunc { Double.NaN }
13 |   val count = VFunc { 0 }
14 |   
15 |   for (episode in 1..episodes) {
16 |     log.debug { "$episode/$episodes" }
17 |     var s = started()
18 |     var accumulate = 0.0
19 |     while (s.isNotTerminal) {
20 |       val a = π(s)
21 |       val (s_next, reward) = a.sample()
22 |       if (preReturn[s].isNaN())
23 |         preReturn[s] = accumulate
24 |       accumulate += reward
25 |       s = s_next
26 |     }
27 |     preReturn.set { idx, value ->
28 |       if (!value.isNaN()) {
29 |         V[idx] += accumulate - value
30 |         count[idx] += 1
31 |       }
32 |       Double.NaN
33 |     }
34 |   }
35 |   for (s in states) {
36 |     val n = count[s]
37 |     if (n > 0)
38 |       V[s] = V[s] / n
39 |   }
40 |   return V
41 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/Off-Policy Prediction.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.model.impl.mdp.*
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.model.log
 7 | import lab.mars.rl.util.buf.newBuf
 8 | import lab.mars.rl.util.collection.filter
 9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.tuples.tuple3
11 | 
12 | fun IndexedMDP.`Off-policy MC prediction`(π: IndexedPolicy, episodes: Int): StateValueFunction {
13 |   val Q = QFunc { 0.0 }
14 |   val C = QFunc { 0.0 }
15 |   val b = IndexedPolicy(QFunc { 1.0 })
16 |   for (s in states.filter { it.isNotTerminal }) {
17 |     val prob = 1.0 / s.actions.size
18 |     for (a in s.actions)
19 |       b[s, a] = prob
20 |   }
21 |   
22 |   val R = newBuf<Double>()
23 |   val S = newBuf<IndexedState>()
24 |   val A = newBuf<IndexedAction>()
25 |   
26 |   for (episode in 1..episodes) {
27 |     log.debug { "$episode/$episodes" }
28 |     var s = started()
29 |     S.clear(); S.append(s)
30 |     R.clear();R.append(0.0)
31 |     A.clear()
32 |     var T = 0
33 |     while (s.isNotTerminal) {
34 |       val a = b(s)
35 |       A.append(a)
36 |       val (s_next, reward) = a.sample()
37 |       S.append(s_next)
38 |       R.append(reward)
39 |       s = s_next
40 |       T++
41 |     }
42 |     var G = 0.0
43 |     var W = 1.0
44 |     for (t in T - 1 downTo 0) {
45 |       val s_t = S[t]
46 |       val a_t = A[t]
47 |       G = γ * G + R[t + 1]
48 |       C[s_t, a_t] += W
49 |       Q[s_t, a_t] += W / C[s_t, a_t] * (G - Q[s_t, a_t])
50 |       W = W * π[s_t, a_t] / b[s_t, a_t]
51 |       if (W == 0.0) break
52 |     }
53 |   }
54 |   val V = VFunc { 0.0 }
55 |   val result = tuple3(π, V, Q)
56 |   V_from_Q(states, result)
57 |   return V
58 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/Off-policy Optimal.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.model.impl.mdp.*
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.model.isTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.buf.newBuf
 9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.math.argmax
11 | import lab.mars.rl.util.tuples.tuple3
12 | 
13 | fun IndexedMDP.`Off-policy MC Optimal`(episodes: Int): OptimalSolution {
14 |   val Q = QFunc { 0.0 }
15 |   val C = QFunc { 0.0 }
16 |   val b = IndexedPolicy(QFunc { 1.0 })
17 |   for (s in states) {
18 |     if (s.isTerminal) continue
19 |     val prob = 1.0 / s.actions.size
20 |     for (a in s.actions)
21 |       b[s, a] = prob
22 |   }
23 |   val π = QFunc { 1.0 }
24 |   
25 |   val R = newBuf<Double>()
26 |   val S = newBuf<IndexedState>()
27 |   val A = newBuf<IndexedAction>()
28 |   
29 |   for (episode in 1..episodes) {
30 |     log.debug { "$episode/$episodes" }
31 |     var s = started()
32 |     S.clear(); S.append(s)
33 |     R.clear();R.append(0.0)
34 |     A.clear()
35 |     var T = 0
36 |     while (s.isNotTerminal) {
37 |       val a = b(s)
38 |       A.append(a)
39 |       val (s_next, reward) = a.sample()
40 |       S.append(s_next)
41 |       R.append(reward)
42 |       s = s_next
43 |       T++
44 |     }
45 |     var G = 0.0
46 |     var W = 1.0
47 |     for (t in T - 1 downTo 0) {
48 |       val s_t = S[t]
49 |       val a_t = A[t]
50 |       G = γ * G + R[t + 1]
51 |       C[s_t, a_t] += W
52 |       Q[s_t, a_t] += W / C[s_t, a_t] * (G - Q[s_t, a_t])
53 |       
54 |       val a_opt = argmax(s_t.actions) { Q[s_t, it] }
55 |       for (a in s_t.actions) {
56 |         π[s_t, a] = when {
57 |           a === a_opt -> 1.0
58 |           else -> 0.0
59 |         }
60 |       }
61 |       if (a_t !== a_opt) break
62 |       W = W * 1 / b[s_t, a_t]
63 |     }
64 |   }
65 |   val V = VFunc { 0.0 }
66 |   val result = tuple3(IndexedPolicy(π), V, Q)
67 |   V_from_Q(states, result)
68 |   return result
69 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/On-Policy Optimal.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.mc
 4 | 
 5 | import lab.mars.rl.algo.V_from_Q
 6 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 7 | import lab.mars.rl.model.impl.mdp.IndexedState
 8 | import lab.mars.rl.model.impl.mdp.OptimalSolution
 9 | import lab.mars.rl.model.isNotTerminal
10 | import lab.mars.rl.model.log
11 | import lab.mars.rl.util.buf.newBuf
12 | import lab.mars.rl.util.collection.fork
13 | import lab.mars.rl.util.log.debug
14 | import lab.mars.rl.util.math.argmax
15 | import lab.mars.rl.util.tuples.tuple3
16 | 
17 | fun IndexedMDP.`On-policy first-visit MC control`(episodes: Int): OptimalSolution {
18 |   val ε = 0.1
19 |   val π = equiprobablePolicy()
20 |   val Q = QFunc { 0.0 }
21 |   val tmpQ = QFunc { Double.NaN }
22 |   val count = QFunc { 0 }
23 |   val tmpS = newBuf<IndexedState>(states.size)
24 |   
25 |   for (episode in 1..episodes) {
26 |     log.debug { "$episode/$episodes" }
27 |     var s = started()
28 |     var accumulate = 0.0
29 |     while (s.isNotTerminal) {
30 |       val a = π(s)
31 |       val (s_next, reward) = a.sample()
32 |       if (tmpQ[s, a].isNaN())
33 |         tmpQ[s, a] = accumulate
34 |       accumulate += reward
35 |       s = s_next
36 |     }
37 |     tmpS.clear()
38 |     for ((s, a) in states.fork { it.actions }) {
39 |       val value = tmpQ[s, a]
40 |       if (!value.isNaN()) {
41 |         Q[s, a] += accumulate - value
42 |         count[s, a] += 1
43 |         tmpS.append(s)
44 |         tmpQ[s, a] = Double.NaN
45 |       }
46 |     }
47 |     for (s in tmpS) {
48 |       val a_opt = argmax(s.actions) {
49 |         val n = count[s, it]
50 |         if (n > 0)
51 |           Q[s, it] / n
52 |         else
53 |           Q[s, it]
54 |       }
55 |       val size = s.actions.size
56 |       for (a in s.actions) {
57 |         π[s, a] = when {
58 |           a === a_opt -> 1 - ε + ε / size
59 |           else -> ε / size
60 |         }
61 |       }
62 |     }
63 |   }
64 |   
65 |   Q.set { idx, value ->
66 |     val n = count[idx]
67 |     if (n > 0)
68 |       value / n
69 |     else
70 |       value
71 |   }
72 |   val V = VFunc { 0.0 }
73 |   val result = tuple3(π, V, Q)
74 |   V_from_Q(states, result)
75 |   return result
76 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Off-policy Sarsa.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.ntd
 4 | 
 5 | import lab.mars.rl.algo.V_from_Q
 6 | import lab.mars.rl.algo.`ε-greedy`
 7 | import lab.mars.rl.model.impl.mdp.IndexedAction
 8 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 9 | import lab.mars.rl.model.impl.mdp.IndexedState
10 | import lab.mars.rl.model.impl.mdp.OptimalSolution
11 | import lab.mars.rl.model.isTerminal
12 | import lab.mars.rl.model.log
13 | import lab.mars.rl.util.buf.newBuf
14 | import lab.mars.rl.util.log.debug
15 | import lab.mars.rl.util.math.Π
16 | import lab.mars.rl.util.math.Σ
17 | import lab.mars.rl.util.tuples.tuple3
18 | import org.apache.commons.math3.util.FastMath.min
19 | import org.apache.commons.math3.util.FastMath.pow
20 | 
21 | fun IndexedMDP.`N-step off-policy sarsa`(
22 |     n: Int,
23 |     ε: Double,
24 |     α: (IndexedState, IndexedAction) -> Double,
25 |     episodes: Int): OptimalSolution {
26 |   val b = equiprobablePolicy()
27 |   val π = equiprobablePolicy()
28 |   
29 |   val Q = QFunc { 0.0 }
30 |   val _R = newBuf<Double>(min(n, MAX_N))
31 |   val _S = newBuf<IndexedState>(min(n, MAX_N))
32 |   val _A = newBuf<IndexedAction>(min(n, MAX_N))
33 |   
34 |   for (episode in 1..episodes) {
35 |     log.debug { "$episode/$episodes" }
36 |     var n = n
37 |     var T = Int.MAX_VALUE
38 |     var t = 0
39 |     var s = started()
40 |     var a = b(s)
41 |     _R.clear();_R.append(0.0)
42 |     _S.clear();_S.append(s)
43 |     _A.clear();_A.append(a)
44 |     do {
45 |       if (t >= n) {
46 |         _R.removeFirst()
47 |         _S.removeFirst()
48 |         _A.removeFirst()
49 |       }
50 |       if (t < T) {
51 |         val (s_next, reward) = a.sample()
52 |         _R.append(reward)
53 |         _S.append(s_next)
54 |         s = s_next
55 |         if (s.isTerminal) {
56 |           T = t + 1
57 |           val τ = t - n + 1
58 |           if (τ < 0) n = T
59 |         } else {
60 |           a = b(s)
61 |           _A.append(a)
62 |         }
63 |       }
64 |       val τ = t - n + 1
65 |       if (τ >= 0) {
66 |         val ρ = Π(1..min(n - 1, T - 1 - τ)) { π[_S[it], _A[it]] / b[_S[it], _A[it]] }
67 |         var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
68 |         if (τ + n < T) G += pow(γ, n) * Q[_S[n], _A[n]]
69 |         Q[_S[0], _A[0]] += α(_S[0], _A[0]) * ρ * (G - Q[_S[0], _A[0]])
70 |         `ε-greedy`(_S[0], Q, π, ε)
71 |       }
72 |       t++
73 |     } while (τ < T - 1)
74 |     log.debug { "n=$n,T=$T" }
75 |   }
76 |   val V = VFunc { 0.0 }
77 |   val result = tuple3(π, V, Q)
78 |   V_from_Q(states, result)
79 |   return result
80 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Sarsa.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.ntd
 4 | 
 5 | import lab.mars.rl.algo.V_from_Q
 6 | import lab.mars.rl.algo.`ε-greedy`
 7 | import lab.mars.rl.model.impl.mdp.*
 8 | import lab.mars.rl.model.isTerminal
 9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.buf.newBuf
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.Σ
13 | import lab.mars.rl.util.tuples.tuple3
14 | import org.apache.commons.math3.util.FastMath.min
15 | import org.apache.commons.math3.util.FastMath.pow
16 | 
17 | fun IndexedMDP.`N-step Sarsa`(
18 |     n: Int,
19 |     ε: Double,
20 |     α: (IndexedState, IndexedAction) -> Double,
21 |     episodes: Int): OptimalSolution {
22 |   val π = IndexedPolicy(QFunc { 0.0 })
23 |   val Q = QFunc { 0.0 }
24 |   val _R = newBuf<Double>(min(n, MAX_N))
25 |   val _S = newBuf<IndexedState>(min(n, MAX_N))
26 |   val _A = newBuf<IndexedAction>(min(n, MAX_N))
27 |   
28 |   for (episode in 1..episodes) {
29 |     log.debug { "$episode/$episodes" }
30 |     var n = n
31 |     var T = Int.MAX_VALUE
32 |     var t = 0
33 |     var s = started()
34 |     
35 |     `ε-greedy`(s, Q, π, ε)
36 |     var a = π(s)
37 |     _R.clear();_R.append(0.0)
38 |     _S.clear();_S.append(s)
39 |     _A.clear();_A.append(a)
40 |     do {
41 |       if (t >= n) {
42 |         _R.removeFirst()
43 |         _S.removeFirst()
44 |         _A.removeFirst()
45 |       }
46 |       if (t < T) {
47 |         val (s_next, reward) = a.sample()
48 |         _R.append(reward)
49 |         _S.append(s_next)
50 |         s = s_next
51 |         if (s.isTerminal) {
52 |           T = t + 1
53 |           val τ = t - n + 1
54 |           if (τ < 0) n = T //n is too large
55 |         } else {
56 |           `ε-greedy`(s, Q, π, ε)
57 |           a = π(s)
58 |           _A.append(a)
59 |         }
60 |       }
61 |       val τ = t - n + 1
62 |       if (τ >= 0) {
63 |         var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
64 |         if (τ + n < T) G += pow(γ, n) * Q[_S[n], _A[n]]
65 |         Q[_S[0], _A[0]] += α(_S[0], _A[0]) * (G - Q[_S[0], _A[0]])
66 |         `ε-greedy`(_S[0], Q, π, ε)
67 |       }
68 |       t++
69 |     } while (τ < T - 1)
70 |     log.debug { "n=$n,T=$T" }
71 |   }
72 |   val V = VFunc { 0.0 }
73 |   val result = tuple3(π, V, Q)
74 |   V_from_Q(states, result)
75 |   return result
76 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step TD prediction.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.ntd
 4 | 
 5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 7 | import lab.mars.rl.model.impl.mdp.IndexedState
 8 | import lab.mars.rl.model.impl.mdp.StateValueFunction
 9 | import lab.mars.rl.model.isTerminal
10 | import lab.mars.rl.model.log
11 | import lab.mars.rl.util.buf.newBuf
12 | import lab.mars.rl.util.collection.IndexedCollection
13 | import lab.mars.rl.util.log.debug
14 | import lab.mars.rl.util.math.Σ
15 | import org.apache.commons.math3.util.FastMath.min
16 | import org.apache.commons.math3.util.FastMath.pow
17 | 
18 | val MAX_N = 1024
19 | 
20 | fun IndexedMDP.`N-step TD prediction`(
21 |     n: Int, π: IndexedPolicy,
22 |     α: Double,
23 |     episodes: Int,
24 |     episodeListener: (Int, IndexedCollection<Double>) -> Unit = { _, _ -> }): StateValueFunction {
25 |   val V = VFunc { 0.0 }
26 |   val R = newBuf<Double>(min(n, MAX_N))
27 |   val S = newBuf<IndexedState>(min(n, MAX_N))
28 |   
29 |   for (episode in 1..episodes) {
30 |     log.debug { "$episode/$episodes" }
31 |     var T = Int.MAX_VALUE
32 |     var n = n
33 |     var t = 0
34 |     var s = started()
35 |     R.clear();R.append(0.0)
36 |     S.clear();S.append(s)
37 |     
38 |     do {
39 |       if (t >= n) {
40 |         R.removeFirst(1)
41 |         S.removeFirst(1)
42 |       }
43 |       if (t < T) {
44 |         val a = π(s)
45 |         val (s_next, reward) = a.sample()
46 |         S.append(s_next)
47 |         R.append(reward)
48 |         s = s_next
49 |         if (s.isTerminal) {
50 |           T = t + 1
51 |           val τ = t - n + 1
52 |           if (τ < 0) n = T //n is too large
53 |         }
54 |       }
55 |       val τ = t - n + 1
56 |       if (τ >= 0) {
57 |         var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * R[it] }
58 |         if (τ + n < T) G += pow(γ, n) * V[S[n]]
59 |         V[S[0]] += α * (G - V[S[0]])
60 |       }
61 |       t++
62 |     } while (τ < T - 1)
63 |     log.debug { "n=$n,T=$T" }
64 |     episodeListener(episode, V)
65 |   }
66 |   return V
67 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Treebackup.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.ntd
 4 | 
 5 | import lab.mars.rl.algo.V_from_Q
 6 | import lab.mars.rl.algo.`ε-greedy`
 7 | import lab.mars.rl.model.impl.mdp.IndexedAction
 8 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 9 | import lab.mars.rl.model.impl.mdp.IndexedState
10 | import lab.mars.rl.model.impl.mdp.OptimalSolution
11 | import lab.mars.rl.model.isTerminal
12 | import lab.mars.rl.model.log
13 | import lab.mars.rl.util.buf.newBuf
14 | import lab.mars.rl.util.log.debug
15 | import lab.mars.rl.util.math.Σ
16 | import lab.mars.rl.util.tuples.tuple3
17 | import org.apache.commons.math3.util.FastMath.min
18 | 
19 | fun IndexedMDP.`N-step Treebackup`(
20 |     n: Int, ε: Double,
21 |     α: (IndexedState, IndexedAction) -> Double,
22 |     episodes: Int): OptimalSolution {
23 |   val π = equiprobablePolicy()
24 |   val Q = QFunc { 0.0 }
25 |   
26 |   val _Q = newBuf<Double>(min(n, MAX_N))
27 |   val _π = newBuf<Double>(min(n, MAX_N))
28 |   val δ = newBuf<Double>(min(n, MAX_N))
29 |   val _S = newBuf<IndexedState>(min(n, MAX_N))
30 |   val _A = newBuf<IndexedAction>(min(n, MAX_N))
31 |   
32 |   for (episode in 1..episodes) {
33 |     var n = n
34 |     log.debug { "$episode/$episodes" }
35 |     var T = Int.MAX_VALUE
36 |     var t = 0
37 |     var s = started()
38 |     var a = π(s)
39 |     
40 |     _Q.clear(); _Q.append(0.0)
41 |     _π.clear();_π.append(π[s, a])
42 |     δ.clear()
43 |     _S.clear();_S.append(s)
44 |     _A.clear(); _A.append(a)
45 |     
46 |     do {
47 |       if (t >= n) {
48 |         _Q.removeFirst()
49 |         _π.removeFirst()
50 |         δ.removeFirst()
51 |         _S.removeFirst()
52 |         _A.removeFirst()
53 |       }
54 |       if (t < T) {
55 |         val (s_next, reward) = a.sample()
56 |         _S.append(s_next)
57 |         s = s_next
58 |         if (s.isTerminal) {
59 |           δ.append(reward - _Q.last)
60 |           T = t + 1
61 |           val τ = t - n + 1
62 |           if (τ < 0) n = T //n is too large
63 |         } else {
64 |           δ.append(reward + γ * Σ(s.actions) { π[s, it] * Q[s, it] } - _Q.last)
65 |           a = s.actions.rand()
66 |           _A.append(a)
67 |           _Q.append(Q[s, a])
68 |           _π.append(π[s, a])
69 |         }
70 |       }
71 |       val τ = t - n + 1
72 |       if (τ >= 0) {
73 |         var Z = 1.0
74 |         var G = _Q[0]
75 |         val end = min(n - 1, T - 1 - τ)
76 |         for (k in 0..end) {
77 |           G += Z * δ[k]
78 |           if (k < end) Z *= γ * _π[k + 1]
79 |         }
80 |         Q[_S[0], _A[0]] += α(_S[0], _A[0]) * (G - Q[_S[0], _A[0]])
81 |         `ε-greedy`(_S[0], Q, π, ε)
82 |       }
83 |       t++
84 |     } while (τ < T - 1)
85 |     log.debug { "n=$n,T=$T" }
86 |   }
87 |   val V = VFunc { 0.0 }
88 |   val result = tuple3(π, V, Q)
89 |   V_from_Q(states, result)
90 |   return result
91 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/package.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.ApproximateFunction
 5 | import lab.mars.rl.model.State
 6 | import lab.mars.rl.model.impl.mdp.*
 7 | import lab.mars.rl.model.isNotTerminal
 8 | import lab.mars.rl.util.collection.Gettable
 9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.math.argmax
12 | import lab.mars.rl.util.math.argmax_tie_random
13 | import lab.mars.rl.util.math.Σ
14 | 
15 | /**
16 |  * <p>
17 |  * Created on 2017-09-06.
18 |  * </p>
19 |  *
20 |  * @author wumo
21 |  */
22 | 
23 | typealias EpisodeListener = (Int, Int, State, Double) -> Unit
24 | 
25 | typealias StepListener = (Int, Int, State, Action<State>) -> Unit
26 | 
27 | fun V_from_Q(states: StateSet, pvq: OptimalSolution) {
28 |   val (π, V, Q) = pvq
29 |   for (s in states.filter { it.isNotTerminal }) {
30 |     V[s] = Σ(s.actions) {
31 |       π[s, it] * Q[s, it]
32 |     }
33 |   }
34 | }
35 | 
36 | fun Q_from_V(gamma: Double, states: StateSet, pvq: OptimalSolution) {
37 |   val (_, V, Q) = pvq
38 |   for ((s, a) in states.fork { it.actions })
39 |     Q[s, a] = Σ(a.possibles) { probability * (reward + gamma * V[next]) }
40 | }
41 | 
42 | fun average_α(indexedMdp: IndexedMDP): (IndexedState, IndexedAction) -> Double {
43 |   val N = indexedMdp.QFunc { 0 }
44 |   return { s, a ->
45 |     N[s, a]++
46 |     1.0 / N[s, a]
47 |   }
48 | }
49 | 
50 | fun `ε-greedy`(s: IndexedState, Q: ActionValueFunction, π: IndexedPolicy, ε: Double) {
51 |   val a_opt = argmax(s.actions) { Q[s, it] }
52 |   val size = s.actions.size
53 |   for (a in s.actions) {
54 |     π[s, a] = when {
55 |       a === a_opt -> 1 - ε + ε / size
56 |       else -> ε / size
57 |     }
58 |   }
59 | }
60 | 
61 | fun `ε-greedy`(s: IndexedState, evaluate: Gettable<Action<State>, Double>, π: IndexedPolicy, ε: Double) {
62 |   val a_opt = argmax(s.actions) { evaluate[it] }
63 |   val size = s.actions.size
64 |   for (a in s.actions) {
65 |     π[s, a] = when {
66 |       a === a_opt -> 1 - ε + ε / size
67 |       else -> ε / size
68 |     }
69 |   }
70 | }
71 | 
72 | fun <E> `ε-greedy`(s: IndexedState, Q: ApproximateFunction<E>, π: IndexedPolicy, ε: Double) {
73 |   val a_opt = argmax(s.actions) { Q(s, it) }
74 |   val size = s.actions.size
75 |   for (a in s.actions) {
76 |     π[s, a] = when {
77 |       a === a_opt -> 1 - ε + ε / size
78 |       else -> ε / size
79 |     }
80 |   }
81 | }
82 | 
83 | fun `ε-greedy (tie broken randomly)`(s: IndexedState, Q: ActionValueFunction, π: IndexedPolicy, ε: Double) {
84 |   val a_opt = argmax_tie_random(s.actions) { Q[s, it] }
85 |   val size = s.actions.size
86 |   for (a in s.actions) {
87 |     π[s, a] = when {
88 |       a === a_opt -> 1 - ε + ε / size
89 |       else -> ε / size
90 |     }
91 |   }
92 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/Actor-Critic with Eligibility Traces (continuing).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.policy_gradient
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.math.rand
 7 | import lab.mars.rl.util.matrix.Matrix
 8 | import lab.mars.rl.util.matrix.MatrixSpec
 9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.matrix.Σ
11 | import kotlin.math.exp
12 | 
13 | fun <E> MDP.`Actor-Critic with Eligibility Traces (continuing)`(
14 |     h: LinearFunc<E>, α_θ: Double, λ_θ: Double,
15 |     v: ApproximateFunction<E>, α_w: Double, λ_w: Double, η: Double,
16 |     episodes: Int,
17 |     z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) }) {
18 |   for (episode in 1..episodes) {
19 |     log.debug { "$episode/$episodes" }
20 |     var step = 0
21 |     var s = started()
22 |     val z_θ = z_maker(h.w.size, 1)
23 |     val z_w = z_maker(v.w.size, 1)
24 |     var averageR = 0.0
25 |     while (s.isNotTerminal) {
26 |       step++
27 |       val a = rand(s.actions) { exp(h(s, it)) }
28 |       val (s_next, reward) = a.sample()
29 |       val δ = reward - averageR + γ * if (s_next.isTerminal) 0.0 else v(s_next) - v(s)
30 |       averageR += η * δ
31 |       z_w `=` λ_w * z_w + v.`∇`(s)
32 |       val `∇` = h.x(s, a) - Σ(s.actions) { b ->
33 |         val tmp = h(s, b)
34 |         h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
35 |       }
36 |       z_θ `=` λ_θ * z_θ + `∇`
37 |       v.w += α_w * δ * z_w
38 |       h.w += α_θ * δ * z_θ
39 |       s = s_next
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/Actor-Critic with Eligibility Traces (episodic).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.policy_gradient
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.math.rand
 7 | import lab.mars.rl.util.matrix.Matrix
 8 | import lab.mars.rl.util.matrix.MatrixSpec
 9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.matrix.Σ
11 | import kotlin.math.exp
12 | 
13 | fun <E> MDP.`Actor-Critic with Eligibility Traces (episodic)`(
14 |     h: LinearFunc<E>, α_θ: Double, λ_θ: Double,
15 |     v: ApproximateFunction<E>, α_w: Double, λ_w: Double,
16 |     episodes: Int,
17 |     z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
18 |     maxStep: Int = Int.MAX_VALUE,
19 |     episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
20 |     stepListener: (Int, Int, State, Action<State>) -> Unit = { _, _, _, _ -> }) {
21 |   for (episode in 1..episodes) {
22 |     log.debug { "$episode/$episodes" }
23 |     var step = 0
24 |     var G = 0.0
25 |     var s = started()
26 |     val z_θ = z_maker(h.w.size, 1)
27 |     val z_w = z_maker(v.w.size, 1)
28 |     var γ_t = 1.0
29 |     while (s.isNotTerminal) {
30 |       step++
31 |       val a = rand(s.actions) { exp(h(s, it)) }
32 |       val (s_next, reward) = a.sample()
33 |       G += γ_t * reward
34 |       val δ = reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)
35 |       z_w *= γ * λ_w
36 |       z_w += γ_t * v.`∇`(s)
37 | //      z_w `=` γ * λ_w * z_w + γ_t * v.`∇`(s)
38 |       val `∇` = h.x(s, a) - Σ(s.actions) { b ->
39 |         val tmp = h(s, b)
40 |         h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
41 |       }
42 |       z_θ *= γ * λ_θ
43 |       z_θ += γ_t * `∇`
44 | //      z_θ `=` γ * λ_θ * z_θ + γ_t * `∇`
45 |       v.w += α_w * δ * z_w
46 |       h.w += α_θ * δ * z_θ
47 |       γ_t *= γ
48 |       stepListener(episode, step, s, a)
49 |       s = s_next
50 |       if (step >= maxStep) break
51 |     }
52 |     episodeListener(episode, step, s, G)
53 |   }
54 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/One-step Actor-Critic (episodic).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.policy_gradient
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.log.debug
 6 | import lab.mars.rl.util.math.rand
 7 | import lab.mars.rl.util.matrix.times
 8 | import lab.mars.rl.util.matrix.Σ
 9 | import kotlin.math.exp
10 | 
11 | fun <E> MDP.`One-step Actor-Critic (episodic)`(
12 |     h: LinearFunc<E>, α_θ: Double,
13 |     v: ApproximateFunction<E>, α_w: Double,
14 |     episodes: Int) {
15 |   for (episode in 1..episodes) {
16 |     log.debug { "$episode/$episodes" }
17 |     var step = 0
18 |     var s = started()
19 |     var γ_t = 1.0
20 |     while (s.isNotTerminal) {
21 |       step++
22 |       val a = rand(s.actions) { exp(h(s, it)) }
23 |       val (s_next, reward) = a.sample()
24 |       val δ = reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)
25 |       v.w += α_w * γ_t * δ * v.`∇`(s)
26 |       val `∇` = h.x(s, a) - Σ(s.actions) { b ->
27 |         val tmp = h(s, b)
28 |         h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
29 |       }
30 |       h.w += α_θ * γ_t * δ * `∇`
31 |       γ_t *= γ
32 |       s = s_next
33 |     }
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/REINFORCE with Baseline (episodic).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.policy_gradient
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.buf.newBuf
 6 | import lab.mars.rl.util.log.debug
 7 | import lab.mars.rl.util.math.rand
 8 | import lab.mars.rl.util.matrix.times
 9 | import lab.mars.rl.util.matrix.Σ
10 | import kotlin.math.exp
11 | 
12 | fun <E> MDP.`REINFORCE with Baseline (episodic)`(
13 |     h: LinearFunc<E>, α_θ: Double,
14 |     v: ApproximateFunction<E>, α_w: Double,
15 |     episodes: Int,
16 |     episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
17 |     stepListener: (Int, Int, State, Action<State>) -> Unit = { _, _, _, _ -> }) {
18 |   for (episode in 1..episodes) {
19 |     log.debug { "$episode/$episodes" }
20 |     var step = 0
21 |     var s = started()
22 |     var a = rand(s.actions) { exp(h(s, it)) }
23 |     val S = newBuf<State>()
24 |     val A = newBuf<Action<State>>()
25 |     val R = newBuf<Double>()
26 |     
27 |     S.append(s)
28 |     R.append(0.0)
29 |     var accu = 0.0
30 |     var T: Int
31 |     while (true) {
32 |       step++
33 |       A.append(a)
34 |       val (s_next, reward) = a.sample()
35 |       accu += reward
36 |       stepListener(episode, step, s, a)
37 |       R.append(accu)
38 |       S.append(s_next)
39 |       s = s_next
40 |       if (s_next.isTerminal) {
41 |         T = step
42 |         break
43 |       }
44 |       
45 |       a = rand(s.actions) { exp(h(s, it)) }
46 |     }
47 |     var γ_t = 1.0
48 |     for (t in 0 until T) {
49 |       val G = accu - R[t]
50 |       val δ = G - v(S[t])
51 |       v.w += α_w * γ_t * δ * v.`∇`(S[t])
52 |       val `∇` = h.x(S[t], A[t]) - Σ(S[t].actions) { b ->
53 |         val tmp = exp(h(S[t], b))
54 |         h.x(S[t], b) / S[t].actions.sumByDouble { exp(h(S[t], it) - tmp) }
55 |       }
56 |       h.w += α_θ * γ_t * δ * `∇`
57 |       γ_t *= γ
58 |     }
59 |     episodeListener(episode, T, s, accu)
60 |   }
61 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/REINFORCE.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.policy_gradient
 2 | 
 3 | import lab.mars.rl.model.*
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.util.buf.newBuf
 6 | import lab.mars.rl.util.log.debug
 7 | import lab.mars.rl.util.math.rand
 8 | import lab.mars.rl.util.matrix.times
 9 | import lab.mars.rl.util.matrix.Σ
10 | import kotlin.math.exp
11 | 
12 | fun <E> MDP.REINFORCE(h: LinearFunc<E>, α: Double, episodes: Int) {
13 |   for (episode in 1..episodes) {
14 |     log.debug { "$episode/$episodes" }
15 |     var step = 0
16 |     var s = started()
17 |     var a = rand(s.actions) { exp(h(s, it)) }
18 |     val S = newBuf<State>()
19 |     val A = newBuf<Action<State>>()
20 |     val R = newBuf<Double>()
21 |     
22 |     S.append(s)
23 |     R.append(0.0)
24 |     var accu = 0.0
25 |     var T: Int
26 |     while (true) {
27 |       step++
28 |       A.append(a)
29 |       val (s_next, reward) = a.sample()
30 |       accu += reward
31 |       R.append(accu)
32 |       S.append(s_next)
33 |       s = s_next
34 |       if (s_next.isTerminal) {
35 |         T = step
36 |         break
37 |       }
38 |       a = rand(s.actions) { exp(h(s, it)) }
39 |     }
40 |     var γ_t = 1.0
41 |     for (t in 0 until T) {
42 |       val G = accu - R[t]
43 |       val `∇` = h.x(S[t], A[t]) - Σ(S[t].actions) { b ->
44 |         val tmp = exp(h(S[t], b))
45 |         h.x(S[t], b) / S[t].actions.sumByDouble { exp(h(S[t], it) - tmp) }
46 |       }
47 |       h.w += α * γ_t * G * `∇`
48 |       γ_t *= γ
49 |     }
50 |   }
51 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/DoubleQLearning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.model.impl.mdp.*
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.model.log
 7 | import lab.mars.rl.util.log.debug
 8 | import lab.mars.rl.util.math.Rand
 9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.tuples.tuple3
11 | 
12 | fun IndexedMDP.DoubleQLearning(
13 |     ε: Double,
14 |     α: (IndexedState, IndexedAction) -> Double,
15 |     episodes: Int): OptimalSolution {
16 |   fun `ε-greedy`(s: IndexedState, Q1: ActionValueFunction, Q2: ActionValueFunction, π: IndexedPolicy) {
17 |     val a_opt = argmax(s.actions) { Q1[s, it] + Q2[s, it] }
18 |     val size = s.actions.size
19 |     for (a in s.actions) {
20 |       π[s, a] = when {
21 |         a === a_opt -> 1 - ε + ε / size
22 |         else -> ε / size
23 |       }
24 |     }
25 |   }
26 |   
27 |   val π = IndexedPolicy(QFunc { 0.0 })
28 |   var Q1 = QFunc { 0.0 }
29 |   var Q2 = QFunc { 0.0 }
30 |   
31 |   for (episode in 1..episodes) {
32 |     log.debug { "$episode/$episodes" }
33 |     var s = started()
34 |     while (true) {
35 |       `ε-greedy`(s, Q1, Q2, π)
36 |       val a = π(s)
37 |       val (s_next, reward) = a.sample()
38 |       if (Rand().nextBoolean()) {
39 |         val tmp = Q1
40 |         Q1 = Q2
41 |         Q2 = tmp
42 |       }
43 |       if (s_next.isNotTerminal) {
44 |         Q1[s, a] += α(s, a) * (reward + γ * Q2[s_next, argmax(s_next.actions) { Q1[s_next, it] }] - Q1[s, a])
45 |         s = s_next
46 |       } else {
47 |         Q1[s, a] += α(s, a) * (reward + γ * 0.0 - Q1[s, a])//Q[terminalState,*]=0.0
48 |         break
49 |       }
50 |     }
51 |   }
52 |   val V = VFunc { 0.0 }
53 |   val result = tuple3(π, V, Q1)
54 |   V_from_Q(states, result)
55 |   return result
56 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/ExpectedSarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.tuples.tuple3
11 | 
12 | fun IndexedMDP.expectedSarsa(
13 |     ε: Double,
14 |     α: (IndexedState, IndexedAction) -> Double,
15 |     episodes: Int): OptimalSolution {
16 |   val π = IndexedPolicy(QFunc { 0.0 })
17 |   val Q = QFunc { 0.0 }
18 |   
19 |   for (episode in 1..episodes) {
20 |     log.debug { "$episode/$episodes" }
21 |     var s = started()
22 |     while (s.isNotTerminal) {
23 |       `ε-greedy`(s, Q, π, ε)
24 |       val a = π(s)
25 |       val (s_next, reward) = a.sample()
26 |       Q[s, a] += α(s, a) * (reward + γ * Σ(s_next.actions) { π[s_next, it] * Q[s_next, it] } - Q[s, a])
27 |       s = s_next
28 |     }
29 |   }
30 |   val V = VFunc { 0.0 }
31 |   val result = tuple3(π, V, Q)
32 |   V_from_Q(states, result)
33 |   return result
34 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/QLearning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.math.max
10 | import lab.mars.rl.util.tuples.tuple3
11 | 
12 | fun IndexedMDP.QLearning(
13 |     ε: Double,
14 |     α: (IndexedState, IndexedAction) -> Double,
15 |     episodes: Int): OptimalSolution {
16 |   val π = IndexedPolicy(QFunc { 0.0 })
17 |   val Q = QFunc { 0.0 }
18 |   
19 |   for (episode in 1..episodes) {
20 |     log.debug { "$episode/$episodes" }
21 |     var s = started()
22 |     while (s.isNotTerminal) {
23 |       `ε-greedy`(s, Q, π, ε)
24 |       val a = π(s)
25 |       val (s_next, reward) = a.sample()
26 |       Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
27 |       s = s_next
28 |     }
29 |   }
30 |   val V = VFunc { 0.0 }
31 |   val result = tuple3(π, V, Q)
32 |   V_from_Q(states, result)
33 |   return result
34 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.V_from_Q
 4 | import lab.mars.rl.algo.`ε-greedy`
 5 | import lab.mars.rl.model.impl.mdp.*
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | import lab.mars.rl.util.tuples.tuple3
10 | 
11 | fun IndexedMDP.sarsa(
12 |     ε: Double,
13 |     α: (IndexedState, IndexedAction) -> Double,
14 |     episodes: Int): OptimalSolution {
15 |   val π = IndexedPolicy(QFunc { 0.0 })
16 |   val Q = QFunc { 0.0 }
17 |   
18 |   for (episode in 1..episodes) {
19 |     log.debug { "$episode/$episodes" }
20 |     var s = started()
21 |     `ε-greedy`(s, Q, π, ε)
22 |     var a = π(s)
23 |     while (true) {
24 |       val (s_next, reward) = a.sample()
25 |       if (s_next.isNotTerminal) {
26 |         `ε-greedy`(s_next, Q, π, ε)
27 |         val a_next = π(s_next)
28 |         Q[s, a] += α(s, a) * (reward + γ * Q[s_next, a_next] - Q[s, a])
29 |         s = s_next
30 |         a = a_next
31 |       } else {
32 |         Q[s, a] += α(s, a) * (reward + γ * 0.0 - Q[s, a])//Q[terminalState,*]=0.0
33 |         break
34 |       }
35 |     }
36 |   }
37 |   val V = VFunc { 0.0 }
38 |   val result = tuple3(π, V, Q)
39 |   V_from_Q(states, result)
40 |   return result
41 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/Tabular TD(0).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 5 | import lab.mars.rl.model.impl.mdp.StateValueFunction
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.model.log
 8 | import lab.mars.rl.util.log.debug
 9 | 
10 | fun IndexedMDP.`Tabular TD(0)`(π: IndexedPolicy, α: Double, episodes: Int): StateValueFunction {
11 |   val V = VFunc { 0.0 }
12 |   for (episode in 1..episodes) {
13 |     log.debug { "$episode/$episodes" }
14 |     var s = started()
15 |     while (s.isNotTerminal) {
16 |       val a = π(s)
17 |       val (s_next, reward) = a.sample()
18 |       V[s] += α * (reward + γ * V[s_next] - V[s])
19 |       s = s_next
20 |     }
21 |   }
22 |   return V
23 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/ApproximateFunction.kt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/main/kotlin/lab/mars/rl/model/ApproximateFunction.kt


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/MDP.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NOTHING_TO_INLINE", "OVERRIDE_BY_INLINE", "UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.model
 4 | 
 5 | import lab.mars.rl.model.impl.mdp.IndexedAction
 6 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 7 | import lab.mars.rl.model.impl.mdp.IndexedState
 8 | import lab.mars.rl.model.impl.mdp.PossibleSet
 9 | import lab.mars.rl.util.buf.DefaultIntBuf
10 | import lab.mars.rl.util.collection.emptyNSet
11 | import org.slf4j.LoggerFactory
12 | 
13 | /**
14 |  * <p>
15 |  * Created on 2017-08-31.
16 |  * </p>
17 |  *
18 |  * @author wumo
19 |  */
20 | 
21 | interface MDP {
22 |   val γ: Double
23 |   val started: () -> State
24 | }
25 | 
26 | interface Policy {
27 |   /**sample action when in state [s]*/
28 |   operator fun invoke(s: State): Action<State>
29 |   
30 |   /**probability of taking action [a] when in state [s]*/
31 |   operator fun get(s: State, a: Action<State>): Double
32 |   
33 |   fun greedy(s: State): Action<State>
34 | }
35 | 
36 | interface RandomIterable<out E>: Iterable<E> {
37 |   fun rand(): E
38 |   val size: Int
39 | }
40 | 
41 | interface State {
42 |   val actions: RandomIterable<Action<State>>
43 | }
44 | 
45 | inline val State.isTerminal
46 |   get() = !isNotTerminal
47 | 
48 | inline val State.isNotTerminal
49 |   get() = actions.any()
50 | 
51 | interface Action<out S: State> {
52 |   val sample: () -> Possible<S>
53 | }
54 | 
55 | open class Possible<out S: State>(val next: S, val reward: Double) {
56 |   open operator fun component1() = next
57 |   open operator fun component2() = reward
58 | }
59 | 
60 | val log = LoggerFactory.getLogger(MDP::class.java)!!
61 | val null_index = DefaultIntBuf.of(-1)
62 | val null_state = IndexedState(null_index)
63 | val null_action = IndexedAction(null_index)
64 | val null_possible = IndexedPossible(null_state, 0.0, 0.0)
65 | val emptyPossibleSet: PossibleSet = emptyNSet()
66 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/LinearFunc.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.func
 2 | 
 3 | import lab.mars.rl.model.ApproximateFunction
 4 | import lab.mars.rl.util.matrix.Matrix
 5 | import lab.mars.rl.util.matrix.MatrixSpec
 6 | import org.apache.commons.math3.util.FastMath.*
 7 | 
 8 | abstract class Feature<E>(val conv: (Array<out Any>) -> E) {
 9 |   operator fun invoke(vararg args: Any): MatrixSpec = _invoke(conv(args))
10 |   
11 |   abstract fun _invoke(s: E): MatrixSpec
12 |   abstract val numOfComponents: Int
13 | }
14 | 
15 | operator fun DoubleArray.times(elements: DoubleArray): Double {
16 |   var result = 0.0
17 |   for (i in 0..lastIndex)
18 |     result += this[i] * elements[i]
19 |   return result
20 | }
21 | 
22 | class SimplePolynomial(override val numOfComponents: Int, conv: (Array<out Any>) -> Double) : Feature<Double>(conv) {
23 |   override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
24 |     pow(s, it)
25 |   }
26 | }
27 | 
28 | class SimpleFourier(override val numOfComponents: Int, conv: (Array<out Any>) -> Double) : Feature<Double>(conv) {
29 |   override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
30 |     cos(it * PI * s)
31 |   }
32 | }
33 | 
34 | class LinearFunc<E>(val x: Feature<E>) : ApproximateFunction<E>(x.conv) {
35 |   override fun `_∇`(input: E) = x._invoke(input)
36 |   
37 |   override val w = Matrix.column(x.numOfComponents)
38 |   
39 |   override fun _invoke(input: E) = (w `T*` x._invoke(input)).toScalar
40 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SimpleCoarseCoding.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.func
 2 | 
 3 | import lab.mars.rl.util.matrix.Matrix
 4 | 
 5 | val ClosedRange<Double>.size: Double
 6 |   get() = endInclusive - start
 7 | 
 8 | class SimpleCoarseCoding(featureWidth: Double, domain: ClosedRange<Double>,
 9 |                          override val numOfComponents: Int, conv: (Array<out Any>) -> Double): Feature<Double>(conv) {
10 |   val features: Array<ClosedRange<Double>>
11 |   
12 |   init {
13 |     val step = (domain.size - featureWidth) / (numOfComponents - 1)
14 |     var left = domain.start
15 |     features = Array(numOfComponents) {
16 |       (left..(left + featureWidth)).apply { left += step }
17 |     }
18 |   }
19 |   
20 |   override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
21 |     if (features[it].contains(s)) 1.0 //quantize the interval
22 |     else 0.0
23 |   }
24 |   
25 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SimpleTileCoding.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.model.impl.func
 4 | 
 5 | import lab.mars.rl.util.matrix.Matrix
 6 | 
 7 | class SimpleTileCoding(numOfTilings: Int,
 8 |                        _tilingSize: Int,
 9 |                        val tileWidth: Int,
10 |                        val tilingOffset: Double, conv: (Array<out Any>) -> Double): Feature<Double>(conv) {
11 |   val tilingSize = _tilingSize + 1
12 |   override val numOfComponents = numOfTilings * tilingSize
13 |   
14 |   override fun _invoke(s: Double): Matrix {
15 |     return Matrix.column(numOfComponents) {
16 |       val tilingIdx = it / tilingSize
17 |       val tileIdx = it % tilingSize
18 |       val start = -tileWidth + tilingIdx * tilingOffset + tileIdx * tileWidth
19 |       if (start <= s && s < start + tileWidth) 1.0 else 0.0
20 |     }
21 |   }
22 |   
23 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/StateAggregation.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.func
 2 | 
 3 | import lab.mars.rl.model.ApproximateFunction
 4 | import lab.mars.rl.util.matrix.Matrix
 5 | import org.apache.commons.math3.util.FastMath.ceil
 6 | 
 7 | class StateAggregation(numStates: Int, val numOfGroups: Int, conv: (Array<out Any>) -> Int): ApproximateFunction<Int>(conv) {
 8 |   override fun `_∇`(input: Int): Matrix {
 9 |     val groupIdx = input / groupSize
10 |     return Matrix.column(numOfGroups) { if (it == groupIdx) 1.0 else 0.0 }
11 |   }
12 |   
13 |   override val w = Matrix.column(numOfGroups) { 0.0 }
14 |   val groupSize = ceil(numStates.toDouble() / numOfGroups).toInt()
15 |   
16 |   override fun _invoke(input: Int): Double {
17 |     val groupIdx = input / groupSize
18 |     return w[groupIdx]
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SuttonTileCoding.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.func
 2 | 
 3 | import lab.mars.rl.util.matrix.MatrixSpec
 4 | import lab.mars.rl.util.matrix.SparseMatrix
 5 | import lab.mars.rl.util.tuples.tuple2
 6 | import org.apache.commons.math3.util.FastMath.*
 7 | 
 8 | val MAXIMUM_CAPACITY = 1 shl 30
 9 | 
10 | /**
11 |  * @param unit_scales scale input unit to tile coding unit so as to ensure expected resolution. Usually defined as #(grid tilings)/(range of data).
12 |  */
13 | private val emptyDoubleArray = DoubleArray(0)
14 | 
15 | class SuttonTileCoding(numTilesPerTiling: Int, _numTilings: Int, val unit_scales: DoubleArray = emptyDoubleArray, val allowCollisions: Boolean = false,
16 |                        conv: (Array<out Any>) -> tuple2<DoubleArray, IntArray>) : Feature<tuple2<DoubleArray, IntArray>>(conv) {
17 |   val numTilings = tableSizeFor(_numTilings)
18 |   override val numOfComponents = numTilings * (numTilesPerTiling + 1)
19 |   override fun _invoke(s: tuple2<DoubleArray, IntArray>): MatrixSpec {
20 |     val (floats, ints) = s
21 |     val activeTiles = tiles(floats, ints)
22 |     val x = SparseMatrix(numOfComponents, 1)
23 |     for (activeTile in activeTiles)
24 |       x[activeTile] = 1.0
25 |     return x
26 |   }
27 |   
28 |   val data = HashMap<ArrayList<Double>, Int>(ceil(numOfComponents / 0.75).toInt())
29 |   
30 |   private fun tiles(floats: DoubleArray, ints: IntArray): IntArray {
31 |     val qfloats = DoubleArray(floats.size) {
32 |       floor(floats[it] * (if (it <= unit_scales.lastIndex) unit_scales[it] else 1.0) * numTilings)
33 |     }
34 |     val result = IntArray(numTilings)
35 |     for (tiling in 0 until numTilings) {
36 |       val tilingX2 = tiling * 2
37 |       val coords = ArrayList<Double>(1 + floats.size + ints.size)
38 |       coords.add(tiling.toDouble())
39 |       var b = tiling
40 |       for (q in qfloats) {
41 |         coords.add(floor(((q + b) / numTilings)))
42 |         b += tilingX2
43 |       }
44 |       for (int in ints)
45 |         coords.add(int.toDouble())
46 |       if (data.size < numOfComponents)
47 |         result[tiling] = data.getOrPut(coords, { data.size })
48 |       else if (allowCollisions)
49 |         abs(coords.hashCode()) % numOfComponents
50 |     }
51 |     return result
52 |   }
53 |   
54 |   /** Returns a power of two size for the given target capacity.*/
55 |   fun tableSizeFor(cap: Int): Int {
56 |     var n = cap - 1
57 |     n = n or n.ushr(1)
58 |     n = n or n.ushr(2)
59 |     n = n or n.ushr(4)
60 |     n = n or n.ushr(8)
61 |     n = n or n.ushr(16)
62 |     return if (n < 0) 1 else if (n >= MAXIMUM_CAPACITY) MAXIMUM_CAPACITY else n + 1
63 |   }
64 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/DefaultAction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 | 
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.Possible
5 | import lab.mars.rl.model.State
6 | 
7 | class DefaultAction<out E, out S: State>(val value: E, override val sample: () -> Possible<S>): Action<S>


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/DefaultMDP.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 | 
3 | import lab.mars.rl.model.MDP
4 | import lab.mars.rl.model.State
5 | 
6 | class DefaultMDP(override val γ: Double, override val started: () -> State): MDP


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/EpsilonGreedyFunctionPolicy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.mdp
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.ApproximateFunction
 5 | import lab.mars.rl.model.Policy
 6 | import lab.mars.rl.model.State
 7 | import lab.mars.rl.util.math.Rand
 8 | import lab.mars.rl.util.math.argmax_tie_random
 9 | import lab.mars.rl.util.math.max_count
10 | 
11 | class EpsilonGreedyFunctionPolicy<E>(val q: ApproximateFunction<E>, val ε: Double = 0.1): Policy {
12 |   override fun invoke(s: State): Action<State> {
13 |     return if (Rand().nextDouble() < ε)
14 |       s.actions.rand()
15 |     else
16 |       argmax_tie_random(s.actions) { q(s, it) }
17 |   }
18 |   
19 |   override fun get(s: State, a: Action<State>): Double {
20 |     val (m, c) = max_count(s.actions) { q(s, it) }
21 |     return if (q(s, a) == m) (1.0 / c - ε / c - ε / s.actions.size) else ε / s.actions.size
22 |   }
23 |   
24 |   override fun greedy(s: State) = argmax_tie_random(s.actions) { q(s, it) }
25 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedAction.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("OVERRIDE_BY_INLINE", "NOTHING_TO_INLINE", "UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.model.impl.mdp
 4 | 
 5 | import lab.mars.rl.model.Action
 6 | import lab.mars.rl.util.buf.Index
 7 | import lab.mars.rl.util.buf.IntBuf
 8 | import lab.mars.rl.util.collection.emptyNSet
 9 | import lab.mars.rl.util.exception.NoMoreElementsException
10 | import lab.mars.rl.util.math.Rand
11 | 
12 | class IndexedAction(val index: IntBuf): Index(), Action<IndexedState> {
13 |   inline override val size: Int
14 |     get() = index.size
15 |   
16 |   inline override operator fun get(idx: Int) = index[idx]
17 |   
18 |   var possibles: PossibleSet = emptyNSet()
19 |   
20 |   override var sample = outer@ {
21 |     if (possibles.isEmpty()) throw NoMoreElementsException()
22 |     val p = Rand().nextDouble()
23 |     var acc = 0.0
24 |     for (possible in possibles) {
25 |       acc += possible.probability
26 |       if (p <= acc)
27 |         return@outer possible
28 |     }
29 |     throw IllegalArgumentException("random=$p, but accumulation=$acc")
30 |   }
31 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedMDP.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.model.impl.mdp
 4 | 
 5 | import lab.mars.rl.model.MDP
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.util.buf.Index
 8 | import lab.mars.rl.util.collection.IndexedCollection
 9 | 
10 | /**
11 |  *
12 |  * @property states state set
13 |  * @property γ decay factor
14 |  * @property state_function [state_function] generator
15 |  * @property state_action_function [state_action_function] generator
16 |  */
17 | class IndexedMDP(
18 |     override val γ: Double,
19 |     val states: StateSet,
20 |     private val state_function: ((Index) -> Any) -> IndexedCollection<Any>,
21 |     private val state_action_function: ((Index) -> Any) -> IndexedCollection<Any>): MDP {
22 |   override var started = { states.rand() }
23 |   /**
24 |    *
25 |    * create state function indexed by [IndexedState]
26 |    */
27 |   fun <T: Any> VFunc(element_maker: (Index) -> T) =
28 |       state_function(element_maker) as IndexedCollection<T>
29 |   
30 |   /**
31 |    *
32 |    * create state action function indexed by [IndexedState] pair [IndexedAction]
33 |    */
34 |   fun <T: Any> QFunc(element_maker: (Index) -> T) =
35 |       state_action_function(element_maker) as IndexedCollection<T>
36 |   
37 |   /**
38 |    * equiprobable random policy
39 |    */
40 |   fun equiprobablePolicy(): IndexedPolicy {
41 |     val policy = QFunc { 0.0 }
42 |     for (s in states.filter { it.isNotTerminal }) {
43 |       val prob = 1.0 / s.actions.size
44 |       for (a in s.actions)
45 |         policy[s, a] = prob
46 |     }
47 |     return IndexedPolicy(policy)
48 |   }
49 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedPolicy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.mdp
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.Policy
 5 | import lab.mars.rl.model.State
 6 | import lab.mars.rl.util.collection.IndexedCollection
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | import lab.mars.rl.util.math.argmax
 9 | 
10 | class IndexedPolicy(val p: IndexedCollection<Double>, val ε: Double = 0.1): Policy {
11 |   
12 |   override fun invoke(s: State): IndexedAction {
13 |     val eval = p(s as IndexedState)
14 |     return s.actions.rand { eval[it] }
15 |   }
16 |   
17 |   override fun get(s: State, a: Action<State>)
18 |       = p[s as IndexedState, a as IndexedAction]
19 |   
20 |   operator fun set(s: IndexedState, a: IndexedAction, v: Double) {
21 |     p[s, a] = v
22 |   }
23 |   
24 |   operator fun set(s: IndexedState, newaction: IndexedAction) {
25 |     for (a in s.actions)
26 |       p[s, a] = 0.0
27 |     p[s, newaction] = 1.0
28 |   }
29 |   
30 |   override fun greedy(s: State): IndexedAction {
31 |     s as IndexedState
32 |     return argmax(s.actions) { get(s, it) }
33 |   }
34 | }
35 | 
36 | val null_policy = IndexedPolicy(emptyNSet())


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedPossible.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 | 
3 | import lab.mars.rl.model.Possible
4 | 
5 | class IndexedPossible(next: IndexedState, reward: Double, var probability: Double): Possible<IndexedState>(next, reward) {
6 |   override operator fun component1() = next
7 |   override operator fun component2() = reward
8 |   operator fun component3() = probability
9 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedState.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("OVERRIDE_BY_INLINE", "NOTHING_TO_INLINE", "UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.model.impl.mdp
 4 | 
 5 | import lab.mars.rl.model.State
 6 | import lab.mars.rl.util.buf.Index
 7 | import lab.mars.rl.util.buf.IntBuf
 8 | import lab.mars.rl.util.collection.IndexedCollection
 9 | import lab.mars.rl.util.collection.emptyNSet
10 | 
11 | class IndexedState(val index: IntBuf): Index(), State {
12 |   override inline val size: Int
13 |     get() = index.size
14 |   
15 |   override inline operator fun get(idx: Int) = index[idx]
16 |   
17 |   override var actions: IndexedCollection<IndexedAction> = emptyNSet()
18 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/NSetMDP.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("NOTHING_TO_INLINE")
 2 | 
 3 | package lab.mars.rl.model.impl.mdp
 4 | 
 5 | import lab.mars.rl.util.buf.IntBuf
 6 | import lab.mars.rl.util.dimension.*
 7 | 
 8 | /**
 9 |  * <p>
10 |  * Created on 2017-09-14.
11 |  * </p>
12 |  *
13 |  * @author wumo
14 |  */
15 | 
16 | /**
17 |  * @param gamma `γ` decay
18 |  * @param state_dim state dimension
19 |  * @param action_dim action dimension
20 |  * @return mdp with same state dimension pair same action dimension
21 |  */
22 | inline fun NSetMDP(gamma: Double, state_dim: Any, action_dim: Any): IndexedMDP {
23 |   val a_dim = action_dim.toDim()
24 |   return NSetMDP(gamma, state_dim.toDim(), { a_dim })
25 | }
26 | 
27 | /**
28 |  * @param gamma  `γ` decay factor
29 |  * @param state_dim state dimension
30 |  * @param action_dim different action dimension according to specific state
31 |  * @return mdp with same state dimension but different action dimension
32 |  */
33 | fun NSetMDP(gamma: Double, state_dim: Any, action_dim: (IntBuf) -> Any): IndexedMDP {
34 |   val s_dim = state_dim.toDim() as GeneralDimension
35 |   val s_a_dim = s_dim.copy() x action_dim
36 |   return IndexedMDP(
37 |       γ = gamma,
38 |       states = nsetFrom(s_dim) {
39 |         IndexedState(it.copy()).apply { actions = nsetFrom(action_dim(it).toDim()) { IndexedAction(it.copy()) } }
40 |       },
41 |       state_function = { element_maker -> nsetFrom(s_dim, element_maker) },
42 |       state_action_function = { element_maker -> nsetFrom(s_a_dim, element_maker) })
43 | }
44 | 
45 | /**
46 |  *  Note that: dimension shouldn't be 0. It it needs to be 0, then you can set `emptyNSet()` afterStartup the construction.
47 |  * @param gamma `γ` decay factor
48 |  * @param state_dim state dimension
49 |  * @param action_dim action dimension
50 |  * @return mdp with same state dimension pair same action dimension
51 |  */
52 | inline fun CNSetMDP(gamma: Double, state_dim: Any, action_dim: Any): IndexedMDP {
53 |   val a_dim = action_dim.toDim() as GeneralDimension
54 |   return CNSetMDP(gamma, state_dim.toDim(), { a_dim })
55 | }
56 | 
57 | /**
58 |  * Note that: dimension shouldn't be 0. It it needs to be 0, then you can set `emptyNSet()` afterStartup the construction.
59 |  * @param gamma  `γ`  decay factor
60 |  * @param state_dim state dimension
61 |  * @param action_dim different action dimension according to specific state
62 |  * @return mdp with same state dimension but different action dimension
63 |  */
64 | fun CNSetMDP(gamma: Double, state_dim: Any, action_dim: (IntBuf) -> Any): IndexedMDP {
65 |   val s_dim = state_dim.toDim() as GeneralDimension
66 |   val states = cnsetFrom(s_dim) {
67 |     IndexedState(it.copy()).apply { actions = cnsetFrom(action_dim(it).toDim()) { IndexedAction(it.copy()) } }
68 |   }
69 |   val s_a_dim = s_dim.copy() x action_dim
70 |   return IndexedMDP(
71 |       γ = gamma,
72 |       states = states,
73 |       state_function = { element_maker -> states.copycat(element_maker) },
74 |       state_action_function = { element_maker -> cnsetFrom(s_a_dim, element_maker) })
75 | }
76 | 
77 | inline fun mdpOf(gamma: Double, state_dim: Any, action_dim: Any)
78 |     = CNSetMDP(gamma, state_dim, action_dim)
79 | 
80 | inline fun mdpOf(gamma: Double, state_dim: Any, noinline action_dim: (IntBuf) -> Any)
81 |     = CNSetMDP(gamma, state_dim, action_dim)


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/SoftmaxpPolicy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.mdp
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.ApproximateFunction
 5 | import lab.mars.rl.model.Policy
 6 | import lab.mars.rl.model.State
 7 | import lab.mars.rl.util.math.rand
 8 | import kotlin.math.exp
 9 | 
10 | class SoftmaxpPolicy<E>(val π: ApproximateFunction<E>) : Policy {
11 |   override fun invoke(s: State): Action<State> {
12 |     return rand(s.actions) { exp(π(s, it)) }
13 |   }
14 |   
15 |   override fun get(s: State, a: Action<State>) = exp(π(s, a))
16 |   
17 |   override fun greedy(s: State) = rand(s.actions) { exp(π(s, it)) }
18 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/package.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.model.impl.mdp
 2 | 
 3 | import lab.mars.rl.util.collection.IndexedCollection
 4 | import lab.mars.rl.util.tuples.tuple3
 5 | 
 6 | typealias StateSet = IndexedCollection<IndexedState>
 7 | typealias PossibleSet = IndexedCollection<IndexedPossible>
 8 | typealias StateValueFunction = IndexedCollection<Double>
 9 | typealias ActionValueFunction = IndexedCollection<Double>
10 | typealias OptimalSolution = tuple3<IndexedPolicy, StateValueFunction, ActionValueFunction>
11 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/1000-state RandomWalk.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 6 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | import lab.mars.rl.util.math.Rand
 9 | 
10 | /**
11 |  * <p>
12 |  * Created on 2017-10-10.
13 |  * </p>
14 |  *
15 |  * @author wumo
16 |  */
17 | object `1000-state RandomWalk` {
18 |   val num_states = 1000
19 |   val step_range = 100
20 |   fun make(): Pair<IndexedMDP, IndexedPolicy> {
21 |     val mdp = CNSetMDP(1.0, num_states + 2, 1)
22 |     mdp.apply {
23 |       val last = num_states + 1
24 |       states[0].actions = emptyNSet()
25 |       states[last].actions = emptyNSet()
26 |       started = { states(num_states / 2).rand() }
27 |       for (a in 1 until last)
28 |         states[a].actions[0].sample = {
29 |           val move = Rand().nextInt(1, step_range + 1) *
30 |                      (if (Rand().nextBoolean()) 1 else -1)
31 |           val next = (a + move).coerceIn(0, last)
32 |           IndexedPossible(states[next],
33 |                           when (next) {
34 |                             0 -> -1.0
35 |                             last -> 1.0
36 |                             else -> 0.0
37 |                           }, 1.0)
38 |         }
39 |     }
40 |     val policy = IndexedPolicy(mdp.QFunc { 1.0 })
41 |     return Pair(mdp, policy)
42 |   }
43 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/19-state RandomWalk.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.cnsetOf
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | 
 9 | /**
10 |  * <p>
11 |  * Created on 2017-10-10.
12 |  * </p>
13 |  *
14 |  * @author wumo
15 |  */
16 | object `19-state RandomWalk` {
17 |   val num_states = 19
18 |   fun make(): IndexedProblem {
19 |     val mdp = CNSetMDP(1.0, num_states + 2, 1)
20 |     mdp.apply {
21 |       val last = num_states + 1
22 |       states[0].actions = emptyNSet()
23 |       states[last].actions = emptyNSet()
24 |       started = { states((num_states + 1) / 2).rand() }
25 |       for (a in 1 until last) {
26 |         states[a].actions[0].apply {
27 |           val left = a - 1
28 |           val right = a + 1
29 |           possibles = cnsetOf(IndexedPossible(states[left], if (left == 0) -1.0 else 0.0, 0.5),
30 |                               IndexedPossible(states[right], if (right == last) 1.0 else 0.0, 0.5))
31 |         }
32 |       }
33 |     }
34 |     val policy = mdp.QFunc { 1.0 }
35 |     return Pair(mdp, IndexedPolicy(policy))
36 |   }
37 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/AccessControl.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.dimension.cnsetFrom
 7 | import lab.mars.rl.util.dimension.x
 8 | import lab.mars.rl.util.math.Rand
 9 | import lab.mars.rl.util.math.binomial
10 | import lab.mars.rl.util.math.pow
11 | 
12 | object AccessControl {
13 |   val k = 10
14 |   val p = 0.06
15 |   val priorities = 0..3
16 |   val rewards = pow(2.0, priorities)
17 |   val reject = 0
18 |   val accept = 1
19 |   fun make(): IndexedMDP {
20 |     val mdp = CNSetMDP(gamma = 0.9, state_dim = (k + 1) x 4, action_dim = { (fs) ->
21 |       if (fs == 0) 1 else 2
22 |     })
23 |     
24 |     return mdp.apply {
25 |       started = { states[k, Rand().nextInt(4)] }
26 |       for (s in states) {
27 |         var (freeServers, priority) = s
28 |         for (a in s.actions) {
29 |           var reward = 0.0
30 |           if (freeServers > 0 && a[0] == accept) {
31 |             freeServers--
32 |             reward = rewards[priority]
33 |           }
34 |           val busyServers = k - freeServers
35 |           a.possibles = cnsetFrom((busyServers + 1) x 4) { (released, pr) ->
36 |             IndexedPossible(states[freeServers + released, pr],
37 |                             reward,
38 |                             binomial(busyServers, released, p) * (1 / 4.0))
39 |           }
40 |         }
41 |       }
42 |     }
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/CliffWalking.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.cnsetOf
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | import lab.mars.rl.util.collection.fork
 9 | import lab.mars.rl.util.collection.map
10 | import lab.mars.rl.util.dimension.x
11 | 
12 | object CliffWalking {
13 |   val world_height = 4
14 |   val world_width = 12
15 |   val move = arrayOf(
16 |       intArrayOf(0, 1), //up
17 |       intArrayOf(0, -1), //down
18 |       intArrayOf(-1, 0), //left
19 |       intArrayOf(1, 0)//right
20 |   )
21 |   val desc_move = arrayOf("↑", "↓", "←", "→")
22 |   fun make(): IndexedMDP {
23 |     val mdp = CNSetMDP(gamma = 1.0,
24 |                        state_dim = world_width x world_height,
25 |                        action_dim = 4)
26 |     return mdp.apply {
27 |       val goal = states[11, 0]
28 |       goal.actions = emptyNSet()
29 |       started = { states(0, 0).rand() }
30 |       val startedState = states[0, 0]
31 |       
32 |       //cliff
33 |       for (x in 1 until world_width - 1)
34 |         states[x, 0].actions = emptyNSet()
35 |       
36 |       for ((s, a) in states.fork { it.actions }) {
37 |         val m = move[a[0]]
38 |         val _x = (s[0] + m[0]).coerceIn(0, world_width - 1)
39 |         val _y = (s[1] + +m[1]).coerceIn(0, world_height - 1)
40 |         val next = states[_x, _y]
41 |         a.possibles = cnsetOf(IndexedPossible(next, if (next === goal) 0.0 else -1.0, 1.0))
42 |       }
43 |       startedState.actions[3].possibles = cnsetOf(IndexedPossible(startedState, -100.0, 1.0))
44 |       for ((s, a) in (1 until world_width - 1).map { states[it, 1] }.fork { it.actions }) {
45 |         val m = move[a[0]]
46 |         var _x = s[0] + m[0]
47 |         var _y = s[1] + +m[1]
48 |         if (_y == 0) {
49 |           _x = 0
50 |           _y = 0
51 |         }
52 |         val next = states[_x, _y]
53 |         a.possibles = cnsetOf(IndexedPossible(next, if (next === startedState) -100.0 else -1.0, 1.0))
54 |       }
55 |     }
56 |   }
57 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/DynaMaze.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.buf.DefaultIntBuf
 7 | import lab.mars.rl.util.buf.IntBuf
 8 | import lab.mars.rl.util.collection.cnsetOf
 9 | import lab.mars.rl.util.collection.emptyNSet
10 | import lab.mars.rl.util.dimension.x
11 | 
12 | object DynaMaze {
13 |   private val move = arrayOf(
14 |       intArrayOf(-1, 0), //left
15 |       intArrayOf(1, 0), //right
16 |       intArrayOf(0, 1), //up
17 |       intArrayOf(0, -1)//down
18 |   )
19 |   val desc_move = arrayOf("←", "→", " ↑", " ↓")
20 |   val wall = hashSetOf<IntBuf>()
21 |   val obstacle = hashSetOf<IntBuf>()
22 |   
23 |   init {
24 |     for (x in -1..9) {
25 |       wall += DefaultIntBuf.of(x, -1)
26 |       wall += DefaultIntBuf.of(x, 6)
27 |     }
28 |     for (y in -1..6) {
29 |       wall += DefaultIntBuf.of(-1, y)
30 |       wall += DefaultIntBuf.of(9, y)
31 |     }
32 |     obstacle += DefaultIntBuf.of(2, 2)
33 |     obstacle += DefaultIntBuf.of(2, 3)
34 |     obstacle += DefaultIntBuf.of(2, 4)
35 |     
36 |     obstacle += DefaultIntBuf.of(5, 1)
37 |     
38 |     obstacle += DefaultIntBuf.of(7, 3)
39 |     obstacle += DefaultIntBuf.of(7, 4)
40 |     obstacle += DefaultIntBuf.of(7, 5)
41 |     
42 |     wall += obstacle
43 |   }
44 |   
45 |   fun make(): IndexedMDP {
46 |     val mdp = CNSetMDP(gamma = 0.95,
47 |                        state_dim = 9 x 6,
48 |                        action_dim = 4)
49 |     return mdp.apply {
50 |       for (s in states)
51 |         for (action in s.actions) {
52 |           val tmp = DefaultIntBuf.of(0, 0)
53 |           tmp[0] = s[0] + move[action[0]][0]
54 |           tmp[1] = s[1] + move[action[0]][1]
55 |           if (tmp in wall) {
56 |             tmp[0] = s[0]
57 |             tmp[1] = s[1]
58 |           }
59 |           val reward = if (tmp[0] == 8 && tmp[1] == 5) 1.0 else 0.0
60 |           action.possibles = cnsetOf(IndexedPossible(states[tmp], reward, 1.0))
61 |           
62 |         }
63 |       states[8, 5].actions = emptyNSet()
64 |       for (o in obstacle)
65 |         states[o].actions = emptyNSet()
66 |       started = { states(0, 3).rand() }
67 |     }
68 |     
69 |   }
70 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/Gambler.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.cnsetOf
 7 | import org.apache.commons.math3.util.FastMath.min
 8 | 
 9 | /**
10 |  * <p>
11 |  * Created on 2017-09-13.
12 |  * </p>
13 |  *
14 |  * @author wumo
15 |  */
16 | object Gambler {
17 |   val goal_coin = 100
18 |   
19 |   fun make(p_head: Double): IndexedMDP {
20 |     val mdp = CNSetMDP(gamma = 1.0,
21 |                        state_dim = goal_coin + 1,
22 |                        action_dim = { min(it[0], goal_coin - it[0]) + 1 })
23 |     mdp.apply {
24 |       for (s in states) {
25 |         val capital = s[0]
26 |         val max_stake = min(capital, goal_coin - capital)
27 |         for (action in s.actions) {
28 |           val stake = action[0]
29 |           action.possibles = if (max_stake == 0)
30 |             cnsetOf(IndexedPossible(states[capital], 0.0, 1.0))
31 |           else
32 |             cnsetOf(IndexedPossible(states[capital - stake], 0.0, 1 - p_head), //lose
33 |                     IndexedPossible(states[capital + stake], if (capital + stake == goal_coin) 1.0 else 0.0, p_head))//win
34 |         }
35 |       }
36 |     }
37 |     return mdp
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/GridWorld.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.model.isNotTerminal
 7 | import lab.mars.rl.util.collection.cnsetOf
 8 | import lab.mars.rl.util.collection.emptyNSet
 9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.dimension.x
12 | 
13 | /**
14 |  * <p>
15 |  * Created on 2017-09-05.
16 |  * </p>
17 |  *
18 |  * @author wumo
19 |  */
20 | object GridWorld {
21 |   private const val n = 4
22 |   private const val m = 4
23 |   private val move = arrayOf(
24 |       intArrayOf(-1, 0), //up
25 |       intArrayOf(1, 0), //down
26 |       intArrayOf(0, 1), //right
27 |       intArrayOf(0, -1)//left
28 |   )
29 |   val desc_move = arrayOf(" ↑", " ↓", "→", "←")
30 |   fun make(): IndexedMDP {
31 |     val mdp = CNSetMDP(gamma = 0.9,
32 |                        state_dim = n x n,
33 |                        action_dim = m)
34 |     mdp.apply {
35 |       for ((s, action) in states.filter { it.isNotTerminal }.fork { it.actions }) {
36 |         val (s0, s1) = s
37 |         val (a) = action
38 |         var x = s0 + move[a][0]
39 |         var y = s1 + move[a][1]
40 |         if (x < 0 || x >= n || y < 0 || y >= n) {
41 |           x = s0
42 |           y = s1
43 |         }
44 |         action.possibles = cnsetOf(IndexedPossible(states[x, y], -1.0, 1.0))
45 |       }
46 |       states[0, 0].actions = emptyNSet()
47 |       states[n - 1, n - 1].actions = emptyNSet()
48 |     }
49 |     
50 |     return mdp
51 |   }
52 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/MaximizationBias.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.emptyNSet
 7 | import lab.mars.rl.util.math.Rand
 8 | 
 9 | object MaximizationBias {
10 |   val mean = -0.1
11 |   val actionsOfB = 10
12 |   fun make(): IndexedMDP {
13 |     val mdp = CNSetMDP(gamma = 1.0,
14 |                        state_dim = 4,
15 |                        action_dim = {
16 |                          when (it[0]) {
17 |                            1 -> actionsOfB
18 |                            2 -> 2
19 |                            else -> 1
20 |                          }
21 |                        })
22 |     mdp.apply {
23 |       states[0].actions = emptyNSet()
24 |       states[3].actions = emptyNSet()
25 |       started = { states(2).rand() }
26 |       for (a in states[2].actions)
27 |         a.sample = {
28 |           val next = if (a[0] == 0) 1 else 3
29 |           IndexedPossible(states[next], 0.0, 1.0)
30 |         }
31 |       for (a in states[1].actions)
32 |         a.sample = {
33 |           IndexedPossible(states[0], Rand().nextGaussian() + mean, 1.0)
34 |         }
35 |     }
36 |     return mdp
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/MountainCar.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.Possible
 5 | import lab.mars.rl.model.RandomIterable
 6 | import lab.mars.rl.model.State
 7 | import lab.mars.rl.model.impl.mdp.DefaultAction
 8 | import lab.mars.rl.model.impl.mdp.DefaultMDP
 9 | import lab.mars.rl.util.collection.emptyNSet
10 | import lab.mars.rl.util.dimension.cnsetFrom
11 | import lab.mars.rl.util.math.Rand
12 | import org.apache.commons.math3.util.FastMath.cos
13 | 
14 | object MountainCar {
15 |   class CarState(val position: Double, val velocity: Double) : State {
16 |     override val actions: RandomIterable<Action<CarState>> =
17 |         if (position == POSITION_MAX) emptyNSet()
18 |         else cnsetFrom(3) {
19 |           val a = it[0] - 1
20 |           DefaultAction(a) {
21 |             var newVelocity = (velocity + 0.001 * a - 0.0025 * cos(3 * position))
22 |                 .coerceIn(VELOCITY_MIN, VELOCITY_MAX)
23 |             val newPosition = (position + newVelocity).coerceIn(POSITION_MIN, POSITION_MAX)
24 |             if (newPosition == POSITION_MIN) newVelocity = 0.0
25 |             Possible(CarState(newPosition, newVelocity), -1.0)
26 |           }
27 |         }
28 |   }
29 |   
30 |   const val POSITION_MIN = -1.2
31 |   const val POSITION_MAX = 0.5
32 |   const val VELOCITY_MIN = -0.07
33 |   const val VELOCITY_MAX = 0.07
34 |   fun make() = DefaultMDP(1.0) {
35 |     CarState(Rand().nextDouble(-0.6, -0.4), 0.0)
36 |   }
37 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/RandomWalk.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.cnsetOf
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | 
 9 | /**
10 |  * <p>
11 |  * Created on 2017-10-10.
12 |  * </p>
13 |  *
14 |  * @author wumo
15 |  */
16 | object RandomWalk {
17 |   fun make(): IndexedProblem {
18 |     val mdp = CNSetMDP(1.0, 7, 1)
19 |     mdp.apply {
20 |       states[0].actions = emptyNSet()
21 |       states[6].actions = emptyNSet()
22 |       started = { states(3).rand() }
23 |       for (a in 1..5) {
24 |         states[a].actions[0].apply {
25 |           possibles = cnsetOf(IndexedPossible(states[a - 1], 0.0, 0.5),
26 |                               IndexedPossible(states[a + 1], if (a == 5) 1.0 else 0.0, 0.5))
27 |         }
28 |       }
29 |     }
30 |     val policy = mdp.QFunc { 1.0 }
31 |     return Pair(mdp, IndexedPolicy(policy))
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/SquareWave.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.Action
 4 | import lab.mars.rl.model.RandomIterable
 5 | import lab.mars.rl.model.State
 6 | import lab.mars.rl.util.collection.emptyNSet
 7 | import lab.mars.rl.util.math.Rand
 8 | import lab.mars.rl.util.tuples.tuple2
 9 | 
10 | class WaveState(val x: Double): State {
11 |   override var actions: RandomIterable<Action<State>> = emptyNSet()
12 | }
13 | 
14 | object SquareWave {
15 |   val domain = 0.0..2.0
16 |   val maxResolution = 100
17 |   fun invoke(x: Double) = if (x in 0.5..1.5) 1.0 else 0.0
18 |   fun sample(): tuple2<State, Double> {
19 |     val x = Rand().nextDouble(domain.start, domain.endInclusive)
20 |     val y = invoke(x)
21 |     return tuple2(WaveState(x), y)
22 |   }
23 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/WindyGridworld.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.problem
 2 | 
 3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
 4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
 6 | import lab.mars.rl.util.collection.cnsetOf
 7 | import lab.mars.rl.util.collection.emptyNSet
 8 | import lab.mars.rl.util.collection.fork
 9 | import lab.mars.rl.util.dimension.x
10 | 
11 | object WindyGridworld {
12 |   val world_height = 7
13 |   val world_width = 10
14 |   val wind = intArrayOf(0, 0, 0, 1, 1, 1, 2, 2, 1, 0)//wind strength for each column
15 |   val move = arrayOf(
16 |       intArrayOf(0, 1), //up
17 |       intArrayOf(0, -1), //down
18 |       intArrayOf(-1, 0), //left
19 |       intArrayOf(1, 0)//right
20 |   )
21 |   val kingMove = arrayOf(
22 |       intArrayOf(0, 1), //up
23 |       intArrayOf(0, -1), //down
24 |       intArrayOf(-1, 0), //left
25 |       intArrayOf(1, 0),//right
26 |       intArrayOf(-1, 1), //up
27 |       intArrayOf(1, 1), //down
28 |       intArrayOf(1, -1), //left
29 |       intArrayOf(-1, -1)//right
30 |   )
31 |   val desc_move = arrayOf(" ↑", " ↓", "←", "→")
32 |   val desc_king_move = arrayOf(" ↑", " ↓", "←", "→", "↖", "↗", "↘", "↙")
33 |   fun make(KingMove: Boolean = false): IndexedMDP {
34 |     val mdp = CNSetMDP(gamma = 1.0,
35 |                        state_dim = world_width x world_height,
36 |                        action_dim = if (KingMove) 8 else 4)
37 |     return mdp.apply {
38 |       val goal = states[7, 3]
39 |       goal.actions = emptyNSet()
40 |       started = { states(0, 3).rand() }
41 |       for ((s, a) in states.fork { it.actions }) {
42 |         val m = (if (KingMove) kingMove else move)[a[0]]
43 |         val x = (s[0] + m[0]).coerceIn(0, world_width - 1)
44 |         val y = (s[1] + wind[s[0]] + m[1]).coerceIn(0, world_height - 1)
45 |         val next = states[x, y]
46 |         a.possibles = cnsetOf(IndexedPossible(next, if (next === goal) 0.0 else -1.0, 1.0))
47 |       }
48 |     }
49 |   }
50 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/package.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 | 
3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 | 
6 | typealias IndexedProblem = Pair<IndexedMDP, IndexedPolicy>


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/Buf.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.buf
 2 | 
 3 | import lab.mars.rl.util.math.Rand
 4 | 
 5 | /**
 6 |  * <p>
 7 |  * Created on 2017-09-28.
 8 |  * </p>
 9 |  *
10 |  * @author wumo
11 |  */
12 | interface Buf<T: Any>: Iterable<T> {
13 |   /** [end]>=[start] */
14 |   operator fun get(start: Int, end: Int): Buf<T>
15 |   
16 |   fun toTypedArray(): Array<T>
17 |   fun copy(): Buf<T>
18 |   
19 |   val size: Int
20 |   val isEmpty: Boolean
21 |     get() = size == 0
22 |   val writePtr: Int
23 |     get() = size
24 |   val lastIndex: Int
25 |     get() = size - 1
26 |   val last: T
27 |     get() = get(lastIndex)
28 |   
29 |   /**
30 |    * get value at the specific [idx]
31 |    */
32 |   operator fun get(idx: Int): T
33 |   
34 |   fun forEach(start: Int = 0, end: Int = lastIndex, block: (Int, T) -> Unit) {
35 |     for (i in start..end)
36 |       block(i, get(i))
37 |   }
38 |   
39 |   fun equals(other: Buf<T>): Boolean {
40 |     if (this === other) return true
41 |     if (size != other.size) return false
42 |     for (i in 0..lastIndex)
43 |       if (get(i) != other[i]) return false
44 |     return true
45 |   }
46 |   
47 |   override fun iterator() = object: Iterator<T> {
48 |     var a = 0
49 |     override fun hasNext() = a < size
50 |     
51 |     override fun next() = get(a++)
52 |   }
53 |   
54 |   fun rand() = get(Rand().nextInt(size))
55 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/IntBuf.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.buf
 2 | 
 3 | /**
 4 |  * <p>
 5 |  * Created on 2017-09-28.
 6 |  * </p>
 7 |  *
 8 |  * @author wumo
 9 |  */
10 | abstract class IntBuf: Index() {
11 |   /** [end]>=[start] */
12 |   abstract operator fun get(start: Int, end: Int): IntBuf
13 |   
14 |   abstract fun toIntArray(): IntArray
15 |   abstract fun copy(): IntBuf
16 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/MutableBuf.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.buf
 2 | 
 3 | /**
 4 |  * <p>
 5 |  * Created on 2017-09-28.
 6 |  * </p>
 7 |  *
 8 |  * @author wumo
 9 |  */
10 | interface MutableBuf<T: Any>: Buf<T> {
11 |   val cap: Int
12 |   
13 |   override fun get(start: Int, end: Int): MutableBuf<T>
14 |   
15 |   operator fun set(idx: Int, s: T)
16 |   
17 |   /** [end]>=[start] */
18 |   operator fun set(start: Int, end: Int, s: T)
19 |   
20 |   fun unfold(num: Int)
21 |   
22 |   fun ensure(minCap: Int)
23 |   
24 |   fun prepend(s: T)
25 |   fun prepend(num: Int, s: T)
26 |   fun prepend(another: Buf<T>)
27 |   
28 |   fun append(s: T)
29 |   fun append(num: Int, s: T)
30 |   fun append(another: Buf<T>)
31 |   
32 |   fun remove(range: IntRange) {
33 |     remove(range.start, range.endInclusive)
34 |   }
35 |   
36 |   /** [end]>=[start] */
37 |   fun remove(start: Int, end: Int)
38 |   
39 |   fun remove(index: Int) = remove(index, index)
40 |   fun removeFirst(num: Int = 1) {
41 |     if (num == 0) return
42 |     remove(0, num - 1)
43 |   }
44 |   
45 |   fun removeLast(num: Int) {
46 |     if (num == 0) return
47 |     remove(lastIndex - num + 1, lastIndex)
48 |   }
49 |   
50 |   fun clear() {
51 |     removeLast(size)
52 |   }
53 |   
54 |   fun reuseBacked(): Buf<T>
55 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/MutableIntBuf.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.buf
 2 | 
 3 | /**
 4 |  * <p>
 5 |  * Created on 2017-09-28.
 6 |  * </p>
 7 |  *
 8 |  * @author wumo
 9 |  */
10 | abstract class MutableIntBuf: IntBuf() {
11 |   abstract val cap: Int
12 |   abstract operator fun set(idx: Int, s: Int)
13 |   
14 |   /** [end]>=[start] */
15 |   abstract operator fun set(start: Int, end: Int, s: Int)
16 |   
17 |   abstract fun ensure(minCap: Int)
18 |   
19 |   abstract fun prepend(s: Int)
20 |   abstract fun prepend(num: Int, s: Int)
21 |   abstract fun prepend(another: Index)
22 |   
23 |   abstract fun append(s: Int)
24 |   abstract fun append(num: Int, s: Int)
25 |   abstract fun append(another: Index)
26 |   
27 |   fun remove(range: IntRange) {
28 |     remove(range.start, range.endInclusive)
29 |   }
30 |   
31 |   /** [end]>=[start] */
32 |   abstract fun remove(start: Int, end: Int)
33 |   
34 |   fun remove(index: Int) = remove(index, index)
35 |   fun removeFirst(num: Int) {
36 |     if (num == 0) return
37 |     remove(0, num - 1)
38 |   }
39 |   
40 |   fun removeLast(num: Int) {
41 |     if (num == 0) return
42 |     remove(lastIndex - num + 1, lastIndex)
43 |   }
44 |   
45 |   fun clear() {
46 |     removeLast(size)
47 |   }
48 |   
49 |   abstract fun reuseBacked(): IntBuf
50 |   abstract fun append(data: IntArray)
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/Gettable.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.collection
2 | 
3 | interface Gettable<in K: Any, out V: Any> {
4 |   operator fun get(k: K): V
5 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/HashMapRAC.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.collection
 2 | 
 3 | import lab.mars.rl.util.buf.DefaultBuf
 4 | import lab.mars.rl.util.buf.Index
 5 | import lab.mars.rl.util.math.Rand
 6 | import lab.mars.rl.util.tuples.tuple2
 7 | 
 8 | class HashMapRAC<E: Any>(): IndexedCollection<E> {
 9 |   private val raw = hashMapOf<Index, E>()
10 |   private val contigus = DefaultBuf.new<E>()
11 |   
12 |   override fun <T: Any> copycat(element_maker: (Index) -> T): IndexedCollection<T> {
13 |     TODO()
14 |   }
15 |   
16 |   override fun indices() = raw.keys.iterator()
17 |   
18 |   override fun withIndices(): Iterator<tuple2<out Index, E>> {
19 |     val iter = raw.entries.iterator()
20 |     return object: Iterator<tuple2<out Index, E>> {
21 |       override fun hasNext() = iter.hasNext()
22 |       
23 |       override fun next(): tuple2<out Index, E> {
24 |         val entry = iter.next()
25 |         return tuple2(entry.key, entry.value)
26 |       }
27 |     }
28 |   }
29 |   
30 |   override fun rand() = contigus[Rand().nextInt(contigus.size)]
31 |   
32 |   override fun get(dim: Index) = raw[dim]!!
33 |   
34 |   override fun invoke(subset_dim: Index): IndexedCollection<E> {
35 |     TODO()
36 |   }
37 |   
38 |   override fun set(dim: Index, s: E) {
39 |     raw.put(dim, s) ?: contigus.append(s)
40 |   }
41 |   
42 |   override fun iterator() = raw.values.iterator()
43 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/extensions.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST", "NOTHING_TO_INLINE")
 2 | 
 3 | package lab.mars.rl.util.collection
 4 | 
 5 | inline fun <E, F> Iterable<E>.fork(crossinline subset: (E) -> Iterable<F>)
 6 |     = asSequence().flatMap { s -> subset(s).asSequence().map { s to it } }
 7 | 
 8 | inline fun <E, F> Sequence<E>.fork(crossinline subset: (E) -> Iterable<F>)
 9 |     = flatMap { s -> subset(s).asSequence().map { s to it } }
10 | 
11 | inline fun <E> Iterable<E>.filter(crossinline predicate: (E) -> Boolean)
12 |     = asSequence().filter { predicate(it) }
13 | 
14 | inline fun <T, R> Iterable<T>.map(crossinline transform: (T) -> R)
15 |     = asSequence().map { transform(it) }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/exception/IndexOutOfDimensionException.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.exception
2 | 
3 | class IndexOutOfDimensionException: Exception()


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/exception/NoMoreElementsException.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.exception
2 | 
3 | class NoMoreElementsException: Exception()


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/log/LoggerHelpers.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.log
 2 | 
 3 | import org.slf4j.Logger
 4 | 
 5 | inline fun Logger.info(block: () -> String) {
 6 |   if (isInfoEnabled) info(block())
 7 | }
 8 | 
 9 | inline fun Logger.debug(block: () -> String) {
10 |   if (isDebugEnabled) debug(block())
11 | }
12 | 
13 | inline fun Logger.warn(block: () -> String) {
14 |   if (isWarnEnabled) warn(block())
15 | }
16 | 
17 | inline fun Logger.error(block: () -> String) {
18 |   if (isErrorEnabled) error(block())
19 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/math/Binomial.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.math
2 | 
3 | import org.apache.commons.math3.util.FastMath.exp
4 | 
5 | fun binomial(trial: Int, x: Int, p: Double): Double {
6 |   if (trial == 0) return if (x == 0) 1.0 else 0.0
7 |   if (x < 0 || x > trial) return 0.0
8 |   return exp(logBinomialProbability(x, trial, p, 1.0 - p))
9 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/math/Vector.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.math
 2 | 
 3 | import kotlin.math.PI
 4 | import kotlin.math.sqrt
 5 | 
 6 | 
 7 | data class Vector2(var x: Double = 0.0, var y: Double = 0.0) {
 8 |   companion object {
 9 |     fun zero() = Vector2(0.0, 0.0)
10 |     val ZERO = zero()
11 |   }
12 |   
13 |   fun set(v: Vector2) {
14 |     x = v.x
15 |     y = v.y
16 |   }
17 |   
18 |   fun set(x: Double, y: Double) {
19 |     this.x = x
20 |     this.y = y
21 |   }
22 |   
23 |   operator fun plus(v: Vector2) = Vector2(x + v.x, y + v.y)
24 |   operator fun plusAssign(v: Vector2) {
25 |     x += v.x
26 |     y += v.y
27 |   }
28 |   
29 |   operator fun minus(v: Vector2) = Vector2(x - v.x, y - v.y)
30 |   operator fun minusAssign(v: Vector2) {
31 |     x -= v.x
32 |     y -= v.y
33 |   }
34 |   
35 |   operator fun times(s: Double) = Vector2(x * s, y * s)
36 |   operator fun timesAssign(s: Double) {
37 |     x *= s
38 |     y *= s
39 |   }
40 |   
41 |   operator fun div(s: Double) = Vector2(x / s, y / s)
42 |   operator fun divAssign(s: Double) {
43 |     x /= s
44 |     y /= s
45 |   }
46 |   
47 |   fun norm(): Vector2 {
48 |     val v = dist()
49 |     x /= v
50 |     y /= v
51 |     return this
52 |   }
53 |   
54 |   fun rot90L(): Vector2 {
55 |     val tmp = x
56 |     x = -y
57 |     y = tmp
58 |     return this
59 |   }
60 |   
61 |   fun rot90R(): Vector2 {
62 |     val tmp = x
63 |     x = y
64 |     y = -tmp
65 |     return this
66 |   }
67 |   
68 |   fun copy() = Vector2(x, y)
69 |   fun dist() = sqrt(x * x + y * y)
70 |   fun dist(v: Vector2) = sqrt((x - v.x) * (x - v.x) + (y - v.y) * (y - v.y))
71 |   
72 |   /** @return the angle in degrees of this vector (point) relative to the x-axis. Angles are towards the positive y-axis
73 |    *         (typically counter-clockwise) and between 0 and 360. */
74 |   fun angle(): Double {
75 |     var angle = Math.atan2(y, x).toFloat() * 180f / PI
76 |     if (angle < 0) angle += 360f
77 |     return angle
78 |   }
79 |   
80 |   fun outOf(_x: Double, _y: Double, width: Double, height: Double): Boolean {
81 |     return x < _x || x > _x + width || y < _y || y > _y + height
82 |   }
83 |   
84 |   fun rotate(degrees: Double): Vector2 {
85 |     val radians = degrees * 180f / PI
86 |     val cos = Math.cos(radians)
87 |     val sin = Math.sin(radians)
88 |     
89 |     val newX = x * cos - y * sin
90 |     val newY = x * sin + y * cos
91 |     
92 |     x = newX
93 |     y = newY
94 |     
95 |     return this
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/ClasspathLocation.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.resource
 2 | 
 3 | import java.io.InputStream
 4 | import java.net.URL
 5 | 
 6 | /**
 7 |  * A resource location that searches the classpath
 8 |  *
 9 |  * @author kevin
10 |  */
11 | class ClasspathLocation : ResourceLocation {
12 |     override fun getResource(ref: String): URL {
13 |         val cpRef = ref.replace('\\', '/')
14 |         return ResourceLoader::class.java.classLoader.getResource(cpRef)
15 |     }
16 | 
17 |     override fun getResourceAsStream(ref: String): InputStream {
18 |         val cpRef = ref.replace('\\', '/')
19 |         return ResourceLoader::class.java.classLoader.getResourceAsStream(cpRef)
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/FileSystemLocation.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.resource
 2 | 
 3 | import java.io.File
 4 | import java.io.FileInputStream
 5 | import java.io.IOException
 6 | import java.io.InputStream
 7 | import java.net.URL
 8 | 
 9 | /**
10 |  * A resource loading location that searches somewhere on the classpath
11 |  *
12 |  * @author kevin
13 |  */
14 | 
15 | /**
16 |  * Create a new resoruce location based on the file system
17 |  *
18 |  * @param root The root of the file system to search
19 |  */
20 | class FileSystemLocation(private val root: File) : ResourceLocation {
21 | 
22 |     /**
23 |      * @see ResourceLocation.getResource
24 |      */
25 |     override fun getResource(ref: String): URL? {
26 |         return try {
27 |             var file = File(root, ref)
28 |             if (!file.exists()) {
29 |                 file = File(ref)
30 |             }
31 |             if (!file.exists()) {
32 |                 null
33 |             } else file.toURI().toURL()
34 |         } catch (e: IOException) {
35 |             null
36 |         }
37 |     }
38 | 
39 |     /**
40 |      * @see ResourceLocation.getResourceAsStream
41 |      */
42 |     override fun getResourceAsStream(ref: String): InputStream? {
43 |         return try {
44 |             var file = File(root, ref)
45 |             if (!file.exists()) {
46 |                 file = File(ref)
47 |             }
48 |             FileInputStream(file)
49 |         } catch (e: IOException) {
50 |             null
51 |         }
52 |     }
53 | 
54 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/ResourceLocation.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.resource
 2 | 
 3 | import java.io.InputStream
 4 | import java.net.URL
 5 | 
 6 | /**
 7 |  * A location from which resources can be loaded
 8 |  *
 9 |  * @author kevin
10 |  */
11 | interface ResourceLocation {
12 |     /**
13 |      * Get a resource as an input stream
14 |      *
15 |      * @param ref The reference to the resource to retrieve
16 |      * @return A stream from which the resource can be read or
17 |      * null if the resource can't be found in this location
18 |      */
19 |     fun getResourceAsStream(ref: String): InputStream?
20 | 
21 |     /**
22 |      * Get a resource as a URL
23 |      *
24 |      * @param ref The reference to the resource to retrieve
25 |      * @return A URL from which the resource can be read
26 |      */
27 |     fun getResource(ref: String): URL?
28 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple2.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.tuples
 2 | 
 3 | data class tuple2<A, B>(var _1: A, var _2: B) {
 4 |   override fun toString(): String = "($_1,$_2)"
 5 |   
 6 |   operator fun invoke(a: A, b: B): tuple2<A, B> {
 7 |     _1 = a
 8 |     _2 = b
 9 |     return this
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple3.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 | 
3 | data class tuple3<A, B, C>(var _1: A, var _2: B, var _3: C) {
4 |   override fun toString(): String {
5 |     return "($_1,$_2,$_3)"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple4.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 | 
3 | data class tuple4<A, B, C, D>(var _1: A, var _2: B, var _3: C, var _4: D) {
4 |   override fun toString(): String {
5 |     return "($_1,$_2,$_3,$_4)"
6 |   }
7 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple5.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 | 
3 | data class tuple5<A, B, C, D, E>(var _1: A, var _2: B, var _3: C, var _4: D, var _5: E) {
4 |   override fun toString(): String {
5 |     return "($_1,$_2,$_3,$_4,$_5)"
6 |   }
7 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple6.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 | 
3 | data class tuple6<A, B, C, D, E, F>(var _1: A, var _2: B, var _3: C, var _4: D, var _5: E, var _6: F) {
4 |   override fun toString(): String {
5 |     return "($_1,$_2,$_3,$_4,$_5,$_6)"
6 |   }
7 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/ChartApp.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.ui
 2 | 
 3 | import javafx.scene.chart.NumberAxis
 4 | import tornadofx.*
 5 | import java.util.concurrent.ConcurrentLinkedQueue
 6 | 
 7 | class Line(val description: String, val data: MutableMap<Number, Number> = hashMapOf()) {
 8 |   operator fun set(x: Number, y: Number) = data.put(x, y)
 9 | }
10 | 
11 | class LineChart(val title: String, val xAxisLabel: String, val yAxisLabel: String,
12 |                 val lines: MutableCollection<Line> = ConcurrentLinkedQueue(),
13 |                 val xAxisConfig: NumberAxis.() -> Unit = {}, val yAxisConfig: NumberAxis.() -> Unit = {},
14 |                 val linesSortor: Array<Line>.() -> Unit = {}) {
15 |   operator fun plusAssign(line: Line) {
16 |     lines += line
17 |   }
18 | }
19 | 
20 | class D2DChart: View() {
21 |   companion object {
22 |     val charts = mutableListOf<LineChart>()
23 |   }
24 |   
25 |   override val root = stackpane {
26 |     flowpane {
27 |       for (chart in charts)
28 |         chart.apply {
29 |           linechart(title, NumberAxis(), NumberAxis()) {
30 |             (xAxis as NumberAxis).apply {
31 |               isForceZeroInRange = false
32 |               isAutoRanging = true
33 |               label = xAxisLabel
34 |               xAxisConfig(this)
35 |             }
36 |             (yAxis as NumberAxis).apply {
37 |               isForceZeroInRange = false
38 |               isAutoRanging = true
39 |               label = yAxisLabel
40 |               yAxisConfig(this)
41 |             }
42 |             val lines = chart.lines.toTypedArray()
43 |             linesSortor(lines)
44 |             for (line in lines)
45 |               series(line.description) {
46 |                 for ((k, v) in line.data)
47 |                   data(k, v)
48 |               }
49 |             createSymbols = false
50 |           }
51 |         }
52 |     }
53 |   }
54 | }
55 | 
56 | class ChartApp: App(D2DChart::class)
57 | 


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/D2DGameUI.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.ui
 2 | 
 3 | import javafx.application.Application
 4 | import javafx.application.Platform
 5 | import javafx.collections.FXCollections
 6 | import javafx.geometry.Orientation
 7 | import javafx.scene.Scene
 8 | import javafx.scene.canvas.Canvas
 9 | import javafx.scene.canvas.GraphicsContext
10 | import javafx.scene.chart.LineChart
11 | import javafx.scene.chart.NumberAxis
12 | import javafx.scene.chart.XYChart
13 | import javafx.scene.layout.FlowPane
14 | import javafx.stage.Stage
15 | import lab.mars.rl.util.resource.ResourceLoader
16 | import java.util.concurrent.CyclicBarrier
17 | 
18 | class D2DGameUI : Application() {
19 |   class ChartDescription(val title: String,
20 |                          val xAxisLabel: String, val yAxisLabel: String,
21 |                          val numSeries: Int = 1,
22 |                          val xForceZeroInRange: Boolean = true,
23 |                          val yForceZeroInRange: Boolean = true) {
24 |     val data = Array(numSeries) { FXCollections.observableArrayList<XYChart.Data<Number, Number>>()!! }
25 |   }
26 |   
27 |   lateinit var canvas: Canvas
28 |   lateinit var primaryStage: Stage
29 |   
30 |   companion object {
31 |     var width = 1000.0
32 |     var height = 800.0
33 |     var canvas_width = 600.0
34 |     var canvas_height = 800.0
35 |     var title = ""
36 |     val charts = FXCollections.observableArrayList<ChartDescription>()!!
37 |     var afterStartup: (GraphicsContext) -> Unit = {}
38 |     lateinit var render: ((GraphicsContext) -> Unit) -> Unit
39 |     
40 |   }
41 |   
42 |   override fun start(ps: Stage?) {
43 |     primaryStage = ps!!
44 |     
45 |     primaryStage.title = title
46 |     val root = FlowPane(Orientation.HORIZONTAL)
47 |     canvas = Canvas(canvas_width, canvas_height)
48 |     root.children.add(canvas)
49 |     for (c in charts) {
50 |       val chart = LineChart(NumberAxis().apply { label = c.xAxisLabel;isForceZeroInRange = c.xForceZeroInRange },
51 |                             NumberAxis().apply { label = c.yAxisLabel;isForceZeroInRange = c.yForceZeroInRange },
52 |                             FXCollections.observableArrayList<XYChart.Series<Number, Number>>().apply {
53 |                               var i = 0
54 |                               for (d in c.data)
55 |                                 add(XYChart.Series("${i++}", d))
56 |                             }).apply {
57 |         title = c.title
58 |         createSymbols = false
59 | //        isLegendVisible = false
60 |         animated = false
61 |         stylesheets.add(ResourceLoader.getResource("StockLineChart.css").toExternalForm())
62 |       }
63 |       root.children.add(chart)
64 |     }
65 |     
66 |     primaryStage.scene = Scene(root, width, height)
67 |     primaryStage.show()
68 |     render = this::render
69 |     afterStartup(canvas.graphicsContext2D)
70 |   }
71 |   
72 |   val barrier = CyclicBarrier(2)
73 |   fun render(draw: (GraphicsContext) -> Unit = {}) {
74 |     barrier.reset()
75 |     Platform.runLater {
76 |       val gc = canvas.graphicsContext2D
77 |       draw(gc)
78 |       primaryStage.title = title
79 |       barrier.await()
80 |     }
81 |     barrier.await()
82 |   }
83 |   
84 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/GridWorldUI.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.ui
 2 | 
 3 | import javafx.application.Application
 4 | import javafx.application.Platform
 5 | import javafx.scene.Group
 6 | import javafx.scene.Scene
 7 | import javafx.scene.canvas.Canvas
 8 | import javafx.scene.paint.Color
 9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.ActionValueFunction
11 | import lab.mars.rl.model.impl.mdp.IndexedState
12 | import lab.mars.rl.model.impl.mdp.StateValueFunction
13 | import java.util.concurrent.CyclicBarrier
14 | 
15 | class GridWorldUI: Application() {
16 |   lateinit var canvas: Canvas
17 |   
18 |   companion object {
19 |     var after: () -> Unit = {}
20 |     var render: (ActionValueFunction, IndexedState) -> Unit = { _, _ -> }
21 |     var width = 450.0
22 |     var height = 300.0
23 |     var grid_x = 9
24 |     var grid_y = 6
25 |   }
26 |   
27 |   override fun start(ps: Stage?) {
28 |     val primaryStage = ps!!
29 |     primaryStage.title = "Drawing Operations Test"
30 |     val root = Group()
31 |     canvas = Canvas(width, height)
32 |     root.children.add(canvas)
33 |     primaryStage.scene = Scene(root)
34 |     primaryStage.show()
35 |     render = this::render
36 |     after()
37 |   }
38 |   
39 |   val barrier = CyclicBarrier(2)
40 |   var max = 1.0
41 |   var min = 0.0
42 |   fun render(V: StateValueFunction, s: IndexedState) {
43 |     barrier.reset()
44 |     Platform.runLater {
45 |       val gc = canvas.graphicsContext2D
46 |       gc.clearRect(0.0, 0.0, width, height)
47 |       gc.stroke = Color.BLACK
48 |       val u_x = width / grid_x
49 |       val u_y = height / grid_y
50 |       for ((dim, value) in V.withIndices()) {
51 |         max = maxOf(max, value)
52 |         min = minOf(min, value)
53 |         val nx = dim[0]
54 |         val ny = dim[1]
55 |         gc.fill = Color.BLUE.interpolate(Color.RED, if (max == min) 0.5 else (value - min) / (max - min))
56 |         val x = u_x * nx
57 |         val y = u_y * ny
58 |         gc.fillRect(x, y, u_x, u_y)
59 |       }
60 |       gc.fill = Color.GREEN
61 |       gc.fillRect(s[0] * u_x, s[1] * u_y, u_x, u_y)
62 |       for ((dim, value) in V.withIndices()) {
63 |         max = maxOf(max, value)
64 |         val nx = dim[0]
65 |         val ny = dim[1]
66 |         val x = u_x * nx
67 |         val y = u_y * ny
68 |         gc.strokeRect(x, y, u_x, u_y)
69 |       }
70 |       barrier.await()
71 |     }
72 |     barrier.await()
73 |   }
74 |   
75 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/MountainCarUI.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.ui
 2 | 
 3 | import javafx.application.Application
 4 | import javafx.application.Platform
 5 | import javafx.scene.Group
 6 | import javafx.scene.Scene
 7 | import javafx.scene.canvas.Canvas
 8 | import javafx.scene.paint.Color
 9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.DefaultAction
11 | import lab.mars.rl.problem.MountainCar
12 | import lab.mars.rl.problem.MountainCar.CarState
13 | import java.util.concurrent.CyclicBarrier
14 | import kotlin.math.PI
15 | import kotlin.math.sin
16 | 
17 | class MountainCarUI: Application() {
18 |   lateinit var canvas: Canvas
19 |   
20 |   companion object {
21 |     var render: (Int, Int, CarState, DefaultAction<Int, CarState>) -> Unit = { _, _, _, _ -> }
22 |     var after: () -> Unit = {}
23 |     var width = 450.0
24 |     var height = 300.0
25 |   }
26 |   
27 |   override fun start(ps: Stage?) {
28 |     val primaryStage = ps!!
29 |     primaryStage.title = "Mountain Car"
30 |     val root = Group()
31 |     canvas = Canvas(width, height)
32 |     root.children.add(canvas)
33 |     primaryStage.scene = Scene(root)
34 |     primaryStage.show()
35 |     render = this::render
36 |     after()
37 |   }
38 |   
39 |   val barrier = CyclicBarrier(2)
40 |   fun tx(x: Double) = (x + PI / 2) / (2 * PI / 3) * width
41 |   fun ty(y: Double) = (-y + 1) / 2 * height
42 |   fun render(episode: Int, step: Int, s: CarState, a: DefaultAction<Int, CarState>) {
43 |     barrier.reset()
44 |     Platform.runLater {
45 |       val gc = canvas.graphicsContext2D
46 |       gc.clearRect(0.0, 0.0, width, height)
47 |       gc.stroke = Color.BLACK
48 |       gc.strokeText("episode:$episode\nstep:$step", width / 2-50, height / 2)
49 |       for (i in 0..40) {
50 |         val x1 = i / 40.0 * 2 * PI / 3
51 |         val y1 = sin(3 * (x1 + PI / 6))
52 |         val x2 = (i + 1) / 40.0 * 2 * PI / 3
53 |         val y2 = sin(3 * (x2 + PI / 6))
54 |         gc.strokeLine(i / 40.0 * width, ty(y1), (i + 1) / 40.0 * width, ty(y2))
55 |       }
56 |       val min_x=tx(MountainCar.POSITION_MIN)
57 |       val min_y=ty(sin(3*MountainCar.POSITION_MIN))
58 |       gc.strokeLine(min_x,min_y,min_x+10,min_y)
59 |       val ball_x = tx(s.position)
60 |       val ball_y = ty(sin(3 * s.position))
61 |       gc.strokeOval(ball_x, ball_y, 10.0, 10.0)
62 |       gc.stroke = Color.RED
63 |       gc.strokeLine(ball_x, ball_y, ball_x + a.value * 40, ball_y)
64 |       barrier.await()
65 |     }
66 |     Thread.sleep(30)
67 |     barrier.await()
68 |   }
69 | }


--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/RodManeuveringUI.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.ui
 2 | 
 3 | import javafx.application.Application
 4 | import javafx.application.Platform.runLater
 5 | import javafx.scene.Group
 6 | import javafx.scene.Scene
 7 | import javafx.scene.canvas.Canvas
 8 | import javafx.scene.paint.Color
 9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.IndexedState
11 | import lab.mars.rl.model.impl.mdp.StateValueFunction
12 | import lab.mars.rl.problem.RodManeuvering
13 | import lab.mars.rl.problem.RodManeuvering.currentStatus
14 | import lab.mars.rl.problem.RodManeuvering.height
15 | import lab.mars.rl.problem.RodManeuvering.resolution
16 | import lab.mars.rl.problem.RodManeuvering.rodEdges
17 | import lab.mars.rl.problem.RodManeuvering.rotate
18 | import lab.mars.rl.problem.RodManeuvering.rotation_resolution
19 | import lab.mars.rl.problem.RodManeuvering.unit_x
20 | import lab.mars.rl.problem.RodManeuvering.unit_y
21 | import lab.mars.rl.problem.RodManeuvering.width
22 | import lab.mars.rl.util.math.max
23 | import java.util.concurrent.CyclicBarrier
24 | 
25 | class RodManeuveringUI: Application() {
26 |   lateinit var canvas: Canvas
27 |   
28 |   companion object {
29 |     var after: () -> Unit = {}
30 |     var render: (StateValueFunction, IndexedState) -> Unit = { _, _ -> }
31 |   }
32 |   
33 |   override fun start(ps: Stage?) {
34 |     val primaryStage = ps!!
35 | //    primaryStage.title = "Drawing Operations Test"
36 |     val root = Group()
37 |     canvas = Canvas(width, height)
38 |     drawMap()
39 |     root.children.add(canvas)
40 |     primaryStage.scene = Scene(root)
41 |     primaryStage.show()
42 |     render = this::render
43 |     after()
44 |   }
45 |   
46 |   fun drawMap() {
47 |     val gc = canvas.graphicsContext2D
48 |     gc.stroke = Color.BLACK
49 |     for (o in RodManeuvering.obstacles) {
50 |       o.v.apply {
51 |         val xPoints = DoubleArray(size) { this[it].x }
52 |         val yPoints = DoubleArray(size) { this[it].y }
53 |         gc.strokePolygon(xPoints,
54 |                          yPoints, size)
55 |       }
56 |     }
57 |   }
58 |   
59 |   val barrier = CyclicBarrier(2)
60 |   var max = 1.0
61 |   var min = 0.0
62 |   fun render(V: StateValueFunction, s: IndexedState) {
63 |     barrier.reset()
64 |     runLater {
65 |       val (x, y, rotation) = currentStatus(s)
66 |       val gc = canvas.graphicsContext2D
67 |       gc.clearRect(0.0, 0.0, width, height)
68 |       gc.stroke = Color.BLACK
69 |       for (nx in 0 until resolution)
70 |         for (ny in 0 until resolution) {
71 |           val value = max(0 until rotation_resolution) { V[nx, ny, it] }
72 |           max = maxOf(max, value)
73 |           min = minOf(min, value)
74 |           gc.fill = Color.BLUE.interpolate(Color.RED, if (max == min) 0.5 else (value - min) / (max - min))
75 |           gc.fillRect(nx * unit_x, ny * unit_y, unit_x, unit_y)
76 |         }
77 |       gc.fill = Color.GREEN
78 |       for (edge in rodEdges) {
79 |         val p1 = edge._1.rotate(rotation).add(x, y)
80 |         val p2 = edge._2.rotate(rotation).add(x, y)
81 |         gc.strokeLine(p1.x, p1.y, p2.x, p2.y)
82 |       }
83 |       
84 |       drawMap()
85 |       barrier.await()
86 |     }
87 |     barrier.await()
88 |   }
89 |   
90 | }


--------------------------------------------------------------------------------
/src/main/resources/StockLineChart.css:
--------------------------------------------------------------------------------
1 | .chart-series-line {
2 |     -fx-stroke-width: 0.5px;
3 |     -fx-effect: null;
4 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dp/Test Value Iteration.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dp
 2 | 
 3 | import lab.mars.rl.problem.CarRental
 4 | import lab.mars.rl.problem.GridWorld
 5 | import lab.mars.rl.util.format
 6 | import org.junit.Assert
 7 | import org.junit.Test
 8 | 
 9 | class `Test Value Iteration` {
10 |   @Test
11 |   fun `GridWorld Problem`() {
12 |     val prob = GridWorld.make()
13 |     val (_, V, _) = prob.ValueIteration()
14 |     for (s in prob.states) {
15 |       println(V[s])
16 |     }
17 |   }
18 |   
19 |   @Test
20 |   fun `Car Rental  Value Iteration`() {
21 |     val prob = CarRental.make(false)
22 |     val (_, V, _) = prob.ValueIteration()
23 |     var i = 0
24 |     for (a in CarRental.max_car downTo 0)
25 |       for (b in 0..CarRental.max_car)
26 |         Assert.assertEquals(`Car Rental Result`[i++], V[prob.states[a, b]].format(2))
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q on-policy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.average_α
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.problem.Blackjack
 7 | import lab.mars.rl.problem.DynaMaze
 8 | import lab.mars.rl.problem.RodManeuvering
 9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.printBlackjack
11 | import lab.mars.rl.util.ui.GridWorldUI
12 | import lab.mars.rl.util.ui.RodManeuveringUI
13 | import org.junit.Test
14 | import java.util.concurrent.CountDownLatch
15 | import kotlin.concurrent.thread
16 | 
17 | class `Test Optimal Dyna-Q on-policy` {
18 |   @Test
19 |   fun `Blackjack`() {
20 |     val (prob) = Blackjack.make()
21 |     val (π, V) = prob.`Dyna-Q-OnPolicy`(
22 |         n = 10,
23 |         ε = 0.1,
24 |         α = average_α(prob),
25 |         episodes = 1000000)
26 |     printBlackjack(prob, π, V)
27 |   }
28 |   
29 |   @Test
30 |   fun `Dyna Maze UI`() {
31 |     val prob = DynaMaze.make()
32 |     val latch = CountDownLatch(1)
33 |     
34 |     thread {
35 |       latch.await()
36 |       val (π) = prob.`Dyna-Q-OnPolicy`(
37 |           n = 20,
38 |           ε = 0.1,
39 |           α = average_α(prob),
40 |           episodes = 1000,
41 |           stepListener = { V, s ->
42 |             GridWorldUI.render(V, s)
43 |           })
44 |       var s = prob.started()
45 |       var count = 0
46 |       print(s)
47 |       while (s.isNotTerminal) {
48 |         val a = argmax(s.actions) { π[s, it] }
49 |         val possible = a.sample()
50 |         s = possible.next
51 |         count++
52 |         print("${DynaMaze.desc_move[a[0]]}$s")
53 |       }
54 |       println("\nsteps=$count")//optimal=14
55 |     }
56 |     GridWorldUI.after = { latch.countDown() }
57 |     Application.launch(GridWorldUI::class.java)
58 |   }
59 |   
60 |   @Test
61 |   fun `Rod Maneuvering UI`() {
62 |     val prob = RodManeuvering.make()
63 |     val latch = CountDownLatch(1)
64 |     
65 |     thread {
66 |       latch.await()
67 |       val (π) = prob.`Dyna-Q-OnPolicy`(
68 |           n = 20,
69 |           ε = 0.1,
70 |           α = average_α(prob),
71 |           episodes = 1000,
72 |           stepListener = { V, s ->
73 |             RodManeuveringUI.render(V, s)
74 |           })
75 |       var s = prob.started()
76 |       var count = 0
77 |       print(s)
78 |       while (s.isNotTerminal) {
79 |         val a = argmax(s.actions) { π[s, it] }
80 |         val possible = a.sample()
81 |         s = possible.next
82 |         count++
83 |         print("$a$s")
84 |       }
85 |       println("\nsteps=$count")//optimal=39
86 |     }
87 |     RodManeuveringUI.after = { latch.countDown() }
88 |     Application.launch(RodManeuveringUI::class.java)
89 |   }
90 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q+.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.DynaMaze
 6 | import lab.mars.rl.util.math.argmax
 7 | import lab.mars.rl.util.ui.GridWorldUI
 8 | import org.junit.Test
 9 | import java.util.concurrent.CountDownLatch
10 | import kotlin.concurrent.thread
11 | 
12 | class `Test Optimal Dyna-Q+` {
13 |   @Test
14 |   fun `Dyna Maze`() {
15 |     val prob = DynaMaze.make()
16 |     val latch = CountDownLatch(1)
17 |     
18 |     thread {
19 |       latch.await()
20 |       val (π) = prob.`Dyna-Q+`(
21 |           n = 10,
22 |           α = { _, _ -> 0.1 },
23 |           ε = 0.1,
24 |           κ = 1e-4,
25 |           episodes = 1000,
26 |           stepListener = { V, s ->
27 |             GridWorldUI.render(V, s)
28 |           })
29 |       var s = prob.started()
30 |       var count = 0
31 |       print(s)
32 |       while (s.isNotTerminal) {
33 |         val a = argmax(s.actions) { π[s, it] }
34 |         val possible = a.sample()
35 |         s = possible.next
36 |         count++
37 |         print("${DynaMaze.desc_move[a[0]]}$s")
38 |       }
39 |       println("\nsteps=$count")//optimal=14
40 |     }
41 |     GridWorldUI.after = { latch.countDown() }
42 |     Application.launch(GridWorldUI::class.java)
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.average_α
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.problem.Blackjack
 7 | import lab.mars.rl.problem.DynaMaze
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import lab.mars.rl.util.ui.GridWorldUI
11 | import org.junit.Test
12 | import java.util.concurrent.CountDownLatch
13 | import kotlin.concurrent.thread
14 | 
15 | class `Test Optimal Dyna-Q` {
16 |   @Test
17 |   fun `Blackjack`() {
18 |     val (prob) = Blackjack.make()
19 |     val (π, V) = prob.DynaQ(
20 |         n = 10,
21 |         ε = 0.1,
22 |         α = average_α(prob),
23 |         episodes = 100000)
24 |     printBlackjack(prob, π, V)
25 |   }
26 |   
27 |   @Test
28 |   fun `Dyna Maze`() {
29 |     val prob = DynaMaze.make()
30 |     val (π) = prob.DynaQ(
31 |         n = 10,
32 |         ε = 0.1,
33 |         α = average_α(prob),
34 |         episodes = 100000)
35 |     var s = prob.started()
36 |     var count = 0
37 |     print(s)
38 |     while (s.isNotTerminal) {
39 |       val a = argmax(s.actions) { π[s, it] }
40 |       val possible = a.sample()
41 |       s = possible.next
42 |       count++
43 |       print("${DynaMaze.desc_move[a[0]]}$s")
44 |     }
45 |     println("\nsteps=$count")//optimal=14
46 |   }
47 |   
48 |   @Test
49 |   fun `Dyna Maze UI`() {
50 |     val prob = DynaMaze.make()
51 |     
52 |     val latch = CountDownLatch(1)
53 |     
54 |     thread {
55 |       latch.await()
56 |       val (π) = prob.DynaQ(
57 |           n = 10,
58 |           ε = 0.1,
59 |           α = average_α(prob),
60 |           episodes = 100000,
61 |           stepListener = { V, s ->
62 |             GridWorldUI.render(V, s)
63 |           })
64 |       var s = prob.started()
65 |       var count = 0
66 |       print(s)
67 |       while (s.isNotTerminal) {
68 |         val a = argmax(s.actions) { π[s, it] }
69 |         val possible = a.sample()
70 |         s = possible.next
71 |         count++
72 |         print("${DynaMaze.desc_move[a[0]]}$s")
73 |       }
74 |       println("\nsteps=$count")//optimal=14
75 |     }
76 |     GridWorldUI.after = { latch.countDown() }
77 |     Application.launch(GridWorldUI::class.java)
78 |   }
79 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Prioritized Sweeping Stochastic.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.average_α
 5 | import lab.mars.rl.model.isNotTerminal
 6 | import lab.mars.rl.problem.Blackjack
 7 | import lab.mars.rl.problem.DynaMaze
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import lab.mars.rl.util.ui.GridWorldUI
11 | import org.junit.Test
12 | import java.util.concurrent.CountDownLatch
13 | import kotlin.concurrent.thread
14 | 
15 | class `Test Optimal Prioritized Sweeping Stochastic` {
16 |   @Test
17 |   fun `Blackjack`() {
18 |     val (prob) = Blackjack.make()
19 |     val (π, V) = prob.PrioritizedSweepingStochasticEnv(
20 |         n = 10,
21 |         θ = 0.0,
22 |         ε = 0.1,
23 |         α = average_α(prob),
24 |         episodes = 100000)
25 |     printBlackjack(prob, π, V)
26 |   }
27 |   
28 |   @Test
29 |   fun `Dyna Maze UI`() {
30 |     val prob = DynaMaze.make()
31 |     val latch = CountDownLatch(1)
32 |     
33 |     thread {
34 |       latch.await()
35 |       val (π) = prob.PrioritizedSweepingStochasticEnv(
36 |           n = 10,
37 |           θ = 0.0,
38 |           ε = 0.1,
39 |           α = { _, _ -> 0.1 },
40 |           episodes = 1000,
41 |           stepListener = { V, s ->
42 |             GridWorldUI.render(V, s)
43 |           })
44 |       var s = prob.started()
45 |       var count = 0
46 |       print(s)
47 |       while (s.isNotTerminal) {
48 |         val a = argmax(s.actions) { π[s, it] }
49 |         val possible = a.sample()
50 |         s = possible.next
51 |         count++
52 |         print("${DynaMaze.desc_move[a[0]]}$s")
53 |       }
54 |       println("\nsteps=$count")//optimal=14
55 |     }
56 |     GridWorldUI.after = { latch.countDown() }
57 |     Application.launch(GridWorldUI::class.java)
58 |   }
59 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal RandomSampleOneStepTabularQLearning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.dyna
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.problem.Blackjack
 5 | import lab.mars.rl.util.printBlackjack
 6 | import org.junit.Test
 7 | 
 8 | class `Test Optimal RandomSampleOneStepTabularQLearning` {
 9 |   @Test
10 |   fun `Blackjack`() {
11 |     val (prob) = Blackjack.make()
12 |     val (π, V) = prob.RandomSampleOneStepTabularQLearning(
13 |         ε = 0.1,
14 |         α = average_α(prob),
15 |         episodes = 1000000)
16 |     printBlackjack(prob, π, V)
17 |   }
18 |   
19 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction Off-line λ-return.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import ch.qos.logback.classic.Level
 4 | import javafx.application.Application
 5 | import kotlinx.coroutines.runBlocking
 6 | import lab.mars.rl.model.impl.func.LinearFunc
 7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
 8 | import lab.mars.rl.model.impl.mdp.IndexedState
 9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.asyncs
11 | import lab.mars.rl.util.await
12 | import lab.mars.rl.util.listOf
13 | import lab.mars.rl.util.logLevel
14 | import lab.mars.rl.util.tuples.tuple2
15 | import lab.mars.rl.util.ui.ChartApp
16 | import lab.mars.rl.util.ui.D2DChart
17 | import lab.mars.rl.util.ui.Line
18 | import lab.mars.rl.util.ui.LineChart
19 | import org.apache.commons.math3.util.FastMath.pow
20 | import org.apache.commons.math3.util.FastMath.sqrt
21 | import org.junit.Test
22 | 
23 | class `Test Prediction Off-line λ-return` {
24 |   @Test
25 |   fun `Performance`() {
26 |     logLevel(Level.ERROR)
27 |     
28 |     val (prob, π) = `19-state RandomWalk`.make()
29 |     val realV = listOf(-20..20 step 2) { it / 20.0 }
30 |     realV[0] = 0.0
31 |     realV[20] = 0.0
32 |     
33 |     val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
34 |     val αs = listOf(0.0, 0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
35 |     
36 |     val episodes = 10
37 |     val runs = 100
38 |     val truncateValue = 0.55
39 |     
40 |     val chart = LineChart("Off-line λ-return", "α", "Average RMS")
41 |     runBlocking {
42 |       for (λ in λs) {
43 |         val line = Line("λ=$λ")
44 |         chart += line
45 |         asyncs(αs) { α ->
46 |           var rms_sum = 0.0
47 |           asyncs(runs) { run ->
48 |             val func = LinearFunc(
49 |                 SimpleTileCoding(1,
50 |                                  prob.states.size,
51 |                                  1,
52 |                                  0.0) { (s) -> (s as IndexedState)[0].toDouble() })
53 |             var rms = 0.0
54 |             prob.`Off-line λ-return`(
55 |                 V = func, π = π,
56 |                 α = α, λ = λ,
57 |                 episodes = episodes,
58 |                 episodeListener = { _, _ ->
59 |                   var error = 0.0
60 |                   for (s in prob.states)
61 |                     error += pow(func(s) - realV[s[0]], 2)
62 |                   error /= prob.states.size
63 |                   rms += sqrt(error)
64 |                 })
65 |             println("finish λ=$λ α=$α run=$run")
66 |             rms
67 |           }.await { rms_sum += it }
68 |           println("finish λ=$λ α=$α")
69 |           tuple2(α, rms_sum / (episodes * runs))
70 |         }.await { (α, rms) ->
71 |           if (rms < truncateValue)
72 |             line[α] = rms
73 |         }
74 |         println("finish λ=$λ")
75 |       }
76 |     }
77 |     D2DChart.charts += chart
78 |     Application.launch(ChartApp::class.java)
79 |   }
80 |   
81 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction Semi-gradient TD(λ).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import ch.qos.logback.classic.Level
 4 | import javafx.application.Application
 5 | import kotlinx.coroutines.runBlocking
 6 | import lab.mars.rl.model.impl.func.LinearFunc
 7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
 8 | import lab.mars.rl.model.impl.mdp.IndexedState
 9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.*
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.ChartApp
13 | import lab.mars.rl.util.ui.D2DChart
14 | import lab.mars.rl.util.ui.Line
15 | import lab.mars.rl.util.ui.LineChart
16 | import org.apache.commons.math3.util.FastMath
17 | import org.junit.Test
18 | 
19 | class `Test Prediction Semi-gradient TDλ` {
20 |   @Test
21 |   fun `Performance`() {
22 |     logLevel(Level.ERROR)
23 |     
24 |     val (prob, π) = `19-state RandomWalk`.make()
25 |     val realV = listOf(-20..20 step 2) { it / 20.0 }
26 |     realV[0] = 0.0
27 |     realV[20] = 0.0
28 |     
29 |     val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
30 |     val αs = listOf(100) { it * 0.01 }
31 |     
32 |     val episodes = 10
33 |     val runs = 100
34 |     val truncateValue = 0.6
35 |     
36 |     val chart = LineChart("Semi-gradient TD(λ)", "α", "Average RMS")
37 |     runBlocking {
38 |       for (λ in λs) {
39 |         val line = Line("λ=$λ")
40 |         chart += line
41 |         asyncs(αs) { α ->
42 |           var rms_sum = 0.0
43 |           asyncs(runs) { run ->
44 |             val func = LinearFunc(
45 |                 SimpleTileCoding(1,
46 |                                  prob.states.size,
47 |                                  1,
48 |                                  0.0) { (s) -> (s as IndexedState)[0].toDouble() }
49 |             )
50 |             var rms = 0.0
51 |             prob.`Semi-gradient TD(λ) prediction`(
52 |                 V = func, π = π,
53 |                 α = α, λ = λ,
54 |                 episodes = episodes,
55 |                 episodeListener = { _, _ ->
56 |                   var error = 0.0
57 |                   for (s in prob.states)
58 |                     error += FastMath.pow(func(s) - realV[s[0]], 2)
59 |                   error /= prob.states.size
60 |                   rms += FastMath.sqrt(error)
61 |                 })
62 |             println("finish λ=${λ.format(2)} α=$α run=$run")
63 |             rms
64 |           }.await { rms_sum += it }
65 |           println("finish λ=$λ α=$α")
66 |           tuple2(α, rms_sum / (episodes * runs))
67 |         }.await { (α, rms) ->
68 |           if (rms < truncateValue)
69 |             line[α] = rms
70 |         }
71 |         println("finish λ=$λ")
72 |       }
73 |     }
74 |     D2DChart.charts += chart
75 |     Application.launch(ChartApp::class.java)
76 |   }
77 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction True Online TD(λ).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.eligibility_trace.prediction
 2 | 
 3 | import ch.qos.logback.classic.Level
 4 | import javafx.application.Application
 5 | import kotlinx.coroutines.runBlocking
 6 | import lab.mars.rl.model.impl.func.LinearFunc
 7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
 8 | import lab.mars.rl.model.impl.mdp.IndexedState
 9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.*
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.ChartApp
13 | import lab.mars.rl.util.ui.D2DChart
14 | import lab.mars.rl.util.ui.Line
15 | import lab.mars.rl.util.ui.LineChart
16 | import org.apache.commons.math3.util.FastMath.pow
17 | import org.apache.commons.math3.util.FastMath.sqrt
18 | import org.junit.Test
19 | 
20 | class `Test Prediction True Online TDλ` {
21 |   @Test
22 |   fun `Performance`() {
23 |     logLevel(Level.ERROR)
24 |     
25 |     val (prob, π) = `19-state RandomWalk`.make()
26 |     val realV = listOf(-20..20 step 2) { it / 20.0 }
27 |     realV[0] = 0.0
28 |     realV[20] = 0.0
29 |     
30 |     val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
31 |     val αs = listOf(100) { it * 0.01 }
32 |     
33 |     val episodes = 10
34 |     val runs = 100
35 |     val truncateValue = 0.6
36 |     
37 |     val chart = LineChart("True Online TD(λ)", "α", "Average RMS")
38 |     runBlocking {
39 |       for (λ in λs) {
40 |         val line = Line("λ=$λ")
41 |         chart += line
42 |         asyncs(αs) { α ->
43 |           var rms_sum = 0.0
44 |           asyncs(runs) { run ->
45 |             val func = LinearFunc(
46 |                 SimpleTileCoding(1,
47 |                                  prob.states.size,
48 |                                  1,
49 |                                  0.0) { (s) -> (s as IndexedState)[0].toDouble() }
50 |             )
51 |             var rms = 0.0
52 |             prob.`True Online TD(λ) prediction`(
53 |                 Vfunc = func, π = π,
54 |                 α = α, λ = λ,
55 |                 episodes = episodes,
56 |                 episodeListener = { _, _ ->
57 |                   var error = 0.0
58 |                   for (s in prob.states)
59 |                     error += pow(func(s) - realV[s[0]], 2)
60 |                   error /= prob.states.size
61 |                   rms += sqrt(error)
62 |                 })
63 |             println("finish λ=$λ α=${α.format(2)} run=$run")
64 |             rms
65 |           }.await { rms_sum += it }
66 |           println("finish λ=$λ α=$α")
67 |           tuple2(α, rms_sum / (episodes * runs))
68 |         }.await { (α, rms) ->
69 |           if (rms < truncateValue)
70 |             line[α] = rms
71 |         }
72 |         println("finish λ=$λ")
73 |       }
74 |     }
75 |     D2DChart.charts += chart
76 |     Application.launch(ChartApp::class.java)
77 |   }
78 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/on_policy/Test Optimal Episodic Semi-gradient QLearning control.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST", "NAME_SHADOWING")
 2 | 
 3 | package lab.mars.rl.algo.func_approx.on_policy
 4 | 
 5 | import javafx.application.Application
 6 | import lab.mars.rl.model.impl.func.LinearFunc
 7 | import lab.mars.rl.model.impl.func.SuttonTileCoding
 8 | import lab.mars.rl.model.impl.mdp.DefaultAction
 9 | import lab.mars.rl.model.impl.mdp.EpsilonGreedyFunctionPolicy
10 | import lab.mars.rl.problem.MountainCar
11 | import lab.mars.rl.problem.MountainCar.CarState
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.ui.MountainCarUI
14 | import org.junit.Test
15 | import java.util.concurrent.CountDownLatch
16 | import kotlin.concurrent.thread
17 | 
18 | class `Test Optimal Episodic Semi-gradient QLearning control` {
19 |   
20 |   @Test
21 |   fun `Mountain Car UI`() {
22 |     val prob = MountainCar.make()
23 |     val feature = SuttonTileCoding(511, 8, doubleArrayOf(8 / (MountainCar.POSITION_MAX - MountainCar.POSITION_MIN),
24 |                                                          8 / (MountainCar.VELOCITY_MAX - MountainCar.VELOCITY_MIN))) { (s, a) ->
25 |       s as CarState
26 |       a as DefaultAction<Int, CarState>
27 |       tuple2(doubleArrayOf(s.position, s.velocity), intArrayOf(a.value))
28 |     }
29 |     val func = LinearFunc(feature)
30 |     
31 |     val episodes = intArrayOf(1, 12, 104, 1000, 9000)
32 |     val latch = CountDownLatch(1)
33 |     thread {
34 |       latch.await()
35 |       prob.`Episodic semi-gradient QLearning control`(
36 |           Q = func,
37 |           π = EpsilonGreedyFunctionPolicy(func, 0.0),
38 |           α = 0.3 / 8,
39 |           episodes = 9000,
40 |           stepListener = step@{ episode, step, s, a ->
41 |             if (episode !in episodes) return@step
42 |             MountainCarUI.render(episode, step, s as CarState, a as DefaultAction<Int, CarState>)
43 |           })
44 |     }
45 |     MountainCarUI.after = { latch.countDown() }
46 |     Application.launch(MountainCarUI::class.java)
47 |   }
48 |   
49 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Function Approximator Coarse Coding.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.model.impl.func.LinearFunc
 5 | import lab.mars.rl.model.impl.func.SimpleCoarseCoding
 6 | import lab.mars.rl.problem.SquareWave
 7 | import lab.mars.rl.problem.WaveState
 8 | import lab.mars.rl.util.format
 9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.junit.Test
15 | 
16 | class `Coarse Coding` {
17 |   @Test
18 |   fun `Coarse Coding`() {
19 |     val alpha = 0.2
20 |     val numOfSamples = listOf(10, 40, 160, 2560, 10240)
21 |     val featureWidths = listOf(0.2, .4, 1.0)
22 |     for (numOfSample in numOfSamples) {
23 |       val chart = LineChart("$numOfSample samples", "state", "value")
24 |       for (featureWidth in featureWidths) {
25 |         val line = Line("feature width: ${featureWidth.format(1)}")
26 |         val feature = SimpleCoarseCoding(featureWidth,
27 |                                          SquareWave.domain, 50) { (s) -> (s as WaveState).x }
28 |         val func = LinearFunc(feature)
29 |         repeat(numOfSample) {
30 |           val (s, y) = SquareWave.sample()
31 |           func.w += alpha / feature.features.sumBy { if (it.contains(feature.conv(arrayOf(s)))) 1 else 0 } * (y - func(s)) * func.`∇`(s)
32 |         }
33 |         for (i in 0 until SquareWave.maxResolution) {
34 |           val s = WaveState(i * 2.0 / SquareWave.maxResolution)
35 |           val y = func(s)
36 |           line[i * 2.0 / SquareWave.maxResolution] = y
37 |         }
38 |         chart += line
39 |       }
40 |       D2DChart.charts += chart
41 |     }
42 |     Application.launch(ChartApp::class.java)
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction Gradient MC.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
 5 | import lab.mars.rl.model.impl.func.StateAggregation
 6 | import lab.mars.rl.model.impl.mdp.IndexedState
 7 | import lab.mars.rl.problem.`1000-state RandomWalk`
 8 | import lab.mars.rl.util.format
 9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 | 
15 | class `Test Prediction Gradient MC` {
16 |   @Test
17 |   fun `1000-state Random walk`() {
18 |     val chart = LineChart("V", "state", "value")
19 |     val (prob, π) = `1000-state RandomWalk`.make()
20 |     val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 |     prob.apply {
22 |       val line = Line("TD")
23 |       for (s in states) {
24 |         println("${V[s].format(2)} ")
25 |         line[s[0]] = V[s]
26 |       }
27 |       chart += line
28 |     }
29 |     
30 |     val func = StateAggregation(`1000-state RandomWalk`.num_states + 2, 10) { (s) -> (s as IndexedState)[0] }
31 |     prob.`Gradient Monte Carlo algorithm`(
32 |         v = func, π = π,
33 |         α = 2e-5,
34 |         episodes = 100000
35 |     )
36 |     prob.apply {
37 |       val line = Line("gradient MC")
38 |       for (s in states) {
39 |         println("${func(s).format(2)} ")
40 |         line[s[0]] = func(s)
41 |       }
42 |       chart += line
43 |     }
44 |     D2DChart.charts += chart
45 |     Application.launch(ChartApp::class.java)
46 |   }
47 |   
48 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction LSTD.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
 5 | import lab.mars.rl.model.impl.func.LinearFunc
 6 | import lab.mars.rl.model.impl.func.SimpleTileCoding
 7 | import lab.mars.rl.model.impl.mdp.IndexedState
 8 | import lab.mars.rl.problem.`1000-state RandomWalk`
 9 | import lab.mars.rl.util.format
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.apache.commons.math3.util.FastMath.ceil
15 | import org.junit.Test
16 | 
17 | class `Test Prediction LSTD` {
18 |   @Test
19 |   fun `1000-state RandomWalk`() {
20 |     val chart = LineChart("V", "state", "value")
21 |     val (prob, π) = `1000-state RandomWalk`.make()
22 |     val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
23 |     prob.apply {
24 |       val line = Line("TD")
25 |       for (s in states) {
26 |         println("${V[s].format(2)} ")
27 |         line[s[0]] = V[s]
28 |       }
29 |       chart += line
30 |     }
31 |     
32 |     val numOfTilings = 50
33 |     val feature = SimpleTileCoding(numOfTilings,
34 |                                    5,
35 |                                    ceil(`1000-state RandomWalk`.num_states / 5.0).toInt(),
36 |                                    4.0) { (s) -> ((s as IndexedState)[0] - 1).toDouble() }
37 |     val func = LinearFunc(feature)
38 |     prob.LSTD(vFunc = func, π = π, ε = 1.0, episodes = 100)
39 |     prob.apply {
40 |       val line = Line("LSTD")
41 |       for (s in states) {
42 |         println("${func(s).format(2)} ")
43 |         line[s[0]] = func(s)
44 |       }
45 |       chart += line
46 |     }
47 |     D2DChart.charts += chart
48 |     Application.launch(ChartApp::class.java)
49 |   }
50 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction Semi-gradient TD(0).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
 5 | import lab.mars.rl.model.impl.func.StateAggregation
 6 | import lab.mars.rl.model.impl.mdp.IndexedState
 7 | import lab.mars.rl.problem.`1000-state RandomWalk`
 8 | import lab.mars.rl.util.format
 9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 | 
15 | class `Test Prediction Semi-gradient TD(0)` {
16 |   @Test
17 |   fun `1000-state Random walk`() {
18 |     val chart = LineChart("V", "state", "value")
19 |     val (prob, π) = `1000-state RandomWalk`.make()
20 |     val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 |     prob.apply {
22 |       val line = Line("TD")
23 |       for (s in states) {
24 |         println("${V[s].format(2)} ")
25 |         line[s[0]] = V[s]
26 |       }
27 |       chart += line
28 |     }
29 |     
30 |     val func = StateAggregation(`1000-state RandomWalk`.num_states + 2,
31 |                                 10) { (s) -> (s as IndexedState)[0] }
32 |     prob.`Semi-gradient TD(0)`(v = func, π = π, α = 2e-4, episodes = 100000)
33 |     prob.apply {
34 |       val line = Line("Semi-gradient TD(0)")
35 |       for (s in states) {
36 |         println("${func(s).format(2)} ")
37 |         line[s[0]] = func(s)
38 |       }
39 |       chart += line
40 |     }
41 |     D2DChart.charts += chart
42 |     Application.launch(ChartApp::class.java)
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction n-step Semi-gradient TD.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.func_approx.prediction
 2 | 
 3 | import javafx.application.Application
 4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
 5 | import lab.mars.rl.model.impl.func.StateAggregation
 6 | import lab.mars.rl.model.impl.mdp.IndexedState
 7 | import lab.mars.rl.problem.`1000-state RandomWalk`
 8 | import lab.mars.rl.util.format
 9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 | 
15 | class `Test Prediction n-step Semi-gradient TD` {
16 |   @Test
17 |   fun `1000-state Random walk`() {
18 |     val chart = LineChart("V", "state", "value")
19 |     val (prob, π) = `1000-state RandomWalk`.make()
20 |     val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 |     prob.apply {
22 |       val line = Line("TD")
23 |       for (s in states) {
24 |         println("${V[s].format(2)} ")
25 |         line[s[0]] = V[s]
26 |       }
27 |       chart += line
28 |     }
29 |     
30 |     val func = StateAggregation(`1000-state RandomWalk`.num_states + 2,
31 |                                 10) { (s) -> (s as IndexedState)[0] }
32 |     prob.`n-step semi-gradient TD`(
33 |         v = func, π = π, n = 10,
34 |         α = 2e-4,
35 |         episodes = 100000)
36 |     prob.apply {
37 |       val line = Line("n-step semi-gradient TD")
38 |       for (s in states) {
39 |         println("${func(s).format(2)} ")
40 |         line[s[0]] = func(s)
41 |       }
42 |       chart += line
43 |     }
44 |     D2DChart.charts += chart
45 |     Application.launch(ChartApp::class.java)
46 |   }
47 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC Exploring Starts.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.util.printBlackjack
 5 | import org.junit.Test
 6 | 
 7 | class `Test Optimal MC Exploring Starts` {
 8 |   @Test
 9 |   fun `Blackjack`() {
10 |     val (prob, π) = Blackjack.make()
11 |     val (PI, V) = prob.`Monte Carlo Exploring Starts`(π, 1000_000)
12 |     printBlackjack(prob, PI, V)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC Off-policy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.util.printBlackjack
 5 | import org.junit.Test
 6 | 
 7 | class `Test Optimal MC Off-policy` {
 8 |   @Test
 9 |   fun `Blackjack`() {
10 |     val (prob) = Blackjack.make()
11 |     val (π, V) = prob.`Off-policy MC Optimal`(1000_000)
12 |     printBlackjack(prob, π, V)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC On-policy first-visit.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.util.printBlackjack
 5 | import org.junit.Test
 6 | 
 7 | class `Test Optimal MC On-policy first-visit` {
 8 |   @Test
 9 |   fun `Blackjack`() {
10 |     val (prob) = Blackjack.make()
11 |     val (π, V) = prob.`On-policy first-visit MC control`(1000_000)
12 |     printBlackjack(prob, π, V)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Prediction MC Off-policy.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.util.printBlackjack
 5 | import org.junit.Test
 6 | 
 7 | class `Test Monte Carlo Off-policy prediction` {
 8 |   @Test
 9 |   fun `Blackjack`() {
10 |     val (prob, π) = Blackjack.make()
11 |     val V = prob.`Off-policy MC prediction`(π, 500_000)
12 |     printBlackjack(prob, π, V)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Prediction Monte Carlo Prediction.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.mc
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.problem.RandomWalk
 5 | import lab.mars.rl.util.format
 6 | import lab.mars.rl.util.printBlackjack
 7 | import org.junit.Test
 8 | 
 9 | class `Test Prediction Monte Carlo Prediction` {
10 |   @Test
11 |   fun `Blackjack`() {
12 |     val (prob, π) = Blackjack.make()
13 |     val V = prob.`First Visit Monte Carlo Prediction`(π, 500_000)
14 |     printBlackjack(prob, π, V)
15 |   }
16 |   
17 |   @Test
18 |   fun `RandomWalk`() {
19 |     val (prob, π) = RandomWalk.make()
20 |     val V = prob.`First Visit Monte Carlo Prediction`(π, 1000)
21 |     prob.apply {
22 |       for (s in states) {
23 |         println("${V[s].format(2)} ")
24 |       }
25 |     }
26 |   }
27 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Off-policy Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.ntd
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.CliffWalking
 7 | import lab.mars.rl.problem.WindyGridworld
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 | 
12 | class `Test Optimal n-TD Off-policy Sarsa` {
13 |   
14 |   @Test
15 |   fun `Blackjack constant alpha`() {
16 |     val (prob) = Blackjack.make()
17 |     val (π, V) = prob.`N-step off-policy sarsa`(
18 |         n = Int.MAX_VALUE,
19 |         ε = 0.1,
20 |         α = { _, _ -> 0.1 },
21 |         episodes = 1000000)
22 |     printBlackjack(prob, π, V)
23 |   }
24 |   
25 |   @Test
26 |   fun `Blackjack average alpha`() {
27 |     val (prob) = Blackjack.make()
28 |     val (π, V) = prob.`N-step off-policy sarsa`(
29 |         n = Int.MAX_VALUE,
30 |         ε = 0.1,
31 |         α = average_α(prob),
32 |         episodes = 1000000)
33 |     printBlackjack(prob, π, V)
34 |   }
35 |   
36 |   @Test
37 |   fun `Cliff Walking`() {
38 |     val prob = CliffWalking.make()
39 |     val (π) = prob.`N-step off-policy sarsa`(
40 |         n = 10,
41 |         ε = 0.1,
42 |         α = { _, _ -> 0.5 },
43 |         episodes = 10000)
44 |     var s = prob.started()
45 |     var sum = 0.0
46 |     print(s)
47 |     while (s.isNotTerminal) {
48 |       val a = argmax(s.actions) { π[s, it] }
49 |       val possible = a.sample()
50 |       s = possible.next
51 |       sum += possible.reward
52 |       print("${WindyGridworld.desc_move[a[0]]}$s")
53 |     }
54 |     println("\nreturn=$sum")//optimal=-12
55 |   }
56 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Q(σ).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.ntd
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.problem.Blackjack
 5 | import lab.mars.rl.util.math.Rand
 6 | import lab.mars.rl.util.printBlackjack
 7 | import org.junit.Test
 8 | 
 9 | class `Test Optimal n-TD Q(σ)` {
10 |   @Test
11 |   fun `Blackjack σ=0`() {
12 |     val (prob) = Blackjack.make()
13 |     val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
14 |         n = Int.MAX_VALUE,
15 |         σ = { 0 },//same as treebackup
16 |         ε = 0.1,
17 |         α = average_α(prob),
18 |         episodes = 1000000)
19 |     printBlackjack(prob, π, V)
20 |   }
21 |   
22 |   @Test
23 |   fun `Blackjack σ=1`() {
24 |     val (prob) = Blackjack.make()
25 |     val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
26 |         n = Int.MAX_VALUE,
27 |         σ = { 1 },//like off-policy sarsa
28 |         ε = 0.1,
29 |         α = average_α(prob),
30 |         episodes = 1000000)
31 |     printBlackjack(prob, π, V)
32 |   }
33 |   
34 |   @Test
35 |   fun `Blackjack σ=%2`() {
36 |     val (prob) = Blackjack.make()
37 |     val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
38 |         n = Int.MAX_VALUE,
39 |         σ = { it % 2 },
40 |         ε = 0.1,
41 |         α = average_α(prob),
42 |         episodes = 1000000)
43 |     printBlackjack(prob, π, V)
44 |   }
45 |   
46 |   @Test
47 |   fun `Blackjack σ=random`() {
48 |     val (prob) = Blackjack.make()
49 |     val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
50 |         n = Int.MAX_VALUE,
51 |         σ = { Rand().nextInt(2) },
52 |         ε = 0.1,
53 |         α = average_α(prob),
54 |         episodes = 1000000)
55 |     printBlackjack(prob, π, V)
56 |   }
57 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.ntd
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.WindyGridworld
 7 | import lab.mars.rl.util.math.argmax
 8 | import lab.mars.rl.util.printBlackjack
 9 | import org.junit.Test
10 | 
11 | class `Test Optimal n-TD Sarsa` {
12 |   @Test
13 |   fun `Blackjack constant alpha`() {
14 |     val (prob) = Blackjack.make()
15 |     val (π, V) = prob.`N-step Sarsa`(
16 |         n = Int.MAX_VALUE,
17 |         ε = 0.1,
18 |         α = { _, _ -> 0.1 },
19 |         episodes = 1000000)
20 |     printBlackjack(prob, π, V)
21 |   }
22 |   
23 |   @Test
24 |   fun `Blackjack average alpha`() {
25 |     val (prob) = Blackjack.make()
26 |     val (π, V) = prob.`N-step Sarsa`(
27 |         n = Int.MAX_VALUE,
28 |         ε = 0.1,
29 |         α = { _, _ -> 0.1 },
30 |         episodes = 1000000)
31 |     printBlackjack(prob, π, V)
32 |   }
33 |   
34 |   @Test
35 |   fun `WindyGridworld`() {
36 |     val prob = WindyGridworld.make()
37 |     val (π) = prob.`N-step Sarsa`(
38 |         n = 10,
39 |         ε = 0.1,
40 |         α = average_α(prob),
41 |         episodes = 1000000)
42 |     var s = prob.started()
43 |     var sum = 0.0
44 |     print(s)
45 |     while (s.isNotTerminal) {
46 |       val a = argmax(s.actions) { π[s, it] }
47 |       val possible = a.sample()
48 |       s = possible.next
49 |       sum += possible.reward
50 |       print("${WindyGridworld.desc_move[a[0]]}$s")
51 |     }
52 |     println("\nreturn=$sum")//optimal=-14
53 |   }
54 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Treebackup.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.ntd
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.problem.Blackjack
 5 | import lab.mars.rl.util.printBlackjack
 6 | import org.junit.Test
 7 | 
 8 | class `Test Optimal n-TD Treebackup` {
 9 |   @Test
10 |   fun `Blackjack constant alpha`() {
11 |     val (prob) = Blackjack.make()
12 |     val (π, V) = prob.`N-step Treebackup`(
13 |         n = 4, ε = 0.1,
14 |         α = { _, _ -> 0.1 },
15 |         episodes = 1000000)
16 |     printBlackjack(prob, π, V)
17 |   }
18 |   
19 |   @Test
20 |   fun `Blackjack average alpha`() {
21 |     val (prob) = Blackjack.make()
22 |     val (π, V) = prob.`N-step Treebackup`(
23 |         n = Int.MAX_VALUE, ε = 0.1,
24 |         α = average_α(prob),
25 |         episodes = 1000000)
26 |     printBlackjack(prob, π, V)
27 |   }
28 |   
29 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Prediction n-TD.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.ntd
 2 | 
 3 | import ch.qos.logback.classic.Level
 4 | import javafx.application.Application
 5 | import kotlinx.coroutines.runBlocking
 6 | import lab.mars.rl.problem.Blackjack
 7 | import lab.mars.rl.problem.`19-state RandomWalk`
 8 | import lab.mars.rl.util.*
 9 | import lab.mars.rl.util.tuples.tuple2
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.apache.commons.math3.util.FastMath.pow
15 | import org.apache.commons.math3.util.FastMath.sqrt
16 | import org.junit.Test
17 | 
18 | class `Test Prediction n-TD` {
19 |   @Test
20 |   fun `Blackjack`() {
21 |     val (prob, π) = Blackjack.make()
22 |     val V = prob.`N-step TD prediction`(
23 |         n = 102400, π = π,
24 |         α = 0.1, episodes = 500000)
25 |     printBlackjack(prob, π, V)
26 |   }
27 |   
28 |   @Test
29 |   fun `RandomWalk`() {
30 |     val (prob, π) = `19-state RandomWalk`.make()
31 |     val V = prob.`N-step TD prediction`(
32 |         n = 8, π = π,
33 |         α = 0.1,
34 |         episodes = 1000)
35 |     prob.apply {
36 |       for (s in states) {
37 |         println("${V[s].format(2)} ")
38 |       }
39 |     }
40 |   }
41 |   
42 |   @Test
43 |   fun `RandomWalk RMS`() {
44 |     logLevel(Level.ERROR)
45 |     
46 |     val (prob, π) = `19-state RandomWalk`.make()
47 |     val realV = listOf(-20..20 step 2) { it / 20.0 }
48 |     realV[0] = 0.0
49 |     realV[20] = 0.0
50 |     
51 |     val ns = listOf(10) { pow(2.0, it).toInt() }
52 |     val αs = listOf(110) { it * 0.01 }
53 |     
54 |     val episodes = 10
55 |     val runs = 100
56 |     val truncateValue = 0.55
57 |     
58 |     val chart = LineChart("RMS", "α", "Average RMS")
59 |     runBlocking {
60 |       for (n in ns) {
61 |         val line = Line("n=$n")
62 |         chart += line
63 |         asyncs(αs) { α ->
64 |           var rms_sum = 0.0
65 |           asyncs(runs) {
66 |             var rms = 0.0
67 |             prob.`N-step TD prediction`(
68 |                 n = n, π = π,
69 |                 α = α,
70 |                 episodes = episodes,
71 |                 episodeListener = { _, V ->
72 |                   var error = 0.0
73 |                   for (s in prob.states)
74 |                     error += pow(V[s] - realV[s[0]], 2)
75 |                   error /= prob.states.size
76 |                   rms += sqrt(error)
77 |                 })
78 |             rms
79 |           }.await { rms_sum += it }
80 |           tuple2(α, rms_sum / (episodes * runs))
81 |         }.await { (α, rms) ->
82 |           if (rms < truncateValue)
83 |             line[α] = rms
84 |         }
85 |         println("finish n=$n")
86 |       }
87 |     }
88 |     D2DChart.charts += chart
89 |     Application.launch(ChartApp::class.java)
90 |   }
91 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Doubel Q-Learning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.CliffWalking
 7 | import lab.mars.rl.problem.MaximizationBias
 8 | import lab.mars.rl.problem.WindyGridworld
 9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.printBlackjack
11 | import org.junit.Test
12 | 
13 | class `Test Optimal TD Doubel Q-Learning` {
14 |   @Test
15 |   fun `Blackjack constant alpha`() {
16 |     val (prob) = Blackjack.make()
17 |     val (π, V) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 1000000)
18 |     printBlackjack(prob, π, V)
19 |   }
20 |   
21 |   @Test
22 |   fun `Blackjack average alpha`() {
23 |     val (prob) = Blackjack.make()
24 |     val (π, V) = prob.DoubleQLearning(ε = 0.1, α = average_α(prob), episodes = 1000000)
25 |     printBlackjack(prob, π, V)
26 |   }
27 |   
28 |   @Test
29 |   fun `Cliff Walking`() {
30 |     val prob = CliffWalking.make()
31 |     val (π) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 10000)
32 |     var s = prob.started()
33 |     var sum = 0.0
34 |     print(s)
35 |     while (s.isNotTerminal) {
36 |       val a = argmax(s.actions) { π[s, it] }
37 |       val possible = a.sample()
38 |       s = possible.next
39 |       sum += possible.reward
40 |       print("${WindyGridworld.desc_move[a[0]]}$s")
41 |     }
42 |     println("\nreturn=$sum")//optimal=-12
43 |   }
44 |   
45 |   @Test
46 |   fun `Maximization Bias Double Q-Learning`() {
47 |     val prob = MaximizationBias.make()
48 |     val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 10)
49 |     val A = prob.started()
50 |     println(π(A))
51 |     
52 |     val (π2) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 10)
53 |     println(π2(A))
54 |     
55 |   }
56 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Expected sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.CliffWalking
 7 | import lab.mars.rl.problem.WindyGridworld
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 | 
12 | class `Test Optimal TD Expected sarsa` {
13 |   @Test
14 |   fun `Blackjack constant alpha`() {
15 |     val (prob) = Blackjack.make()
16 |     val (π, V) = prob.expectedSarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000_000)
17 |     printBlackjack(prob, π, V)
18 |   }
19 |   
20 |   @Test
21 |   fun `Blackjack average alpha`() {
22 |     val (prob) = Blackjack.make()
23 |     val (π, V) = prob.expectedSarsa(ε = 0.1, α = average_α(prob), episodes = 1000_000)
24 |     printBlackjack(prob, π, V)
25 |   }
26 |   
27 |   @Test
28 |   fun `Cliff Walking TD Expected Sarsa`() {
29 |     val prob = CliffWalking.make()
30 |     val (PI) = prob.expectedSarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000_000)
31 |     var s = prob.started()
32 |     var sum = 0.0
33 |     print(s)
34 |     while (s.isNotTerminal) {
35 |       val a = argmax(s.actions) { PI[s, it] }
36 |       val possible = a.sample()
37 |       s = possible.next
38 |       sum += possible.reward
39 |       print("${WindyGridworld.desc_move[a[0]]}$s")
40 |     }
41 |     println("\nreturn=$sum")//optimal=-12
42 |   }
43 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Q-Learning.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.CliffWalking
 7 | import lab.mars.rl.problem.WindyGridworld
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 | 
12 | class `Test Optimal TD Q-Learning` {
13 |   @Test
14 |   fun `Blackjack constant alpha`() {
15 |     val (prob) = Blackjack.make()
16 |     val (π, V) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
17 |     printBlackjack(prob, π, V)
18 |   }
19 |   
20 |   @Test
21 |   fun `Blackjack average alpha`() {
22 |     val (prob) = Blackjack.make()
23 |     val (π, V) = prob.QLearning(ε = 0.1, α = average_α(prob), episodes = 1000000)
24 |     printBlackjack(prob, π, V)
25 |   }
26 |   
27 |   @Test
28 |   fun `WindyGridworld`() {
29 |     val prob = WindyGridworld.make()
30 |     val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
31 |     var s = prob.started()
32 |     var sum = 0.0
33 |     print(s)
34 |     while (s.isNotTerminal) {
35 |       val a = argmax(s.actions) { π[s, it] }
36 |       val possible = a.sample()
37 |       s = possible.next
38 |       sum += possible.reward
39 |       print("${WindyGridworld.desc_move[a[0]]}$s")
40 |     }
41 |     println("\nreturn=$sum")//optimal=-14
42 |   }
43 |   
44 |   @Test
45 |   fun `WindyGridworld King's Move`() {
46 |     val prob = WindyGridworld.make(true)
47 |     val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
48 |     var s = prob.started()
49 |     var sum = 0.0
50 |     print(s)
51 |     while (s.isNotTerminal) {
52 |       val a = argmax(s.actions) { π[s, it] }
53 |       val possible = a.sample()
54 |       s = possible.next
55 |       sum += possible.reward
56 |       print("${WindyGridworld.desc_king_move[a[0]]}$s")
57 |     }
58 |     println("\nreturn=$sum")//optimal=-6
59 |   }
60 |   
61 |   @Test
62 |   fun `Cliff Walking`() {
63 |     val prob = CliffWalking.make()
64 |     val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
65 |     var s = prob.started()
66 |     var sum = 0.0
67 |     print(s)
68 |     while (s.isNotTerminal) {
69 |       val a = argmax(s.actions) { π[s, it] }
70 |       val possible = a.sample()
71 |       s = possible.next
72 |       sum += possible.reward
73 |       print("${WindyGridworld.desc_move[a[0]]}$s")
74 |     }
75 |     println("\nreturn=$sum")//optimal=-12
76 |   }
77 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Sarsa.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.algo.average_α
 4 | import lab.mars.rl.model.isNotTerminal
 5 | import lab.mars.rl.problem.Blackjack
 6 | import lab.mars.rl.problem.CliffWalking
 7 | import lab.mars.rl.problem.WindyGridworld
 8 | import lab.mars.rl.util.math.argmax
 9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 | 
12 | class `Test Optimal TD Sarsa` {
13 |   
14 |   @Test
15 |   fun `Blackjack constant alpha`() {
16 |     val (prob) = Blackjack.make()
17 |     val (π, V) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
18 |     printBlackjack(prob, π, V)
19 |   }
20 |   
21 |   @Test
22 |   fun `Blackjack average alpha`() {
23 |     val (prob) = Blackjack.make()
24 |     val (π, V) = prob.sarsa(ε = 0.1, α = average_α(prob), episodes = 100000)
25 |     printBlackjack(prob, π, V)
26 |   }
27 |   
28 |   @Test
29 |   fun `WindyGridworld`() {
30 |     val prob = WindyGridworld.make()
31 |     val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
32 |     var s = prob.started()
33 |     var sum = 0.0
34 |     print(s)
35 |     while (s.isNotTerminal) {
36 |       val a = argmax(s.actions) { π[s, it] }
37 |       val possible = a.sample()
38 |       s = possible.next
39 |       sum += possible.reward
40 |       print("${WindyGridworld.desc_move[a[0]]}$s")
41 |     }
42 |     println("\nreturn=$sum")//optimal=-14
43 |   }
44 |   
45 |   @Test
46 |   fun `WindyGridworld King's Move`() {
47 |     val prob = WindyGridworld.make(true)
48 |     val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
49 |     var s = prob.started()
50 |     var sum = 0.0
51 |     print(s)
52 |     while (s.isNotTerminal) {
53 |       val a = argmax(s.actions) { π[s, it] }
54 |       val possible = a.sample()
55 |       s = possible.next
56 |       sum += possible.reward
57 |       print("${WindyGridworld.desc_king_move[a[0]]}$s")
58 |     }
59 |     println("\nreturn=$sum")//optimal=-6
60 |   }
61 |   
62 |   @Test
63 |   fun `Cliff Walking`() {
64 |     val prob = CliffWalking.make()
65 |     val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
66 |     var s = prob.started()
67 |     var sum = 0.0
68 |     print(s)
69 |     while (s.isNotTerminal) {
70 |       val a = argmax(s.actions) { π[s, it] }
71 |       val possible = a.sample()
72 |       s = possible.next
73 |       sum += possible.reward
74 |       print("${WindyGridworld.desc_move[a[0]]}$s")
75 |     }
76 |     println("\nreturn=$sum")//optimal=-12
77 |   }
78 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Prediction Tabular TD(0).kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.algo.td
 2 | 
 3 | import lab.mars.rl.problem.Blackjack
 4 | import lab.mars.rl.problem.RandomWalk
 5 | import lab.mars.rl.problem.`1000-state RandomWalk`
 6 | import lab.mars.rl.util.format
 7 | import lab.mars.rl.util.printBlackjack
 8 | import org.junit.Test
 9 | 
10 | class `Test Prediction Tabular TD(0)` {
11 |   @Test
12 |   fun `Blackjack`() {
13 |     val (prob, π) = Blackjack.make()
14 |     val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 500000)
15 |     printBlackjack(prob, π, V)
16 |   }
17 |   
18 |   @Test
19 |   fun `RandomWalk`() {
20 |     val (prob, π) = RandomWalk.make()
21 |     val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 1000)
22 |     prob.apply {
23 |       for (s in states) {
24 |         println("${V[s].format(2)} ")
25 |       }
26 |     }
27 |   }
28 |   
29 |   @Test
30 |   fun `1000-state RandomWalk`() {
31 |     val (prob, π) = `1000-state RandomWalk`.make()
32 |     val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 10000)
33 |     prob.apply {
34 |       for (s in states) {
35 |         println("${V[s].format(2)} ")
36 |       }
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/problem/Test Mountain Car with Actor-Critic.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.problem
 4 | 
 5 | import javafx.application.Application
 6 | import lab.mars.rl.algo.policy_gradient.`Actor-Critic with Eligibility Traces (episodic)`
 7 | import lab.mars.rl.model.impl.func.LinearFunc
 8 | import lab.mars.rl.model.impl.func.SuttonTileCoding
 9 | import lab.mars.rl.model.impl.mdp.DefaultAction
10 | import lab.mars.rl.util.matrix.SparseMatrix
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.MountainCarUI
13 | import org.junit.Test
14 | import java.util.concurrent.CountDownLatch
15 | import kotlin.concurrent.thread
16 | 
17 | class `Test Mountain Car with Actor-Critic` {
18 |   val numTilings = 8
19 |   val positionScale = numTilings / (MountainCar.POSITION_MAX - MountainCar.POSITION_MIN)
20 |   val velocityScale = numTilings / (MountainCar.VELOCITY_MAX - MountainCar.VELOCITY_MIN)
21 |   
22 |   fun func(): LinearFunc<tuple2<DoubleArray, IntArray>> {
23 |     val feature = SuttonTileCoding(511, numTilings) { (s, a) ->
24 |       s as MountainCar.CarState
25 |       a as DefaultAction<Int, MountainCar.CarState>
26 |       tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity),
27 |              intArrayOf(a.value))
28 |     }
29 |     return LinearFunc(feature)
30 |   }
31 |   
32 |   @Test
33 |   fun `Mountain Car UI`() {
34 |     val prob = MountainCar.make()
35 |     
36 |     val policyFeature = SuttonTileCoding(511, numTilings) { (s, a) ->
37 |       s as MountainCar.CarState
38 |       a as DefaultAction<Int, MountainCar.CarState>
39 |       tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity),
40 |              intArrayOf(a.value))
41 |     }
42 |     val h = LinearFunc(policyFeature)
43 |     val emptyIntArray = IntArray(0)
44 |     val valueFeature = SuttonTileCoding(511, numTilings) { (s) ->
45 |       s as MountainCar.CarState
46 |       tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity), emptyIntArray)
47 |     }
48 |     val v = LinearFunc(valueFeature)
49 |     
50 |     val episodes = intArrayOf(1, 12, 104, 1000, 9000)
51 |     val latch = CountDownLatch(1)
52 |     thread {
53 |       latch.await()
54 |       prob.`Actor-Critic with Eligibility Traces (episodic)`(
55 |           h = h, α_θ = 2e-9 / numTilings, λ_θ = 0.96,
56 |           v = v, α_w = 0.6 / numTilings, λ_w = 0.96,
57 |           episodes = 9000,
58 |           z_maker = { m, n -> SparseMatrix(m, n) },
59 |           stepListener = step@{ episode, step, s, a ->
60 |             if (episode !in episodes) return@step
61 |             MountainCarUI.render(episode, step, s as MountainCar.CarState, a as DefaultAction<Int, MountainCar.CarState>)
62 |           })
63 |     }
64 |     MountainCarUI.after = { latch.countDown() }
65 |     Application.launch(MountainCarUI::class.java)
66 |   }
67 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/problem/`Test FlyPlane Problem with REINFORCE`.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.problem
 4 | 
 5 | import javafx.application.Application
 6 | import lab.mars.rl.algo.func_approx.play
 7 | import lab.mars.rl.algo.policy_gradient.`REINFORCE with Baseline (episodic)`
 8 | import lab.mars.rl.model.impl.func.LinearFunc
 9 | import lab.mars.rl.model.impl.func.SuttonTileCoding
10 | import lab.mars.rl.model.impl.mdp.DefaultAction
11 | import lab.mars.rl.model.impl.mdp.SoftmaxpPolicy
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.ui.D2DGameUI
14 | import org.junit.Test
15 | import java.util.concurrent.CountDownLatch
16 | import kotlin.concurrent.thread
17 | 
18 | class `Test FlyPlane Problem with REINFORCE` {
19 |   @Test
20 |   fun `Fly Plane UI`() {
21 |     val numTilings = 10
22 |     val emptyIntArray = IntArray(0)
23 |     val valueFeature = SuttonTileCoding(1000, numTilings, doubleArrayOf(1 / 100.0, 1 / 100.0, 1 / 10.0, 1 / 10.0)) { (s) ->
24 |       s as FlyPlane.PlaneState
25 |       tuple2(doubleArrayOf(s.loc.x, s.loc.y, s.vel.x, s.vel.y), emptyIntArray)
26 |     }
27 |     val v = LinearFunc(valueFeature)
28 |     val policyFeature = SuttonTileCoding(1000, numTilings, doubleArrayOf(1 / 100.0, 1 / 100.0, 1 / 10.0, 1 / 10.0)) { (s, a) ->
29 |       s as FlyPlane.PlaneState
30 |       a as DefaultAction<Int, FlyPlane.PlaneState>
31 |       tuple2(doubleArrayOf(s.loc.x, s.loc.y, s.vel.x, s.vel.y), intArrayOf(a.value))
32 |     }
33 |     val h = LinearFunc(policyFeature)
34 |     val resolution = 100
35 |     val unit = FlyPlane.fieldWidth / resolution
36 |     val qvalue = Array(resolution) { Array(resolution + 1) { Double.NEGATIVE_INFINITY } }
37 |     var accuG = 0.0
38 |     var wins = 0.0
39 |     var win_step = 0.0
40 |     val episode_round = 100
41 |     val step_round = 10
42 |     val max_episode = 100000
43 |     var episode_base = 0
44 |     var animate = false
45 |     val latch = CountDownLatch(1)
46 |     thread {
47 |       latch.await()
48 |       
49 |       while (true) {
50 |         val prob = FlyPlane.makeRand()
51 |         animate = false
52 |         prob.`REINFORCE with Baseline (episodic)`(
53 |             h = h, α_θ = 1e-12,
54 |             v = v, α_w = 1e-3,
55 |             episodes = max_episode
56 |             )
57 |         animate = true
58 |         prob.play(
59 |             π = SoftmaxpPolicy(h),
60 |             episodes = 10,
61 |             stepListener = { _, _, s, a ->
62 |               Thread.sleep(Math.floor(1000 / 60.0).toLong())
63 |             }
64 |         )
65 |         episode_base += max_episode
66 |       }
67 |     }
68 |     D2DGameUI.apply {
69 |       canvas_width = FlyPlane.fieldWidth
70 |       canvas_height = FlyPlane.fieldWidth
71 |       width = 1200.0
72 |       height = 800.0
73 |       charts.addAll(D2DGameUI.ChartDescription("average return per $episode_round episodes", "episode", "average return"),
74 |                     D2DGameUI.ChartDescription("win rate per $episode_round episodes", "episode", "win rate"),
75 |                     D2DGameUI.ChartDescription("average win step per $episode_round episodes", "episode", "average win step",
76 |                                                yForceZeroInRange = false))
77 |       afterStartup = { gc ->
78 |         latch.countDown()
79 |       }
80 |     }
81 |     Application.launch(D2DGameUI::class.java)
82 |   }
83 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/TestBase.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util
 2 | 
 3 | import ch.qos.logback.classic.Level
 4 | import ch.qos.logback.classic.LoggerContext
 5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
 6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
 7 | import lab.mars.rl.model.impl.mdp.StateValueFunction
 8 | import org.slf4j.Logger
 9 | import org.slf4j.LoggerFactory
10 | 
11 | /**
12 |  * <p>
13 |  * Created on 2017-09-06.
14 |  * </p>
15 |  *
16 |  * @author wumo
17 |  */
18 | val ANSI_BLACK = "\u001B[30m"
19 | val ANSI_RED = "\u001B[31m"
20 | val ANSI_GREEN = "\u001B[32m"
21 | val ANSI_YELLOW = "\u001B[33m"
22 | val ANSI_BLUE = "\u001B[34m"
23 | val ANSI_PURPLE = "\u001B[35m"
24 | val ANSI_CYAN = "\u001B[36m"
25 | val ANSI_WHITE = "\u001B[37m"
26 | val ANSI_RESET = "\u001B[0m"
27 | val ANSI_BLACK_BACKGROUND = "\u001B[40m"
28 | val ANSI_RED_BACKGROUND = "\u001B[41m"
29 | val ANSI_GREEN_BACKGROUND = "\u001B[42m"
30 | val ANSI_YELLOW_BACKGROUND = "\u001B[43m"
31 | val ANSI_BLUE_BACKGROUND = "\u001B[44m"
32 | val ANSI_PURPLE_BACKGROUND = "\u001B[45m"
33 | val ANSI_CYAN_BACKGROUND = "\u001B[46m"
34 | val ANSI_WHITE_BACKGROUND = "\u001B[47m"
35 | 
36 | val colors = arrayOf(
37 |     ANSI_WHITE_BACKGROUND + ANSI_WHITE,
38 |     ANSI_BLACK_BACKGROUND + ANSI_BLACK,
39 |     ANSI_RED_BACKGROUND + ANSI_RED,
40 |     ANSI_GREEN_BACKGROUND + ANSI_GREEN,
41 |     ANSI_YELLOW_BACKGROUND + ANSI_YELLOW,
42 |     ANSI_BLUE_BACKGROUND + ANSI_BLUE,
43 |     ANSI_PURPLE_BACKGROUND + ANSI_PURPLE,
44 |     ANSI_CYAN_BACKGROUND + ANSI_CYAN)
45 | 
46 | fun color(idx: Int): String {
47 |   if (idx in 0..colors.lastIndex)
48 |     return colors[idx]
49 |   return idx.toString()
50 | }
51 | 
52 | fun reset() = ANSI_RESET
53 | fun Double.format(digits: Int) = String.format("%.${digits}f", this)
54 | 
55 | fun logLevel(level: Level) {
56 |   val loggerContext: LoggerContext = LoggerFactory.getILoggerFactory() as LoggerContext
57 |   val rootLogger = loggerContext.getLogger(Logger.ROOT_LOGGER_NAME)
58 |   rootLogger.level = level
59 | }
60 | 
61 | fun printBlackjack(prob: IndexedMDP, π: IndexedPolicy, V: StateValueFunction) {
62 |   println("---------------------Usable Ace--------------------------")
63 |   for (a in 9 downTo 0) {
64 |     for (b in 0 until 10) {
65 |       val s = prob.states[1, 1, b, a]
66 |       print("${color(π.greedy(s)[0])}  ${reset()}")
67 |     }
68 |     println()
69 |   }
70 |   println("---------------------No Usable Ace--------------------------")
71 |   for (a in 9 downTo 0) {
72 |     for (b in 0 until 10) {
73 |       val s = prob.states[1, 0, b, a]
74 |       print("${color(π.greedy(s)[0])}  ${reset()}")
75 |     }
76 |     println()
77 |   }
78 |   for (a in 0 until 10) {
79 |     for (b in 0 until 10)
80 |       print("${V[1, 1, a, b].format(2)} ")
81 |     println()
82 |   }
83 |   println("------------------------------------------------------------")
84 |   for (a in 0 until 10) {
85 |     for (b in 0 until 10)
86 |       print("${V[1, 0, a, b].format(2)} ")
87 |     println()
88 |   }
89 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/TestIndex.kt:
--------------------------------------------------------------------------------
 1 | @file:Suppress("UNCHECKED_CAST")
 2 | 
 3 | package lab.mars.rl.util
 4 | 
 5 | import lab.mars.rl.util.buf.DefaultIntBuf
 6 | import lab.mars.rl.util.buf.Index
 7 | import lab.mars.rl.util.buf.MultiIndex
 8 | import org.junit.Assert.assertEquals
 9 | import org.junit.Test
10 | 
11 | /**
12 |  * <p>
13 |  * Created on 2017-09-18.
14 |  * </p>
15 |  *
16 |  * @author wumo
17 |  */
18 | class TestIndex {
19 |   @Test
20 |   fun `range forEach`() {
21 |     val indices = arrayOf(DefaultIntBuf.of(0),
22 |                           DefaultIntBuf.of(1, 2, 3),
23 |                           DefaultIntBuf.of(4, 5, 6, 7))
24 |     val _idx = MultiIndex(indices as Array<Index>)
25 |     val expected = IntArray(8) { it }
26 |     _idx.forEach(0, 0) { idx, value ->
27 |       assertEquals(expected[idx], value)
28 |     }
29 |     _idx.forEach(4, 7) { idx, value ->
30 |       assertEquals(expected[idx], value)
31 |     }
32 |     _idx.forEach { idx, value ->
33 |       assertEquals(expected[idx], value)
34 |     }
35 |     _idx.forEach(2, 5) { idx, value ->
36 |       assertEquals(expected[idx], value)
37 |     }
38 |     _idx.forEach(0, 5) { idx, value ->
39 |       assertEquals(expected[idx], value)
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/extensions.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util
 2 | 
 3 | import kotlinx.coroutines.Deferred
 4 | import kotlinx.coroutines.GlobalScope
 5 | import kotlinx.coroutines.async
 6 | 
 7 | inline fun <R> listOf(size: Int, init: (Int) -> R): ArrayList<R> {
 8 |   val list = ArrayList<R>()
 9 |   for (i in 0 until size)
10 |     list += init(i)
11 |   return list
12 | }
13 | 
14 | inline fun <I, R> listOf(iter: Iterable<I>, init: (I) -> R): ArrayList<R> {
15 |   val list = ArrayList<R>()
16 |   for (i in iter)
17 |     list += init(i)
18 |   return list
19 | }
20 | 
21 | inline fun <I1, I2, R> listOf(iter1: Iterable<I1>, iter2: Iterable<I2>, init: (I1, I2) -> R): List<R> {
22 |   val list = mutableListOf<R>()
23 |   for (i in iter1)
24 |     for (j in iter2)
25 |       list += init(i, j)
26 |   return list
27 | }
28 | 
29 | fun <R> asyncs(size: Int, init: suspend (Int) -> R): ArrayList<Deferred<R>> {
30 |   val list = ArrayList<Deferred<R>>()
31 |   for (i in 0 until size)
32 |     list += GlobalScope.async {
33 |       init(i)
34 |     }
35 |   
36 |   return list
37 | }
38 | 
39 | fun <I, R> asyncs(iter: Iterable<I>, init: suspend (I) -> R): ArrayList<Deferred<R>> {
40 |   val list = ArrayList<Deferred<R>>()
41 |   for (i in iter)
42 |     list += GlobalScope.async {
43 |       init(i)
44 |     }
45 |   
46 |   return list
47 | }
48 | 
49 | fun <I1, I2, R> asyncs(iter1: Iterable<I1>, iter2: Iterable<I2>, init: suspend (I1, I2) -> R): ArrayList<Deferred<R>> {
50 |   val list = ArrayList<Deferred<R>>()
51 |   for (i in iter1)
52 |     for (j in iter2)
53 |       list += GlobalScope.async { init(i, j) }
54 |   return list
55 | }
56 | 
57 | suspend fun <R> ArrayList<Deferred<R>>.await(process: suspend (R) -> Unit = {}) {
58 |   forEach { process(it.await()) }
59 | }


--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/range/DoubleProgression.kt:
--------------------------------------------------------------------------------
 1 | package lab.mars.rl.util.range
 2 | 
 3 | import kotlin.math.sign
 4 | 
 5 | class DoubleProgression(val start: Double,
 6 |                         val endInclusive: Double,
 7 |                         val step: Double): Iterable<Double> {
 8 |   override fun iterator() = object: Iterator<Double> {
 9 |     var current = start
10 |     override fun hasNext() = step.sign * (endInclusive - current) >= 0
11 |     
12 |     override fun next() = current.apply { current += step }
13 |   }
14 | }
15 | 
16 | operator fun Double.rangeTo(that: Double)
17 |     = DoubleProgression(this, that, 0.1)
18 | 
19 | infix fun DoubleProgression.step(step: Double)
20 |     = DoubleProgression(start, endInclusive, step)


--------------------------------------------------------------------------------
/src/test/resources/Figure 10.1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 10.1.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 10.4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 10.4.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 12.10.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.10.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 12.11.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.11.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 12.3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.3.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 12.6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.6.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 12.8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.8.PNG


--------------------------------------------------------------------------------
/src/test/resources/Figure 7.2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 7.2.PNG


--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <appender name="stdout_method" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>
 6 |                 %d{HH:mm:ss} [%thread] %-5level %logger{36} %n \t %M - %msg%n
 7 |             </pattern>
 8 |         </encoder>
 9 |     </appender>
10 | 
11 |     <appender name="stdout_qualified" class="ch.qos.logback.core.ConsoleAppender">
12 |         <encoder>
13 |             <pattern>
14 |                 %d{HH:mm:ss} [%thread] %-5level %logger{36} %n \t - %msg%n
15 |             </pattern>
16 |         </encoder>
17 |     </appender>
18 | 
19 |     <appender name="stdout_simple" class="ch.qos.logback.core.ConsoleAppender">
20 |         <encoder>
21 |             <pattern>
22 |                 [%d{HH:mm:ss.SSS}] &lt;%logger{0}@%thread&gt; - %msg%n
23 |             </pattern>
24 |         </encoder>
25 |     </appender>
26 | 
27 |     <appender name="stdout_simple_nothread" class="ch.qos.logback.core.ConsoleAppender">
28 |         <encoder>
29 |             <pattern>
30 |                 [%d{HH:mm:ss.SSS}] &lt;%logger{0}&gt; - %msg%n
31 |             </pattern>
32 |         </encoder>
33 |     </appender>
34 | 
35 |     <root level="debug">
36 |         <appender-ref ref="stdout_simple"/>
37 |     </root>
38 | 
39 | </configuration>


--------------------------------------------------------------------------------